diff options
Diffstat (limited to 'bolt/lib')
| -rw-r--r-- | bolt/lib/Core/BinaryContext.cpp | 19 | ||||
| -rw-r--r-- | bolt/lib/Core/BinaryEmitter.cpp | 8 | ||||
| -rw-r--r-- | bolt/lib/Core/BinaryFunction.cpp | 12 | ||||
| -rw-r--r-- | bolt/lib/Core/DIEBuilder.cpp | 22 | ||||
| -rw-r--r-- | bolt/lib/Core/DebugNames.cpp | 7 | ||||
| -rw-r--r-- | bolt/lib/Core/FunctionLayout.cpp | 4 | ||||
| -rw-r--r-- | bolt/lib/Core/GDBIndex.cpp | 7 | ||||
| -rw-r--r-- | bolt/lib/Passes/ADRRelaxationPass.cpp | 1 | ||||
| -rw-r--r-- | bolt/lib/Passes/BinaryPasses.cpp | 2 | ||||
| -rw-r--r-- | bolt/lib/Passes/CMakeLists.txt | 1 | ||||
| -rw-r--r-- | bolt/lib/Passes/ContinuityStats.cpp | 250 | ||||
| -rw-r--r-- | bolt/lib/Passes/IdenticalCodeFolding.cpp | 12 | ||||
| -rw-r--r-- | bolt/lib/Passes/LongJmp.cpp | 440 | ||||
| -rw-r--r-- | bolt/lib/Passes/PatchEntries.cpp | 15 | ||||
| -rw-r--r-- | bolt/lib/Passes/VeneerElimination.cpp | 43 | ||||
| -rw-r--r-- | bolt/lib/Profile/BoltAddressTranslation.cpp | 22 | ||||
| -rw-r--r-- | bolt/lib/Profile/YAMLProfileReader.cpp | 6 | ||||
| -rw-r--r-- | bolt/lib/Rewrite/BinaryPassManager.cpp | 9 | ||||
| -rw-r--r-- | bolt/lib/Rewrite/DWARFRewriter.cpp | 36 | ||||
| -rw-r--r-- | bolt/lib/Rewrite/PseudoProbeRewriter.cpp | 20 | ||||
| -rw-r--r-- | bolt/lib/Rewrite/RewriteInstance.cpp | 42 |
21 files changed, 310 insertions, 668 deletions
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp index ba2de6ce2b28..1347047e1b70 100644 --- a/bolt/lib/Core/BinaryContext.cpp +++ b/bolt/lib/Core/BinaryContext.cpp @@ -1609,18 +1609,10 @@ std::vector<BinaryFunction *> BinaryContext::getSortedFunctions() { llvm::stable_sort(SortedFunctions, [](const BinaryFunction *A, const BinaryFunction *B) { - // Place hot text movers at the start. - if (A->isHotTextMover() && !B->isHotTextMover()) - return true; - if (!A->isHotTextMover() && B->isHotTextMover()) - return false; if (A->hasValidIndex() && B->hasValidIndex()) { return A->getIndex() < B->getIndex(); } - if (opts::HotFunctionsAtEnd) - return B->hasValidIndex(); - else - return A->hasValidIndex(); + return A->hasValidIndex(); }); return SortedFunctions; } @@ -2362,15 +2354,6 @@ BinaryContext::createInjectedBinaryFunction(const std::string &Name, return BF; } -BinaryFunction * -BinaryContext::createThunkBinaryFunction(const std::string &Name) { - ThunkBinaryFunctions.push_back(new BinaryFunction(Name, *this, true)); - BinaryFunction *BF = ThunkBinaryFunctions.back(); - setSymbolToFunctionMap(BF->getSymbol(), BF); - BF->CurrentState = BinaryFunction::State::CFG; - return BF; -} - std::pair<size_t, size_t> BinaryContext::calculateEmittedSize(BinaryFunction &BF, bool FixBranches) { // Adjust branch instruction to match the current layout. diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp index 89043db03102..f6dfa249f9a9 100644 --- a/bolt/lib/Core/BinaryEmitter.cpp +++ b/bolt/lib/Core/BinaryEmitter.cpp @@ -258,14 +258,6 @@ void BinaryEmitter::emitFunctions() { if (Emitted) Function->setEmitted(/*KeepCFG=*/opts::PrintCacheMetrics); - - // Emit thunks. - if (BC.getThunkLocation() != Function) - continue; - - for (BinaryFunction *Thunk : BC.getThunkBinaryFunctions()) { - emitFunction(*Thunk, Thunk->getLayout().getMainFragment()); - } } }; diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index 27c8ccefedee..36c42fced93d 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -111,10 +111,6 @@ cl::opt<bool> cl::desc("try to preserve basic block alignment"), cl::cat(BoltOptCategory)); -static cl::opt<bool> PrintOffsets("print-offsets", - cl::desc("print basic block offsets"), - cl::Hidden, cl::cat(BoltOptCategory)); - static cl::opt<bool> PrintOutputAddressRange( "print-output-address-range", cl::desc( @@ -545,11 +541,6 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation) { if (BB->isLandingPad()) OS << " Landing Pad\n"; - if (opts::PrintOffsets && BB->getOutputStartAddress()) { - OS << " OutputOffset: 0x" - << Twine::utohexstr(BB->getOutputStartAddress()) << '\n'; - } - uint64_t BBExecCount = BB->getExecutionCount(); if (hasValidProfile()) { OS << " Exec Count : "; @@ -4562,9 +4553,6 @@ void BinaryFunction::printLoopInfo(raw_ostream &OS) const { } bool BinaryFunction::isAArch64Veneer() const { - if (hasNameRegex("__AArch64.*Thunk.*")) - return true; - if (empty() || hasIslandsInfo()) return false; diff --git a/bolt/lib/Core/DIEBuilder.cpp b/bolt/lib/Core/DIEBuilder.cpp index fa79d60ac01e..69cfd58a1df0 100644 --- a/bolt/lib/Core/DIEBuilder.cpp +++ b/bolt/lib/Core/DIEBuilder.cpp @@ -24,7 +24,6 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/LEB128.h" -#include "llvm/Support/Timer.h" #include <algorithm> #include <cstdint> @@ -39,8 +38,7 @@ #define DEBUG_TYPE "bolt" namespace opts { extern cl::opt<unsigned> Verbosity; -extern cl::opt<bool> TimeDebug; -} // namespace opts +} namespace llvm { namespace bolt { @@ -351,8 +349,6 @@ void DIEBuilder::buildCompileUnits(const bool Init) { } } void DIEBuilder::buildCompileUnits(const std::vector<DWARFUnit *> &CUs) { - NamedRegionTimer T("buildcompileunits", "build compile units", "debug", - "update debug info", opts::TimeDebug); BuilderState.reset(new State()); // Allocating enough for current batch being processed. // In real use cases we either processing a batch of CUs with no cross @@ -560,8 +556,6 @@ void DIEBuilder::populateDebugNamesTable( } void DIEBuilder::updateDebugNamesTable() { - NamedRegionTimer T("updatedebugnames", "update debug_names table", - "debug", "update debug info", opts::TimeDebug); auto finalizeDebugNamesTableForCU = [&](DWARFUnit &CU, uint64_t &UnitStartOffset) -> void { DIE *UnitDIE = getUnitDIEbyUnit(CU); @@ -572,14 +566,18 @@ void DIEBuilder::updateDebugNamesTable() { UnitStartOffset += CurUnitInfo.UnitLength; }; - auto It = llvm::partition_point(getState().DUList, [](DWARFUnit *CU) { - return CU->getVersion() < 5 && CU->isTypeUnit(); - }); uint64_t TypeUnitStartOffset = 0; - for (DWARFUnit *CU : llvm::make_range(getState().DUList.begin(), It)) + for (DWARFUnit *CU : getState().DUList) { + if (!(CU->getVersion() < 5 && CU->isTypeUnit())) + break; finalizeDebugNamesTableForCU(*CU, TypeUnitStartOffset); - for (DWARFUnit *CU : llvm::make_range(It, getState().DUList.end())) + } + + for (DWARFUnit *CU : getState().DUList) { + if (CU->getVersion() < 5 && CU->isTypeUnit()) + continue; finalizeDebugNamesTableForCU(*CU, DebugNamesUnitSize); + } updateReferences(); } diff --git a/bolt/lib/Core/DebugNames.cpp b/bolt/lib/Core/DebugNames.cpp index d014c2c2fbcf..640b29ec36d5 100644 --- a/bolt/lib/Core/DebugNames.cpp +++ b/bolt/lib/Core/DebugNames.cpp @@ -10,16 +10,11 @@ #include "bolt/Core/BinaryContext.h" #include "llvm/DebugInfo/DWARF/DWARFExpression.h" #include "llvm/DebugInfo/DWARF/DWARFTypeUnit.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/EndianStream.h" #include "llvm/Support/LEB128.h" -#include "llvm/Support/Timer.h" #include <cstdint> #include <optional> -namespace opts { -extern llvm::cl::opt<bool> TimeDebug; -} // namespace opts namespace llvm { namespace bolt { DWARF5AcceleratorTable::DWARF5AcceleratorTable( @@ -745,8 +740,6 @@ void DWARF5AcceleratorTable::emitAugmentationString() const { void DWARF5AcceleratorTable::emitAccelTable() { if (!NeedToCreate) return; - NamedRegionTimer T("emitAccelTable", "Emit Accelerator Table", - "debug", "Update Debug Info", opts::TimeDebug); finalize(); populateAbbrevsMap(); writeEntries(); diff --git a/bolt/lib/Core/FunctionLayout.cpp b/bolt/lib/Core/FunctionLayout.cpp index 4498fc44da95..15e6127ad2e9 100644 --- a/bolt/lib/Core/FunctionLayout.cpp +++ b/bolt/lib/Core/FunctionLayout.cpp @@ -33,9 +33,7 @@ FunctionFragment::const_iterator FunctionFragment::end() const { return const_iterator(Layout->block_begin() + StartIndex + Size); } -BinaryBasicBlock *FunctionFragment::front() const { return *begin(); } - -BinaryBasicBlock *FunctionFragment::back() const { return *std::prev(end()); } +const BinaryBasicBlock *FunctionFragment::front() const { return *begin(); } FunctionLayout::FunctionLayout() { addFragment(); } diff --git a/bolt/lib/Core/GDBIndex.cpp b/bolt/lib/Core/GDBIndex.cpp index a3aecf4651bc..c7fb4889646b 100644 --- a/bolt/lib/Core/GDBIndex.cpp +++ b/bolt/lib/Core/GDBIndex.cpp @@ -7,12 +7,7 @@ //===----------------------------------------------------------------------===// #include "bolt/Core/GDBIndex.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Timer.h" -namespace opts { -extern llvm::cl::opt<bool> TimeDebug; -} // namespace opts using namespace llvm::bolt; using namespace llvm::support::endian; @@ -28,8 +23,6 @@ void GDBIndex::updateGdbIndexSection( DebugARangesSectionWriter &ARangesSectionWriter) { if (!BC.getGdbIndexSection()) return; - NamedRegionTimer T("updateGdbIndex", "Update gdb_index Section", - "debug", "Update Debug Info", opts::TimeDebug); // See https://sourceware.org/gdb/onlinedocs/gdb/Index-Section-Format.html // for .gdb_index section format. diff --git a/bolt/lib/Passes/ADRRelaxationPass.cpp b/bolt/lib/Passes/ADRRelaxationPass.cpp index a7f99b6bb547..52811edcb827 100644 --- a/bolt/lib/Passes/ADRRelaxationPass.cpp +++ b/bolt/lib/Passes/ADRRelaxationPass.cpp @@ -63,6 +63,7 @@ void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) { BinaryFunction *TargetBF = BC.getFunctionForSymbol(Symbol); if (TargetBF == &BF && !BB.isSplit()) continue; + // No relaxation needed if ADR references a basic block in the same // fragment. if (BinaryBasicBlock *TargetBB = BF.getBasicBlockForLabel(Symbol)) diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp index ca69667f9c85..fa95ad7324ac 100644 --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -1264,8 +1264,6 @@ Error AssignSections::runOnFunctions(BinaryContext &BC) { if (opts::isHotTextMover(Function)) { Function.setCodeSectionName(BC.getHotTextMoverSectionName()); Function.setColdCodeSectionName(BC.getHotTextMoverSectionName()); - // TODO: find a better place to mark a function as a mover. - Function.setHotTextMover(true); continue; } diff --git a/bolt/lib/Passes/CMakeLists.txt b/bolt/lib/Passes/CMakeLists.txt index 407d8b03f739..1c1273b3d242 100644 --- a/bolt/lib/Passes/CMakeLists.txt +++ b/bolt/lib/Passes/CMakeLists.txt @@ -26,6 +26,7 @@ add_llvm_library(LLVMBOLTPasses PatchEntries.cpp PettisAndHansen.cpp PLTCall.cpp + ContinuityStats.cpp RegAnalysis.cpp RegReAssign.cpp ReorderAlgorithm.cpp diff --git a/bolt/lib/Passes/ContinuityStats.cpp b/bolt/lib/Passes/ContinuityStats.cpp new file mode 100644 index 000000000000..b32365b59065 --- /dev/null +++ b/bolt/lib/Passes/ContinuityStats.cpp @@ -0,0 +1,250 @@ +//===- bolt/Passes/ContinuityStats.cpp --------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the continuity stats calculation pass. +// +//===----------------------------------------------------------------------===// + +#include "bolt/Passes/ContinuityStats.h" +#include "bolt/Core/BinaryBasicBlock.h" +#include "bolt/Core/BinaryFunction.h" +#include "bolt/Utils/CommandLineOpts.h" +#include "llvm/Support/CommandLine.h" +#include <queue> +#include <unordered_map> +#include <unordered_set> + +#define DEBUG_TYPE "bolt-opts" + +using namespace llvm; +using namespace bolt; + +namespace opts { +extern cl::opt<unsigned> Verbosity; +cl::opt<unsigned> NumFunctionsForContinuityCheck( + "num-functions-for-continuity-check", + cl::desc("number of hottest functions to print aggregated " + "CFG discontinuity stats of."), + cl::init(1000), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory)); +} // namespace opts + +namespace { +using FunctionListType = std::vector<const BinaryFunction *>; +using function_iterator = FunctionListType::iterator; + +template <typename T> +void printDistribution(raw_ostream &OS, std::vector<T> &values, + bool Fraction = false) { + if (values.empty()) + return; + // Sort values from largest to smallest and print the MAX, TOP 1%, 5%, 10%, + // 20%, 50%, 80%, MIN. If Fraction is true, then values are printed as + // fractions instead of integers. + std::sort(values.begin(), values.end()); + + auto printLine = [&](std::string Text, double Percent) { + int Rank = int(values.size() * (1.0 - Percent / 100)); + if (Percent == 0) + Rank = values.size() - 1; + if (Fraction) + OS << " " << Text << std::string(9 - Text.length(), ' ') << ": " + << format("%.2lf%%", values[Rank] * 100) << "\n"; + else + OS << " " << Text << std::string(9 - Text.length(), ' ') << ": " + << values[Rank] << "\n"; + }; + + printLine("MAX", 0); + const int percentages[] = {1, 5, 10, 20, 50, 80}; + for (size_t i = 0; i < sizeof(percentages) / sizeof(percentages[0]); ++i) { + printLine("TOP " + std::to_string(percentages[i]) + "%", percentages[i]); + } + printLine("MIN", 100); +} + +void printCFGContinuityStats(raw_ostream &OS, + iterator_range<function_iterator> &Functions) { + // Given a perfect profile, every positive-execution-count BB should be + // connected to an entry of the function through a positive-execution-count + // directed path in the control flow graph. + std::vector<size_t> NumUnreachables; + std::vector<size_t> SumECUnreachables; + std::vector<double> FractionECUnreachables; + + for (auto it = Functions.begin(); it != Functions.end(); ++it) { + const BinaryFunction *Function = *it; + if (Function->size() <= 1) + continue; + + // Compute the sum of all BB execution counts (ECs). + size_t NumPosECBBs = 0; + size_t SumAllBBEC = 0; + for (const BinaryBasicBlock &BB : *Function) { + const size_t BBEC = BB.getKnownExecutionCount(); + NumPosECBBs += BBEC > 0 ? 1 : 0; + SumAllBBEC += BBEC; + } + + // Perform BFS on subgraph of CFG induced by positive weight edges. + // Compute the number of BBs reachable from the entry(s) of the function and + // the sum of their execution counts (ECs). + std::unordered_map<unsigned, const BinaryBasicBlock *> IndexToBB; + std::unordered_set<unsigned> Visited; + std::queue<unsigned> Queue; + for (const BinaryBasicBlock &BB : *Function) { + // Make sure BB.getIndex() is not already in IndexToBB. + assert(IndexToBB.find(BB.getIndex()) == IndexToBB.end()); + IndexToBB[BB.getIndex()] = &BB; + if (BB.isEntryPoint() && BB.getKnownExecutionCount() > 0) { + Queue.push(BB.getIndex()); + Visited.insert(BB.getIndex()); + } + } + while (!Queue.empty()) { + const unsigned BBIndex = Queue.front(); + const BinaryBasicBlock *BB = IndexToBB[BBIndex]; + Queue.pop(); + auto SuccBIIter = BB->branch_info_begin(); + for (const BinaryBasicBlock *Succ : BB->successors()) { + const uint64_t Count = SuccBIIter->Count; + if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) { + ++SuccBIIter; + continue; + } + if (!Visited.insert(Succ->getIndex()).second) { + ++SuccBIIter; + continue; + } + Queue.push(Succ->getIndex()); + ++SuccBIIter; + } + } + + const size_t NumReachableBBs = Visited.size(); + + // Loop through Visited, and sum the corresponding BBs' execution counts + // (ECs). + size_t SumReachableBBEC = 0; + for (const unsigned BBIndex : Visited) { + const BinaryBasicBlock *BB = IndexToBB[BBIndex]; + SumReachableBBEC += BB->getKnownExecutionCount(); + } + + const size_t NumPosECBBsUnreachableFromEntry = + NumPosECBBs - NumReachableBBs; + const size_t SumUnreachableBBEC = SumAllBBEC - SumReachableBBEC; + const double FractionECUnreachable = + (double)SumUnreachableBBEC / SumAllBBEC; + + if (opts::Verbosity >= 2 && FractionECUnreachable >= 0.05) { + OS << "Non-trivial CFG discontinuity observed in function " + << Function->getPrintName() << "\n"; + LLVM_DEBUG(Function->dump()); + } + + NumUnreachables.push_back(NumPosECBBsUnreachableFromEntry); + SumECUnreachables.push_back(SumUnreachableBBEC); + FractionECUnreachables.push_back(FractionECUnreachable); + } + + if (FractionECUnreachables.empty()) + return; + + std::sort(FractionECUnreachables.begin(), FractionECUnreachables.end()); + const int Rank = int(FractionECUnreachables.size() * 0.95); + OS << format("top 5%% function CFG discontinuity is %.2lf%%\n", + FractionECUnreachables[Rank] * 100); + + if (opts::Verbosity >= 1) { + OS << "abbreviations: EC = execution count, POS BBs = positive EC BBs\n" + << "distribution of NUM(unreachable POS BBs) among all focal " + "functions\n"; + printDistribution(OS, NumUnreachables); + + OS << "distribution of SUM_EC(unreachable POS BBs) among all focal " + "functions\n"; + printDistribution(OS, SumECUnreachables); + + OS << "distribution of [(SUM_EC(unreachable POS BBs) / SUM_EC(all " + "POS BBs))] among all focal functions\n"; + printDistribution(OS, FractionECUnreachables, /*Fraction=*/true); + } +} + +void printAll(BinaryContext &BC, FunctionListType &ValidFunctions, + size_t NumTopFunctions) { + // Sort the list of functions by execution counts (reverse). + llvm::sort(ValidFunctions, + [&](const BinaryFunction *A, const BinaryFunction *B) { + return A->getKnownExecutionCount() > B->getKnownExecutionCount(); + }); + + const size_t RealNumTopFunctions = + std::min(NumTopFunctions, ValidFunctions.size()); + + iterator_range<function_iterator> Functions( + ValidFunctions.begin(), ValidFunctions.begin() + RealNumTopFunctions); + + BC.outs() << format("BOLT-INFO: among the hottest %zu functions ", + RealNumTopFunctions); + printCFGContinuityStats(BC.outs(), Functions); + + // Print more detailed bucketed stats if requested. + if (opts::Verbosity >= 1 && RealNumTopFunctions >= 5) { + const size_t PerBucketSize = RealNumTopFunctions / 5; + BC.outs() << format( + "Detailed stats for 5 buckets, each with %zu functions:\n", + PerBucketSize); + + // For each bucket, print the CFG continuity stats of the functions in the + // bucket. + for (size_t BucketIndex = 0; BucketIndex < 5; ++BucketIndex) { + const size_t StartIndex = BucketIndex * PerBucketSize; + const size_t EndIndex = StartIndex + PerBucketSize; + iterator_range<function_iterator> Functions( + ValidFunctions.begin() + StartIndex, + ValidFunctions.begin() + EndIndex); + const size_t MaxFunctionExecutionCount = + ValidFunctions[StartIndex]->getKnownExecutionCount(); + const size_t MinFunctionExecutionCount = + ValidFunctions[EndIndex - 1]->getKnownExecutionCount(); + BC.outs() << format("----------------\n| Bucket %zu: " + "|\n----------------\n", + BucketIndex + 1) + << format( + "execution counts of the %zu functions in the bucket: " + "%zu-%zu\n", + EndIndex - StartIndex, MinFunctionExecutionCount, + MaxFunctionExecutionCount); + printCFGContinuityStats(BC.outs(), Functions); + } + } +} +} // namespace + +bool PrintContinuityStats::shouldOptimize(const BinaryFunction &BF) const { + if (BF.empty() || !BF.hasValidProfile()) + return false; + + return BinaryFunctionPass::shouldOptimize(BF); +} + +Error PrintContinuityStats::runOnFunctions(BinaryContext &BC) { + // Create a list of functions with valid profiles. + FunctionListType ValidFunctions; + for (const auto &BFI : BC.getBinaryFunctions()) { + const BinaryFunction *Function = &BFI.second; + if (PrintContinuityStats::shouldOptimize(*Function)) + ValidFunctions.push_back(Function); + } + if (ValidFunctions.empty() || opts::NumFunctionsForContinuityCheck == 0) + return Error::success(); + + printAll(BC, ValidFunctions, opts::NumFunctionsForContinuityCheck); + return Error::success(); +} diff --git a/bolt/lib/Passes/IdenticalCodeFolding.cpp b/bolt/lib/Passes/IdenticalCodeFolding.cpp index 8a8fa0639ec2..38e080c9dd62 100644 --- a/bolt/lib/Passes/IdenticalCodeFolding.cpp +++ b/bolt/lib/Passes/IdenticalCodeFolding.cpp @@ -44,18 +44,6 @@ TimeICF("time-icf", cl::cat(BoltOptCategory)); } // namespace opts -bool IdenticalCodeFolding::shouldOptimize(const BinaryFunction &BF) const { - if (BF.hasUnknownControlFlow()) - return false; - if (BF.isFolded()) - return false; - if (BF.hasSDTMarker()) - return false; - if (BF.isPseudo()) - return false; - return BinaryFunctionPass::shouldOptimize(BF); -} - /// Compare two jump tables in 2 functions. The function relies on consistent /// ordering of basic blocks in both binary functions (e.g. DFS). static bool equalJumpTables(const JumpTable &JumpTableA, diff --git a/bolt/lib/Passes/LongJmp.cpp b/bolt/lib/Passes/LongJmp.cpp index 932d4fd7508f..c483f70a836e 100644 --- a/bolt/lib/Passes/LongJmp.cpp +++ b/bolt/lib/Passes/LongJmp.cpp @@ -11,8 +11,6 @@ //===----------------------------------------------------------------------===// #include "bolt/Passes/LongJmp.h" -#include "bolt/Core/ParallelUtilities.h" -#include "llvm/Support/MathExtras.h" #define DEBUG_TYPE "longjmp" @@ -25,11 +23,6 @@ extern cl::opt<unsigned> AlignFunctions; extern cl::opt<bool> UseOldText; extern cl::opt<bool> HotFunctionsAtEnd; -static cl::opt<bool> - ExperimentalRelaxation("relax-exp", - cl::desc("run experimental relaxation pass"), - cl::init(false), cl::cat(BoltOptCategory)); - static cl::opt<bool> GroupStubs("group-stubs", cl::desc("share stubs across functions"), cl::init(true), cl::cat(BoltOptCategory)); @@ -68,10 +61,10 @@ static BinaryBasicBlock *getBBAtHotColdSplitPoint(BinaryFunction &Func) { if (Next != E && (*Next)->isCold()) return *I; } - llvm_unreachable("No hot-cold split point found"); + llvm_unreachable("No hot-colt split point found"); } -static bool mayNeedStub(const BinaryContext &BC, const MCInst &Inst) { +static bool shouldInsertStub(const BinaryContext &BC, const MCInst &Inst) { return (BC.MIB->isBranch(Inst) || BC.MIB->isCall(Inst)) && !BC.MIB->isIndirectBranch(Inst) && !BC.MIB->isIndirectCall(Inst); } @@ -572,7 +565,7 @@ Error LongJmpPass::relax(BinaryFunction &Func, bool &Modified) { if (BC.MIB->isPseudo(Inst)) continue; - if (!mayNeedStub(BC, Inst)) { + if (!shouldInsertStub(BC, Inst)) { DotAddress += InsnSize; continue; } @@ -636,434 +629,7 @@ Error LongJmpPass::relax(BinaryFunction &Func, bool &Modified) { return Error::success(); } -// Relax internal branches with the assumption that they are not separated by -// more than 128MB after the function is split into fragments. -void LongJmpPass::relaxLocalBranches(BinaryFunction &BF) { - BinaryContext &BC = BF.getBinaryContext(); - auto &MIB = BC.MIB; - - if (!BF.isSimple()) - return; - - // Quick path. - if (!BF.isSplit() && BF.estimateSize() < ShortestJumpSpan) - return; - - auto isBranchOffsetInRange = [&](const MCInst &Inst, int64_t Offset) { - const unsigned Bits = MIB->getPCRelEncodingSize(Inst); - return isIntN(Bits, Offset); - }; - - auto isBlockInRange = [&](const MCInst &Inst, uint64_t InstAddress, - const BinaryBasicBlock &BB) { - const int64_t Offset = BB.getOutputStartAddress() - InstAddress; - return isBranchOffsetInRange(Inst, Offset); - }; - - // Keep track of all function trampolines that are going to be added to the - // function layout at the end of relaxation. - std::vector<std::pair<BinaryBasicBlock *, std::unique_ptr<BinaryBasicBlock>>> - FunctionTrampolines; - - // Fragments are relaxed independently. - for (FunctionFragment &FF : BF.getLayout().fragments()) { - // Fill out code size estimation for the fragment. Use output BB address - // range for offsets from the start of the function. - uint64_t CodeSize = 0; - for (BinaryBasicBlock *BB : FF) { - BB->setOutputStartAddress(CodeSize); - CodeSize += BB->estimateSize(); - BB->setOutputEndAddress(CodeSize); - } - - // Dynamically-updated size of the fragment. - uint64_t FragmentSize = CodeSize; - - // Trampolines created for the fragment. DestinationBB -> TrampolineBB. - DenseMap<const BinaryBasicBlock *, BinaryBasicBlock *> FragmentTrampolines; - - // Create a trampoline code after \p BB or at the end of the fragment if BB - // is nullptr. - auto addTrampolineAfter = [&](BinaryBasicBlock *BB, - BinaryBasicBlock *TargetBB, uint64_t Count, - bool UpdateOffsets = true) { - std::unique_ptr<BinaryBasicBlock> TrampolineBB = BF.createBasicBlock(); - MCInst Inst; - { - auto L = BC.scopeLock(); - MIB->createUncondBranch(Inst, TargetBB->getLabel(), BC.Ctx.get()); - } - TrampolineBB->addInstruction(Inst); - TrampolineBB->addSuccessor(TargetBB, Count); - TrampolineBB->setExecutionCount(Count); - const uint64_t TrampolineAddress = - BB ? BB->getOutputEndAddress() : FragmentSize; - TrampolineBB->setOutputStartAddress(TrampolineAddress); - TrampolineBB->setOutputEndAddress(TrampolineAddress + InstSize); - TrampolineBB->setFragmentNum(FF.getFragmentNum()); - - if (UpdateOffsets) { - FragmentSize += InstSize; - for (BinaryBasicBlock *TBB : FF) { - if (TBB->getOutputStartAddress() >= TrampolineAddress) { - TBB->setOutputStartAddress(TBB->getOutputStartAddress() + InstSize); - TBB->setOutputEndAddress(TBB->getOutputEndAddress() + InstSize); - } - } - for (auto &Pair : FunctionTrampolines) { - BinaryBasicBlock *TBB = Pair.second.get(); - if (TBB->getFragmentNum() != TrampolineBB->getFragmentNum()) - continue; - if (TBB == TrampolineBB.get()) - continue; - if (TBB->getOutputStartAddress() >= TrampolineAddress) { - TBB->setOutputStartAddress(TBB->getOutputStartAddress() + InstSize); - TBB->setOutputEndAddress(TBB->getOutputEndAddress() + InstSize); - } - } - } - - if (!FragmentTrampolines.lookup(TargetBB)) - FragmentTrampolines[TargetBB] = TrampolineBB.get(); - FunctionTrampolines.emplace_back(BB ? BB : FF.back(), - std::move(TrampolineBB)); - return FunctionTrampolines.back().second.get(); - }; - - // Pre-populate trampolines by splitting unconditional branches from the - // containing basic block. - for (BinaryBasicBlock *BB : FF) { - MCInst *Inst = BB->getLastNonPseudoInstr(); - if (!Inst || !MIB->isUnconditionalBranch(*Inst)) - continue; - - const MCSymbol *TargetSymbol = MIB->getTargetSymbol(*Inst); - BB->eraseInstruction(BB->findInstruction(Inst)); - BB->setOutputEndAddress(BB->getOutputEndAddress() - InstSize); - - BinaryBasicBlock::BinaryBranchInfo BI; - BinaryBasicBlock *TargetBB = BB->getSuccessor(TargetSymbol, BI); - - BinaryBasicBlock *TrampolineBB = - addTrampolineAfter(BB, TargetBB, BI.Count, /*UpdateOffsets*/ false); - BB->replaceSuccessor(TargetBB, TrampolineBB, BI.Count); - } - - /// Relax the branch \p Inst. Return true if basic block offsets need an - /// update after the trampoline insertion. - auto relaxBranch = [&](BinaryBasicBlock *BB, MCInst &Inst, - uint64_t InstAddress, BinaryBasicBlock *TargetBB) { - BinaryFunction *BF = BB->getParent(); - - // Branch taken count for optimal relaxation. - const uint64_t Count = BB->getBranchInfo(*TargetBB).Count; - assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE && - "Expected valid branch execution count"); - - // Try to reuse an existing trampoline without introducing any new code. - BinaryBasicBlock *TrampolineBB = FragmentTrampolines.lookup(TargetBB); - if (TrampolineBB && isBlockInRange(Inst, InstAddress, *TrampolineBB)) { - BB->replaceSuccessor(TargetBB, TrampolineBB, Count); - TrampolineBB->setExecutionCount(TrampolineBB->getExecutionCount() + - Count); - auto L = BC.scopeLock(); - MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get()); - return; - } - - // For cold branches, check if we can introduce a trampoline at the end - // of the fragment that is within the branch reach. Note that such - // trampoline may become unreachable and may need further relaxation. - const int64_t OffsetToEnd = FragmentSize - InstAddress; - if (Count == 0 && isBranchOffsetInRange(Inst, OffsetToEnd)) { - TrampolineBB = addTrampolineAfter(nullptr, TargetBB, Count); - BB->replaceSuccessor(TargetBB, TrampolineBB, Count); - auto L = BC.scopeLock(); - MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get()); - - return; - } - - // Insert a new block after the current one and use it as a trampoline. - // If the other successor is a fallthrough invert the condition code. - TrampolineBB = addTrampolineAfter(BB, TargetBB, Count); - - // Check if there's a fallthrough block. - const BinaryBasicBlock *const NextBB = - BF->getLayout().getBasicBlockAfter(BB, /*IgnoreSplits*/ false); - if (BB->getConditionalSuccessor(false) == NextBB) { - BB->swapConditionalSuccessors(); - auto L = BC.scopeLock(); - MIB->reverseBranchCondition(Inst, NextBB->getLabel(), BC.Ctx.get()); - } else { - auto L = BC.scopeLock(); - MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get()); - } - BB->replaceSuccessor(TargetBB, TrampolineBB, Count); - }; - - bool MayNeedRelaxation; - uint64_t NumIterations = 0; - do { - MayNeedRelaxation = false; - ++NumIterations; - for (auto BBI = FF.begin(); BBI != FF.end(); ++BBI) { - BinaryBasicBlock *BB = *BBI; - uint64_t NextInstOffset = BB->getOutputStartAddress(); - for (MCInst &Inst : *BB) { - const size_t InstAddress = NextInstOffset; - if (!MIB->isPseudo(Inst)) - NextInstOffset += 4; - - if (!mayNeedStub(BF.getBinaryContext(), Inst)) - continue; - - const size_t BitsAvailable = MIB->getPCRelEncodingSize(Inst); - - // Span of +/-128MB. - if (BitsAvailable == LongestJumpBits) - continue; - - const MCSymbol *TargetSymbol = MIB->getTargetSymbol(Inst); - BinaryBasicBlock *TargetBB = BB->getSuccessor(TargetSymbol); - assert(TargetBB && - "Basic block target expected for conditional branch."); - - // Check if the relaxation is needed. - if (TargetBB->getFragmentNum() == FF.getFragmentNum() && - isBlockInRange(Inst, InstAddress, *TargetBB)) - continue; - - relaxBranch(BB, Inst, InstAddress, TargetBB); - - MayNeedRelaxation = true; - } - } - - // We may have added new instructions, but the whole fragment is less than - // the minimum branch span. - if (FragmentSize < ShortestJumpSpan) - MayNeedRelaxation = false; - - } while (MayNeedRelaxation); - - LLVM_DEBUG({ - if (NumIterations > 2) { - dbgs() << "BOLT-DEBUG: relaxed fragment " << FF.getFragmentNum().get() - << " in " << NumIterations << " iterations in " << BF << '\n'; - } - }); - } - - // Add trampoline blocks from all fragments to the layout. - DenseMap<BinaryBasicBlock *, std::vector<std::unique_ptr<BinaryBasicBlock>>> - Insertions; - for (std::pair<BinaryBasicBlock *, std::unique_ptr<BinaryBasicBlock>> &Pair : - FunctionTrampolines) { - if (!Pair.second) - continue; - Insertions[Pair.first].emplace_back(std::move(Pair.second)); - } - - for (auto &Pair : Insertions) { - BF.insertBasicBlocks(Pair.first, std::move(Pair.second), - /*UpdateLayout*/ true, /*UpdateCFI*/ true, - /*RecomputeLPs*/ false); - } -} - -void LongJmpPass::relaxCalls(BinaryContext &BC) { - // Map every function to its direct callees. Note that this is different from - // a typical call graph as we completely ignore indirect calls. - uint64_t EstimatedSize = 0; - // Conservatively estimate emitted function size. - auto estimateFunctionSize = [&](const BinaryFunction &BF) -> uint64_t { - if (!BC.shouldEmit(BF)) - return 0; - uint64_t Size = BF.estimateSize(); - if (BF.hasValidIndex()) - Size += BF.getAlignment(); - if (BF.hasIslandsInfo()) { - Size += BF.estimateConstantIslandSize(); - Size += BF.getConstantIslandAlignment(); - } - - return Size; - }; - - std::unordered_map<BinaryFunction *, std::set<BinaryFunction *>> CallMap; - for (BinaryFunction &BF : llvm::make_second_range(BC.getBinaryFunctions())) { - if (!BC.shouldEmit(BF)) - continue; - - EstimatedSize += estimateFunctionSize(BF); - - for (const BinaryBasicBlock &BB : BF) { - for (const MCInst &Inst : BB) { - if (!BC.MIB->isCall(Inst) || BC.MIB->isIndirectCall(Inst) || - BC.MIB->isIndirectBranch(Inst)) - continue; - const MCSymbol *TargetSymbol = BC.MIB->getTargetSymbol(Inst); - assert(TargetSymbol); - - BinaryFunction *Callee = BC.getFunctionForSymbol(TargetSymbol); - if (!Callee) { - /* Ignore internall calls */ - continue; - } - - CallMap[&BF].insert(Callee); - } - } - } - - LLVM_DEBUG(dbgs() << "LongJmp: estimated code size : " << EstimatedSize - << '\n'); - - // Build clusters in the order the functions will appear in the output. - std::vector<FunctionCluster> Clusters; - Clusters.emplace_back(FunctionCluster()); - - for (BinaryFunction *BF : BC.getSortedFunctions()) { - if (!BC.shouldEmit(*BF)) - continue; - - const uint64_t BFSize = estimateFunctionSize(*BF); - if (Clusters.empty() || Clusters.back().Size + BFSize > MaxClusterSize) { - Clusters.emplace_back(FunctionCluster()); - } - - FunctionCluster &FC = Clusters.back(); - FC.Functions.insert(BF); - auto It = FC.Callees.find(BF); - if (It != FC.Callees.end()) { - FC.Callees.erase(It); - } - FC.Size += BFSize; - FC.LastBF = BF; - - for (BinaryFunction *Callee : CallMap[BF]) - if (!FC.Functions.count(Callee)) - FC.Callees.insert(Callee); - } - - // Print cluster stats. - dbgs() << "Built " << Clusters.size() << " clusters\n"; - uint64_t Index = 0; - for (const FunctionCluster &FC : Clusters) { - dbgs() << " Cluster: " << Index++ << '\n'; - dbgs() << " " << FC.Functions.size() << " functions\n"; - dbgs() << " " << FC.Callees.size() << " callees\n"; - dbgs() << " " << FC.Size << " bytes\n"; - } - - if (Clusters.size() > 2) { - BC.errs() << "Large code model is unsupported\n"; - exit(1); - } - - if (Clusters.size() == 1) - return; - - // Populate one of the clusters with PLT functions based on the proximity of - // the PLT section to avoid unneeded thunk redirection. - // FIXME: this part is extremely fragile as it depends on the placement - // of PLT section and its proximity to old or new .text. - // FIXME: a slightly better approach will be to always use thunks for PLT and - // eliminate redirection later using final addresses in address maps. - const size_t PLTClusterNum = opts::UseOldText ? 1 : 0; - for (BinaryFunction &BF : llvm::make_second_range(BC.getBinaryFunctions())) { - if (BF.isPLTFunction()) { - auto &PLTCluster = Clusters[PLTClusterNum]; - PLTCluster.Functions.insert(&BF); - auto It = PLTCluster.Callees.find(&BF); - if (It != PLTCluster.Callees.end()) - PLTCluster.Callees.erase(It); - } - } - - // FIXME: section name to use for thunks. - std::string SectionName = - Clusters[0].LastBF->getCodeSectionName().str().str(); - - // Build thunk functions. - auto createSmallThunk = [&](BinaryFunction &Callee) { - BinaryFunction *ThunkBF = - BC.createThunkBinaryFunction("__BThunk__" + Callee.getOneName().str()); - MCInst Inst; - BC.MIB->createTailCall(Inst, Callee.getSymbol(), BC.Ctx.get()); - ThunkBF->addBasicBlock()->addInstruction(Inst); - ThunkBF->setCodeSectionName(SectionName); - - return ThunkBF; - }; - - DenseMap<BinaryFunction *, BinaryFunction *> Thunks; - for (const FunctionCluster &FC : Clusters) { - for (BinaryFunction *Callee : FC.Callees) { - Thunks[Callee] = createSmallThunk(*Callee); - } - } - - BC.outs() << "BOLT-INFO: " << Thunks.size() << " thunks created\n"; - - // Replace callees with thunks. - for (FunctionCluster &FC : Clusters) { - for (BinaryFunction *BF : FC.Functions) { - if (!CallMap.count(BF)) - continue; - - for (BinaryBasicBlock &BB : *BF) { - for (MCInst &Inst : BB) { - if (!BC.MIB->isCall(Inst) || BC.MIB->isIndirectCall(Inst) || - BC.MIB->isIndirectBranch(Inst)) - continue; - const MCSymbol *TargetSymbol = BC.MIB->getTargetSymbol(Inst); - assert(TargetSymbol); - - BinaryFunction *Callee = BC.getFunctionForSymbol(TargetSymbol); - if (!Callee) { - /* Ignore internal calls */ - continue; - } - - // Check if the callee is in the same cluster. - if (!FC.Callees.count(Callee)) - continue; - - // Use thunk as the call destination. - BC.MIB->replaceBranchTarget(Inst, Thunks[Callee]->getSymbol(), - BC.Ctx.get()); - } - } - } - } - - BC.setThunkLocation(Clusters[0].LastBF); -} - Error LongJmpPass::runOnFunctions(BinaryContext &BC) { - // TODO: set correct code model based on the total size of split-code. - if (opts::ExperimentalRelaxation) { - BC.outs() << "BOLT-INFO: starting experimental relaxation pass\n"; - ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { - relaxLocalBranches(BF); - }; - - ParallelUtilities::PredicateTy SkipPredicate = - [&](const BinaryFunction &BF) { - return !BC.shouldEmit(BF) || !BF.isSimple(); - }; - - ParallelUtilities::runOnEachFunction( - BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, - SkipPredicate, "ExpLongJump"); - - relaxCalls(BC); - - return Error::success(); - } - BC.outs() << "BOLT-INFO: Starting stub-insertion pass\n"; std::vector<BinaryFunction *> Sorted = BC.getSortedFunctions(); bool Modified; diff --git a/bolt/lib/Passes/PatchEntries.cpp b/bolt/lib/Passes/PatchEntries.cpp index 1530d1076bb0..981d1b70af90 100644 --- a/bolt/lib/Passes/PatchEntries.cpp +++ b/bolt/lib/Passes/PatchEntries.cpp @@ -12,7 +12,6 @@ //===----------------------------------------------------------------------===// #include "bolt/Passes/PatchEntries.h" -#include "bolt/Utils/CommandLineOpts.h" #include "bolt/Utils/NameResolver.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Support/CommandLine.h" @@ -36,20 +35,16 @@ Error PatchEntries::runOnFunctions(BinaryContext &BC) { if (!opts::ForcePatch) { // Mark the binary for patching if we did not create external references // for original code in any of functions we are not going to emit. - bool NeedsPatching = - llvm::any_of(llvm::make_second_range(BC.getBinaryFunctions()), - [&](BinaryFunction &BF) { - return !BF.isPseudo() && !BC.shouldEmit(BF) && - !BF.hasExternalRefRelocations(); - }); + bool NeedsPatching = llvm::any_of( + llvm::make_second_range(BC.getBinaryFunctions()), + [&](BinaryFunction &BF) { + return !BC.shouldEmit(BF) && !BF.hasExternalRefRelocations(); + }); if (!NeedsPatching) return Error::success(); } - assert(!opts::UseOldText && - "Cannot patch entries while overwriting original .text"); - if (opts::Verbosity >= 1) BC.outs() << "BOLT-INFO: patching entries in original code\n"; diff --git a/bolt/lib/Passes/VeneerElimination.cpp b/bolt/lib/Passes/VeneerElimination.cpp index 738538ef3c38..87fe625e8c3b 100644 --- a/bolt/lib/Passes/VeneerElimination.cpp +++ b/bolt/lib/Passes/VeneerElimination.cpp @@ -33,40 +33,26 @@ Error VeneerElimination::runOnFunctions(BinaryContext &BC) { if (!opts::EliminateVeneers || !BC.isAArch64()) return Error::success(); + std::map<uint64_t, BinaryFunction> &BFs = BC.getBinaryFunctions(); std::unordered_map<const MCSymbol *, const MCSymbol *> VeneerDestinations; uint64_t VeneersCount = 0; - uint64_t NumAllVeneers = 0; - for (BinaryFunction &BF : llvm::make_second_range(BC.getBinaryFunctions())) { - if (!BF.isAArch64Veneer()) + for (auto &It : BFs) { + BinaryFunction &VeneerFunction = It.second; + if (!VeneerFunction.isAArch64Veneer()) continue; - ++NumAllVeneers; - - if (BF.isIgnored()) - continue; - - MCInst &FirstInstruction = *(BF.begin()->begin()); - const MCSymbol *VeneerTargetSymbol; - if (BC.MIB->isTailCall(FirstInstruction)) { - VeneerTargetSymbol = BC.MIB->getTargetSymbol(FirstInstruction); - } else { - if (!BC.MIB->hasAnnotation(FirstInstruction, "AArch64Veneer")) - continue; - VeneerTargetSymbol = BC.MIB->getTargetSymbol(FirstInstruction, 1); - } - - if (!VeneerTargetSymbol) - continue; - - for (const MCSymbol *Symbol : BF.getSymbols()) - VeneerDestinations[Symbol] = VeneerTargetSymbol; - VeneersCount++; - BF.setPseudo(true); + VeneerFunction.setPseudo(true); + MCInst &FirstInstruction = *(VeneerFunction.begin()->begin()); + const MCSymbol *VeneerTargetSymbol = + BC.MIB->getTargetSymbol(FirstInstruction, 1); + assert(VeneerTargetSymbol && "Expecting target symbol for instruction"); + for (const MCSymbol *Symbol : VeneerFunction.getSymbols()) + VeneerDestinations[Symbol] = VeneerTargetSymbol; } BC.outs() << "BOLT-INFO: number of removed linker-inserted veneers: " - << VeneersCount << ". Total veneers: " << NumAllVeneers << '\n'; + << VeneersCount << "\n"; // Handle veneers to veneers in case they occur for (auto &Entry : VeneerDestinations) { @@ -79,8 +65,9 @@ Error VeneerElimination::runOnFunctions(BinaryContext &BC) { } uint64_t VeneerCallers = 0; - for (BinaryFunction &BF : llvm::make_second_range(BC.getBinaryFunctions())) { - for (BinaryBasicBlock &BB : BF) { + for (auto &It : BFs) { + BinaryFunction &Function = It.second; + for (BinaryBasicBlock &BB : Function) { for (MCInst &Instr : BB) { if (!BC.MIB->isCall(Instr) || BC.MIB->isIndirectCall(Instr)) continue; diff --git a/bolt/lib/Profile/BoltAddressTranslation.cpp b/bolt/lib/Profile/BoltAddressTranslation.cpp index 334252cbd360..ec7e303c0f52 100644 --- a/bolt/lib/Profile/BoltAddressTranslation.cpp +++ b/bolt/lib/Profile/BoltAddressTranslation.cpp @@ -9,22 +9,12 @@ #include "bolt/Profile/BoltAddressTranslation.h" #include "bolt/Core/BinaryFunction.h" #include "llvm/ADT/APInt.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/Errc.h" #include "llvm/Support/Error.h" #include "llvm/Support/LEB128.h" -#include "llvm/Support/Timer.h" #define DEBUG_TYPE "bolt-bat" -namespace opts { -extern llvm::cl::OptionCategory BoltCategory; -llvm::cl::opt<bool> - TimeBAT("time-bat", - llvm::cl::desc("print time spent processing BAT tables"), - llvm::cl::Hidden, llvm::cl::cat(BoltCategory)); -} // namespace opts - namespace llvm { namespace bolt { @@ -85,9 +75,8 @@ void BoltAddressTranslation::writeEntriesForBB( } } -void BoltAddressTranslation::constructMaps(const BinaryContext &BC) { - NamedRegionTimer T("constuctmaps", "construct translation maps", "bat", - "process BAT", opts::TimeBAT); +void BoltAddressTranslation::write(const BinaryContext &BC, raw_ostream &OS) { + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Writing BOLT Address Translation Tables\n"); for (auto &BFI : BC.getBinaryFunctions()) { const BinaryFunction &Function = BFI.second; const uint64_t InputAddress = Function.getAddress(); @@ -151,11 +140,6 @@ void BoltAddressTranslation::constructMaps(const BinaryContext &BC) { Maps.emplace(FF.getAddress(), std::move(Map)); } } -} - -void BoltAddressTranslation::write(const BinaryContext &BC, raw_ostream &OS) { - LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Writing BOLT Address Translation Tables\n"); - constructMaps(BC); // Output addresses are delta-encoded uint64_t PrevAddress = 0; @@ -200,8 +184,6 @@ size_t BoltAddressTranslation::getNumEqualOffsets(const MapTy &Map, template <bool Cold> void BoltAddressTranslation::writeMaps(std::map<uint64_t, MapTy> &Maps, uint64_t &PrevAddress, raw_ostream &OS) { - NamedRegionTimer T("writemaps", "write translation maps", "bat", - "process BAT", opts::TimeBAT); const uint32_t NumFuncs = llvm::count_if(llvm::make_first_range(Maps), [&](const uint64_t Address) { return Cold == ColdPartSource.count(Address); diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp index 67ed32017667..fe0fcfdcd42f 100644 --- a/bolt/lib/Profile/YAMLProfileReader.cpp +++ b/bolt/lib/Profile/YAMLProfileReader.cpp @@ -643,11 +643,7 @@ size_t YAMLProfileReader::matchWithNameSimilarity(BinaryContext &BC) { // equal number of blocks. if (NamespaceToProfiledBFSizesIt->second.count(BF->size()) == 0) continue; - auto NamespaceToBFsIt = NamespaceToBFs.find(Namespace); - if (NamespaceToBFsIt == NamespaceToBFs.end()) - NamespaceToBFs[Namespace] = {BF}; - else - NamespaceToBFsIt->second.push_back(BF); + NamespaceToBFs[Namespace].push_back(BF); } // Iterates through all profiled functions and binary functions belonging to diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp index 17e08324024d..b09060418334 100644 --- a/bolt/lib/Rewrite/BinaryPassManager.cpp +++ b/bolt/lib/Rewrite/BinaryPassManager.cpp @@ -12,6 +12,7 @@ #include "bolt/Passes/AllocCombiner.h" #include "bolt/Passes/AsmDump.h" #include "bolt/Passes/CMOVConversion.h" +#include "bolt/Passes/ContinuityStats.h" #include "bolt/Passes/FixRISCVCallsPass.h" #include "bolt/Passes/FixRelaxationPass.h" #include "bolt/Passes/FrameOptimizer.h" @@ -373,6 +374,8 @@ Error BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) { if (opts::PrintProfileStats) Manager.registerPass(std::make_unique<PrintProfileStats>(NeverPrint)); + Manager.registerPass(std::make_unique<PrintContinuityStats>(NeverPrint)); + Manager.registerPass(std::make_unique<ValidateInternalCalls>(NeverPrint)); Manager.registerPass(std::make_unique<ValidateMemRefs>(NeverPrint)); @@ -489,9 +492,6 @@ Error BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) { // memory profiling data. Manager.registerPass(std::make_unique<ReorderData>()); - // Assign each function an output section. - Manager.registerPass(std::make_unique<AssignSections>()); - if (BC.isAArch64()) { Manager.registerPass(std::make_unique<ADRRelaxationPass>()); @@ -515,6 +515,9 @@ Error BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) { Manager.registerPass( std::make_unique<RetpolineInsertion>(PrintRetpolineInsertion)); + // Assign each function an output section. + Manager.registerPass(std::make_unique<AssignSections>()); + // Patch original function entries if (BC.HasRelocations) Manager.registerPass(std::make_unique<PatchEntries>()); diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp index 849928910996..f9cb1b3895e7 100644 --- a/bolt/lib/Rewrite/DWARFRewriter.cpp +++ b/bolt/lib/Rewrite/DWARFRewriter.cpp @@ -42,7 +42,6 @@ #include "llvm/Support/FileSystem.h" #include "llvm/Support/LEB128.h" #include "llvm/Support/ThreadPool.h" -#include "llvm/Support/Timer.h" #include "llvm/Support/raw_ostream.h" #include <algorithm> #include <cstdint> @@ -368,11 +367,6 @@ static cl::opt<bool> AlwaysConvertToRanges( cl::ReallyHidden, cl::init(false), cl::cat(BoltCategory)); extern cl::opt<std::string> CompDirOverride; - -cl::opt<bool> - TimeDebug("time-debug", - cl::desc("print time spent processing debug information"), - cl::Hidden, cl::cat(BoltCategory)); } // namespace opts /// If DW_AT_low_pc exists sets LowPC and returns true. @@ -553,8 +547,6 @@ using CUPartitionVector = std::vector<DWARFUnitVec>; /// cu-processing-batch-size. All the CUs that have cross CU reference reference /// as a source are put in to the same initial bucket. static CUPartitionVector partitionCUs(DWARFContext &DwCtx) { - NamedRegionTimer T("partitioncus", "partition cus", "debug", - "update debug info", opts::TimeDebug); CUPartitionVector Vec(2); unsigned Counter = 0; const DWARFDebugAbbrev *Abbr = DwCtx.getDebugAbbrev(); @@ -615,9 +607,11 @@ void DWARFRewriter::updateDebugInfo() { } uint32_t CUIndex = 0; + std::mutex AccessMutex; // Needs to be invoked in the same order as CUs are processed. llvm::DenseMap<uint64_t, uint64_t> LocListWritersIndexByCU; auto createRangeLocListAddressWriters = [&](DWARFUnit &CU) { + std::lock_guard<std::mutex> Lock(AccessMutex); const uint16_t DwarfVersion = CU.getVersion(); if (DwarfVersion >= 5) { auto AddrW = std::make_unique<DebugAddrWriterDwarf5>( @@ -683,8 +677,6 @@ void DWARFRewriter::updateDebugInfo() { GDBIndexSection, TempRangesSectionWriter); }; auto processMainBinaryCU = [&](DWARFUnit &Unit, DIEBuilder &DIEBlder) { - NamedRegionTimer T("processmainbinarycu", "process main binary CU", - "debug", "update debug info", opts::TimeDebug); std::optional<DWARFUnit *> SplitCU; std::optional<uint64_t> RangesBase; std::optional<uint64_t> DWOId = Unit.getDWOId(); @@ -717,11 +709,7 @@ void DWARFRewriter::updateDebugInfo() { }; DIEBuilder DIEBlder(BC, BC.DwCtx.get(), DebugNamesTable); - { - NamedRegionTimer T("buildtypeunits", "build type units", "debug", - "update debug info", opts::TimeDebug); - DIEBlder.buildTypeUnits(StrOffstsWriter.get()); - } + DIEBlder.buildTypeUnits(StrOffstsWriter.get()); SmallVector<char, 20> OutBuffer; std::unique_ptr<raw_svector_ostream> ObjOS = std::make_unique<raw_svector_ostream>(OutBuffer); @@ -735,9 +723,8 @@ void DWARFRewriter::updateDebugInfo() { CUPartitionVector PartVec = partitionCUs(*BC.DwCtx); const unsigned int ThreadCount = std::min(opts::DebugThreadCount, opts::ThreadCount); - auto updateSplitCUs = [&]() { - NamedRegionTimer T("updatesplitcus", "update split CUs", "debug", - "update debug info", opts::TimeDebug); + for (std::vector<DWARFUnit *> &Vec : PartVec) { + DIEBlder.buildCompileUnits(Vec); llvm::SmallVector<std::unique_ptr<DIEBuilder>, 72> DWODIEBuildersByCU; ThreadPoolInterface &ThreadPool = ParallelUtilities::getThreadPool(ThreadCount); @@ -777,13 +764,6 @@ void DWARFRewriter::updateDebugInfo() { }); } ThreadPool.wait(); - return DWODIEBuildersByCU; - }; - - for (std::vector<DWARFUnit *> &Vec : PartVec) { - DIEBlder.buildCompileUnits(Vec); - llvm::SmallVector<std::unique_ptr<DIEBuilder>, 72> DWODIEBuildersByCU = - updateSplitCUs(); for (std::unique_ptr<DIEBuilder> &DWODIEBuilderPtr : DWODIEBuildersByCU) DWODIEBuilderPtr->updateDebugNamesTable(); for (DWARFUnit *CU : DIEBlder.getProcessedCUs()) @@ -1471,8 +1451,6 @@ void DWARFRewriter::updateLineTableOffsets(const MCAssembler &Asm) { CUOffsetMap DWARFRewriter::finalizeTypeSections(DIEBuilder &DIEBlder, DIEStreamer &Streamer, GDBIndex &GDBIndexSection) { - NamedRegionTimer T("finalizetypesections", "finalize type sections", - "debug", "update debug info", opts::TimeDebug); // update TypeUnit DW_AT_stmt_list with new .debug_line information. auto updateLineTable = [&](const DWARFUnit &Unit) -> void { DIE *UnitDIE = DIEBlder.getUnitDIEbyUnit(Unit); @@ -1538,8 +1516,6 @@ void DWARFRewriter::finalizeDebugSections( DIEBuilder &DIEBlder, DWARF5AcceleratorTable &DebugNamesTable, DIEStreamer &Streamer, raw_svector_ostream &ObjOS, CUOffsetMap &CUMap, DebugAddrWriter &FinalAddrWriter) { - NamedRegionTimer T("finalizedebugsections", "finalize debug sections", - "debug", "update debug info", opts::TimeDebug); if (StrWriter->isInitialized()) { RewriteInstance::addToDebugSectionsToOverwrite(".debug_str"); std::unique_ptr<DebugStrBufferVector> DebugStrSectionContents = @@ -1652,8 +1628,6 @@ void DWARFRewriter::finalizeCompileUnits(DIEBuilder &DIEBlder, CUOffsetMap &CUMap, const std::list<DWARFUnit *> &CUs, DebugAddrWriter &FinalAddrWriter) { - NamedRegionTimer T("finalizecompileunits", "finalize compile units", - "debug", "update debug info", opts::TimeDebug); for (DWARFUnit *CU : CUs) { auto AddressWriterIterator = AddressWritersByCU.find(CU->getOffset()); assert(AddressWriterIterator != AddressWritersByCU.end() && diff --git a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp index 8532ec85ebbf..8647df4b0edf 100644 --- a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp +++ b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp @@ -20,7 +20,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/LEB128.h" -#include "llvm/Support/Timer.h" #include <memory> #undef DEBUG_TYPE @@ -31,10 +30,6 @@ using namespace bolt; namespace opts { -cl::opt<bool> TimeProbes("time-probes", - cl::desc("print time spent processing pseudo probes"), - cl::Hidden, cl::cat(BoltCategory)); - enum PrintPseudoProbesOptions { PPP_None = 0, PPP_Probes_Section_Decode = 0x1, @@ -108,9 +103,6 @@ Error PseudoProbeRewriter::postEmitFinalizer() { parsePseudoProbe(); updatePseudoProbes(); - // encode pseudo probes with updated addresses - encodePseudoProbes(); - return Error::success(); } @@ -124,8 +116,6 @@ void PseudoProbeRewriter::parsePseudoProbe(bool ProfiledOnly) { return; } - NamedRegionTimer T("parseprobes", "parse pseudo probes", "probes", - "process pseudo probes", opts::TimeProbes); // If only one section is found, it might mean the ELF is corrupted. if (!PseudoProbeDescSection) { errs() << "BOLT-WARNING: fail in reading .pseudo_probe_desc binary\n"; @@ -206,8 +196,6 @@ void PseudoProbeRewriter::updatePseudoProbes() { // check if there is pseudo probe section decoded if (ProbeDecoder.getAddress2ProbesMap().empty()) return; - NamedRegionTimer T("updateprobes", "update pseudo probes", "probes", - "process pseudo probes", opts::TimeProbes); // input address converted to output AddressProbesMap &Address2ProbesMap = ProbeDecoder.getAddress2ProbesMap(); const GUIDProbeFunctionMap &GUID2Func = ProbeDecoder.getGUID2FuncDescMap(); @@ -284,15 +272,13 @@ void PseudoProbeRewriter::updatePseudoProbes() { } outs() << "=======================================\n"; } + + // encode pseudo probes with updated addresses + encodePseudoProbes(); } void PseudoProbeRewriter::encodePseudoProbes() { MCPseudoProbeDecoder &ProbeDecoder(*ProbeDecoderPtr); - // check if there is pseudo probe section decoded - if (ProbeDecoder.getAddress2ProbesMap().empty()) - return; - NamedRegionTimer T("encodeprobes", "encode pseudo probes", "probes", - "process pseudo probes", opts::TimeProbes); // Buffer for new pseudo probes section SmallString<8> Contents; MCDecodedPseudoProbe *LastProbe = nullptr; diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index ba27aa817e1e..32ec7abe8b66 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -3557,11 +3557,9 @@ void RewriteInstance::finalizeMetadataPreEmit() { } void RewriteInstance::updateMetadata() { - { - NamedRegionTimer T("updatemetadata-postemit", "update metadata post-emit", - TimerGroupName, TimerGroupDesc, opts::TimeRewrite); - MetadataManager.runFinalizersAfterEmit(); - } + NamedRegionTimer T("updatemetadata-postemit", "update metadata post-emit", + TimerGroupName, TimerGroupDesc, opts::TimeRewrite); + MetadataManager.runFinalizersAfterEmit(); if (opts::UpdateDebugSections) { NamedRegionTimer T("updateDebugInfo", "update debug info", TimerGroupName, @@ -3723,41 +3721,15 @@ void RewriteInstance::mapCodeSections(BOLTLinker::SectionMapper MapSection) { return Address; }; - // Try to allocate sections before the \p Address and return an address for - // the allocation of the first section or 0 if \p is not big enough. - auto allocateBefore = [&](uint64_t Address) -> uint64_t { - for (auto SI = CodeSections.rbegin(), SE = CodeSections.rend(); SI != SE; - ++SI) { - BinarySection *Section = *SI; - if (Section->getOutputSize() > Address) - return 0; - Address -= Section->getOutputSize(); - Address = alignDown(Address, Section->getAlignment()); - Section->setOutputAddress(Address); - } - return Address; - }; - // Check if we can fit code in the original .text bool AllocationDone = false; if (opts::UseOldText) { - uint64_t StartAddress; - uint64_t EndAddress; - if (opts::HotFunctionsAtEnd) { - EndAddress = BC->OldTextSectionAddress + BC->OldTextSectionSize; - StartAddress = allocateBefore(EndAddress); - } else { - StartAddress = BC->OldTextSectionAddress; - EndAddress = allocateAt(BC->OldTextSectionAddress); - } + const uint64_t CodeSize = + allocateAt(BC->OldTextSectionAddress) - BC->OldTextSectionAddress; - const uint64_t CodeSize = EndAddress - StartAddress; if (CodeSize <= BC->OldTextSectionSize) { BC->outs() << "BOLT-INFO: using original .text for new code with 0x" - << Twine::utohexstr(opts::AlignText) << " alignment"; - if (StartAddress != BC->OldTextSectionAddress) - BC->outs() << " at 0x" << Twine::utohexstr(StartAddress); - BC->outs() << '\n'; + << Twine::utohexstr(opts::AlignText) << " alignment\n"; AllocationDone = true; } else { BC->errs() @@ -5563,8 +5535,6 @@ uint64_t RewriteInstance::getNewFunctionOrDataAddress(uint64_t OldAddress) { } void RewriteInstance::rewriteFile() { - NamedRegionTimer T("rewrite", "rewrite file", TimerGroupName, - TimerGroupDesc, opts::TimeRewrite); std::error_code EC; Out = std::make_unique<ToolOutputFile>(opts::OutputFilename, EC, sys::fs::OF_None); |
