diff options
| author | Amir Ayupov <aaupov@fb.com> | 2024-09-30 13:34:45 -0700 |
|---|---|---|
| committer | Amir Ayupov <aaupov@fb.com> | 2024-09-30 13:34:45 -0700 |
| commit | 7e450342bfbf28139bd26ce65c8be19c3c8860b2 (patch) | |
| tree | 67df516c958d7edbab8a3228c3a500cc100644a1 | |
| parent | 70ef5eb6f087524dc952a8f5249b79f4a4000e04 (diff) | |
| parent | dc5ee08f8f05b3936ec014c1b67e40192bd89976 (diff) | |
[𝘀𝗽𝗿] initial versionusers/aaupov/spr/bolt-support-show-density-for-fdata-and-yaml-profiles
Created using spr 1.3.4
| -rw-r--r-- | bolt/include/bolt/Core/BinaryFunction.h | 6 | ||||
| -rw-r--r-- | bolt/lib/Passes/BinaryPasses.cpp | 81 | ||||
| -rw-r--r-- | bolt/lib/Profile/DataAggregator.cpp | 15 | ||||
| -rw-r--r-- | bolt/test/X86/pre-aggregated-perf.test | 7 |
4 files changed, 105 insertions, 4 deletions
diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h index 6ebbaf94754e..d0a746d6805f 100644 --- a/bolt/include/bolt/Core/BinaryFunction.h +++ b/bolt/include/bolt/Core/BinaryFunction.h @@ -387,6 +387,9 @@ private: /// Raw branch count for this function in the profile. uint64_t RawBranchCount{0}; + /// Dynamically executed function bytes, used for density computation. + uint64_t SampleCountInBytes{0}; + /// Indicates the type of profile the function is using. uint16_t ProfileFlags{PF_NONE}; @@ -1845,6 +1848,9 @@ public: /// to this function. void setRawBranchCount(uint64_t Count) { RawBranchCount = Count; } + /// Return the number of dynamically executed bytes, from raw perf data. + uint64_t getSampleCountInBytes() const { return SampleCountInBytes; } + /// Return the execution count for functions with known profile. /// Return 0 if the function has no profile. uint64_t getKnownExecutionCount() const { diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp index fa95ad7324ac..e9602b0e88f3 100644 --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -223,6 +223,22 @@ static cl::opt<unsigned> TopCalledLimit( "functions section"), cl::init(100), cl::Hidden, cl::cat(BoltCategory)); +// Profile density options, synced with llvm-profgen/ProfileGenerator.cpp +static cl::opt<bool> ShowDensity("show-density", cl::init(false), + cl::desc("show profile density details"), + cl::Optional); + +static cl::opt<int> ProfileDensityCutOffHot( + "profile-density-cutoff-hot", cl::init(990000), + cl::desc("Total samples cutoff for functions used to calculate " + "profile density.")); + +static cl::opt<double> ProfileDensityThreshold( + "profile-density-threshold", cl::init(0), + cl::desc("If the profile density is below the given threshold, it " + "will be suggested to increase the sampling rate."), + cl::Optional); + } // namespace opts namespace llvm { @@ -1383,6 +1399,7 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) { uint64_t StaleSampleCount = 0; uint64_t InferredSampleCount = 0; std::vector<const BinaryFunction *> ProfiledFunctions; + std::vector<std::pair<double, uint64_t>> FuncDensityList; const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n"; for (auto &BFI : BC.getBinaryFunctions()) { const BinaryFunction &Function = BFI.second; @@ -1441,6 +1458,26 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) { StaleSampleCount += SampleCount; ++NumAllStaleFunctions; } + + if (opts::ShowDensity) { + uint64_t Size = Function.getSize(); + // In case of BOLT split functions registered in BAT, executed traces are + // automatically attributed to the main fragment. Add up function sizes + // for all fragments. + if (IsHotParentOfBOLTSplitFunction) + for (const BinaryFunction *Fragment : Function.getFragments()) + Size += Fragment->getSize(); + uint64_t ExecutedBytes = Function.getSampleCountInBytes(); + if (!ExecutedBytes && Function.hasCFG()) + for (const BinaryBasicBlock &BB : Function) + ExecutedBytes += BB.getOriginalSize() * BB.getKnownExecutionCount(); + double Density = (double)1.0 * ExecutedBytes / Size; + FuncDensityList.emplace_back(Density, SampleCount); + LLVM_DEBUG(BC.outs() << Function << ": executed bytes " + << Function.getSampleCountInBytes() << ", size (b) " + << Size << ", density " << Density + << ", sample count " << SampleCount << '\n'); + } } BC.NumProfiledFuncs = ProfiledFunctions.size(); BC.NumStaleProfileFuncs = NumStaleProfileFunctions; @@ -1684,6 +1721,50 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) { BC.outs() << ". Use -print-unknown to see the list."; BC.outs() << '\n'; } + + if (opts::ShowDensity) { + double Density = 0.0; + // Sorted by the density in descending order. + llvm::stable_sort(FuncDensityList, + [&](const std::pair<double, uint64_t> &A, + const std::pair<double, uint64_t> &B) { + if (A.first != B.first) + return A.first > B.first; + return A.second < B.second; + }); + + uint64_t AccumulatedSamples = 0; + uint32_t I = 0; + assert(opts::ProfileDensityCutOffHot <= 1000000 && + "The cutoff value is greater than 1000000(100%)"); + while (AccumulatedSamples < + TotalSampleCount * + static_cast<float>(opts::ProfileDensityCutOffHot) / + 1000000 && + I < FuncDensityList.size()) { + AccumulatedSamples += FuncDensityList[I].second; + Density = FuncDensityList[I].first; + I++; + } + if (Density == 0.0) { + BC.errs() << "BOLT-WARNING: the output profile is empty or the " + "--profile-density-cutoff-hot option is " + "set too low. Please check your command.\n"; + } else if (Density < opts::ProfileDensityThreshold) { + BC.errs() + << "BOLT-WARNING: BOLT is estimated to optimize better with " + << format("%.1f", opts::ProfileDensityThreshold / Density) + << "x more samples. Please consider increasing sampling rate or " + "profiling for longer duration to get more samples.\n"; + } + + BC.outs() << "BOLT-INFO: Functions with density >= " + << format("%.1f", Density) << " account for " + << format("%.2f", + static_cast<double>(opts::ProfileDensityCutOffHot) / + 10000) + << "% total sample counts.\n"; + } return Error::success(); } diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index fcde6f5f4642..4c716ce65ac6 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -638,8 +638,12 @@ void DataAggregator::processProfile(BinaryContext &BC) { : BinaryFunction::PF_LBR; for (auto &BFI : BC.getBinaryFunctions()) { BinaryFunction &BF = BFI.second; - if (getBranchData(BF) || getFuncSampleData(BF.getNames())) + FuncBranchData *FBD = getBranchData(BF); + if (FBD || getFuncSampleData(BF.getNames())) { BF.markProfiled(Flags); + if (FBD) + BF.RawBranchCount = FBD->getNumExecutedBranches(); + } } for (auto &FuncBranches : NamesToBranches) @@ -845,6 +849,12 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second, return false; } + // Set ParentFunc to BAT parent function or FromFunc itself. + BinaryFunction *ParentFunc = getBATParentFunction(*FromFunc); + if (!ParentFunc) + ParentFunc = FromFunc; + ParentFunc->SampleCountInBytes += Count * (Second.From - First.To); + std::optional<BoltAddressTranslation::FallthroughListTy> FTs = BAT ? BAT->getFallthroughsInTrace(FromFunc->getAddress(), First.To, Second.From) @@ -864,13 +874,12 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second, << FromFunc->getPrintName() << ":" << Twine::utohexstr(First.To) << " to " << Twine::utohexstr(Second.From) << ".\n"); - BinaryFunction *ParentFunc = getBATParentFunction(*FromFunc); for (auto [From, To] : *FTs) { if (BAT) { From = BAT->translate(FromFunc->getAddress(), From, /*IsBranchSrc=*/true); To = BAT->translate(FromFunc->getAddress(), To, /*IsBranchSrc=*/false); } - doIntraBranch(ParentFunc ? *ParentFunc : *FromFunc, From, To, Count, false); + doIntraBranch(*ParentFunc, From, To, Count, false); } return true; diff --git a/bolt/test/X86/pre-aggregated-perf.test b/bolt/test/X86/pre-aggregated-perf.test index 90252f9ff68d..ecdc613a8d76 100644 --- a/bolt/test/X86/pre-aggregated-perf.test +++ b/bolt/test/X86/pre-aggregated-perf.test @@ -11,7 +11,12 @@ REQUIRES: system-linux RUN: yaml2obj %p/Inputs/blarge.yaml &> %t.exe RUN: perf2bolt %t.exe -o %t --pa -p %p/Inputs/pre-aggregated.txt -w %t.new \ -RUN: --profile-use-dfs | FileCheck %s +RUN: --show-density --profile-density-threshold=9 \ +RUN: --profile-density-cutoff-hot=970000 \ +RUN: --profile-use-dfs | FileCheck %s --check-prefix=CHECK-P2B + +CHECK-P2B: BOLT-INFO: 4 out of 7 functions in the binary (57.1%) have non-empty execution profile +CHECK-P2B: BOLT-INFO: Functions with density >= 21.7 account for 97.00% total sample counts. RUN: llvm-bolt %t.exe -data %t -o %t.null | FileCheck %s RUN: llvm-bolt %t.exe -data %t.new -o %t.null | FileCheck %s |
