summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAmir Ayupov <aaupov@fb.com>2024-09-30 13:34:45 -0700
committerAmir Ayupov <aaupov@fb.com>2024-09-30 13:34:45 -0700
commitdc5ee08f8f05b3936ec014c1b67e40192bd89976 (patch)
tree00786e055ffca8b98277fa48dd774b80e06fd0c6
parent70ef5eb6f087524dc952a8f5249b79f4a4000e04 (diff)
[𝘀𝗽𝗿] changes to main this commit is based onusers/aaupov/spr/main.bolt-support-show-density-for-fdata-and-yaml-profiles
Created using spr 1.3.4 [skip ci]
-rw-r--r--bolt/include/bolt/Core/BinaryFunction.h6
-rw-r--r--bolt/lib/Passes/BinaryPasses.cpp77
-rw-r--r--bolt/lib/Profile/DataAggregator.cpp15
-rw-r--r--bolt/test/X86/pre-aggregated-perf.test7
4 files changed, 101 insertions, 4 deletions
diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h
index 6ebbaf94754e..d0a746d6805f 100644
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -387,6 +387,9 @@ private:
/// Raw branch count for this function in the profile.
uint64_t RawBranchCount{0};
+ /// Dynamically executed function bytes, used for density computation.
+ uint64_t SampleCountInBytes{0};
+
/// Indicates the type of profile the function is using.
uint16_t ProfileFlags{PF_NONE};
@@ -1845,6 +1848,9 @@ public:
/// to this function.
void setRawBranchCount(uint64_t Count) { RawBranchCount = Count; }
+ /// Return the number of dynamically executed bytes, from raw perf data.
+ uint64_t getSampleCountInBytes() const { return SampleCountInBytes; }
+
/// Return the execution count for functions with known profile.
/// Return 0 if the function has no profile.
uint64_t getKnownExecutionCount() const {
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index fa95ad7324ac..1b6c450c0b48 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -223,6 +223,22 @@ static cl::opt<unsigned> TopCalledLimit(
"functions section"),
cl::init(100), cl::Hidden, cl::cat(BoltCategory));
+// Profile density options, synced with llvm-profgen/ProfileGenerator.cpp
+static cl::opt<bool> ShowDensity("show-density", cl::init(false),
+ cl::desc("show profile density details"),
+ cl::Optional);
+
+static cl::opt<int> ProfileDensityCutOffHot(
+ "profile-density-cutoff-hot", cl::init(990000),
+ cl::desc("Total samples cutoff for functions used to calculate "
+ "profile density."));
+
+static cl::opt<double> ProfileDensityThreshold(
+ "profile-density-threshold", cl::init(0),
+ cl::desc("If the profile density is below the given threshold, it "
+ "will be suggested to increase the sampling rate."),
+ cl::Optional);
+
} // namespace opts
namespace llvm {
@@ -1383,6 +1399,7 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
uint64_t StaleSampleCount = 0;
uint64_t InferredSampleCount = 0;
std::vector<const BinaryFunction *> ProfiledFunctions;
+ std::vector<std::pair<double, uint64_t>> FuncDensityList;
const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n";
for (auto &BFI : BC.getBinaryFunctions()) {
const BinaryFunction &Function = BFI.second;
@@ -1441,6 +1458,22 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
StaleSampleCount += SampleCount;
++NumAllStaleFunctions;
}
+
+ if (opts::ShowDensity) {
+ uint64_t Size = Function.getSize();
+ // In case of BOLT split functions registered in BAT, executed traces are
+ // automatically attributed to the main fragment. Add up function sizes
+ // for all fragments.
+ if (IsHotParentOfBOLTSplitFunction)
+ for (const BinaryFunction *Fragment : Function.getFragments())
+ Size += Fragment->getSize();
+ double Density = (double)1.0 * Function.getSampleCountInBytes() / Size;
+ FuncDensityList.emplace_back(Density, SampleCount);
+ LLVM_DEBUG(BC.outs() << Function << ": executed bytes "
+ << Function.getSampleCountInBytes() << ", size (b) "
+ << Size << ", density " << Density
+ << ", sample count " << SampleCount << '\n');
+ }
}
BC.NumProfiledFuncs = ProfiledFunctions.size();
BC.NumStaleProfileFuncs = NumStaleProfileFunctions;
@@ -1684,6 +1717,50 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
BC.outs() << ". Use -print-unknown to see the list.";
BC.outs() << '\n';
}
+
+ if (opts::ShowDensity) {
+ double Density = 0.0;
+ // Sorted by the density in descending order.
+ llvm::stable_sort(FuncDensityList,
+ [&](const std::pair<double, uint64_t> &A,
+ const std::pair<double, uint64_t> &B) {
+ if (A.first != B.first)
+ return A.first > B.first;
+ return A.second < B.second;
+ });
+
+ uint64_t AccumulatedSamples = 0;
+ uint32_t I = 0;
+ assert(opts::ProfileDensityCutOffHot <= 1000000 &&
+ "The cutoff value is greater than 1000000(100%)");
+ while (AccumulatedSamples <
+ TotalSampleCount *
+ static_cast<float>(opts::ProfileDensityCutOffHot) /
+ 1000000 &&
+ I < FuncDensityList.size()) {
+ AccumulatedSamples += FuncDensityList[I].second;
+ Density = FuncDensityList[I].first;
+ I++;
+ }
+ if (Density == 0.0) {
+ BC.errs() << "BOLT-WARNING: the output profile is empty or the "
+ "--profile-density-cutoff-hot option is "
+ "set too low. Please check your command.\n";
+ } else if (Density < opts::ProfileDensityThreshold) {
+ BC.errs()
+ << "BOLT-WARNING: BOLT is estimated to optimize better with "
+ << format("%.1f", opts::ProfileDensityThreshold / Density)
+ << "x more samples. Please consider increasing sampling rate or "
+ "profiling for longer duration to get more samples.\n";
+ }
+
+ BC.outs() << "BOLT-INFO: Functions with density >= "
+ << format("%.1f", Density) << " account for "
+ << format("%.2f",
+ static_cast<double>(opts::ProfileDensityCutOffHot) /
+ 10000)
+ << "% total sample counts.\n";
+ }
return Error::success();
}
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index fcde6f5f4642..4c716ce65ac6 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -638,8 +638,12 @@ void DataAggregator::processProfile(BinaryContext &BC) {
: BinaryFunction::PF_LBR;
for (auto &BFI : BC.getBinaryFunctions()) {
BinaryFunction &BF = BFI.second;
- if (getBranchData(BF) || getFuncSampleData(BF.getNames()))
+ FuncBranchData *FBD = getBranchData(BF);
+ if (FBD || getFuncSampleData(BF.getNames())) {
BF.markProfiled(Flags);
+ if (FBD)
+ BF.RawBranchCount = FBD->getNumExecutedBranches();
+ }
}
for (auto &FuncBranches : NamesToBranches)
@@ -845,6 +849,12 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
return false;
}
+ // Set ParentFunc to BAT parent function or FromFunc itself.
+ BinaryFunction *ParentFunc = getBATParentFunction(*FromFunc);
+ if (!ParentFunc)
+ ParentFunc = FromFunc;
+ ParentFunc->SampleCountInBytes += Count * (Second.From - First.To);
+
std::optional<BoltAddressTranslation::FallthroughListTy> FTs =
BAT ? BAT->getFallthroughsInTrace(FromFunc->getAddress(), First.To,
Second.From)
@@ -864,13 +874,12 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
<< FromFunc->getPrintName() << ":"
<< Twine::utohexstr(First.To) << " to "
<< Twine::utohexstr(Second.From) << ".\n");
- BinaryFunction *ParentFunc = getBATParentFunction(*FromFunc);
for (auto [From, To] : *FTs) {
if (BAT) {
From = BAT->translate(FromFunc->getAddress(), From, /*IsBranchSrc=*/true);
To = BAT->translate(FromFunc->getAddress(), To, /*IsBranchSrc=*/false);
}
- doIntraBranch(ParentFunc ? *ParentFunc : *FromFunc, From, To, Count, false);
+ doIntraBranch(*ParentFunc, From, To, Count, false);
}
return true;
diff --git a/bolt/test/X86/pre-aggregated-perf.test b/bolt/test/X86/pre-aggregated-perf.test
index 90252f9ff68d..ecdc613a8d76 100644
--- a/bolt/test/X86/pre-aggregated-perf.test
+++ b/bolt/test/X86/pre-aggregated-perf.test
@@ -11,7 +11,12 @@ REQUIRES: system-linux
RUN: yaml2obj %p/Inputs/blarge.yaml &> %t.exe
RUN: perf2bolt %t.exe -o %t --pa -p %p/Inputs/pre-aggregated.txt -w %t.new \
-RUN: --profile-use-dfs | FileCheck %s
+RUN: --show-density --profile-density-threshold=9 \
+RUN: --profile-density-cutoff-hot=970000 \
+RUN: --profile-use-dfs | FileCheck %s --check-prefix=CHECK-P2B
+
+CHECK-P2B: BOLT-INFO: 4 out of 7 functions in the binary (57.1%) have non-empty execution profile
+CHECK-P2B: BOLT-INFO: Functions with density >= 21.7 account for 97.00% total sample counts.
RUN: llvm-bolt %t.exe -data %t -o %t.null | FileCheck %s
RUN: llvm-bolt %t.exe -data %t.new -o %t.null | FileCheck %s