summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp274
1 files changed, 158 insertions, 116 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
index 2449fa581842..3e5d83b8e3fb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
@@ -15,10 +15,9 @@
/// SplitModule: load-balance the module's functions across a set of N
/// partitions to allow parallel codegen. However, it does it very
/// differently than the target-agnostic variant:
-/// - Kernels are used as the module's "roots".
-/// They're known entry points on AMDGPU, and everything else is often
-/// internal only.
-/// - Each kernel has a set of dependencies, and when a kernel and its
+/// - The module has "split roots", which are kernels in the vast
+// majority of cases.
+/// - Each root has a set of dependencies, and when a root and its
/// dependencies is considered "big", we try to put it in a partition where
/// most dependencies are already imported, to avoid duplicating large
/// amounts of code.
@@ -67,20 +66,22 @@ using namespace llvm;
namespace {
-static cl::opt<float> LargeKernelFactor(
- "amdgpu-module-splitting-large-kernel-threshold", cl::init(2.0f),
+static cl::opt<float> LargeFnFactor(
+ "amdgpu-module-splitting-large-function-threshold", cl::init(2.0f),
cl::Hidden,
cl::desc(
- "consider a kernel as large and needing special treatment when it "
+ "consider a function as large and needing special treatment when the "
+ "cost of importing it into a partition"
"exceeds the average cost of a partition by this factor; e;g. 2.0 "
- "means if the kernel and its dependencies is 2 times bigger than "
- "an average partition; 0 disables large kernels handling entirely"));
+ "means if the function and its dependencies is 2 times bigger than "
+ "an average partition; 0 disables large functions handling entirely"));
-static cl::opt<float> LargeKernelOverlapForMerge(
- "amdgpu-module-splitting-large-kernel-merge-overlap", cl::init(0.8f),
+static cl::opt<float> LargeFnOverlapForMerge(
+ "amdgpu-module-splitting-large-function-merge-overlap", cl::init(0.8f),
cl::Hidden,
- cl::desc("defines how much overlap between two large kernel's dependencies "
- "is needed to put them in the same partition"));
+ cl::desc(
+ "defines how much overlap between two large function's dependencies "
+ "is needed to put them in the same partition"));
static cl::opt<bool> NoExternalizeGlobals(
"amdgpu-module-splitting-no-externalize-globals", cl::Hidden,
@@ -98,6 +99,7 @@ static cl::opt<bool>
using CostType = InstructionCost::CostType;
using PartitionID = unsigned;
+using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>;
static bool isEntryPoint(const Function *F) {
return AMDGPU::isEntryFunctionCC(F->getCallingConv());
@@ -214,13 +216,12 @@ static SplitModuleLogger &operator<<(SplitModuleLogger &SML, const Ty &Val) {
/// Calculate the cost of each function in \p M
/// \param SML Log Helper
-/// \param TM TargetMachine instance used to retrieve TargetTransformInfo.
+/// \param GetTTI Abstract getter for TargetTransformInfo.
/// \param M Module to analyze.
/// \param CostMap[out] Resulting Function -> Cost map.
/// \return The module's total cost.
static CostType
-calculateFunctionCosts(SplitModuleLogger &SML, const AMDGPUTargetMachine &TM,
- Module &M,
+calculateFunctionCosts(SplitModuleLogger &SML, GetTTIFn GetTTI, Module &M,
DenseMap<const Function *, CostType> &CostMap) {
CostType ModuleCost = 0;
CostType KernelCost = 0;
@@ -230,8 +231,7 @@ calculateFunctionCosts(SplitModuleLogger &SML, const AMDGPUTargetMachine &TM,
continue;
CostType FnCost = 0;
- TargetTransformInfo TTI = TM.getTargetTransformInfo(Fn);
-
+ const auto &TTI = GetTTI(Fn);
for (const auto &BB : Fn) {
for (const auto &I : BB) {
auto Cost =
@@ -277,9 +277,9 @@ static bool canBeIndirectlyCalled(const Function &F) {
/*IgnoreCastedDirectCall=*/true);
}
-/// When a kernel or any of its callees performs an indirect call, this function
+/// When a function or any of its callees performs an indirect call, this
/// takes over \ref addAllDependencies and adds all potentially callable
-/// functions to \p Fns so they can be counted as dependencies of the kernel.
+/// functions to \p Fns so they can be counted as dependencies of the function.
///
/// This is needed due to how AMDGPUResourceUsageAnalysis operates: in the
/// presence of an indirect call, the function's resource usage is the same as
@@ -301,13 +301,14 @@ static void addAllIndirectCallDependencies(const Module &M,
/// \param CG Call graph for \p Fn's module.
/// \param Fn Current function to look at.
/// \param Fns[out] Resulting list of functions.
+/// \param OnlyDirect Whether to only consider direct callees.
/// \param HadIndirectCall[out] Set to true if an indirect call was seen at some
/// point, either in \p Fn or in one of the function it calls. When that
/// happens, we fall back to adding all callable functions inside \p Fn's module
/// to \p Fns.
static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG,
const Function &Fn,
- DenseSet<const Function *> &Fns,
+ DenseSet<const Function *> &Fns, bool OnlyDirect,
bool &HadIndirectCall) {
assert(!Fn.isDeclaration());
@@ -325,6 +326,9 @@ static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG,
auto *CGNode = CGEntry.second;
auto *Callee = CGNode->getFunction();
if (!Callee) {
+ if (OnlyDirect)
+ continue;
+
// Functions have an edge towards CallsExternalNode if they're external
// declarations, or if they do an indirect call. As we only process
// definitions here, we know this means the function has an indirect
@@ -353,13 +357,19 @@ static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG,
}
}
-/// Contains information about a kernel and its dependencies.
-struct KernelWithDependencies {
- KernelWithDependencies(SplitModuleLogger &SML, CallGraph &CG,
- const DenseMap<const Function *, CostType> &FnCosts,
- const Function *Fn)
+/// Contains information about a function and its dependencies.
+/// This is a splitting root. The splitting algorithm works by
+/// assigning these to partitions.
+struct FunctionWithDependencies {
+ FunctionWithDependencies(SplitModuleLogger &SML, CallGraph &CG,
+ const DenseMap<const Function *, CostType> &FnCosts,
+ const Function *Fn)
: Fn(Fn) {
- addAllDependencies(SML, CG, *Fn, Dependencies, HasIndirectCall);
+ // When Fn is not a kernel, we don't need to collect indirect callees.
+ // Resource usage analysis is only performed on kernels, and we collect
+ // indirect callees for resource usage analysis.
+ addAllDependencies(SML, CG, *Fn, Dependencies,
+ /*OnlyDirect*/ !isEntryPoint(Fn), HasIndirectCall);
TotalCost = FnCosts.at(Fn);
for (const auto *Dep : Dependencies) {
TotalCost += FnCosts.at(Dep);
@@ -380,8 +390,8 @@ struct KernelWithDependencies {
CostType TotalCost = 0;
- /// \returns true if this kernel and its dependencies can be considered large
- /// according to \p Threshold.
+ /// \returns true if this function and its dependencies can be considered
+ /// large according to \p Threshold.
bool isLarge(CostType Threshold) const {
return TotalCost > Threshold && !Dependencies.empty();
}
@@ -420,39 +430,39 @@ static float calculateOverlap(const DenseSet<const Function *> &A,
/// \param NumParts Number of partitions to create.
/// \param ModuleCost Total cost of all functions in \p M.
/// \param FnCosts Map of Function -> Cost
-/// \param WorkList Kernels and their dependencies to process in order.
+/// \param WorkList Functions and their dependencies to process in order.
/// \returns The created partitions (a vector of size \p NumParts )
static std::vector<DenseSet<const Function *>>
doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts,
CostType ModuleCost,
const DenseMap<const Function *, CostType> &FnCosts,
- const SmallVector<KernelWithDependencies> &WorkList) {
+ const SmallVector<FunctionWithDependencies> &WorkList) {
SML << "\n--Partitioning Starts--\n";
- // Calculate a "large kernel threshold". When more than one kernel's total
- // import cost exceeds this value, we will try to merge it with other,
- // similarly large kernels.
+ // Calculate a "large function threshold". When more than one function's total
+ // import cost exceeds this value, we will try to assign it to an existing
+ // partition to reduce the amount of duplication needed.
//
- // e.g. let two kernels X and Y have a import cost of ~10% of the module, we
+ // e.g. let two functions X and Y have a import cost of ~10% of the module, we
// assign X to a partition as usual, but when we get to Y, we check if it's
// worth also putting it in Y's partition.
- const CostType LargeKernelThreshold =
- LargeKernelFactor ? CostType(((ModuleCost / NumParts) * LargeKernelFactor))
- : std::numeric_limits<CostType>::max();
+ const CostType LargeFnThreshold =
+ LargeFnFactor ? CostType(((ModuleCost / NumParts) * LargeFnFactor))
+ : std::numeric_limits<CostType>::max();
std::vector<DenseSet<const Function *>> Partitions;
Partitions.resize(NumParts);
- // Assign a partition to each kernel, and try to keep the partitions more or
+ // Assign functions to partitions, and try to keep the partitions more or
// less balanced. We do that through a priority queue sorted in reverse, so we
// can always look at the partition with the least content.
//
// There are some cases where we will be deliberately unbalanced though.
- // - Large kernels: we try to merge with existing partitions to reduce code
+ // - Large functions: we try to merge with existing partitions to reduce code
// duplication.
- // - Kernels with indirect or external calls always go in the first partition
- // (P0).
+ // - Functions with indirect or external calls always go in the first
+ // partition (P0).
auto ComparePartitions = [](const std::pair<PartitionID, CostType> &a,
const std::pair<PartitionID, CostType> &b) {
// When two partitions have the same cost, assign to the one with the
@@ -471,17 +481,17 @@ doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts,
for (unsigned I = 0; I < NumParts; ++I)
BalancingQueue.push_back(std::make_pair(I, 0));
- // Helper function to handle assigning a kernel to a partition. This takes
+ // Helper function to handle assigning a function to a partition. This takes
// care of updating the balancing queue.
const auto AssignToPartition = [&](PartitionID PID,
- const KernelWithDependencies &KWD) {
+ const FunctionWithDependencies &FWD) {
auto &FnsInPart = Partitions[PID];
- FnsInPart.insert(KWD.Fn);
- FnsInPart.insert(KWD.Dependencies.begin(), KWD.Dependencies.end());
+ FnsInPart.insert(FWD.Fn);
+ FnsInPart.insert(FWD.Dependencies.begin(), FWD.Dependencies.end());
- SML << "assign " << getName(*KWD.Fn) << " to P" << PID << "\n -> ";
- if (!KWD.Dependencies.empty()) {
- SML << KWD.Dependencies.size() << " dependencies added\n";
+ SML << "assign " << getName(*FWD.Fn) << " to P" << PID << "\n -> ";
+ if (!FWD.Dependencies.empty()) {
+ SML << FWD.Dependencies.size() << " dependencies added\n";
};
// Update the balancing queue. we scan backwards because in the common case
@@ -506,44 +516,43 @@ doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts,
sort(BalancingQueue, ComparePartitions);
};
- for (auto &CurKernel : WorkList) {
- // When a kernel has indirect calls, it must stay in the first partition
+ for (auto &CurFn : WorkList) {
+ // When a function has indirect calls, it must stay in the first partition
// alongside every reachable non-entry function. This is a nightmare case
// for splitting as it severely limits what we can do.
- if (CurKernel.HasIndirectCall) {
- SML << "Kernel with indirect call(s): " << getName(*CurKernel.Fn)
+ if (CurFn.HasIndirectCall) {
+ SML << "Function with indirect call(s): " << getName(*CurFn.Fn)
<< " defaulting to P0\n";
- AssignToPartition(0, CurKernel);
+ AssignToPartition(0, CurFn);
continue;
}
- // When a kernel has non duplicatable dependencies, we have to keep it in
+ // When a function has non duplicatable dependencies, we have to keep it in
// the first partition as well. This is a conservative approach, a
// finer-grained approach could keep track of which dependencies are
// non-duplicatable exactly and just make sure they're grouped together.
- if (CurKernel.HasNonDuplicatableDependecy) {
- SML << "Kernel with externally visible dependency "
- << getName(*CurKernel.Fn) << " defaulting to P0\n";
- AssignToPartition(0, CurKernel);
+ if (CurFn.HasNonDuplicatableDependecy) {
+ SML << "Function with externally visible dependency "
+ << getName(*CurFn.Fn) << " defaulting to P0\n";
+ AssignToPartition(0, CurFn);
continue;
}
- // Be smart with large kernels to avoid duplicating their dependencies.
- if (CurKernel.isLarge(LargeKernelThreshold)) {
- assert(LargeKernelOverlapForMerge >= 0.0f &&
- LargeKernelOverlapForMerge <= 1.0f);
- SML << "Large Kernel: " << getName(*CurKernel.Fn)
+ // Be smart with large functions to avoid duplicating their dependencies.
+ if (CurFn.isLarge(LargeFnThreshold)) {
+ assert(LargeFnOverlapForMerge >= 0.0f && LargeFnOverlapForMerge <= 1.0f);
+ SML << "Large Function: " << getName(*CurFn.Fn)
<< " - looking for partition with at least "
- << format("%0.2f", LargeKernelOverlapForMerge * 100) << "% overlap\n";
+ << format("%0.2f", LargeFnOverlapForMerge * 100) << "% overlap\n";
bool Assigned = false;
for (const auto &[PID, Fns] : enumerate(Partitions)) {
- float Overlap = calculateOverlap(CurKernel.Dependencies, Fns);
+ float Overlap = calculateOverlap(CurFn.Dependencies, Fns);
SML << " => " << format("%0.2f", Overlap * 100) << "% overlap with P"
<< PID << '\n';
- if (Overlap > LargeKernelOverlapForMerge) {
+ if (Overlap > LargeFnOverlapForMerge) {
SML << " selecting P" << PID << '\n';
- AssignToPartition(PID, CurKernel);
+ AssignToPartition(PID, CurFn);
Assigned = true;
}
}
@@ -554,41 +563,34 @@ doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts,
// Normal "load-balancing", assign to partition with least pressure.
auto [PID, CurCost] = BalancingQueue.back();
- AssignToPartition(PID, CurKernel);
+ AssignToPartition(PID, CurFn);
}
- // Work is mostly done now, verify the partioning and add all functions we may
- // have missed (= unreachable, or we don't understand how they're reached) to
- // P0.
- DenseSet<const Function *> AllFunctions;
- for (const auto &[Idx, Part] : enumerate(Partitions)) {
- CostType Cost = 0;
- for (auto *Fn : Part) {
- // external linkage functions should exclusively be in the first partition
- // at this stage. In theory, we should only ever see external linkage
- // functions here if they're kernels, or if they've been added due to a
- // kernel using indirect calls somewhere in its CallGraph.
- assert(Idx == 0 || (!Fn->hasExternalLinkage() || isEntryPoint(Fn)));
- Cost += FnCosts.at(Fn);
+ if (SML) {
+ for (const auto &[Idx, Part] : enumerate(Partitions)) {
+ CostType Cost = 0;
+ for (auto *Fn : Part)
+ Cost += FnCosts.at(Fn);
+ SML << "P" << Idx << " has a total cost of " << Cost << " ("
+ << format("%0.2f", (float(Cost) / ModuleCost) * 100)
+ << "% of source module)\n";
}
- SML << "P" << Idx << " has a total cost of " << Cost << " ("
- << format("%0.2f", (float(Cost) / ModuleCost) * 100)
- << "% of source module)\n";
- AllFunctions.insert(Part.begin(), Part.end());
+
+ SML << "--Partitioning Done--\n\n";
}
- // Add missed functions to P0. This will take care of adding things like
- // external functions with no callers in the module to P0. This should be
- // fairly rare as AMDGPU internalizes everything in most cases, so unused
- // internal functions would get removed.
+ // Check no functions were missed.
+#ifndef NDEBUG
+ DenseSet<const Function *> AllFunctions;
+ for (const auto &Part : Partitions)
+ AllFunctions.insert(Part.begin(), Part.end());
+
for (auto &Fn : M) {
if (!Fn.isDeclaration() && !AllFunctions.contains(&Fn)) {
- SML << getName(Fn) << " has no partition assigned, defaulting to P0\n";
- Partitions[0].insert(&Fn);
+ assert(AllFunctions.contains(&Fn) && "Missed a function?!");
}
}
-
- SML << "--Partitioning Done--\n\n";
+#endif
return Partitions;
}
@@ -604,10 +606,17 @@ static void externalize(GlobalValue &GV) {
if (!GV.hasName())
GV.setName("__llvmsplit_unnamed");
}
-} // end anonymous namespace
-void llvm::splitAMDGPUModule(
- const AMDGPUTargetMachine &TM, Module &M, unsigned N,
+static bool hasDirectCaller(const Function &Fn) {
+ for (auto &U : Fn.uses()) {
+ if (auto *CB = dyn_cast<CallBase>(U.getUser()); CB && CB->isCallee(&U))
+ return true;
+ }
+ return false;
+}
+
+static void splitAMDGPUModule(
+ GetTTIFn GetTTI, Module &M, unsigned N,
function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
SplitModuleLogger SML(M);
@@ -648,15 +657,36 @@ void llvm::splitAMDGPUModule(
// Start by calculating the cost of every function in the module, as well as
// the module's overall cost.
DenseMap<const Function *, CostType> FnCosts;
- const CostType ModuleCost = calculateFunctionCosts(SML, TM, M, FnCosts);
+ const CostType ModuleCost = calculateFunctionCosts(SML, GetTTI, M, FnCosts);
- // Gather every kernel into a WorkList, then sort it by descending total cost
- // of the kernel so the biggest kernels are seen first.
- SmallVector<KernelWithDependencies> WorkList;
+ // First, gather ever kernel into the worklist.
+ SmallVector<FunctionWithDependencies> WorkList;
for (auto &Fn : M) {
if (isEntryPoint(&Fn) && !Fn.isDeclaration())
WorkList.emplace_back(SML, CG, FnCosts, &Fn);
}
+
+ // Then, find missing functions that need to be considered as additional
+ // roots. These can't be called in theory, but in practice we still have to
+ // handle them to avoid linker errors.
+ {
+ DenseSet<const Function *> SeenFunctions;
+ for (const auto &FWD : WorkList) {
+ SeenFunctions.insert(FWD.Fn);
+ SeenFunctions.insert(FWD.Dependencies.begin(), FWD.Dependencies.end());
+ }
+
+ for (auto &Fn : M) {
+ // If this function is not part of any kernel's dependencies and isn't
+ // directly called, consider it as a root.
+ if (!Fn.isDeclaration() && !isEntryPoint(&Fn) &&
+ !SeenFunctions.count(&Fn) && !hasDirectCaller(Fn)) {
+ WorkList.emplace_back(SML, CG, FnCosts, &Fn);
+ }
+ }
+ }
+
+ // Sort the worklist so the most expensive roots are seen first.
sort(WorkList, [&](auto &A, auto &B) {
// Sort by total cost, and if the total cost is identical, sort
// alphabetically.
@@ -667,13 +697,20 @@ void llvm::splitAMDGPUModule(
if (SML) {
SML << "Worklist\n";
- for (const auto &KWD : WorkList) {
- SML << "[Kernel] " << getName(*KWD.Fn) << " (totalCost:" << KWD.TotalCost
- << " indirect:" << KWD.HasIndirectCall
- << " hasNonDuplicatableDep:" << KWD.HasNonDuplicatableDependecy
+ for (const auto &FWD : WorkList) {
+ SML << "[root] " << getName(*FWD.Fn) << " (totalCost:" << FWD.TotalCost
+ << " indirect:" << FWD.HasIndirectCall
+ << " hasNonDuplicatableDep:" << FWD.HasNonDuplicatableDependecy
<< ")\n";
- for (const auto *Dep : KWD.Dependencies)
- SML << " [Dep] " << getName(*Dep) << '\n';
+ // Sort function names before printing to ensure determinism.
+ SmallVector<std::string> SortedDepNames;
+ SortedDepNames.reserve(FWD.Dependencies.size());
+ for (const auto *Dep : FWD.Dependencies)
+ SortedDepNames.push_back(getName(*Dep));
+ sort(SortedDepNames);
+
+ for (const auto &Name : SortedDepNames)
+ SML << " [dependency] " << Name << '\n';
}
}
@@ -700,16 +737,8 @@ void llvm::splitAMDGPUModule(
std::unique_ptr<Module> MPart(
CloneModule(M, VMap, [&](const GlobalValue *GV) {
// Functions go in their assigned partition.
- if (const auto *Fn = dyn_cast<Function>(GV)) {
-// Check we don't import an external linkage function in any
-// partition other than P0.
-#ifndef NDEBUG
- if (Fn->hasExternalLinkage() && !isEntryPoint(Fn)) {
- assert((I == 0) == FnsInPart.contains(Fn));
- }
-#endif
+ if (const auto *Fn = dyn_cast<Function>(GV))
return FnsInPart.contains(Fn);
- }
if (NeedsConservativeImport(GV))
return true;
@@ -742,3 +771,16 @@ void llvm::splitAMDGPUModule(
<< format("%0.2f", (float(TotalFnImpls) / FnCosts.size()) * 100)
<< "% of original module)\n";
}
+} // namespace
+
+PreservedAnalyses AMDGPUSplitModulePass::run(Module &M,
+ ModuleAnalysisManager &MAM) {
+ FunctionAnalysisManager &FAM =
+ MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ const auto TTIGetter = [&FAM](Function &F) -> const TargetTransformInfo & {
+ return FAM.getResult<TargetIRAnalysis>(F);
+ };
+ splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
+ // We don't change the original module.
+ return PreservedAnalyses::all();
+}