diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 57 |
1 files changed, 25 insertions, 32 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 92a587b5771b..4958a200de4e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -526,6 +526,11 @@ static cl::opt<bool> HasClosedWorldAssumption( cl::desc("Whether has closed-world assumption at link time"), cl::init(false), cl::Hidden); +static cl::opt<bool> EnableUniformIntrinsicCombine( + "amdgpu-enable-uniform-intrinsic-combine", + cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"), + cl::init(true), cl::Hidden); + extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine<R600TargetMachine> X(getTheR600Target()); @@ -879,6 +884,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { if (EarlyInlineAll && !EnableFunctionCalls) PM.addPass(AMDGPUAlwaysInlinePass()); + + if (EnableUniformIntrinsicCombine) + PM.addPass(AMDGPUUniformIntrinsicCombinePass()); }); PB.registerPeepholeEPCallback( @@ -929,8 +937,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { ThinOrFullLTOPhase Phase) { if (Level != OptimizationLevel::O0) { if (!isLTOPreLink(Phase)) { - AMDGPUAttributorOptions Opts; - MPM.addPass(AMDGPUAttributorPass(*this, Opts, Phase)); + if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) { + AMDGPUAttributorOptions Opts; + MPM.addPass(AMDGPUAttributorPass(*this, Opts, Phase)); + } } } }); @@ -964,7 +974,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PM.addPass(InternalizePass(mustPreserveGV)); PM.addPass(GlobalDCEPass()); } - if (EnableAMDGPUAttributor) { + if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) { AMDGPUAttributorOptions Opt; if (HasClosedWorldAssumption) Opt.IsClosedWorld = true; @@ -1296,7 +1306,8 @@ void AMDGPUPassConfig::addIRPasses() { if (LowerCtorDtor) addPass(createAMDGPUCtorDtorLoweringLegacyPass()); - if (isPassEnabled(EnableImageIntrinsicOptimizer)) + if (TM.getTargetTriple().isAMDGCN() && + isPassEnabled(EnableImageIntrinsicOptimizer)) addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM)); // This can be disabled by passing ::Disable here or on the command line @@ -1384,6 +1395,11 @@ void AMDGPUPassConfig::addCodeGenPrepare() { if (TM->getTargetTriple().isAMDGCN() && EnableLowerKernelArguments) addPass(createAMDGPULowerKernelArgumentsPass()); + TargetPassConfig::addCodeGenPrepare(); + + if (isPassEnabled(EnableLoadStoreVectorizer)) + addPass(createLoadStoreVectorizerPass()); + if (TM->getTargetTriple().isAMDGCN()) { // This lowering has been placed after codegenprepare to take advantage of // address mode matching (which is why it isn't put with the LDS lowerings). @@ -1392,15 +1408,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() { // but has been put before switch lowering and CFG flattening so that those // passes can run on the more optimized control flow this pass creates in // many cases. - // - // FIXME: This should ideally be put after the LoadStoreVectorizer. - // However, due to some annoying facts about ResourceUsageAnalysis, - // (especially as exercised in the resource-usage-dead-function test), - // we need all the function passes codegenprepare all the way through - // said resource usage analysis to run on the call graph produced - // before codegenprepare runs (because codegenprepare will knock some - // nodes out of the graph, which leads to function-level passes not - // being run on them, which causes crashes in the resource usage analysis). addPass(createAMDGPULowerBufferFatPointersPass()); addPass(createAMDGPULowerIntrinsicsLegacyPass()); // In accordance with the above FIXME, manually force all the @@ -1408,11 +1415,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() { addPass(new DummyCGSCCPass()); } - TargetPassConfig::addCodeGenPrepare(); - - if (isPassEnabled(EnableLoadStoreVectorizer)) - addPass(createLoadStoreVectorizerPass()); - // LowerSwitch pass may introduce unreachable blocks that can // cause unexpected behavior for subsequent passes. Placing it // here seems better that these blocks would get cleaned up by @@ -2125,6 +2127,11 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const { if (EnableLowerKernelArguments) addPass(AMDGPULowerKernelArgumentsPass(TM)); + Base::addCodeGenPrepare(addPass); + + if (isPassEnabled(EnableLoadStoreVectorizer)) + addPass(LoadStoreVectorizerPass()); + // This lowering has been placed after codegenprepare to take advantage of // address mode matching (which is why it isn't put with the LDS lowerings). // It could be placed anywhere before uniformity annotations (an analysis @@ -2132,25 +2139,11 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const { // but has been put before switch lowering and CFG flattening so that those // passes can run on the more optimized control flow this pass creates in // many cases. - // - // FIXME: This should ideally be put after the LoadStoreVectorizer. - // However, due to some annoying facts about ResourceUsageAnalysis, - // (especially as exercised in the resource-usage-dead-function test), - // we need all the function passes codegenprepare all the way through - // said resource usage analysis to run on the call graph produced - // before codegenprepare runs (because codegenprepare will knock some - // nodes out of the graph, which leads to function-level passes not - // being run on them, which causes crashes in the resource usage analysis). addPass(AMDGPULowerBufferFatPointersPass(TM)); addPass.requireCGSCCOrder(); addPass(AMDGPULowerIntrinsicsPass(TM)); - Base::addCodeGenPrepare(addPass); - - if (isPassEnabled(EnableLoadStoreVectorizer)) - addPass(LoadStoreVectorizerPass()); - // LowerSwitch pass may introduce unreachable blocks that can cause unexpected // behavior for subsequent passes. Placing it here seems better that these // blocks would get cleaned up by UnreachableBlockElim inserted next in the |
