summaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
authorChristudasan Devadasan <Christudasan.Devadasan@amd.com>2024-07-30 14:46:36 +0530
committerChristudasan Devadasan <Christudasan.Devadasan@amd.com>2024-08-02 11:33:49 +0530
commitf3633f18518dfd08e3c5dbc1392412d5d2cb6514 (patch)
tree486fcdbc0e1c5e2ea81e79699c435b4a45170da2 /llvm/lib/Target
parentc937051492a77490d32be60e00135e227840cebf (diff)
[AMDGPU][SILoadStoreOptimizer] Include constrained buffer load variantsusers/cdevadas/combine-constrained-buffer-loads
Use the constrained buffer load opcodes while combining under-aligned load for XNACK enabled subtargets.
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp75
1 files changed, 63 insertions, 12 deletions
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index ae537b194f50..7553c370f694 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -352,6 +352,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
return 1;
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
case AMDGPU::S_LOAD_DWORDX2_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
case AMDGPU::GLOBAL_LOAD_DWORDX2:
@@ -363,6 +365,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
return 2;
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
case AMDGPU::S_LOAD_DWORDX3_IMM:
case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
case AMDGPU::GLOBAL_LOAD_DWORDX3:
@@ -374,6 +378,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
return 3;
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
case AMDGPU::GLOBAL_LOAD_DWORDX4:
@@ -385,6 +391,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
return 4;
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
case AMDGPU::S_LOAD_DWORDX8_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
return 8;
@@ -499,12 +507,20 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
return S_BUFFER_LOAD_IMM;
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
return S_BUFFER_LOAD_SGPR_IMM;
case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
@@ -587,12 +603,20 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
@@ -703,6 +727,10 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
Result.SOffset = true;
[[fallthrough]];
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
@@ -710,6 +738,10 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
case AMDGPU::S_LOAD_DWORDX3_IMM:
@@ -1679,6 +1711,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
return New;
}
+static bool needsConstraintedOpcode(const GCNSubtarget &STM,
+ const MachineMemOperand *MMO,
+ unsigned Width) {
+ return STM.isXNACKEnabled() && MMO->getAlign().value() < Width * 4;
+}
+
unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
const CombineInfo &Paired) {
const unsigned Width = CI.Width + Paired.Width;
@@ -1696,38 +1734,51 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
case UNKNOWN:
llvm_unreachable("Unknown instruction class");
- case S_BUFFER_LOAD_IMM:
+ case S_BUFFER_LOAD_IMM: {
+ const MachineMemOperand *MMO = *CI.I->memoperands_begin();
+ bool NeedsConstrainedOpc = needsConstraintedOpcode(*STM, MMO, Width);
switch (Width) {
default:
return 0;
case 2:
- return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
case 3:
- return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
case 4:
- return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
case 8:
- return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
}
- case S_BUFFER_LOAD_SGPR_IMM:
+ }
+ case S_BUFFER_LOAD_SGPR_IMM: {
+ const MachineMemOperand *MMO = *CI.I->memoperands_begin();
+ bool NeedsConstrainedOpc = needsConstraintedOpcode(*STM, MMO, Width);
switch (Width) {
default:
return 0;
case 2:
- return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
case 3:
- return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
case 4:
- return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
case 8:
- return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
}
+ }
case S_LOAD_IMM: {
// If XNACK is enabled, use the constrained opcodes when the first load is
// under-aligned.
const MachineMemOperand *MMO = *CI.I->memoperands_begin();
- bool NeedsConstrainedOpc =
- STM->isXNACKEnabled() && MMO->getAlign().value() < Width * 4;
+ bool NeedsConstrainedOpc = needsConstraintedOpcode(*STM, MMO, Width);
switch (Width) {
default:
return 0;