summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIFoldOperands.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp179
1 files changed, 102 insertions, 77 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 962c276bc212..5297816ec1f2 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -173,6 +173,7 @@ struct FoldCandidate {
class SIFoldOperandsImpl {
public:
+ MachineFunction *MF;
MachineRegisterInfo *MRI;
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
@@ -705,6 +706,36 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
}
MachineOperand *New = Fold.Def.OpToFold;
+
+ // Verify the register is compatible with the operand.
+ if (const TargetRegisterClass *OpRC =
+ TII->getRegClass(MI->getDesc(), Fold.UseOpNo, TRI, *MF)) {
+ const TargetRegisterClass *OldRC = MRI->getRegClass(Old.getReg());
+ const TargetRegisterClass *NewRC = MRI->getRegClass(New->getReg());
+ unsigned NewSubReg = New->getSubReg();
+ unsigned OldSubReg = Old.getSubReg();
+
+ const TargetRegisterClass *ConstrainRC = OpRC;
+ if (NewSubReg && OldSubReg) {
+ unsigned PreA, PreB;
+ ConstrainRC = TRI->getCommonSuperRegClass(OpRC, OldSubReg, NewRC,
+ NewSubReg, PreA, PreB);
+ } else if (OldSubReg) {
+ ConstrainRC = TRI->getMatchingSuperRegClass(OldRC, OpRC, OldSubReg);
+ } else if (NewSubReg) {
+ ConstrainRC = TRI->getMatchingSuperRegClass(NewRC, OpRC, NewSubReg);
+ }
+
+ if (!ConstrainRC)
+ return false;
+
+ if (!MRI->constrainRegClass(New->getReg(), ConstrainRC)) {
+ LLVM_DEBUG(dbgs() << "Cannot constrain " << printReg(New->getReg(), TRI)
+ << TRI->getRegClassName(ConstrainRC) << '\n');
+ return false;
+ }
+ }
+
// Rework once the VS_16 register class is updated to include proper
// 16-bit SGPRs instead of 32-bit ones.
if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
@@ -1248,6 +1279,7 @@ void SIFoldOperandsImpl::foldOperand(
if (FoldingImmLike && UseMI->isCopy()) {
Register DestReg = UseMI->getOperand(0).getReg();
Register SrcReg = UseMI->getOperand(1).getReg();
+ unsigned UseSubReg = UseMI->getOperand(1).getSubReg();
assert(SrcReg.isVirtual());
const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
@@ -1259,63 +1291,74 @@ void SIFoldOperandsImpl::foldOperand(
return;
const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
- if (!DestReg.isPhysical() && DestRC == &AMDGPU::AGPR_32RegClass) {
- std::optional<int64_t> UseImmVal = OpToFold.getEffectiveImmVal();
- if (UseImmVal && TII->isInlineConstant(
- *UseImmVal, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
- UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
- UseMI->getOperand(1).ChangeToImmediate(*UseImmVal);
- CopiesToReplace.push_back(UseMI);
- return;
+ // In order to fold immediates into copies, we need to change the copy to a
+ // MOV. Find a compatible mov instruction with the value.
+ for (unsigned MovOp :
+ {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
+ AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_MOV_B16_t16_e64,
+ AMDGPU::V_ACCVGPR_WRITE_B32_e64, AMDGPU::AV_MOV_B32_IMM_PSEUDO,
+ AMDGPU::AV_MOV_B64_IMM_PSEUDO}) {
+ const MCInstrDesc &MovDesc = TII->get(MovOp);
+ assert(MovDesc.getNumDefs() > 0 && MovDesc.operands()[0].RegClass != -1);
+
+ const TargetRegisterClass *MovDstRC =
+ TRI->getRegClass(MovDesc.operands()[0].RegClass);
+
+ // Fold if the destination register class of the MOV instruction (ResRC)
+ // is a superclass of (or equal to) the destination register class of the
+ // COPY (DestRC). If this condition fails, folding would be illegal.
+ if (!DestRC->hasSuperClassEq(MovDstRC))
+ continue;
+
+ const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1;
+ const TargetRegisterClass *MovSrcRC =
+ TRI->getRegClass(MovDesc.operands()[SrcIdx].RegClass);
+ if (MovSrcRC) {
+ if (UseSubReg)
+ MovSrcRC = TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg);
+ if (!MRI->constrainRegClass(SrcReg, MovSrcRC))
+ break;
+
+ // FIXME: This is mutating the instruction only and deferring the actual
+ // fold of the immediate
+ } else {
+ // For the _IMM_PSEUDO cases, there can be value restrictions on the
+ // immediate to verify. Technically we should always verify this, but it
+ // only matters for these concrete cases.
+ // TODO: Handle non-imm case if it's useful.
+ if (!OpToFold.isImm() ||
+ !TII->isImmOperandLegal(MovDesc, 1, *OpToFold.getEffectiveImmVal()))
+ break;
}
- }
- // Allow immediates COPYd into sgpr_lo16 to be further folded while
- // still being legal if not further folded
- if (DestRC == &AMDGPU::SGPR_LO16RegClass) {
- assert(ST->useRealTrue16Insts());
- MRI->setRegClass(DestReg, &AMDGPU::SGPR_32RegClass);
- DestRC = &AMDGPU::SGPR_32RegClass;
+ MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin();
+ MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end();
+ while (ImpOpI != ImpOpE) {
+ MachineInstr::mop_iterator Tmp = ImpOpI;
+ ImpOpI++;
+ UseMI->removeOperand(UseMI->getOperandNo(Tmp));
+ }
+ UseMI->setDesc(MovDesc);
+
+ if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
+ const auto &SrcOp = UseMI->getOperand(UseOpIdx);
+ MachineOperand NewSrcOp(SrcOp);
+ MachineFunction *MF = UseMI->getParent()->getParent();
+ UseMI->removeOperand(1);
+ UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
+ UseMI->addOperand(NewSrcOp); // src0
+ UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
+ UseOpIdx = SrcIdx;
+ UseOp = &UseMI->getOperand(UseOpIdx);
+ }
+ CopiesToReplace.push_back(UseMI);
+ break;
}
- // In order to fold immediates into copies, we need to change the
- // copy to a MOV.
-
- unsigned MovOp = TII->getMovOpcode(DestRC);
- if (MovOp == AMDGPU::COPY)
- return;
-
- // Fold if the destination register class of the MOV instruction (ResRC)
- // is a superclass of (or equal to) the destination register class of the
- // COPY (DestRC). If this condition fails, folding would be illegal.
- const MCInstrDesc &MovDesc = TII->get(MovOp);
- assert(MovDesc.getNumDefs() > 0 && MovDesc.operands()[0].RegClass != -1);
- const TargetRegisterClass *ResRC =
- TRI->getRegClass(MovDesc.operands()[0].RegClass);
- if (!DestRC->hasSuperClassEq(ResRC))
+ // We failed to replace the copy, so give up.
+ if (UseMI->getOpcode() == AMDGPU::COPY)
return;
- MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin();
- MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end();
- while (ImpOpI != ImpOpE) {
- MachineInstr::mop_iterator Tmp = ImpOpI;
- ImpOpI++;
- UseMI->removeOperand(UseMI->getOperandNo(Tmp));
- }
- UseMI->setDesc(TII->get(MovOp));
-
- if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
- const auto &SrcOp = UseMI->getOperand(UseOpIdx);
- MachineOperand NewSrcOp(SrcOp);
- MachineFunction *MF = UseMI->getParent()->getParent();
- UseMI->removeOperand(1);
- UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
- UseMI->addOperand(NewSrcOp); // src0
- UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
- UseOpIdx = 2;
- UseOp = &UseMI->getOperand(UseOpIdx);
- }
- CopiesToReplace.push_back(UseMI);
} else {
if (UseMI->isCopy() && OpToFold.isReg() &&
UseMI->getOperand(0).getReg().isVirtual() &&
@@ -1430,30 +1473,9 @@ void SIFoldOperandsImpl::foldOperand(
return;
}
- if (!FoldingImmLike) {
- if (OpToFold.isReg() && ST->needsAlignedVGPRs()) {
- // Don't fold if OpToFold doesn't hold an aligned register.
- const TargetRegisterClass *RC =
- TRI->getRegClassForReg(*MRI, OpToFold.getReg());
- assert(RC);
- if (TRI->hasVectorRegisters(RC) && OpToFold.getSubReg()) {
- unsigned SubReg = OpToFold.getSubReg();
- if (const TargetRegisterClass *SubRC =
- TRI->getSubRegisterClass(RC, SubReg))
- RC = SubRC;
- }
-
- if (!RC || !TRI->isProperlyAlignedRC(*RC))
- return;
- }
-
- tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold);
-
- // FIXME: We could try to change the instruction from 64-bit to 32-bit
- // to enable more folding opportunities. The shrink operands pass
- // already does this.
- return;
- }
+ // FIXME: We could try to change the instruction from 64-bit to 32-bit
+ // to enable more folding opportunities. The shrink operands pass
+ // already does this.
tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold);
}
@@ -1931,8 +1953,10 @@ bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const {
// Direct copy from SGPR to AGPR is not possible on gfx908. To avoid
// creation of exploded copies SGPR->VGPR->AGPR in the copyPhysReg()
// later, create a copy here and track if we already have such a copy.
- if (TRI->getSubRegisterClass(MRI->getRegClass(Src.Reg), Src.SubReg) !=
- VGPRUseSubRC) {
+ const TargetRegisterClass *SubRC =
+ TRI->getSubRegisterClass(MRI->getRegClass(Src.Reg), Src.SubReg);
+ if (!VGPRUseSubRC->hasSubClassEq(SubRC)) {
+ // TODO: Try to reconstrain class
VGPRCopy = MRI->createVirtualRegister(VGPRUseSubRC);
BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), VGPRCopy).add(*Def);
B.addReg(VGPRCopy);
@@ -2748,6 +2772,7 @@ bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
}
bool SIFoldOperandsImpl::run(MachineFunction &MF) {
+ this->MF = &MF;
MRI = &MF.getRegInfo();
ST = &MF.getSubtarget<GCNSubtarget>();
TII = ST->getInstrInfo();