summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIISelLowering.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp677
1 files changed, 521 insertions, 156 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 66c1dfc71c2f..2a977247bc2c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1263,22 +1263,61 @@ MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const {
static unsigned getIntrMemWidth(unsigned IntrID) {
switch (IntrID) {
case Intrinsic::amdgcn_global_load_async_to_lds_b8:
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
case Intrinsic::amdgcn_global_store_async_from_lds_b8:
return 8;
case Intrinsic::amdgcn_global_load_async_to_lds_b32:
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
case Intrinsic::amdgcn_global_store_async_from_lds_b32:
+ case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
+ case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
return 32;
case Intrinsic::amdgcn_global_load_async_to_lds_b64:
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
case Intrinsic::amdgcn_global_store_async_from_lds_b64:
+ case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
+ case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
return 64;
case Intrinsic::amdgcn_global_load_async_to_lds_b128:
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
case Intrinsic::amdgcn_global_store_async_from_lds_b128:
+ case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
+ case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
return 128;
default:
llvm_unreachable("Unknown width");
}
}
+static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad,
+ TargetLoweringBase::IntrinsicInfo &Info) {
+ Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2);
+ unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
+ switch (AtomicOrderingCABI(Ord)) {
+ case AtomicOrderingCABI::acquire:
+ Info.order = AtomicOrdering::Acquire;
+ break;
+ case AtomicOrderingCABI::release:
+ Info.order = AtomicOrdering::Release;
+ break;
+ case AtomicOrderingCABI::seq_cst:
+ Info.order = AtomicOrdering::SequentiallyConsistent;
+ break;
+ default:
+ Info.order = AtomicOrdering::Monotonic;
+ break;
+ }
+
+ Info.flags =
+ (IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore);
+ Info.flags |= MOCooperative;
+
+ MDNode *ScopeMD = cast<MDNode>(
+ cast<MetadataAsValue>(CI.getArgOperand(IsLoad ? 2 : 3))->getMetadata());
+ StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
+ Info.ssid = CI.getContext().getOrInsertSyncScopeID(Scope);
+}
+
bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &CI,
MachineFunction &MF,
@@ -1506,6 +1545,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::amdgcn_global_load_monitor_b32:
case Intrinsic::amdgcn_global_load_monitor_b64:
case Intrinsic::amdgcn_global_load_monitor_b128:
+ case Intrinsic::amdgcn_cluster_load_b32:
+ case Intrinsic::amdgcn_cluster_load_b64:
+ case Intrinsic::amdgcn_cluster_load_b128:
case Intrinsic::amdgcn_ds_load_tr6_b96:
case Intrinsic::amdgcn_ds_load_tr4_b64:
case Intrinsic::amdgcn_ds_load_tr8_b64:
@@ -1525,6 +1567,26 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags |= MachineMemOperand::MOLoad;
return true;
}
+ case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
+ case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
+ case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
+ Info.ptrVal = CI.getOperand(0);
+ Info.align.reset();
+ getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info);
+ return true;
+ }
+ case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
+ case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
+ case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
+ Info.ptrVal = CI.getArgOperand(0);
+ Info.align.reset();
+ getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info);
+ return true;
+ }
case Intrinsic::amdgcn_ds_gws_init:
case Intrinsic::amdgcn_ds_gws_barrier:
case Intrinsic::amdgcn_ds_gws_sema_v:
@@ -1553,7 +1615,11 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::amdgcn_global_load_async_to_lds_b8:
case Intrinsic::amdgcn_global_load_async_to_lds_b32:
case Intrinsic::amdgcn_global_load_async_to_lds_b64:
- case Intrinsic::amdgcn_global_load_async_to_lds_b128: {
+ case Intrinsic::amdgcn_global_load_async_to_lds_b128:
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
Info.ptrVal = CI.getArgOperand(1);
@@ -1636,6 +1702,9 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
Value *Ptr = nullptr;
switch (II->getIntrinsicID()) {
case Intrinsic::amdgcn_atomic_cond_sub_u32:
+ case Intrinsic::amdgcn_cluster_load_b128:
+ case Intrinsic::amdgcn_cluster_load_b64:
+ case Intrinsic::amdgcn_cluster_load_b32:
case Intrinsic::amdgcn_ds_append:
case Intrinsic::amdgcn_ds_consume:
case Intrinsic::amdgcn_ds_load_tr8_b64:
@@ -1678,6 +1747,10 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
case Intrinsic::amdgcn_global_load_async_to_lds_b32:
case Intrinsic::amdgcn_global_load_async_to_lds_b64:
case Intrinsic::amdgcn_global_load_async_to_lds_b128:
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
+ case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
Ptr = II->getArgOperand(1);
break;
default:
@@ -4260,6 +4333,11 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
break;
}
+ // If the caller is a whole wave function, we need to use a special opcode
+ // so we can patch up EXEC.
+ if (Info->isWholeWaveFunction())
+ OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
+
return DAG.getNode(OPC, DL, MVT::Other, Ops);
}
@@ -5192,7 +5270,58 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
return LoopBB;
}
-static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
+static MachineBasicBlock *Expand64BitScalarArithmetic(MachineInstr &MI,
+ MachineBasicBlock *BB) {
+ // For targets older than GFX12, we emit a sequence of 32-bit operations.
+ // For GFX12, we emit s_add_u64 and s_sub_u64.
+ MachineFunction *MF = BB->getParent();
+ const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineOperand &Dest = MI.getOperand(0);
+ MachineOperand &Src0 = MI.getOperand(1);
+ MachineOperand &Src1 = MI.getOperand(2);
+ bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
+ if (ST.hasScalarAddSub64()) {
+ unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
+ // clang-format off
+ BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
+ .add(Src0)
+ .add(Src1);
+ // clang-format on
+ } else {
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const TargetRegisterClass *BoolRC = TRI->getBoolRC();
+
+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+ MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
+ MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
+
+ MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
+ MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
+
+ unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
+ unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
+ BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
+ BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
+ .addReg(DestSub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DestSub1)
+ .addImm(AMDGPU::sub1);
+ }
+ MI.eraseFromParent();
+ return BB;
+}
+
+static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
switch (Opc) {
case AMDGPU::S_MIN_U32:
return std::numeric_limits<uint32_t>::max();
@@ -5210,10 +5339,42 @@ static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
case AMDGPU::S_AND_B32:
return std::numeric_limits<uint32_t>::max();
default:
- llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
+ llvm_unreachable(
+ "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
}
}
+static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc) {
+ switch (Opc) {
+ case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
+ return std::numeric_limits<uint64_t>::max();
+ case AMDGPU::V_CMP_LT_I64_e64: // min.i64
+ return std::numeric_limits<int64_t>::max();
+ case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
+ return std::numeric_limits<uint64_t>::min();
+ case AMDGPU::V_CMP_GT_I64_e64: // max.i64
+ return std::numeric_limits<int64_t>::min();
+ case AMDGPU::S_ADD_U64_PSEUDO:
+ case AMDGPU::S_SUB_U64_PSEUDO:
+ case AMDGPU::S_OR_B64:
+ case AMDGPU::S_XOR_B64:
+ return std::numeric_limits<uint64_t>::min();
+ case AMDGPU::S_AND_B64:
+ return std::numeric_limits<uint64_t>::max();
+ default:
+ llvm_unreachable(
+ "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
+ }
+}
+
+static bool is32bitWaveReduceOperation(unsigned Opc) {
+ return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
+ Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
+ Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
+ Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
+ Opc == AMDGPU::S_XOR_B32;
+}
+
static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
MachineBasicBlock &BB,
const GCNSubtarget &ST,
@@ -5241,53 +5402,99 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
RetBB = &BB;
break;
}
+ case AMDGPU::V_CMP_LT_U64_e64: // umin
+ case AMDGPU::V_CMP_LT_I64_e64: // min
+ case AMDGPU::V_CMP_GT_U64_e64: // umax
+ case AMDGPU::V_CMP_GT_I64_e64: // max
+ case AMDGPU::S_AND_B64:
+ case AMDGPU::S_OR_B64: {
+ // Idempotent operations.
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
+ RetBB = &BB;
+ break;
+ }
case AMDGPU::S_XOR_B32:
+ case AMDGPU::S_XOR_B64:
case AMDGPU::S_ADD_I32:
- case AMDGPU::S_SUB_I32: {
+ case AMDGPU::S_ADD_U64_PSEUDO:
+ case AMDGPU::S_SUB_I32:
+ case AMDGPU::S_SUB_U64_PSEUDO: {
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
- Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
+ Register NumActiveLanes =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
bool IsWave32 = ST.isWave32();
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- unsigned CountReg =
+ unsigned BitCountOpc =
IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
- auto Exec =
- BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
+ BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
- auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
- .addReg(Exec->getOperand(0).getReg());
+ auto NewAccumulator =
+ BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
+ .addReg(ExecMask);
switch (Opc) {
- case AMDGPU::S_XOR_B32: {
+ case AMDGPU::S_XOR_B32:
+ case AMDGPU::S_XOR_B64: {
// Performing an XOR operation on a uniform value
// depends on the parity of the number of active lanes.
// For even parity, the result will be 0, for odd
// parity the result will be the same as the input value.
- Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
-
- auto ParityReg =
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
- .addReg(NewAccumulator->getOperand(0).getReg())
- .addImm(1);
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
- .addReg(SrcReg)
- .addReg(ParityReg->getOperand(0).getReg());
+ Register ParityRegister =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
+ .addReg(NewAccumulator->getOperand(0).getReg())
+ .addImm(1)
+ .setOperandDead(3); // Dead scc
+ if (Opc == AMDGPU::S_XOR_B32) {
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+ .addReg(SrcReg)
+ .addReg(ParityRegister);
+ } else {
+ Register DestSub0 =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register DestSub1 =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+ const TargetRegisterClass *SrcSubRC =
+ TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
+
+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
+
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
+ .add(Op1L)
+ .addReg(ParityRegister);
+
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
+ .add(Op1H)
+ .addReg(ParityRegister);
+
+ BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
+ .addReg(DestSub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DestSub1)
+ .addImm(AMDGPU::sub1);
+ }
break;
}
case AMDGPU::S_SUB_I32: {
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
// Take the negation of the source operand.
- auto InvertedValReg =
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
- .addImm(-1)
- .addReg(SrcReg);
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
+ .addImm(0)
+ .addReg(SrcReg);
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
- .addReg(InvertedValReg->getOperand(0).getReg())
+ .addReg(NegatedVal)
.addReg(NewAccumulator->getOperand(0).getReg());
break;
}
@@ -5297,6 +5504,75 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
.addReg(NewAccumulator->getOperand(0).getReg());
break;
}
+ case AMDGPU::S_ADD_U64_PSEUDO:
+ case AMDGPU::S_SUB_U64_PSEUDO: {
+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register Op1H_Op0L_Reg =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register Op1L_Op0H_Reg =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register NegatedValLo =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register NegatedValHi =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+ const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
+ const TargetRegisterClass *Src1SubRC =
+ TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
+
+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
+ MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
+ MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
+
+ if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
+ .addImm(0)
+ .addReg(NewAccumulator->getOperand(0).getReg())
+ .setOperandDead(3); // Dead scc
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
+ .addReg(NegatedValLo)
+ .addImm(31)
+ .setOperandDead(3); // Dead scc
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
+ .add(Op1L)
+ .addReg(NegatedValHi);
+ }
+ Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
+ ? NegatedValLo
+ : NewAccumulator->getOperand(0).getReg();
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
+ .add(Op1L)
+ .addReg(LowOpcode);
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
+ .add(Op1L)
+ .addReg(LowOpcode);
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
+ .add(Op1H)
+ .addReg(LowOpcode);
+
+ Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
+ .addReg(CarryReg)
+ .addReg(Op1H_Op0L_Reg)
+ .setOperandDead(3); // Dead scc
+
+ if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
+ .addReg(HiVal)
+ .addReg(Op1L_Op0H_Reg)
+ .setOperandDead(3); // Dead scc
+ }
+ BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
+ .addReg(DestSub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DestSub1)
+ .addImm(AMDGPU::sub1);
+ break;
+ }
}
RetBB = &BB;
}
@@ -5313,6 +5589,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
// so that we will get the next active lane for next iteration.
MachineBasicBlock::iterator I = BB.end();
Register SrcReg = MI.getOperand(1).getReg();
+ bool is32BitOpc = is32bitWaveReduceOperation(Opc);
// Create Control flow for loop
// Split MI's Machine Basic block into For loop
@@ -5322,73 +5599,160 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
- Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
-
+ Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
-
- Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
- Register LaneValueReg =
- MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
bool IsWave32 = ST.isWave32();
- unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
// Create initial values of induction variable from Exec, Accumulator and
// insert branch instr to newly created ComputeBlock
- uint32_t InitalValue = getIdentityValueForWaveReduction(Opc);
- auto TmpSReg =
- BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
- BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
- .addImm(InitalValue);
+ BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
+ if (is32BitOpc) {
+ uint32_t IdentityValue = getIdentityValueFor32BitWaveReduction(Opc);
+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
+ .addImm(IdentityValue);
+ } else {
+ uint64_t IdentityValue = getIdentityValueFor64BitWaveReduction(Opc);
+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
+ .addImm(IdentityValue);
+ }
// clang-format off
BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
.addMBB(ComputeLoop);
// clang-format on
// Start constructing ComputeLoop
- I = ComputeLoop->end();
+ I = ComputeLoop->begin();
auto Accumulator =
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
- .addReg(InitalValReg)
+ .addReg(IdentityValReg)
.addMBB(&BB);
auto ActiveBits =
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
- .addReg(TmpSReg->getOperand(0).getReg())
+ .addReg(LoopIterator)
.addMBB(&BB);
+ I = ComputeLoop->end();
+ MachineInstr *NewAccumulator;
// Perform the computations
unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
- auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
- .addReg(ActiveBits->getOperand(0).getReg());
- auto LaneValue = BuildMI(*ComputeLoop, I, DL,
- TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
- .addReg(SrcReg)
- .addReg(FF1->getOperand(0).getReg());
- auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
- .addReg(Accumulator->getOperand(0).getReg())
- .addReg(LaneValue->getOperand(0).getReg());
-
+ BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
+ .addReg(ActiveBitsReg);
+ if (is32BitOpc) {
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
+ LaneValueReg)
+ .addReg(SrcReg)
+ .addReg(FF1Reg);
+ NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
+ .addReg(Accumulator->getOperand(0).getReg())
+ .addReg(LaneValueReg);
+ } else {
+ Register LaneValueLoReg =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register LaneValueHiReg =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+ const TargetRegisterClass *SrcSubRC =
+ TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
+ // lane value input should be in an sgpr
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
+ LaneValueLoReg)
+ .add(Op1L)
+ .addReg(FF1Reg);
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
+ LaneValueHiReg)
+ .add(Op1H)
+ .addReg(FF1Reg);
+ auto LaneValue = BuildMI(*ComputeLoop, I, DL,
+ TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
+ .addReg(LaneValueLoReg)
+ .addImm(AMDGPU::sub0)
+ .addReg(LaneValueHiReg)
+ .addImm(AMDGPU::sub1);
+ switch (Opc) {
+ case AMDGPU::S_OR_B64:
+ case AMDGPU::S_AND_B64:
+ case AMDGPU::S_XOR_B64: {
+ NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
+ .addReg(Accumulator->getOperand(0).getReg())
+ .addReg(LaneValue->getOperand(0).getReg())
+ .setOperandDead(3); // Dead scc
+ break;
+ }
+ case AMDGPU::V_CMP_GT_I64_e64:
+ case AMDGPU::V_CMP_GT_U64_e64:
+ case AMDGPU::V_CMP_LT_I64_e64:
+ case AMDGPU::V_CMP_LT_U64_e64: {
+ Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
+ Register ComparisonResultReg =
+ MRI.createVirtualRegister(WaveMaskRegClass);
+ const TargetRegisterClass *VregClass = TRI->getVGPR64Class();
+ const TargetRegisterClass *VSubRegClass =
+ TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
+ Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
+ MachineOperand SrcReg0Sub0 =
+ TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
+ VregClass, AMDGPU::sub0, VSubRegClass);
+ MachineOperand SrcReg0Sub1 =
+ TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
+ VregClass, AMDGPU::sub1, VSubRegClass);
+ BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
+ AccumulatorVReg)
+ .add(SrcReg0Sub0)
+ .addImm(AMDGPU::sub0)
+ .add(SrcReg0Sub1)
+ .addImm(AMDGPU::sub1);
+ BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
+ .addReg(LaneValue->getOperand(0).getReg())
+ .addReg(AccumulatorVReg);
+
+ unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+ BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
+ .addReg(LaneMaskReg)
+ .addReg(ActiveBitsReg);
+
+ NewAccumulator = BuildMI(*ComputeLoop, I, DL,
+ TII->get(AMDGPU::S_CSELECT_B64), DstReg)
+ .addReg(LaneValue->getOperand(0).getReg())
+ .addReg(Accumulator->getOperand(0).getReg());
+ break;
+ }
+ case AMDGPU::S_ADD_U64_PSEUDO:
+ case AMDGPU::S_SUB_U64_PSEUDO: {
+ NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
+ .addReg(Accumulator->getOperand(0).getReg())
+ .addReg(LaneValue->getOperand(0).getReg());
+ ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
+ break;
+ }
+ }
+ }
// Manipulate the iterator to get the next active lane
unsigned BITSETOpc =
IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
- auto NewActiveBits =
- BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
- .addReg(FF1->getOperand(0).getReg())
- .addReg(ActiveBits->getOperand(0).getReg());
+ BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
+ .addReg(FF1Reg)
+ .addReg(ActiveBitsReg);
// Add phi nodes
- Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
- .addMBB(ComputeLoop);
- ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
- .addMBB(ComputeLoop);
+ Accumulator.addReg(DstReg).addMBB(ComputeLoop);
+ ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
// Creating branching
unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
- .addReg(NewActiveBits->getOperand(0).getReg())
+ .addReg(NewActiveBitsReg)
.addImm(0);
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
.addMBB(ComputeLoop);
@@ -5410,22 +5774,40 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
switch (MI.getOpcode()) {
case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
+ case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
+ case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
+ case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
+ case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
+ case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
+ case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
+ case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
case AMDGPU::S_UADDO_PSEUDO:
case AMDGPU::S_USUBO_PSEUDO: {
const DebugLoc &DL = MI.getDebugLoc();
@@ -5452,55 +5834,7 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
}
case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::S_SUB_U64_PSEUDO: {
- // For targets older than GFX12, we emit a sequence of 32-bit operations.
- // For GFX12, we emit s_add_u64 and s_sub_u64.
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
- MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
- const DebugLoc &DL = MI.getDebugLoc();
- MachineOperand &Dest = MI.getOperand(0);
- MachineOperand &Src0 = MI.getOperand(1);
- MachineOperand &Src1 = MI.getOperand(2);
- bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
- if (Subtarget->hasScalarAddSub64()) {
- unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
- // clang-format off
- BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
- .add(Src0)
- .add(Src1);
- // clang-format on
- } else {
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
- const TargetRegisterClass *BoolRC = TRI->getBoolRC();
-
- Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
- Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-
- MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
- MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
- MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
- MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
-
- MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
- MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
- MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
- MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
-
- unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
- unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
- BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
- .add(Src0Sub0)
- .add(Src1Sub0);
- BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
- .add(Src0Sub1)
- .add(Src1Sub1);
- BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
- .addReg(DestSub0)
- .addImm(AMDGPU::sub0)
- .addReg(DestSub1)
- .addImm(AMDGPU::sub1);
- }
- MI.eraseFromParent();
- return BB;
+ return Expand64BitScalarArithmetic(MI, BB);
}
case AMDGPU::V_ADD_U64_PSEUDO:
case AMDGPU::V_SUB_U64_PSEUDO: {
@@ -6023,14 +6357,15 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.eraseFromParent();
return SplitBB;
}
+ case AMDGPU::SI_TCRETURN_GFX_WholeWave:
case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
assert(MFI->isWholeWaveFunction());
// During ISel, it's difficult to propagate the original EXEC mask to use as
// an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
- Register OriginalExec = Setup->getOperand(0).getReg();
assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
+ Register OriginalExec = Setup->getOperand(0).getReg();
MF->getRegInfo().clearKillFlags(OriginalExec);
MI.getOperand(0).setReg(OriginalExec);
return BB;
@@ -10246,6 +10581,16 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
return SDValue(NewMI, 0);
}
+ case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
+ case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
+ case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
+ MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
+ SDValue Chain = Op->getOperand(0);
+ SDValue Ptr = Op->getOperand(2);
+ EVT VT = Op->getValueType(0);
+ return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
+ Chain, Ptr, MII->getMemOperand());
+ }
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
@@ -10421,41 +10766,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
}
- case Intrinsic::amdgcn_s_barrier:
- case Intrinsic::amdgcn_s_barrier_signal:
- case Intrinsic::amdgcn_s_barrier_wait: {
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) {
- unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
- if (WGSize <= ST.getWavefrontSize()) {
- // If the workgroup fits in a wave, remove s_barrier_signal and lower
- // s_barrier/s_barrier_wait to wave_barrier.
- if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal)
- return Op.getOperand(0);
- else
- return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL,
- MVT::Other, Op.getOperand(0)),
- 0);
- }
- }
-
- if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
- // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
- SDValue K =
- DAG.getSignedTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32);
- SDValue BarSignal =
- SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
- MVT::Other, K, Op.getOperand(0)),
- 0);
- SDValue BarWait =
- SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
- BarSignal.getValue(0)),
- 0);
- return BarWait;
- }
-
- return SDValue();
- };
case Intrinsic::amdgcn_struct_tbuffer_store:
case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
@@ -10913,6 +11223,16 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Op->getVTList(), Ops, M->getMemoryVT(),
M->getMemOperand());
}
+ case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
+ case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
+ case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
+ MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
+ SDValue Chain = Op->getOperand(0);
+ SDValue Ptr = Op->getOperand(2);
+ SDValue Val = Op->getOperand(3);
+ return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
+ Ptr, MII->getMemOperand());
+ }
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
@@ -16933,10 +17253,12 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
switch (BitWidth) {
case 16:
RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
- : &AMDGPU::VGPR_32RegClass;
+ : &AMDGPU::VGPR_32_Lo256RegClass;
break;
default:
- RC = TRI->getVGPRClassForBitWidth(BitWidth);
+ RC = Subtarget->has1024AddressableVGPRs()
+ ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
+ : TRI->getVGPRClassForBitWidth(BitWidth);
if (!RC)
return std::pair(0U, nullptr);
break;
@@ -16980,7 +17302,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
if (Kind != '\0') {
if (Kind == 'v') {
- RC = &AMDGPU::VGPR_32RegClass;
+ RC = &AMDGPU::VGPR_32_Lo256RegClass;
} else if (Kind == 's') {
RC = &AMDGPU::SGPR_32RegClass;
} else if (Kind == 'a') {
@@ -17022,6 +17344,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
return std::pair(0U, nullptr);
if (Idx < RC->getNumRegs())
return std::pair(RC->getRegister(Idx), RC);
+ return std::pair(0U, nullptr);
}
}
@@ -17808,11 +18131,19 @@ static bool flatInstrMayAccessPrivate(const Instruction *I) {
!AMDGPU::hasValueInRangeLikeMetadata(*MD, AMDGPUAS::PRIVATE_ADDRESS);
}
+static TargetLowering::AtomicExpansionKind
+getPrivateAtomicExpansionKind(const GCNSubtarget &STI) {
+ // For GAS, lower to flat atomic.
+ return STI.hasGloballyAddressableScratch()
+ ? TargetLowering::AtomicExpansionKind::CustomExpand
+ : TargetLowering::AtomicExpansionKind::NotAtomic;
+}
+
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
unsigned AS = RMW->getPointerAddressSpace();
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
- return AtomicExpansionKind::NotAtomic;
+ return getPrivateAtomicExpansionKind(*getSubtarget());
// 64-bit flat atomics that dynamically reside in private memory will silently
// be dropped.
@@ -17823,7 +18154,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
if (AS == AMDGPUAS::FLAT_ADDRESS &&
DL.getTypeSizeInBits(RMW->getType()) == 64 &&
flatInstrMayAccessPrivate(RMW))
- return AtomicExpansionKind::Expand;
+ return AtomicExpansionKind::CustomExpand;
auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
OptimizationRemarkEmitter ORE(RMW->getFunction());
@@ -17898,7 +18229,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
// does. InstCombine transforms these with 0 to or, so undo that.
if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
ConstVal && ConstVal->isNullValue())
- return AtomicExpansionKind::Expand;
+ return AtomicExpansionKind::CustomExpand;
}
// If the allocation could be in remote, fine-grained memory, the rmw
@@ -18027,9 +18358,9 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
// fadd.
if (Subtarget->hasLDSFPAtomicAddF32()) {
if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
- return AtomicExpansionKind::Expand;
+ return AtomicExpansionKind::CustomExpand;
if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
- return AtomicExpansionKind::Expand;
+ return AtomicExpansionKind::CustomExpand;
}
}
}
@@ -18083,14 +18414,14 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
- ? AtomicExpansionKind::NotAtomic
+ ? getPrivateAtomicExpansionKind(*getSubtarget())
: AtomicExpansionKind::None;
}
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
- ? AtomicExpansionKind::NotAtomic
+ ? getPrivateAtomicExpansionKind(*getSubtarget())
: AtomicExpansionKind::None;
}
@@ -18098,7 +18429,7 @@ TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
unsigned AddrSpace = CmpX->getPointerAddressSpace();
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
- return AtomicExpansionKind::NotAtomic;
+ return getPrivateAtomicExpansionKind(*getSubtarget());
if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
return AtomicExpansionKind::None;
@@ -18109,7 +18440,7 @@ SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
// If a 64-bit flat atomic may alias private, we need to avoid using the
// atomic in the private case.
- return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
+ return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
: AtomicExpansionKind::None;
}
@@ -18468,9 +18799,24 @@ void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
Builder.CreateBr(ExitBB);
}
+static void convertScratchAtomicToFlatAtomic(Instruction *I,
+ unsigned PtrOpIdx) {
+ Value *PtrOp = I->getOperand(PtrOpIdx);
+ assert(PtrOp->getType()->getPointerAddressSpace() ==
+ AMDGPUAS::PRIVATE_ADDRESS);
+
+ Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
+ Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
+ I->getIterator());
+ I->setOperand(PtrOpIdx, ASCast);
+}
+
void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
AtomicRMWInst::BinOp Op = AI->getOperation();
+ if (AI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+ return convertScratchAtomicToFlatAtomic(AI, AI->getPointerOperandIndex());
+
if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
Op == AtomicRMWInst::Xor) {
if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
@@ -18493,9 +18839,28 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
}
void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
+ if (CI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+ return convertScratchAtomicToFlatAtomic(CI, CI->getPointerOperandIndex());
+
emitExpandAtomicAddrSpacePredicate(CI);
}
+void SITargetLowering::emitExpandAtomicLoad(LoadInst *LI) const {
+ if (LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+ return convertScratchAtomicToFlatAtomic(LI, LI->getPointerOperandIndex());
+
+ llvm_unreachable(
+ "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
+}
+
+void SITargetLowering::emitExpandAtomicStore(StoreInst *SI) const {
+ if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+ return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
+
+ llvm_unreachable(
+ "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
+}
+
LoadInst *
SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
IRBuilder<> Builder(AI);