summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp186
1 files changed, 114 insertions, 72 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b8b419d93021..6c36f8ad9b6a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -470,6 +470,24 @@ MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
}
+SDNode *AMDGPUDAGToDAGISel::packConstantV2I16(const SDNode *N,
+ SelectionDAG &DAG) const {
+ // TODO: Handle undef as zero
+
+ assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
+ uint32_t LHSVal, RHSVal;
+ if (getConstantValue(N->getOperand(0), LHSVal) &&
+ getConstantValue(N->getOperand(1), RHSVal)) {
+ SDLoc SL(N);
+ uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16);
+ return DAG.getMachineNode(
+ isVGPRImm(N) ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32, SL,
+ N->getValueType(0), DAG.getTargetConstant(K, SL, MVT::i32));
+ }
+
+ return nullptr;
+}
+
void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
EVT VT = N->getValueType(0);
unsigned NumVectorElts = VT.getVectorNumElements();
@@ -708,10 +726,14 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
break;
}
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
assert(VT.getVectorElementType().bitsEq(MVT::i32));
- unsigned RegClassID =
- SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
- SelectBuildVector(N, RegClassID);
+ const TargetRegisterClass *RegClass =
+ N->isDivergent()
+ ? TRI->getDefaultVectorSuperClassForBitWidth(NumVectorElts * 32)
+ : SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32);
+
+ SelectBuildVector(N, RegClass->getID());
return;
}
case ISD::VECTOR_SHUFFLE:
@@ -1828,72 +1850,83 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
isFlatScratchBaseLegal(Addr))) {
int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
- const SIInstrInfo *TII = Subtarget->getInstrInfo();
- if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
- Addr = N0;
- OffsetVal = COffsetVal;
- } else {
- // If the offset doesn't fit, put the low bits into the offset field and
- // add the rest.
- //
- // For a FLAT instruction the hardware decides whether to access
- // global/scratch/shared memory based on the high bits of vaddr,
- // ignoring the offset field, so we have to ensure that when we add
- // remainder to vaddr it still points into the same underlying object.
- // The easiest way to do that is to make sure that we split the offset
- // into two pieces that are both >= 0 or both <= 0.
-
- SDLoc DL(N);
- uint64_t RemainderOffset;
-
- std::tie(OffsetVal, RemainderOffset) =
- TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
-
- SDValue AddOffsetLo =
- getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
- SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
-
- if (Addr.getValueType().getSizeInBits() == 32) {
- SmallVector<SDValue, 3> Opnds;
- Opnds.push_back(N0);
- Opnds.push_back(AddOffsetLo);
- unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
- if (Subtarget->hasAddNoCarry()) {
- AddOp = AMDGPU::V_ADD_U32_e64;
- Opnds.push_back(Clamp);
- }
- Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
+ // Adding the offset to the base address in a FLAT instruction must not
+ // change the memory aperture in which the address falls. Therefore we can
+ // only fold offsets from inbounds GEPs into FLAT instructions.
+ bool IsInBounds =
+ Addr.getOpcode() == ISD::PTRADD && Addr->getFlags().hasInBounds();
+ if (COffsetVal == 0 || FlatVariant != SIInstrFlags::FLAT || IsInBounds) {
+ const SIInstrInfo *TII = Subtarget->getInstrInfo();
+ if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
+ Addr = N0;
+ OffsetVal = COffsetVal;
} else {
- // TODO: Should this try to use a scalar add pseudo if the base address
- // is uniform and saddr is usable?
- SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
- SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
-
- SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
- DL, MVT::i32, N0, Sub0);
- SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
- DL, MVT::i32, N0, Sub1);
-
- SDValue AddOffsetHi =
- getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
-
- SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
-
- SDNode *Add =
- CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
- {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
-
- SDNode *Addc = CurDAG->getMachineNode(
- AMDGPU::V_ADDC_U32_e64, DL, VTs,
- {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
-
- SDValue RegSequenceArgs[] = {
- CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
- SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
-
- Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
- MVT::i64, RegSequenceArgs),
- 0);
+ // If the offset doesn't fit, put the low bits into the offset field
+ // and add the rest.
+ //
+ // For a FLAT instruction the hardware decides whether to access
+ // global/scratch/shared memory based on the high bits of vaddr,
+ // ignoring the offset field, so we have to ensure that when we add
+ // remainder to vaddr it still points into the same underlying object.
+ // The easiest way to do that is to make sure that we split the offset
+ // into two pieces that are both >= 0 or both <= 0.
+
+ SDLoc DL(N);
+ uint64_t RemainderOffset;
+
+ std::tie(OffsetVal, RemainderOffset) =
+ TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
+
+ SDValue AddOffsetLo =
+ getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
+ SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
+
+ if (Addr.getValueType().getSizeInBits() == 32) {
+ SmallVector<SDValue, 3> Opnds;
+ Opnds.push_back(N0);
+ Opnds.push_back(AddOffsetLo);
+ unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
+ if (Subtarget->hasAddNoCarry()) {
+ AddOp = AMDGPU::V_ADD_U32_e64;
+ Opnds.push_back(Clamp);
+ }
+ Addr =
+ SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
+ } else {
+ // TODO: Should this try to use a scalar add pseudo if the base
+ // address is uniform and saddr is usable?
+ SDValue Sub0 =
+ CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
+ SDValue Sub1 =
+ CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
+
+ SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ DL, MVT::i32, N0, Sub0);
+ SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ DL, MVT::i32, N0, Sub1);
+
+ SDValue AddOffsetHi =
+ getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
+
+ SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
+
+ SDNode *Add =
+ CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
+ {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
+
+ SDNode *Addc = CurDAG->getMachineNode(
+ AMDGPU::V_ADDC_U32_e64, DL, VTs,
+ {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
+
+ SDValue RegSequenceArgs[] = {
+ CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL,
+ MVT::i32),
+ SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
+
+ Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+ MVT::i64, RegSequenceArgs),
+ 0);
+ }
}
}
}
@@ -4387,16 +4420,25 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
const auto *Ld = cast<LoadSDNode>(N);
-
const MachineMemOperand *MMO = Ld->getMemOperand();
- if (N->isDivergent() && !AMDGPU::isUniformMMO(MMO))
- return false;
+
+ if (Ld->isDivergent()) {
+ // FIXME: We ought to able able to take the direct isDivergent result. We
+ // cannot rely on the MMO for a uniformity check, and should stop using
+ // it. This is a hack for 2 ways that the IR divergence analysis is superior
+ // to the DAG divergence: Recognizing shift-of-workitem-id as always
+ // uniform, and isSingleLaneExecution. These should be handled in the DAG
+ // version, and then this can be dropped.
+ if (!MMO->getValue() || !AMDGPU::isUniformMMO(MMO))
+ return false;
+ }
return MMO->getSize().hasValue() &&
Ld->getAlign() >=
Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
uint64_t(4))) &&
- ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
+ (MMO->isInvariant() ||
+ (Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
(Subtarget->getScalarizeGlobalBehavior() &&
Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&