diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 186 |
1 files changed, 114 insertions, 72 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index b8b419d93021..6c36f8ad9b6a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -470,6 +470,24 @@ MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm, return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops); } +SDNode *AMDGPUDAGToDAGISel::packConstantV2I16(const SDNode *N, + SelectionDAG &DAG) const { + // TODO: Handle undef as zero + + assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2); + uint32_t LHSVal, RHSVal; + if (getConstantValue(N->getOperand(0), LHSVal) && + getConstantValue(N->getOperand(1), RHSVal)) { + SDLoc SL(N); + uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16); + return DAG.getMachineNode( + isVGPRImm(N) ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32, SL, + N->getValueType(0), DAG.getTargetConstant(K, SL, MVT::i32)); + } + + return nullptr; +} + void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { EVT VT = N->getValueType(0); unsigned NumVectorElts = VT.getVectorNumElements(); @@ -708,10 +726,14 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { break; } + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); assert(VT.getVectorElementType().bitsEq(MVT::i32)); - unsigned RegClassID = - SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID(); - SelectBuildVector(N, RegClassID); + const TargetRegisterClass *RegClass = + N->isDivergent() + ? TRI->getDefaultVectorSuperClassForBitWidth(NumVectorElts * 32) + : SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32); + + SelectBuildVector(N, RegClass->getID()); return; } case ISD::VECTOR_SHUFFLE: @@ -1828,72 +1850,83 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr, isFlatScratchBaseLegal(Addr))) { int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue(); - const SIInstrInfo *TII = Subtarget->getInstrInfo(); - if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) { - Addr = N0; - OffsetVal = COffsetVal; - } else { - // If the offset doesn't fit, put the low bits into the offset field and - // add the rest. - // - // For a FLAT instruction the hardware decides whether to access - // global/scratch/shared memory based on the high bits of vaddr, - // ignoring the offset field, so we have to ensure that when we add - // remainder to vaddr it still points into the same underlying object. - // The easiest way to do that is to make sure that we split the offset - // into two pieces that are both >= 0 or both <= 0. - - SDLoc DL(N); - uint64_t RemainderOffset; - - std::tie(OffsetVal, RemainderOffset) = - TII->splitFlatOffset(COffsetVal, AS, FlatVariant); - - SDValue AddOffsetLo = - getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); - SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); - - if (Addr.getValueType().getSizeInBits() == 32) { - SmallVector<SDValue, 3> Opnds; - Opnds.push_back(N0); - Opnds.push_back(AddOffsetLo); - unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32; - if (Subtarget->hasAddNoCarry()) { - AddOp = AMDGPU::V_ADD_U32_e64; - Opnds.push_back(Clamp); - } - Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0); + // Adding the offset to the base address in a FLAT instruction must not + // change the memory aperture in which the address falls. Therefore we can + // only fold offsets from inbounds GEPs into FLAT instructions. + bool IsInBounds = + Addr.getOpcode() == ISD::PTRADD && Addr->getFlags().hasInBounds(); + if (COffsetVal == 0 || FlatVariant != SIInstrFlags::FLAT || IsInBounds) { + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) { + Addr = N0; + OffsetVal = COffsetVal; } else { - // TODO: Should this try to use a scalar add pseudo if the base address - // is uniform and saddr is usable? - SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); - SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); - - SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, N0, Sub0); - SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, N0, Sub1); - - SDValue AddOffsetHi = - getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); - - SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); - - SDNode *Add = - CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs, - {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); - - SDNode *Addc = CurDAG->getMachineNode( - AMDGPU::V_ADDC_U32_e64, DL, VTs, - {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); - - SDValue RegSequenceArgs[] = { - CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32), - SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1}; - - Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, - MVT::i64, RegSequenceArgs), - 0); + // If the offset doesn't fit, put the low bits into the offset field + // and add the rest. + // + // For a FLAT instruction the hardware decides whether to access + // global/scratch/shared memory based on the high bits of vaddr, + // ignoring the offset field, so we have to ensure that when we add + // remainder to vaddr it still points into the same underlying object. + // The easiest way to do that is to make sure that we split the offset + // into two pieces that are both >= 0 or both <= 0. + + SDLoc DL(N); + uint64_t RemainderOffset; + + std::tie(OffsetVal, RemainderOffset) = + TII->splitFlatOffset(COffsetVal, AS, FlatVariant); + + SDValue AddOffsetLo = + getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); + SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); + + if (Addr.getValueType().getSizeInBits() == 32) { + SmallVector<SDValue, 3> Opnds; + Opnds.push_back(N0); + Opnds.push_back(AddOffsetLo); + unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32; + if (Subtarget->hasAddNoCarry()) { + AddOp = AMDGPU::V_ADD_U32_e64; + Opnds.push_back(Clamp); + } + Addr = + SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0); + } else { + // TODO: Should this try to use a scalar add pseudo if the base + // address is uniform and saddr is usable? + SDValue Sub0 = + CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); + SDValue Sub1 = + CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); + + SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, N0, Sub0); + SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, N0, Sub1); + + SDValue AddOffsetHi = + getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); + + SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); + + SDNode *Add = + CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs, + {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); + + SDNode *Addc = CurDAG->getMachineNode( + AMDGPU::V_ADDC_U32_e64, DL, VTs, + {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); + + SDValue RegSequenceArgs[] = { + CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, + MVT::i32), + SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1}; + + Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::i64, RegSequenceArgs), + 0); + } } } } @@ -4387,16 +4420,25 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const { bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const { const auto *Ld = cast<LoadSDNode>(N); - const MachineMemOperand *MMO = Ld->getMemOperand(); - if (N->isDivergent() && !AMDGPU::isUniformMMO(MMO)) - return false; + + if (Ld->isDivergent()) { + // FIXME: We ought to able able to take the direct isDivergent result. We + // cannot rely on the MMO for a uniformity check, and should stop using + // it. This is a hack for 2 ways that the IR divergence analysis is superior + // to the DAG divergence: Recognizing shift-of-workitem-id as always + // uniform, and isSingleLaneExecution. These should be handled in the DAG + // version, and then this can be dropped. + if (!MMO->getValue() || !AMDGPU::isUniformMMO(MMO)) + return false; + } return MMO->getSize().hasValue() && Ld->getAlign() >= Align(std::min(MMO->getSize().getValue().getKnownMinValue(), uint64_t(4))) && - ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || + (MMO->isInvariant() || + (Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) || (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && |
