diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIISelLowering.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 266 |
1 files changed, 233 insertions, 33 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index e2a10be4c2c7..0c76ff2ec5ea 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -33,6 +33,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/SDPatternMatch.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" @@ -46,6 +47,7 @@ #include <optional> using namespace llvm; +using namespace llvm::SDPatternMatch; #define DEBUG_TYPE "si-lower" @@ -938,6 +940,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal); } + if (Subtarget->hasBF16TransInsts()) { + setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal); + } + if (Subtarget->hasCvtPkF16F32Inst()) { setOperationAction(ISD::FP_ROUND, {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16}, @@ -3893,7 +3899,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, // arguments to begin at SP+0. Completely unused for non-tail calls. int32_t FPDiff = 0; MachineFrameInfo &MFI = MF.getFrameInfo(); - auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); + auto *TRI = Subtarget->getRegisterInfo(); // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass @@ -8162,6 +8168,14 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, // which is a 64-bit pc-relative offset from the encoding of the $symbol // operand to the global variable. + if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) { + assert(GAFlags != SIInstrInfo::MO_NONE); + + SDValue Ptr = + DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2); + return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr); + } + SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags); SDValue PtrHi; if (GAFlags == SIInstrInfo::MO_NONE) @@ -8211,6 +8225,13 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, } if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) { + if (Subtarget->has64BitLiterals()) { + SDValue Addr = DAG.getTargetGlobalAddress( + GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64); + return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr), + 0); + } + SDValue AddrLo = DAG.getTargetGlobalAddress( GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO); AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0}; @@ -9289,7 +9310,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); case Intrinsic::amdgcn_reloc_constant: { - Module *M = const_cast<Module *>(MF.getFunction().getParent()); + Module *M = MF.getFunction().getParent(); const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD(); auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString(); auto *RelocSymbol = cast<GlobalVariable>( @@ -9315,6 +9336,44 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(0), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), IndexKeyi32); } + case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: { + if (Op.getOperand(4).getValueType() == MVT::i64) + return SDValue(); + + SDLoc SL(Op); + auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), + {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2), + Op.getOperand(3), IndexKeyi64, Op.getOperand(5), + Op.getOperand(6)}); + } + case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16: + case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16: + case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16: + case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: { + EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8 + ? MVT::i64 + : MVT::i32; + if (Op.getOperand(6).getValueType() == IndexKeyTy) + return SDValue(); + + SDLoc SL(Op); + auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), + {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2), + Op.getOperand(3), Op.getOperand(4), Op.getOperand(5), + IndexKey, Op.getOperand(7), + Op.getOperand(8)}); // No clamp operand + } case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: { @@ -11074,7 +11133,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { assert(VT.getSizeInBits() == 64); SDLoc DL(Op); - SDValue Cond = Op.getOperand(0); + SDValue Cond = DAG.getFreeze(Op.getOperand(0)); SDValue Zero = DAG.getConstant(0, DL, MVT::i32); SDValue One = DAG.getConstant(1, DL, MVT::i32); @@ -12155,6 +12214,11 @@ SDValue SITargetLowering::splitBinaryBitConstantOp( if ((bitOpWithConstantIsReducible(Opc, ValLo) || bitOpWithConstantIsReducible(Opc, ValHi)) || (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) { + // We have 64-bit scalar and/or/xor, but do not have vector forms. + if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() && + !CRHS->user_begin()->isDivergent()) + return SDValue(); + // If we need to materialize a 64-bit immediate, it will be split up later // anyway. Avoid creating the harder to understand 64-bit immediate // materialization. @@ -13660,6 +13724,7 @@ bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF, case Intrinsic::amdgcn_frexp_mant: case Intrinsic::amdgcn_fdot2: case Intrinsic::amdgcn_trig_preop: + case Intrinsic::amdgcn_tanh: return true; default: break; @@ -14498,7 +14563,7 @@ static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, // instead of a tree. SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const { - assert(N->getOpcode() == ISD::ADD); + assert(N->isAnyAdd()); SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); @@ -14531,7 +14596,7 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, for (SDNode *User : LHS->users()) { // There is a use that does not feed into addition, so the multiply can't // be removed. We prefer MUL + ADD + ADDC over MAD + MUL. - if (User->getOpcode() != ISD::ADD) + if (!User->isAnyAdd()) return SDValue(); // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer @@ -14643,8 +14708,11 @@ SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N, SDValue Hi = getHiHalf64(LHS, DAG); SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32); + unsigned Opcode = N->getOpcode(); + if (Opcode == ISD::PTRADD) + Opcode = ISD::ADD; SDValue AddHi = - DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags()); + DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags()); SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi); @@ -15118,42 +15186,123 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); + EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - if (N1.getOpcode() == ISD::ADD) { - // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant, - // y is not, and (add y, z) is used only once. - // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant, - // z is not, and (add y, z) is used only once. - // The goal is to move constant offsets to the outermost ptradd, to create - // more opportunities to fold offsets into memory instructions. - // Together with the generic combines in DAGCombiner.cpp, this also - // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)). - // - // This transform is here instead of in the general DAGCombiner as it can - // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for - // AArch64's CPA. - SDValue X = N0; - SDValue Y = N1.getOperand(0); - SDValue Z = N1.getOperand(1); - if (N1.hasOneUse()) { - bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); - bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); - if (ZIsConstant != YIsConstant) { - // If both additions in the original were NUW, the new ones are as well. - SDNodeFlags Flags = - (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap; - if (YIsConstant) - std::swap(Y, Z); + // The following folds transform PTRADDs into regular arithmetic in cases + // where the PTRADD wouldn't be folded as an immediate offset into memory + // instructions anyway. They are target-specific in that other targets might + // prefer to not lose information about the pointer arithmetic. + + // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)). + // Adapted from DAGCombiner::visitADDLikeCommutative. + SDValue V, K; + if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) { + SDNodeFlags ShlFlags = N1->getFlags(); + // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0, + // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be + // preserved. + SDNodeFlags NewShlFlags = + ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap() + ? SDNodeFlags::NoSignedWrap + : SDNodeFlags(); + SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags); + DCI.AddToWorklist(Inner.getNode()); + return DAG.getNode(ISD::SUB, DL, VT, N0, Inner); + } + + // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in + // performAddCombine. + if (N1.getOpcode() == ISD::MUL) { + if (Subtarget->hasMad64_32()) { + if (SDValue Folded = tryFoldToMad64_32(N, DCI)) + return Folded; + } + } + + // If the 32 low bits of the constant are all zero, there is nothing to fold + // into an immediate offset, so it's better to eliminate the unnecessary + // addition for the lower 32 bits than to preserve the PTRADD. + // Analogous to a fold in performAddCombine. + if (VT == MVT::i64) { + if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI)) + return Folded; + } - SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, Flags); + if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) { + // Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with + // global address GA and constant c, such that c can be folded into GA. + SDValue GAValue = N0.getOperand(0); + if (const GlobalAddressSDNode *GA = + dyn_cast<GlobalAddressSDNode>(GAValue)) { + if (DCI.isBeforeLegalizeOps() && isOffsetFoldingLegal(GA)) { + // If both additions in the original were NUW, reassociation preserves + // that. + SDNodeFlags Flags = + (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap; + SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags); DCI.AddToWorklist(Inner.getNode()); - return DAG.getMemBasePlusOffset(Inner, Z, DL, Flags); + return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags); } } } + if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse()) + return SDValue(); + + // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant, + // y is not, and (add y, z) is used only once. + // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant, + // z is not, and (add y, z) is used only once. + // The goal is to move constant offsets to the outermost ptradd, to create + // more opportunities to fold offsets into memory instructions. + // Together with the generic combines in DAGCombiner.cpp, this also + // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)). + // + // This transform is here instead of in the general DAGCombiner as it can + // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for + // AArch64's CPA. + SDValue X = N0; + SDValue Y = N1.getOperand(0); + SDValue Z = N1.getOperand(1); + bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); + bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); + + // If both additions in the original were NUW, reassociation preserves that. + SDNodeFlags ReassocFlags = + (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap; + + if (ZIsConstant != YIsConstant) { + if (YIsConstant) + std::swap(Y, Z); + SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags); + DCI.AddToWorklist(Inner.getNode()); + return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags); + } + + // If one of Y and Z is constant, they have been handled above. If both were + // constant, the addition would have been folded in SelectionDAG::getNode + // already. This ensures that the generic DAG combines won't undo the + // following reassociation. + assert(!YIsConstant && !ZIsConstant); + + if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) { + // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and + // y are uniform and z isn't. + // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and + // z are uniform and y isn't. + // The goal is to push uniform operands up in the computation, so that they + // can be handled with scalar operations. We can't use reassociateScalarOps + // for this since it requires two identical commutative operations to + // reassociate. + if (Y->isDivergent()) + std::swap(Y, Z); + SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags); + DCI.AddToWorklist(UniformInner.getNode()); + return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags); + } + return SDValue(); } @@ -16847,12 +16996,63 @@ static void knownBitsForWorkitemID(const GCNSubtarget &ST, Known.Zero.setHighBits(llvm::countl_zero(MaxValue)); } +static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, + KnownBits &Known, const APInt &DemandedElts, + unsigned BFEWidth, bool SExt, unsigned Depth) { + const MachineRegisterInfo &MRI = VT.getMachineFunction().getRegInfo(); + const MachineOperand &Src1 = MI.getOperand(2); + + unsigned Src1Cst = 0; + if (Src1.isImm()) { + Src1Cst = Src1.getImm(); + } else if (Src1.isReg()) { + auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI); + if (!Cst) + return; + Src1Cst = Cst->Value.getZExtValue(); + } else { + return; + } + + // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit. + // Width is always [22:16]. + const unsigned Offset = + Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6); + const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6); + + if (Width >= BFEWidth) // Ill-formed. + return; + + VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts, + Depth + 1); + + Known = Known.extractBits(Width, Offset); + + if (SExt) + Known = Known.sext(BFEWidth); + else + Known = Known.zext(BFEWidth); +} + void SITargetLowering::computeKnownBitsForTargetInstr( GISelValueTracking &VT, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth) const { + Known.resetAll(); const MachineInstr *MI = MRI.getVRegDef(R); switch (MI->getOpcode()) { + case AMDGPU::S_BFE_I32: + return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32, + /*SExt=*/true, Depth); + case AMDGPU::S_BFE_U32: + return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32, + /*SExt=*/false, Depth); + case AMDGPU::S_BFE_I64: + return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64, + /*SExt=*/true, Depth); + case AMDGPU::S_BFE_U64: + return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64, + /*SExt=*/false, Depth); case AMDGPU::G_INTRINSIC: case AMDGPU::G_INTRINSIC_CONVERGENT: { Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID(); |
