summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp')
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp503
1 files changed, 288 insertions, 215 deletions
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index bb4bb1195f78..d3fb657851fe 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -198,6 +198,12 @@ static bool IsPTXVectorType(MVT VT) {
static std::optional<std::pair<unsigned int, MVT>>
getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI,
unsigned AddressSpace) {
+ const bool CanLowerTo256Bit = STI.has256BitVectorLoadStore(AddressSpace);
+
+ if (CanLowerTo256Bit && VectorEVT.isScalarInteger() &&
+ VectorEVT.getSizeInBits() == 256)
+ return {{4, MVT::i64}};
+
if (!VectorEVT.isSimple())
return std::nullopt;
const MVT VectorVT = VectorEVT.getSimpleVT();
@@ -214,8 +220,6 @@ getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI,
// The size of the PTX virtual register that holds a packed type.
unsigned PackRegSize;
- bool CanLowerTo256Bit = STI.has256BitVectorLoadStore(AddressSpace);
-
// We only handle "native" vector sizes for now, e.g. <4 x double> is not
// legal. We can (and should) split that into 2 stores of <2 x double> here
// but I'm leaving that as a TODO for now.
@@ -539,6 +543,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
case ISD::FMINNUM_IEEE:
case ISD::FMAXIMUM:
case ISD::FMINIMUM:
+ case ISD::FMAXIMUMNUM:
+ case ISD::FMINIMUMNUM:
IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
break;
case ISD::FEXP2:
@@ -702,57 +708,66 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// intrinsics.
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
- // Turn FP extload into load/fpextend
- setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
- // Turn FP truncstore into trunc + store.
- // FIXME: vector types should also be expanded
- setTruncStoreAction(MVT::f32, MVT::f16, Expand);
- setTruncStoreAction(MVT::f64, MVT::f16, Expand);
- setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
- setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
- setTruncStoreAction(MVT::f64, MVT::f32, Expand);
- setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
- setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
+ // FP extload/truncstore is not legal in PTX. We need to expand all these.
+ for (auto FloatVTs :
+ {MVT::fp_valuetypes(), MVT::fp_fixedlen_vector_valuetypes()}) {
+ for (MVT ValVT : FloatVTs) {
+ for (MVT MemVT : FloatVTs) {
+ setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Expand);
+ setTruncStoreAction(ValVT, MemVT, Expand);
+ }
+ }
+ }
- // PTX does not support load / store predicate registers
- setOperationAction(ISD::LOAD, MVT::i1, Custom);
- setOperationAction(ISD::STORE, MVT::i1, Custom);
+ // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
+ // how they'll be lowered in ISel anyway, and by doing this a little earlier
+ // we allow for more DAG combine opportunities.
+ for (auto IntVTs :
+ {MVT::integer_valuetypes(), MVT::integer_fixedlen_vector_valuetypes()})
+ for (MVT ValVT : IntVTs)
+ for (MVT MemVT : IntVTs)
+ if (isTypeLegal(ValVT))
+ setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Custom);
+ // PTX does not support load / store predicate registers
+ setOperationAction({ISD::LOAD, ISD::STORE}, MVT::i1, Custom);
for (MVT VT : MVT::integer_valuetypes()) {
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
- setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, VT, MVT::i1,
+ Promote);
setTruncStoreAction(VT, MVT::i1, Expand);
}
+ // Disable generations of extload/truncstore for v2i16/v2i8. The generic
+ // expansion for these nodes when they are unaligned is incorrect if the
+ // type is a vector.
+ //
+ // TODO: Fix the generic expansion for these nodes found in
+ // TargetLowering::expandUnalignedLoad/Store.
+ setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16,
+ MVT::v2i8, Expand);
+ setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
+
+ // Register custom handling for illegal type loads/stores. We'll try to custom
+ // lower almost all illegal types and logic in the lowering will discard cases
+ // we can't handle.
+ setOperationAction({ISD::LOAD, ISD::STORE}, {MVT::i128, MVT::f128}, Custom);
+ for (MVT VT : MVT::fixedlen_vector_valuetypes())
+ if (!isTypeLegal(VT) && VT.getStoreSizeInBits() <= 256)
+ setOperationAction({ISD::STORE, ISD::LOAD}, VT, Custom);
+
+ // Custom legalization for LDU intrinsics.
+ // TODO: The logic to lower these is not very robust and we should rewrite it.
+ // Perhaps LDU should not be represented as an intrinsic at all.
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
+ for (MVT VT : MVT::fixedlen_vector_valuetypes())
+ if (IsPTXVectorType(VT))
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
+
setCondCodeAction({ISD::SETNE, ISD::SETEQ, ISD::SETUGE, ISD::SETULE,
ISD::SETUGT, ISD::SETULT, ISD::SETGT, ISD::SETLT,
ISD::SETGE, ISD::SETLE},
MVT::i1, Expand);
- // expand extload of vector of integers.
- setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16,
- MVT::v2i8, Expand);
- setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
-
// This is legal in NVPTX
setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
@@ -767,24 +782,12 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// DEBUGTRAP can be lowered to PTX brkpt
setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
- // Register custom handling for vector loads/stores
- for (MVT VT : MVT::fixedlen_vector_valuetypes())
- if (IsPTXVectorType(VT))
- setOperationAction({ISD::LOAD, ISD::STORE, ISD::INTRINSIC_W_CHAIN}, VT,
- Custom);
-
- setOperationAction({ISD::LOAD, ISD::STORE, ISD::INTRINSIC_W_CHAIN},
- {MVT::i128, MVT::f128}, Custom);
-
// Support varargs.
setOperationAction(ISD::VASTART, MVT::Other, Custom);
setOperationAction(ISD::VAARG, MVT::Other, Custom);
setOperationAction(ISD::VACOPY, MVT::Other, Expand);
setOperationAction(ISD::VAEND, MVT::Other, Expand);
- // Custom handling for i8 intrinsics
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
-
setOperationAction({ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX},
{MVT::i16, MVT::i32, MVT::i64}, Legal);
@@ -988,7 +991,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
if (getOperationAction(ISD::FABS, MVT::bf16) == Promote)
AddPromotedToType(ISD::FABS, MVT::bf16, MVT::f32);
- for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
+ for (const auto &Op :
+ {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM}) {
setOperationAction(Op, MVT::f32, Legal);
setOperationAction(Op, MVT::f64, Legal);
setFP16OperationAction(Op, MVT::f16, Legal, Promote);
@@ -1039,7 +1043,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
setOperationAction(ISD::ATOMIC_LOAD_SUB, {MVT::i32, MVT::i64}, Expand);
- // No FPOW or FREM in PTX.
+
+ // atom.b128 is legal in PTX but since we don't represent i128 as a legal
+ // type, we need to custom lower it.
+ setOperationAction({ISD::ATOMIC_CMP_SWAP, ISD::ATOMIC_SWAP}, MVT::i128,
+ Custom);
// Now deduce the information based on the above mentioned
// actions
@@ -1047,7 +1055,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// PTX support for 16-bit CAS is emulated. Only use 32+
setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits());
- setMaxAtomicSizeInBitsSupported(64);
+ setMaxAtomicSizeInBitsSupported(STI.hasAtomSwap128() ? 128 : 64);
setMaxDivRemBitWidthSupported(64);
// Custom lowering for tcgen05.ld vector operands
@@ -1080,6 +1088,8 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
case NVPTXISD::FIRST_NUMBER:
break;
+ MAKE_CASE(NVPTXISD::ATOMIC_CMP_SWAP_B128)
+ MAKE_CASE(NVPTXISD::ATOMIC_SWAP_B128)
MAKE_CASE(NVPTXISD::RET_GLUE)
MAKE_CASE(NVPTXISD::DeclareArrayParam)
MAKE_CASE(NVPTXISD::DeclareScalarParam)
@@ -3088,29 +3098,112 @@ SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
MachinePointerInfo(SV));
}
-static void replaceLoadVector(SDNode *N, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &Results,
- const NVPTXSubtarget &STI);
+/// replaceLoadVector - Convert vector loads into multi-output scalar loads.
+static std::optional<std::pair<SDValue, SDValue>>
+replaceLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI) {
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ const EVT ResVT = LD->getValueType(0);
+ const EVT MemVT = LD->getMemoryVT();
-SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
- if (Op.getValueType() == MVT::i1)
- return LowerLOADi1(Op, DAG);
+ // If we're doing sign/zero extension as part of the load, avoid lowering to
+ // a LoadV node. TODO: consider relaxing this restriction.
+ if (ResVT != MemVT)
+ return std::nullopt;
- EVT VT = Op.getValueType();
+ const auto NumEltsAndEltVT =
+ getVectorLoweringShape(ResVT, STI, LD->getAddressSpace());
+ if (!NumEltsAndEltVT)
+ return std::nullopt;
+ const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
+
+ Align Alignment = LD->getAlign();
+ const auto &TD = DAG.getDataLayout();
+ Align PrefAlign = TD.getPrefTypeAlign(MemVT.getTypeForEVT(*DAG.getContext()));
+ if (Alignment < PrefAlign) {
+ // This load is not sufficiently aligned, so bail out and let this vector
+ // load be scalarized. Note that we may still be able to emit smaller
+ // vector loads. For example, if we are loading a <4 x float> with an
+ // alignment of 8, this check will fail but the legalizer will try again
+ // with 2 x <2 x float>, which will succeed with an alignment of 8.
+ return std::nullopt;
+ }
+
+ // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
+ // Therefore, we must ensure the type is legal. For i1 and i8, we set the
+ // loaded type to i16 and propagate the "real" type as the memory type.
+ const MVT LoadEltVT = (EltVT.getSizeInBits() < 16) ? MVT::i16 : EltVT;
+
+ unsigned Opcode;
+ switch (NumElts) {
+ default:
+ return std::nullopt;
+ case 2:
+ Opcode = NVPTXISD::LoadV2;
+ break;
+ case 4:
+ Opcode = NVPTXISD::LoadV4;
+ break;
+ case 8:
+ Opcode = NVPTXISD::LoadV8;
+ break;
+ }
+ auto ListVTs = SmallVector<EVT, 9>(NumElts, LoadEltVT);
+ ListVTs.push_back(MVT::Other);
+ SDVTList LdResVTs = DAG.getVTList(ListVTs);
- if (NVPTX::isPackedVectorTy(VT)) {
- // v2f32/v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to
- // handle unaligned loads and have to handle it here.
- LoadSDNode *Load = cast<LoadSDNode>(Op);
- EVT MemVT = Load->getMemoryVT();
- if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
- MemVT, *Load->getMemOperand())) {
- SDValue Ops[2];
- std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
- return DAG.getMergeValues(Ops, SDLoc(Op));
+ SDLoc DL(LD);
+
+ // Copy regular operands
+ SmallVector<SDValue, 8> OtherOps(LD->ops());
+
+ // The select routine does not have access to the LoadSDNode instance, so
+ // pass along the extension information
+ OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
+
+ SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, MemVT,
+ LD->getMemOperand());
+
+ SmallVector<SDValue> ScalarRes;
+ if (EltVT.isVector()) {
+ assert(EVT(EltVT.getVectorElementType()) == ResVT.getVectorElementType());
+ assert(NumElts * EltVT.getVectorNumElements() ==
+ ResVT.getVectorNumElements());
+ // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back
+ // into individual elements.
+ for (const unsigned I : llvm::seq(NumElts)) {
+ SDValue SubVector = NewLD.getValue(I);
+ DAG.ExtractVectorElements(SubVector, ScalarRes);
+ }
+ } else {
+ for (const unsigned I : llvm::seq(NumElts)) {
+ SDValue Res = NewLD.getValue(I);
+ if (LoadEltVT != EltVT)
+ Res = DAG.getNode(ISD::TRUNCATE, DL, EltVT, Res);
+ ScalarRes.push_back(Res);
}
}
+ SDValue LoadChain = NewLD.getValue(NumElts);
+
+ const MVT BuildVecVT =
+ MVT::getVectorVT(EltVT.getScalarType(), ScalarRes.size());
+ SDValue BuildVec = DAG.getBuildVector(BuildVecVT, DL, ScalarRes);
+ SDValue LoadValue = DAG.getBitcast(ResVT, BuildVec);
+
+ return {{LoadValue, LoadChain}};
+}
+
+static void replaceLoadVector(SDNode *N, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &Results,
+ const NVPTXSubtarget &STI) {
+ if (auto Res = replaceLoadVector(N, DAG, STI))
+ Results.append({Res->first, Res->second});
+}
+
+static SDValue lowerLoadVector(SDNode *N, SelectionDAG &DAG,
+ const NVPTXSubtarget &STI) {
+ if (auto Res = replaceLoadVector(N, DAG, STI))
+ return DAG.getMergeValues({Res->first, Res->second}, SDLoc(N));
return SDValue();
}
@@ -3118,13 +3211,10 @@ SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
// =>
// v1 = ld i8* addr (-> i16)
// v = trunc i16 to i1
-SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
- SDNode *Node = Op.getNode();
- LoadSDNode *LD = cast<LoadSDNode>(Node);
- SDLoc dl(Node);
+static SDValue lowerLOADi1(LoadSDNode *LD, SelectionDAG &DAG) {
+ SDLoc dl(LD);
assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
- assert(Node->getValueType(0) == MVT::i1 &&
- "Custom lowering for i1 load only");
+ assert(LD->getValueType(0) == MVT::i1 && "Custom lowering for i1 load only");
SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(),
LD->getBasePtr(), LD->getPointerInfo(),
MVT::i8, LD->getAlign(),
@@ -3133,35 +3223,31 @@ SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
// The legalizer (the caller) is expecting two values from the legalized
// load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
// in LegalizeDAG.cpp which also uses MergeValues.
- SDValue Ops[] = { result, LD->getChain() };
- return DAG.getMergeValues(Ops, dl);
+ return DAG.getMergeValues({result, LD->getChain()}, dl);
}
-SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
- StoreSDNode *Store = cast<StoreSDNode>(Op);
- EVT VT = Store->getMemoryVT();
-
- if (VT == MVT::i1)
- return LowerSTOREi1(Op, DAG);
+SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+ LoadSDNode *LD = cast<LoadSDNode>(Op);
- // v2f32/v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to
- // handle unaligned stores and have to handle it here.
- if (NVPTX::isPackedVectorTy(VT) &&
- !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
- VT, *Store->getMemOperand()))
- return expandUnalignedStore(Store, DAG);
+ if (Op.getValueType() == MVT::i1)
+ return lowerLOADi1(LD, DAG);
- // v2f16/v2bf16/v2i16 don't need special handling.
- if (NVPTX::isPackedVectorTy(VT) && VT.is32BitVector())
- return SDValue();
+ // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
+ // how they'll be lowered in ISel anyway, and by doing this a little earlier
+ // we allow for more DAG combine opportunities.
+ if (LD->getExtensionType() == ISD::EXTLOAD) {
+ assert(LD->getValueType(0).isInteger() && LD->getMemoryVT().isInteger() &&
+ "Unexpected fpext-load");
+ return DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Op), Op.getValueType(),
+ LD->getChain(), LD->getBasePtr(), LD->getMemoryVT(),
+ LD->getMemOperand());
+ }
- // Lower store of any other vector type, including v2f32 as we want to break
- // it apart since this is not a widely-supported type.
- return LowerSTOREVector(Op, DAG);
+ llvm_unreachable("Unexpected custom lowering for load");
}
-SDValue
-NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
+static SDValue lowerSTOREVector(SDValue Op, SelectionDAG &DAG,
+ const NVPTXSubtarget &STI) {
MemSDNode *N = cast<MemSDNode>(Op.getNode());
SDValue Val = N->getOperand(1);
SDLoc DL(N);
@@ -3253,6 +3339,18 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
return NewSt;
}
+SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+ StoreSDNode *Store = cast<StoreSDNode>(Op);
+ EVT VT = Store->getMemoryVT();
+
+ if (VT == MVT::i1)
+ return LowerSTOREi1(Op, DAG);
+
+ // Lower store of any other vector type, including v2f32 as we want to break
+ // it apart since this is not a widely-supported type.
+ return lowerSTOREVector(Op, DAG, STI);
+}
+
// st i1 v, addr
// =>
// v1 = zxt v to i16
@@ -4010,14 +4108,8 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
case Intrinsic::nvvm_ldu_global_i:
case Intrinsic::nvvm_ldu_global_f:
case Intrinsic::nvvm_ldu_global_p: {
- auto &DL = I.getDataLayout();
Info.opc = ISD::INTRINSIC_W_CHAIN;
- if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
- Info.memVT = getValueType(DL, I.getType());
- else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
- Info.memVT = getPointerTy(DL);
- else
- Info.memVT = getValueType(DL, I.getType());
+ Info.memVT = getValueType(I.getDataLayout(), I.getType());
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
@@ -5152,11 +5244,34 @@ static SDValue combinePackingMovIntoStore(SDNode *N,
ST->getMemoryVT(), ST->getMemOperand());
}
-static SDValue PerformStoreCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
+static SDValue combineSTORE(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ const NVPTXSubtarget &STI) {
+
+ if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::STORE) {
+ // Here is our chance to custom lower a store with a non-simple type.
+ // Unfortunately, we can't do this in the legalizer because there is no
+ // way to setOperationAction for an non-simple type.
+ StoreSDNode *ST = cast<StoreSDNode>(N);
+ if (!ST->getValue().getValueType().isSimple())
+ return lowerSTOREVector(SDValue(ST, 0), DCI.DAG, STI);
+ }
+
return combinePackingMovIntoStore(N, DCI, 1, 2);
}
+static SDValue combineLOAD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ const NVPTXSubtarget &STI) {
+ if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::LOAD) {
+ // Here is our chance to custom lower a load with a non-simple type.
+ // Unfortunately, we can't do this in the legalizer because there is no
+ // way to setOperationAction for an non-simple type.
+ if (!N->getValueType(0).isSimple())
+ return lowerLoadVector(N, DCI.DAG, STI);
+ }
+
+ return combineUnpackingMovIntoLoad(N, DCI);
+}
+
/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
///
static SDValue PerformADDCombine(SDNode *N,
@@ -5884,7 +5999,7 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::LOAD:
case NVPTXISD::LoadV2:
case NVPTXISD::LoadV4:
- return combineUnpackingMovIntoLoad(N, DCI);
+ return combineLOAD(N, DCI, STI);
case ISD::MUL:
return PerformMULCombine(N, DCI, OptLevel);
case NVPTXISD::PRMT:
@@ -5901,7 +6016,7 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::STORE:
case NVPTXISD::StoreV2:
case NVPTXISD::StoreV4:
- return PerformStoreCombine(N, DCI);
+ return combineSTORE(N, DCI, STI);
case ISD::VSELECT:
return PerformVSELECTCombine(N, DCI);
}
@@ -5930,103 +6045,6 @@ static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG,
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i8, {Vec0, Vec1}));
}
-/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
-static void replaceLoadVector(SDNode *N, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &Results,
- const NVPTXSubtarget &STI) {
- LoadSDNode *LD = cast<LoadSDNode>(N);
- const EVT ResVT = LD->getValueType(0);
- const EVT MemVT = LD->getMemoryVT();
-
- // If we're doing sign/zero extension as part of the load, avoid lowering to
- // a LoadV node. TODO: consider relaxing this restriction.
- if (ResVT != MemVT)
- return;
-
- const auto NumEltsAndEltVT =
- getVectorLoweringShape(ResVT, STI, LD->getAddressSpace());
- if (!NumEltsAndEltVT)
- return;
- const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
-
- Align Alignment = LD->getAlign();
- const auto &TD = DAG.getDataLayout();
- Align PrefAlign = TD.getPrefTypeAlign(MemVT.getTypeForEVT(*DAG.getContext()));
- if (Alignment < PrefAlign) {
- // This load is not sufficiently aligned, so bail out and let this vector
- // load be scalarized. Note that we may still be able to emit smaller
- // vector loads. For example, if we are loading a <4 x float> with an
- // alignment of 8, this check will fail but the legalizer will try again
- // with 2 x <2 x float>, which will succeed with an alignment of 8.
- return;
- }
-
- // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
- // Therefore, we must ensure the type is legal. For i1 and i8, we set the
- // loaded type to i16 and propagate the "real" type as the memory type.
- const MVT LoadEltVT = (EltVT.getSizeInBits() < 16) ? MVT::i16 : EltVT;
-
- unsigned Opcode;
- switch (NumElts) {
- default:
- return;
- case 2:
- Opcode = NVPTXISD::LoadV2;
- break;
- case 4:
- Opcode = NVPTXISD::LoadV4;
- break;
- case 8:
- Opcode = NVPTXISD::LoadV8;
- break;
- }
- auto ListVTs = SmallVector<EVT, 9>(NumElts, LoadEltVT);
- ListVTs.push_back(MVT::Other);
- SDVTList LdResVTs = DAG.getVTList(ListVTs);
-
- SDLoc DL(LD);
-
- // Copy regular operands
- SmallVector<SDValue, 8> OtherOps(LD->ops());
-
- // The select routine does not have access to the LoadSDNode instance, so
- // pass along the extension information
- OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
-
- SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
- LD->getMemoryVT(),
- LD->getMemOperand());
-
- SmallVector<SDValue> ScalarRes;
- if (EltVT.isVector()) {
- assert(EVT(EltVT.getVectorElementType()) == ResVT.getVectorElementType());
- assert(NumElts * EltVT.getVectorNumElements() ==
- ResVT.getVectorNumElements());
- // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back
- // into individual elements.
- for (const unsigned I : llvm::seq(NumElts)) {
- SDValue SubVector = NewLD.getValue(I);
- DAG.ExtractVectorElements(SubVector, ScalarRes);
- }
- } else {
- for (const unsigned I : llvm::seq(NumElts)) {
- SDValue Res = NewLD.getValue(I);
- if (LoadEltVT != EltVT)
- Res = DAG.getNode(ISD::TRUNCATE, DL, EltVT, Res);
- ScalarRes.push_back(Res);
- }
- }
-
- SDValue LoadChain = NewLD.getValue(NumElts);
-
- const MVT BuildVecVT =
- MVT::getVectorVT(EltVT.getScalarType(), ScalarRes.size());
- SDValue BuildVec = DAG.getBuildVector(BuildVecVT, DL, ScalarRes);
- SDValue LoadValue = DAG.getBitcast(ResVT, BuildVec);
-
- Results.append({LoadValue, LoadChain});
-}
-
// Lower vector return type of tcgen05.ld intrinsics
static void ReplaceTcgen05Ld(SDNode *N, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &Results,
@@ -6262,6 +6280,49 @@ static void replaceProxyReg(SDNode *N, SelectionDAG &DAG,
Results.push_back(Res);
}
+static void replaceAtomicSwap128(SDNode *N, SelectionDAG &DAG,
+ const NVPTXSubtarget &STI,
+ SmallVectorImpl<SDValue> &Results) {
+ assert(N->getValueType(0) == MVT::i128 &&
+ "Custom lowering for atomic128 only supports i128");
+
+ AtomicSDNode *AN = cast<AtomicSDNode>(N);
+ SDLoc dl(N);
+
+ if (!STI.hasAtomSwap128()) {
+ DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
+ DAG.getMachineFunction().getFunction(),
+ "Support for b128 atomics introduced in PTX ISA version 8.3 and "
+ "requires target sm_90.",
+ dl.getDebugLoc()));
+
+ Results.push_back(DAG.getUNDEF(MVT::i128));
+ Results.push_back(AN->getOperand(0)); // Chain
+ return;
+ }
+
+ SmallVector<SDValue, 6> Ops;
+ Ops.push_back(AN->getOperand(0)); // Chain
+ Ops.push_back(AN->getOperand(1)); // Ptr
+ for (const auto &Op : AN->ops().drop_front(2)) {
+ // Low part
+ Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,
+ DAG.getIntPtrConstant(0, dl)));
+ // High part
+ Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,
+ DAG.getIntPtrConstant(1, dl)));
+ }
+ unsigned Opcode = N->getOpcode() == ISD::ATOMIC_SWAP
+ ? NVPTXISD::ATOMIC_SWAP_B128
+ : NVPTXISD::ATOMIC_CMP_SWAP_B128;
+ SDVTList Tys = DAG.getVTList(MVT::i64, MVT::i64, MVT::Other);
+ SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, MVT::i128,
+ AN->getMemOperand());
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i128,
+ {Result.getValue(0), Result.getValue(1)}));
+ Results.push_back(Result.getValue(2));
+}
+
void NVPTXTargetLowering::ReplaceNodeResults(
SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
switch (N->getOpcode()) {
@@ -6282,6 +6343,10 @@ void NVPTXTargetLowering::ReplaceNodeResults(
case NVPTXISD::ProxyReg:
replaceProxyReg(N, DAG, *this, Results);
return;
+ case ISD::ATOMIC_CMP_SWAP:
+ case ISD::ATOMIC_SWAP:
+ replaceAtomicSwap128(N, DAG, STI, Results);
+ return;
}
}
@@ -6306,16 +6371,19 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
}
assert(Ty->isIntegerTy() && "Ty should be integer at this point");
- auto ITy = cast<llvm::IntegerType>(Ty);
+ const unsigned BitWidth = cast<IntegerType>(Ty)->getBitWidth();
switch (AI->getOperation()) {
default:
return AtomicExpansionKind::CmpXChg;
+ case AtomicRMWInst::BinOp::Xchg:
+ if (BitWidth == 128)
+ return AtomicExpansionKind::None;
+ LLVM_FALLTHROUGH;
case AtomicRMWInst::BinOp::And:
case AtomicRMWInst::BinOp::Or:
case AtomicRMWInst::BinOp::Xor:
- case AtomicRMWInst::BinOp::Xchg:
- switch (ITy->getBitWidth()) {
+ switch (BitWidth) {
case 8:
case 16:
return AtomicExpansionKind::CmpXChg;
@@ -6325,6 +6393,8 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
if (STI.hasAtomBitwise64())
return AtomicExpansionKind::None;
return AtomicExpansionKind::CmpXChg;
+ case 128:
+ return AtomicExpansionKind::CmpXChg;
default:
llvm_unreachable("unsupported width encountered");
}
@@ -6334,7 +6404,7 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
case AtomicRMWInst::BinOp::Min:
case AtomicRMWInst::BinOp::UMax:
case AtomicRMWInst::BinOp::UMin:
- switch (ITy->getBitWidth()) {
+ switch (BitWidth) {
case 8:
case 16:
return AtomicExpansionKind::CmpXChg;
@@ -6344,17 +6414,20 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
if (STI.hasAtomMinMax64())
return AtomicExpansionKind::None;
return AtomicExpansionKind::CmpXChg;
+ case 128:
+ return AtomicExpansionKind::CmpXChg;
default:
llvm_unreachable("unsupported width encountered");
}
case AtomicRMWInst::BinOp::UIncWrap:
case AtomicRMWInst::BinOp::UDecWrap:
- switch (ITy->getBitWidth()) {
+ switch (BitWidth) {
case 32:
return AtomicExpansionKind::None;
case 8:
case 16:
case 64:
+ case 128:
return AtomicExpansionKind::CmpXChg;
default:
llvm_unreachable("unsupported width encountered");