diff options
Diffstat (limited to 'llvm/lib/Target')
64 files changed, 1951 insertions, 573 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index b51c056e9d53..040a47f2f6d7 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -113,6 +113,8 @@ public: void emitFunctionEntryLabel() override; + void emitXXStructor(const DataLayout &DL, const Constant *CV) override; + void LowerJumpTableDest(MCStreamer &OutStreamer, const MachineInstr &MI); void LowerHardenedBRJumpTable(const MachineInstr &MI); @@ -1280,6 +1282,23 @@ void AArch64AsmPrinter::emitFunctionEntryLabel() { } } +void AArch64AsmPrinter::emitXXStructor(const DataLayout &DL, + const Constant *CV) { + if (const auto *CPA = dyn_cast<ConstantPtrAuth>(CV)) + if (CPA->hasAddressDiscriminator() && + !CPA->hasSpecialAddressDiscriminator( + ConstantPtrAuth::AddrDiscriminator_CtorsDtors)) + report_fatal_error( + "unexpected address discrimination value for ctors/dtors entry, only " + "'ptr inttoptr (i64 1 to ptr)' is allowed"); + // If we have signed pointers in xxstructors list, they'll be lowered to @AUTH + // MCExpr's via AArch64AsmPrinter::lowerConstantPtrAuth. It does not look at + // actual address discrimination value and only checks + // hasAddressDiscriminator(), so it's OK to leave special address + // discrimination value here. + AsmPrinter::emitXXStructor(DL, CV); +} + void AArch64AsmPrinter::emitGlobalAlias(const Module &M, const GlobalAlias &GA) { if (auto F = dyn_cast_or_null<Function>(GA.getAliasee())) { @@ -2142,6 +2161,10 @@ void AArch64AsmPrinter::LowerMOVaddrPAC(const MachineInstr &MI) { }; const bool IsGOTLoad = MI.getOpcode() == AArch64::LOADgotPAC; + const bool IsELFSignedGOT = MI.getParent() + ->getParent() + ->getInfo<AArch64FunctionInfo>() + ->hasELFSignedGOT(); MachineOperand GAOp = MI.getOperand(0); const uint64_t KeyC = MI.getOperand(1).getImm(); assert(KeyC <= AArch64PACKey::LAST && @@ -2158,9 +2181,16 @@ void AArch64AsmPrinter::LowerMOVaddrPAC(const MachineInstr &MI) { // Emit: // target materialization: // - via GOT: - // adrp x16, :got:target - // ldr x16, [x16, :got_lo12:target] - // add offset to x16 if offset != 0 + // - unsigned GOT: + // adrp x16, :got:target + // ldr x16, [x16, :got_lo12:target] + // add offset to x16 if offset != 0 + // - ELF signed GOT: + // adrp x17, :got:target + // add x17, x17, :got_auth_lo12:target + // ldr x16, [x17] + // aut{i|d}a x16, x17 + // add offset to x16 if offset != 0 // // - direct: // adrp x16, target @@ -2203,13 +2233,40 @@ void AArch64AsmPrinter::LowerMOVaddrPAC(const MachineInstr &MI) { MCInstLowering.lowerOperand(GAMOLo, GAMCLo); EmitAndIncrement( - MCInstBuilder(AArch64::ADRP).addReg(AArch64::X16).addOperand(GAMCHi)); + MCInstBuilder(AArch64::ADRP) + .addReg(IsGOTLoad && IsELFSignedGOT ? AArch64::X17 : AArch64::X16) + .addOperand(GAMCHi)); if (IsGOTLoad) { - EmitAndIncrement(MCInstBuilder(AArch64::LDRXui) - .addReg(AArch64::X16) - .addReg(AArch64::X16) - .addOperand(GAMCLo)); + if (IsELFSignedGOT) { + EmitAndIncrement(MCInstBuilder(AArch64::ADDXri) + .addReg(AArch64::X17) + .addReg(AArch64::X17) + .addOperand(GAMCLo) + .addImm(0)); + + EmitAndIncrement(MCInstBuilder(AArch64::LDRXui) + .addReg(AArch64::X16) + .addReg(AArch64::X17) + .addImm(0)); + + assert(GAOp.isGlobal()); + assert(GAOp.getGlobal()->getValueType() != nullptr); + unsigned AuthOpcode = GAOp.getGlobal()->getValueType()->isFunctionTy() + ? AArch64::AUTIA + : AArch64::AUTDA; + + EmitAndIncrement(MCInstBuilder(AuthOpcode) + .addReg(AArch64::X16) + .addReg(AArch64::X16) + .addReg(AArch64::X17)); + + } else { + EmitAndIncrement(MCInstBuilder(AArch64::LDRXui) + .addReg(AArch64::X16) + .addReg(AArch64::X16) + .addOperand(GAMCLo)); + } } else { EmitAndIncrement(MCInstBuilder(AArch64::ADDXri) .addReg(AArch64::X16) diff --git a/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp index 2bc14f9821e6..161cf24dd403 100644 --- a/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp +++ b/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp @@ -108,6 +108,10 @@ static bool atomicReadDroppedOnZero(unsigned Opcode) { case AArch64::LDUMINW: case AArch64::LDUMINX: case AArch64::LDUMINLB: case AArch64::LDUMINLH: case AArch64::LDUMINLW: case AArch64::LDUMINLX: + case AArch64::SWPB: case AArch64::SWPH: + case AArch64::SWPW: case AArch64::SWPX: + case AArch64::SWPLB: case AArch64::SWPLH: + case AArch64::SWPLW: case AArch64::SWPLX: return true; } return false; diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 9b7fc228d5de..72c767200b38 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -1291,7 +1291,40 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, MI.eraseFromParent(); return true; } + case AArch64::LOADgotAUTH: { + Register DstReg = MI.getOperand(0).getReg(); + const MachineOperand &MO1 = MI.getOperand(1); + + MachineOperand GAHiOp(MO1); + MachineOperand GALoOp(MO1); + GAHiOp.addTargetFlag(AArch64II::MO_PAGE); + GALoOp.addTargetFlag(AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + + DebugLoc DL = MI.getDebugLoc(); + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), AArch64::X16) + .add(GAHiOp); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), AArch64::X16) + .addReg(AArch64::X16) + .add(GALoOp) + .addImm(0); + + BuildMI(MBB, MBBI, DL, TII->get(AArch64::LDRXui), DstReg) + .addReg(AArch64::X16) + .addImm(0); + + assert(MO1.isGlobal()); + assert(MO1.getGlobal()->getValueType() != nullptr); + unsigned AuthOpcode = MO1.getGlobal()->getValueType()->isFunctionTy() + ? AArch64::AUTIA + : AArch64::AUTDA; + BuildMI(MBB, MBBI, DL, TII->get(AuthOpcode), DstReg) + .addReg(DstReg) + .addReg(AArch64::X16); + + MI.eraseFromParent(); + return true; + } case AArch64::LOADgot: { MachineFunction *MF = MBB.getParent(); Register DstReg = MI.getOperand(0).getReg(); diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp index cbf38f2c57a3..4487d34a936c 100644 --- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -453,6 +453,9 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) { if (!Subtarget->useSmallAddressing() && !Subtarget->isTargetMachO()) return 0; + if (FuncInfo.MF->getInfo<AArch64FunctionInfo>()->hasELFSignedGOT()) + return 0; + unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, TM); EVT DestEVT = TLI.getValueType(DL, GV->getType(), true); diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td index a1ae0873fc19..2aa74deb0e85 100644 --- a/llvm/lib/Target/AArch64/AArch64Features.td +++ b/llvm/lib/Target/AArch64/AArch64Features.td @@ -186,7 +186,7 @@ def FeatureJS : ExtensionWithMArch<"jsconv", "JS", "FEAT_JSCVT", [FeatureFPARMv8]>; def FeatureFPAC : Extension<"fpac", "FPAC", "FEAT_FPAC", - "Enable v8.3-A Pointer Authentication Faulting enhancement">; + "Enable Armv8.3-A Pointer Authentication Faulting enhancement">; def FeatureCCIDX : Extension<"ccidx", "CCIDX", "FEAT_CCIDX", "Enable Armv8.3-A Extend of the CCSIDR number of sets">; @@ -435,8 +435,11 @@ def FeatureMEC : Extension<"mec", "MEC", "FEAT_MEC", def FeatureSVE2p1: ExtensionWithMArch<"sve2p1", "SVE2p1", "FEAT_SVE2p1", "Enable Scalable Vector Extension 2.1 instructions", [FeatureSVE2]>; -def FeatureB16B16 : ExtensionWithMArch<"b16b16", "B16B16", "FEAT_SVE_B16B16", - "Enable SVE2.1 or SME2.1 non-widening BFloat16 to BFloat16 instructions", [FeatureBF16]>; +def FeatureB16B16 : ExtensionWithMArch<"b16b16", "B16B16", "FEAT_B16B16", + "Enable SME2.1 ZA-targeting non-widening BFloat16 to BFloat16 instructions", [FeatureBF16]>; + +def FeatureSVEB16B16: ExtensionWithMArch<"sve-b16b16", "SVEB16B16", "FEAT_SVE_B16B16", + "Enable SVE2.1 non-widening and SME2.1 Z-targeting non-widening BFloat16 to BFloat16 instructions">; def FeatureSMEF16F16 : ExtensionWithMArch<"sme-f16f16", "SMEF16F16", "FEAT_SME_F16F16", "Enable SME non-widening Float16 instructions", [FeatureSME2]>; diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index f28511c74dcd..bf0eb1461e55 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -240,6 +240,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" @@ -275,6 +276,10 @@ cl::opt<bool> EnableHomogeneousPrologEpilog( // Stack hazard padding size. 0 = disabled. static cl::opt<unsigned> StackHazardSize("aarch64-stack-hazard-size", cl::init(0), cl::Hidden); +// Stack hazard size for analysis remarks. StackHazardSize takes precedence. +static cl::opt<unsigned> + StackHazardRemarkSize("aarch64-stack-hazard-remark-size", cl::init(0), + cl::Hidden); // Whether to insert padding into non-streaming functions (for testing). static cl::opt<bool> StackHazardInNonStreaming("aarch64-stack-hazard-in-non-streaming", @@ -2616,9 +2621,16 @@ AArch64FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, const auto &MFI = MF.getFrameInfo(); int64_t ObjectOffset = MFI.getObjectOffset(FI); + StackOffset SVEStackSize = getSVEStackSize(MF); + + // For VLA-area objects, just emit an offset at the end of the stack frame. + // Whilst not quite correct, these objects do live at the end of the frame and + // so it is more useful for analysis for the offset to reflect this. + if (MFI.isVariableSizedObjectIndex(FI)) { + return StackOffset::getFixed(-((int64_t)MFI.getStackSize())) - SVEStackSize; + } // This is correct in the absence of any SVE stack objects. - StackOffset SVEStackSize = getSVEStackSize(MF); if (!SVEStackSize) return StackOffset::getFixed(ObjectOffset - getOffsetOfLocalArea()); @@ -3529,13 +3541,9 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( return true; } -// Return the FrameID for a Load/Store instruction by looking at the MMO. -static std::optional<int> getLdStFrameID(const MachineInstr &MI, - const MachineFrameInfo &MFI) { - if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1) - return std::nullopt; - - MachineMemOperand *MMO = *MI.memoperands_begin(); +// Return the FrameID for a MMO. +static std::optional<int> getMMOFrameID(MachineMemOperand *MMO, + const MachineFrameInfo &MFI) { auto *PSV = dyn_cast_or_null<FixedStackPseudoSourceValue>(MMO->getPseudoValue()); if (PSV) @@ -3553,6 +3561,15 @@ static std::optional<int> getLdStFrameID(const MachineInstr &MI, return std::nullopt; } +// Return the FrameID for a Load/Store instruction by looking at the first MMO. +static std::optional<int> getLdStFrameID(const MachineInstr &MI, + const MachineFrameInfo &MFI) { + if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1) + return std::nullopt; + + return getMMOFrameID(*MI.memoperands_begin(), MFI); +} + // Check if a Hazard slot is needed for the current function, and if so create // one for it. The index is stored in AArch64FunctionInfo->StackHazardSlotIndex, // which can be used to determine if any hazard padding is needed. @@ -5030,3 +5047,174 @@ void AArch64FrameLowering::inlineStackProbe(MachineFunction &MF, MI->eraseFromParent(); } } + +struct StackAccess { + enum AccessType { + NotAccessed = 0, // Stack object not accessed by load/store instructions. + GPR = 1 << 0, // A general purpose register. + PPR = 1 << 1, // A predicate register. + FPR = 1 << 2, // A floating point/Neon/SVE register. + }; + + int Idx; + StackOffset Offset; + int64_t Size; + unsigned AccessTypes; + + StackAccess() : Idx(0), Offset(), Size(0), AccessTypes(NotAccessed) {} + + bool operator<(const StackAccess &Rhs) const { + return std::make_tuple(start(), Idx) < + std::make_tuple(Rhs.start(), Rhs.Idx); + } + + bool isCPU() const { + // Predicate register load and store instructions execute on the CPU. + return AccessTypes & (AccessType::GPR | AccessType::PPR); + } + bool isSME() const { return AccessTypes & AccessType::FPR; } + bool isMixed() const { return isCPU() && isSME(); } + + int64_t start() const { return Offset.getFixed() + Offset.getScalable(); } + int64_t end() const { return start() + Size; } + + std::string getTypeString() const { + switch (AccessTypes) { + case AccessType::FPR: + return "FPR"; + case AccessType::PPR: + return "PPR"; + case AccessType::GPR: + return "GPR"; + case AccessType::NotAccessed: + return "NA"; + default: + return "Mixed"; + } + } + + void print(raw_ostream &OS) const { + OS << getTypeString() << " stack object at [SP" + << (Offset.getFixed() < 0 ? "" : "+") << Offset.getFixed(); + if (Offset.getScalable()) + OS << (Offset.getScalable() < 0 ? "" : "+") << Offset.getScalable() + << " * vscale"; + OS << "]"; + } +}; + +static inline raw_ostream &operator<<(raw_ostream &OS, const StackAccess &SA) { + SA.print(OS); + return OS; +} + +void AArch64FrameLowering::emitRemarks( + const MachineFunction &MF, MachineOptimizationRemarkEmitter *ORE) const { + + SMEAttrs Attrs(MF.getFunction()); + if (Attrs.hasNonStreamingInterfaceAndBody()) + return; + + const uint64_t HazardSize = + (StackHazardSize) ? StackHazardSize : StackHazardRemarkSize; + + if (HazardSize == 0) + return; + + const MachineFrameInfo &MFI = MF.getFrameInfo(); + // Bail if function has no stack objects. + if (!MFI.hasStackObjects()) + return; + + std::vector<StackAccess> StackAccesses(MFI.getNumObjects()); + + size_t NumFPLdSt = 0; + size_t NumNonFPLdSt = 0; + + // Collect stack accesses via Load/Store instructions. + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1) + continue; + for (MachineMemOperand *MMO : MI.memoperands()) { + std::optional<int> FI = getMMOFrameID(MMO, MFI); + if (FI && !MFI.isDeadObjectIndex(*FI)) { + int FrameIdx = *FI; + + size_t ArrIdx = FrameIdx + MFI.getNumFixedObjects(); + if (StackAccesses[ArrIdx].AccessTypes == StackAccess::NotAccessed) { + StackAccesses[ArrIdx].Idx = FrameIdx; + StackAccesses[ArrIdx].Offset = + getFrameIndexReferenceFromSP(MF, FrameIdx); + StackAccesses[ArrIdx].Size = MFI.getObjectSize(FrameIdx); + } + + unsigned RegTy = StackAccess::AccessType::GPR; + if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector) { + if (AArch64::PPRRegClass.contains(MI.getOperand(0).getReg())) + RegTy = StackAccess::PPR; + else + RegTy = StackAccess::FPR; + } else if (AArch64InstrInfo::isFpOrNEON(MI)) { + RegTy = StackAccess::FPR; + } + + StackAccesses[ArrIdx].AccessTypes |= RegTy; + + if (RegTy == StackAccess::FPR) + ++NumFPLdSt; + else + ++NumNonFPLdSt; + } + } + } + } + + if (NumFPLdSt == 0 || NumNonFPLdSt == 0) + return; + + llvm::sort(StackAccesses); + StackAccesses.erase(llvm::remove_if(StackAccesses, + [](const StackAccess &S) { + return S.AccessTypes == + StackAccess::NotAccessed; + }), + StackAccesses.end()); + + SmallVector<const StackAccess *> MixedObjects; + SmallVector<std::pair<const StackAccess *, const StackAccess *>> HazardPairs; + + if (StackAccesses.front().isMixed()) + MixedObjects.push_back(&StackAccesses.front()); + + for (auto It = StackAccesses.begin(), End = std::prev(StackAccesses.end()); + It != End; ++It) { + const auto &First = *It; + const auto &Second = *(It + 1); + + if (Second.isMixed()) + MixedObjects.push_back(&Second); + + if ((First.isSME() && Second.isCPU()) || + (First.isCPU() && Second.isSME())) { + uint64_t Distance = static_cast<uint64_t>(Second.start() - First.end()); + if (Distance < HazardSize) + HazardPairs.emplace_back(&First, &Second); + } + } + + auto EmitRemark = [&](llvm::StringRef Str) { + ORE->emit([&]() { + auto R = MachineOptimizationRemarkAnalysis( + "sme", "StackHazard", MF.getFunction().getSubprogram(), &MF.front()); + return R << formatv("stack hazard in '{0}': ", MF.getName()).str() << Str; + }); + }; + + for (const auto &P : HazardPairs) + EmitRemark(formatv("{0} is too close to {1}", *P.first, *P.second).str()); + + for (const auto *Obj : MixedObjects) + EmitRemark( + formatv("{0} accessed by both GP and FP instructions", *Obj).str()); +} diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h index 0ebab1700e9c..c19731249620 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -13,8 +13,9 @@ #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H #define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H -#include "llvm/Support/TypeSize.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/TargetFrameLowering.h" +#include "llvm/Support/TypeSize.h" namespace llvm { @@ -178,6 +179,9 @@ private: inlineStackProbeLoopExactMultiple(MachineBasicBlock::iterator MBBI, int64_t NegProbeSize, Register TargetReg) const; + + void emitRemarks(const MachineFunction &MF, + MachineOptimizationRemarkEmitter *ORE) const override; }; } // End llvm namespace diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 7704321a0fc3..94130736c398 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9225,6 +9225,11 @@ SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG, SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags); // FIXME: Once remat is capable of dealing with instructions with register // operands, expand this into two nodes instead of using a wrapper node. + if (DAG.getMachineFunction() + .getInfo<AArch64FunctionInfo>() + ->hasELFSignedGOT()) + return SDValue(DAG.getMachineNode(AArch64::LOADgotAUTH, DL, Ty, GotAddr), + 0); return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr); } @@ -21769,6 +21774,7 @@ static SDValue performExtendCombine(SDNode *N, // helps the backend to decide that an sabdl2 would be useful, saving a real // extract_high operation. if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND && + N->getOperand(0).getValueType().is64BitVector() && (N->getOperand(0).getOpcode() == ISD::ABDU || N->getOperand(0).getOpcode() == ISD::ABDS)) { SDNode *ABDNode = N->getOperand(0).getNode(); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index e720c6b21a97..1e5c5e2657e6 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -143,6 +143,8 @@ def HasFuseAES : Predicate<"Subtarget->hasFuseAES()">, "fuse-aes">; def HasSVE : Predicate<"Subtarget->isSVEAvailable()">, AssemblerPredicateWithAll<(all_of FeatureSVE), "sve">; +def HasSVEB16B16 : Predicate<"Subtarget->hasSVEB16B16()">, + AssemblerPredicateWithAll<(all_of FeatureSVEB16B16), "sve-b16b16">; def HasSVE2 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2()">, AssemblerPredicateWithAll<(all_of FeatureSVE2), "sve2">; def HasSVE2p1 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2p1()">, @@ -1872,7 +1874,7 @@ let Predicates = [HasPAuth] in { Sched<[WriteI, ReadI]> { let isReMaterializable = 1; let isCodeGenOnly = 1; - let Size = 40; // 12 fixed + 28 variable, for pointer offset, and discriminator + let Size = 48; // 12 fixed + 36 variable, for pointer offset, and discriminator let Defs = [X16,X17]; } @@ -1911,6 +1913,11 @@ let Predicates = [HasPAuth] in { tcGPR64:$AddrDisc), (AUTH_TCRETURN_BTI tcGPRx16x17:$dst, imm:$FPDiff, imm:$Key, imm:$Disc, tcGPR64:$AddrDisc)>; + + def LOADgotAUTH : Pseudo<(outs GPR64common:$dst), (ins i64imm:$addr), []>, + Sched<[WriteI, ReadI]> { + let Defs = [X16]; + } } // v9.5-A pointer authentication extensions diff --git a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp index 48672241f905..9f234b0f9170 100644 --- a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp +++ b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "AArch64MCInstLower.h" +#include "AArch64MachineFunctionInfo.h" #include "MCTargetDesc/AArch64MCExpr.h" #include "Utils/AArch64BaseInfo.h" #include "llvm/CodeGen/AsmPrinter.h" @@ -185,9 +186,12 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO, MCSymbol *Sym) const { uint32_t RefFlags = 0; - if (MO.getTargetFlags() & AArch64II::MO_GOT) - RefFlags |= AArch64MCExpr::VK_GOT; - else if (MO.getTargetFlags() & AArch64II::MO_TLS) { + if (MO.getTargetFlags() & AArch64II::MO_GOT) { + const MachineFunction *MF = MO.getParent()->getParent()->getParent(); + RefFlags |= (MF->getInfo<AArch64FunctionInfo>()->hasELFSignedGOT() + ? AArch64MCExpr::VK_GOT_AUTH + : AArch64MCExpr::VK_GOT); + } else if (MO.getTargetFlags() & AArch64II::MO_TLS) { TLSModel::Model Model; if (MO.isGlobal()) { const GlobalValue *GV = MO.getGlobal(); diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp index e96c5a953ff2..a0f0a489816c 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp @@ -16,6 +16,7 @@ #include "AArch64MachineFunctionInfo.h" #include "AArch64InstrInfo.h" #include "AArch64Subtarget.h" +#include "llvm/BinaryFormat/ELF.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" @@ -72,6 +73,29 @@ static bool ShouldSignWithBKey(const Function &F, const AArch64Subtarget &STI) { return Key == "b_key"; } +// Determine if we need to treat pointers in GOT as signed (as described in +// https://github.com/ARM-software/abi-aa/blob/main/pauthabielf64/pauthabielf64.rst#appendix-signed-got) +// based on PAuth core info encoded as "aarch64-elf-pauthabi-platform" and +// "aarch64-elf-pauthabi-version" module flags. Currently, only +// AARCH64_PAUTH_PLATFORM_LLVM_LINUX platform supports signed GOT with +// AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_GOT bit in version value set. +static bool hasELFSignedGOTHelper(const Function &F, + const AArch64Subtarget *STI) { + if (!Triple(STI->getTargetTriple()).isOSBinFormatELF()) + return false; + const Module *M = F.getParent(); + const auto *PAP = mdconst::extract_or_null<ConstantInt>( + M->getModuleFlag("aarch64-elf-pauthabi-platform")); + if (!PAP || PAP->getZExtValue() != ELF::AARCH64_PAUTH_PLATFORM_LLVM_LINUX) + return false; + const auto *PAV = mdconst::extract_or_null<ConstantInt>( + M->getModuleFlag("aarch64-elf-pauthabi-version")); + if (!PAV) + return false; + return PAV->getZExtValue() & + (1 << ELF::AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_GOT); +} + AArch64FunctionInfo::AArch64FunctionInfo(const Function &F, const AArch64Subtarget *STI) { // If we already know that the function doesn't have a redzone, set @@ -80,6 +104,7 @@ AArch64FunctionInfo::AArch64FunctionInfo(const Function &F, HasRedZone = false; std::tie(SignReturnAddress, SignReturnAddressAll) = GetSignReturnAddress(F); SignWithBKey = ShouldSignWithBKey(F, *STI); + HasELFSignedGOT = hasELFSignedGOTHelper(F, STI); // TODO: skip functions that have no instrumented allocas for optimization IsMTETagged = F.hasFnAttribute(Attribute::SanitizeMemTag); diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 72f110cebbdc..9ae458488343 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -177,6 +177,14 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// SignWithBKey modifies the default PAC-RET mode to signing with the B key. bool SignWithBKey = false; + /// HasELFSignedGOT is true if the target binary format is ELF and the IR + /// module containing the corresponding function has the following flags: + /// - aarch64-elf-pauthabi-platform flag equal to + /// AARCH64_PAUTH_PLATFORM_LLVM_LINUX; + /// - aarch64-elf-pauthabi-version flag with + /// AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_GOT bit set. + bool HasELFSignedGOT = false; + /// SigningInstrOffset captures the offset of the PAC-RET signing instruction /// within the prologue, so it can be re-used for authentication in the /// epilogue when using PC as a second salt (FEAT_PAuth_LR) @@ -509,6 +517,8 @@ public: bool shouldSignWithBKey() const { return SignWithBKey; } + bool hasELFSignedGOT() const { return HasELFSignedGOT; } + MCSymbol *getSigningInstrLabel() const { return SignInstrLabel; } void setSigningInstrLabel(MCSymbol *Label) { SignInstrLabel = Label; } diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index 71384a23c49a..52b5c8a0903e 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -863,22 +863,25 @@ def ProcessorFeatures { list<SubtargetFeature> AppleA15 = [HasV8_6aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8, FeatureNEON, FeaturePerfMon, FeatureSHA3, FeatureFullFP16, FeatureFP16FML, - FeatureComplxNum, FeatureCRC, FeatureJS, FeatureLSE, - FeaturePAuth, FeatureRAS, FeatureRCPC, FeatureRDM, + FeatureComplxNum, FeatureCRC, FeatureJS, + FeatureLSE, FeaturePAuth, FeatureFPAC, + FeatureRAS, FeatureRCPC, FeatureRDM, FeatureBF16, FeatureDotProd, FeatureMatMulInt8]; list<SubtargetFeature> AppleA16 = [HasV8_6aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8, FeatureNEON, FeaturePerfMon, FeatureSHA3, FeatureFullFP16, FeatureFP16FML, FeatureHCX, - FeatureComplxNum, FeatureCRC, FeatureJS, FeatureLSE, - FeaturePAuth, FeatureRAS, FeatureRCPC, FeatureRDM, + FeatureComplxNum, FeatureCRC, FeatureJS, + FeatureLSE, FeaturePAuth, FeatureFPAC, + FeatureRAS, FeatureRCPC, FeatureRDM, FeatureBF16, FeatureDotProd, FeatureMatMulInt8]; list<SubtargetFeature> AppleA17 = [HasV8_6aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8, FeatureNEON, FeaturePerfMon, FeatureSHA3, FeatureFullFP16, FeatureFP16FML, FeatureHCX, - FeatureComplxNum, FeatureCRC, FeatureJS, FeatureLSE, - FeaturePAuth, FeatureRAS, FeatureRCPC, FeatureRDM, + FeatureComplxNum, FeatureCRC, FeatureJS, + FeatureLSE, FeaturePAuth, FeatureFPAC, + FeatureRAS, FeatureRCPC, FeatureRDM, FeatureBF16, FeatureDotProd, FeatureMatMulInt8]; list<SubtargetFeature> AppleM4 = [HasV9_2aOps, FeatureSHA2, FeatureFPARMv8, FeatureNEON, FeaturePerfMon, FeatureSHA3, @@ -886,8 +889,9 @@ def ProcessorFeatures { FeatureAES, FeatureBF16, FeatureSME, FeatureSME2, FeatureSMEF64F64, FeatureSMEI16I64, - FeatureComplxNum, FeatureCRC, FeatureJS, FeatureLSE, - FeaturePAuth, FeatureRAS, FeatureRCPC, FeatureRDM, + FeatureComplxNum, FeatureCRC, FeatureJS, + FeatureLSE, FeaturePAuth, FeatureFPAC, + FeatureRAS, FeatureRCPC, FeatureRDM, FeatureDotProd, FeatureMatMulInt8]; list<SubtargetFeature> ExynosM3 = [HasV8_0aOps, FeatureCRC, FeatureSHA2, FeatureAES, FeaturePerfMon, FeatureNEON, FeatureFPARMv8]; diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 709a98d3a8cb..22de9e1458b7 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -857,6 +857,7 @@ defm FMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmopa", 0b0, 0b0, nxv8f16, int_aarch defm FMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmops", 0b0, 0b1, nxv8f16, int_aarch64_sme_mops>; } +// SME2 ZA-targeting non-widening BFloat16 instructions let Predicates = [HasSME2, HasB16B16] in { defm BFADD_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"bfadd", 0b1100, MatrixOp16, ZZ_h_mul_r, nxv8bf16, int_aarch64_sme_add_za16_vg1x2>; defm BFADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"bfadd", 0b1100, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, int_aarch64_sme_add_za16_vg1x4>; @@ -877,6 +878,12 @@ defm BFMLS_VG4_M4ZZ : sme2_dot_mla_add_sub_array_vg4_single<"bfmls", 0b1111101, defm BFMLS_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmls", 0b1100011, MatrixOp16, ZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fmls_vg1x2>; defm BFMLS_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmls", 0b1100011, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fmls_vg1x4>; +defm BFMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"bfmopa", 0b1, 0b0, nxv8bf16, int_aarch64_sme_mopa>; +defm BFMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"bfmops", 0b1, 0b1, nxv8bf16, int_aarch64_sme_mops>; +} + +// SME2 Z-targeting non-widening BFloat16 instructions +let Predicates = [HasSME2, HasSVEB16B16] in { defm BFMAX_VG2_2ZZ : sme2p1_bf_max_min_vector_vg2_single<"bfmax", 0b0010000>; defm BFMAX_VG4_4ZZ : sme2p1_bf_max_min_vector_vg4_single<"bfmax", 0b0010000>; defm BFMAX_VG2_2Z2Z : sme2p1_bf_max_min_vector_vg2_multi<"bfmax", 0b0010000>; @@ -899,9 +906,6 @@ defm BFMINNM_VG4_4Z2Z : sme2p1_bf_max_min_vector_vg4_multi<"bfminnm", 0b0010011 defm BFCLAMP_VG2_2ZZZ: sme2p1_bfclamp_vector_vg2_multi<"bfclamp">; defm BFCLAMP_VG4_4ZZZ: sme2p1_bfclamp_vector_vg4_multi<"bfclamp">; - -defm BFMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"bfmopa", 0b1, 0b0, nxv8bf16, int_aarch64_sme_mopa>; -defm BFMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"bfmops", 0b1, 0b1, nxv8bf16, int_aarch64_sme_mops>; } let Predicates = [HasSME2, HasFP8] in { diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 19c03011e07b..d9a70b5ef02f 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -279,6 +279,10 @@ def AArch64fsub_m1 : PatFrags<(ops node:$pg, node:$op1, node:$op2), [ (int_aarch64_sve_fsub node:$pg, node:$op1, node:$op2), (vselect node:$pg, (AArch64fsub_p (SVEAllActive), node:$op1, node:$op2), node:$op1) ]>; +def AArch64fsubr_m1 : PatFrags<(ops node:$pg, node:$op1, node:$op2), [ + (int_aarch64_sve_fsubr node:$pg, node:$op1, node:$op2), + (vselect node:$pg, (AArch64fsub_p (SVEAllActive), node:$op2, node:$op1), node:$op1) +]>; def AArch64shadd : PatFrags<(ops node:$pg, node:$op1, node:$op2), [(int_aarch64_sve_shadd node:$pg, node:$op1, node:$op2), @@ -423,6 +427,11 @@ def AArch64bic : PatFrags<(ops node:$op1, node:$op2), def AArch64subr : PatFrag<(ops node:$op1, node:$op2), (sub node:$op2, node:$op1)>; + +def AArch64subr_m1 : PatFrags<(ops node:$pg, node:$op1, node:$op2), + [(int_aarch64_sve_subr node:$pg, node:$op1, node:$op2), + (vselect node:$pg, (sub node:$op2, node:$op1), node:$op1)]>; + def AArch64mla_m1 : PatFrags<(ops node:$pred, node:$op1, node:$op2, node:$op3), [(int_aarch64_sve_mla node:$pred, node:$op1, node:$op2, node:$op3), (vselect node:$pred, (add node:$op1, (AArch64mul_p_oneuse (SVEAllActive), node:$op2, node:$op3)), node:$op1)]>; @@ -529,7 +538,7 @@ let Predicates = [HasSVEorSME] in { defm ADD_ZPmZ : sve_int_bin_pred_arit_0<0b000, "add", "ADD_ZPZZ", AArch64add_m1, DestructiveBinaryComm>; defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub", "SUB_ZPZZ", AArch64sub_m1, DestructiveBinaryCommWithRev, "SUBR_ZPmZ">; - defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr", "SUBR_ZPZZ", int_aarch64_sve_subr, DestructiveBinaryCommWithRev, "SUB_ZPmZ", /*isReverseInstr*/ 1>; + defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr", "SUBR_ZPZZ", AArch64subr_m1, DestructiveBinaryCommWithRev, "SUB_ZPmZ", /*isReverseInstr*/ 1>; defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr", "ORR_ZPZZ", AArch64orr_m1, DestructiveBinaryComm>; defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor", "EOR_ZPZZ", AArch64eor_m1, DestructiveBinaryComm>; @@ -685,7 +694,7 @@ let Predicates = [HasSVEorSME] in { defm FADD_ZPmZ : sve_fp_2op_p_zds<0b0000, "fadd", "FADD_ZPZZ", AArch64fadd_m1, DestructiveBinaryComm>; defm FSUB_ZPmZ : sve_fp_2op_p_zds<0b0001, "fsub", "FSUB_ZPZZ", AArch64fsub_m1, DestructiveBinaryCommWithRev, "FSUBR_ZPmZ">; defm FMUL_ZPmZ : sve_fp_2op_p_zds<0b0010, "fmul", "FMUL_ZPZZ", AArch64fmul_m1, DestructiveBinaryComm>; - defm FSUBR_ZPmZ : sve_fp_2op_p_zds<0b0011, "fsubr", "FSUBR_ZPZZ", int_aarch64_sve_fsubr, DestructiveBinaryCommWithRev, "FSUB_ZPmZ", /*isReverseInstr*/ 1>; + defm FSUBR_ZPmZ : sve_fp_2op_p_zds<0b0011, "fsubr", "FSUBR_ZPZZ", AArch64fsubr_m1, DestructiveBinaryCommWithRev, "FSUB_ZPmZ", /*isReverseInstr*/ 1>; defm FMAXNM_ZPmZ : sve_fp_2op_p_zds<0b0100, "fmaxnm", "FMAXNM_ZPZZ", AArch64fmaxnm_m1, DestructiveBinaryComm>; defm FMINNM_ZPmZ : sve_fp_2op_p_zds<0b0101, "fminnm", "FMINNM_ZPZZ", AArch64fminnm_m1, DestructiveBinaryComm>; defm FMAX_ZPmZ : sve_fp_2op_p_zds<0b0110, "fmax", "FMAX_ZPZZ", AArch64fmax_m1, DestructiveBinaryComm>; @@ -4101,7 +4110,7 @@ def : InstAlias<"pfalse\t$Pd", (PFALSE PPRorPNR8:$Pd), 0>; // Non-widening BFloat16 to BFloat16 instructions //===----------------------------------------------------------------------===// -let Predicates = [HasSVE2orSME2, HasB16B16, UseExperimentalZeroingPseudos] in { +let Predicates = [HasSVE2orSME2, HasSVEB16B16, UseExperimentalZeroingPseudos] in { defm BFADD_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fadd>; defm BFSUB_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fsub>; defm BFMUL_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fmul>; @@ -4109,9 +4118,9 @@ defm BFMAXNM_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fmaxnm>; defm BFMINNM_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fminnm>; defm BFMIN_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fmin>; defm BFMAX_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fmax>; -} // HasSVE2orSME2, HasB16B16, UseExperimentalZeroingPseudos +} // HasSVE2orSME2, HasSVEB16B16, UseExperimentalZeroingPseudos -let Predicates = [HasSVE2orSME2, HasB16B16] in { +let Predicates = [HasSVE2orSME2, HasSVEB16B16] in { defm BFMLA_ZPmZZ : sve_fp_3op_p_zds_a_bf<0b00, "bfmla", "BFMLA_ZPZZZ", AArch64fmla_m1>; defm BFMLS_ZPmZZ : sve_fp_3op_p_zds_a_bf<0b01, "bfmls", "BFMLS_ZPZZZ", AArch64fmls_m1>; @@ -4151,7 +4160,7 @@ defm BFMINNM_ZPZZ : sve2p1_bf_bin_pred_zds<AArch64fminnm_p>; defm BFMUL_ZZZI : sve2p1_fp_bfmul_by_indexed_elem<"bfmul", int_aarch64_sve_fmul_lane>; defm BFCLAMP_ZZZ : sve2p1_bfclamp<"bfclamp", AArch64fclamp>; -} // End HasSVE2orSME2, HasB16B16 +} // End HasSVE2orSME2, HasSVEB16B16 //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 5e17ed40df8a..8a93b7fc4c89 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -875,6 +875,7 @@ public: if (DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF || ELFRefKind == AArch64MCExpr::VK_LO12 || ELFRefKind == AArch64MCExpr::VK_GOT_LO12 || + ELFRefKind == AArch64MCExpr::VK_GOT_AUTH_LO12 || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC || ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 || @@ -986,19 +987,20 @@ public: int64_t Addend; if (AArch64AsmParser::classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) { - return DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF - || DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF - || (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF && Addend == 0) - || ELFRefKind == AArch64MCExpr::VK_LO12 - || ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12 - || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 - || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC - || ELFRefKind == AArch64MCExpr::VK_TPREL_HI12 - || ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 - || ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC - || ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12 - || ELFRefKind == AArch64MCExpr::VK_SECREL_HI12 - || ELFRefKind == AArch64MCExpr::VK_SECREL_LO12; + return DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF || + DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF || + (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF && Addend == 0) || + ELFRefKind == AArch64MCExpr::VK_LO12 || + ELFRefKind == AArch64MCExpr::VK_GOT_AUTH_LO12 || + ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12 || + ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 || + ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC || + ELFRefKind == AArch64MCExpr::VK_TPREL_HI12 || + ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 || + ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC || + ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12 || + ELFRefKind == AArch64MCExpr::VK_SECREL_HI12 || + ELFRefKind == AArch64MCExpr::VK_SECREL_LO12; } // If it's a constant, it should be a real immediate in range. @@ -3250,6 +3252,7 @@ ParseStatus AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) { DarwinRefKind != MCSymbolRefExpr::VK_TLVPPAGE && ELFRefKind != AArch64MCExpr::VK_ABS_PAGE_NC && ELFRefKind != AArch64MCExpr::VK_GOT_PAGE && + ELFRefKind != AArch64MCExpr::VK_GOT_AUTH_PAGE && ELFRefKind != AArch64MCExpr::VK_GOT_PAGE_LO15 && ELFRefKind != AArch64MCExpr::VK_GOTTPREL_PAGE && ELFRefKind != AArch64MCExpr::VK_TLSDESC_PAGE) { @@ -3674,6 +3677,7 @@ static const struct Extension { {"rcpc", {AArch64::FeatureRCPC}}, {"rng", {AArch64::FeatureRandGen}}, {"sve", {AArch64::FeatureSVE}}, + {"sve-b16b16", {AArch64::FeatureSVEB16B16}}, {"sve2", {AArch64::FeatureSVE2}}, {"sve2-aes", {AArch64::FeatureSVE2AES}}, {"sve2-sm4", {AArch64::FeatureSVE2SM4}}, @@ -4334,6 +4338,8 @@ bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) { .Case("got", AArch64MCExpr::VK_GOT_PAGE) .Case("gotpage_lo15", AArch64MCExpr::VK_GOT_PAGE_LO15) .Case("got_lo12", AArch64MCExpr::VK_GOT_LO12) + .Case("got_auth", AArch64MCExpr::VK_GOT_AUTH_PAGE) + .Case("got_auth_lo12", AArch64MCExpr::VK_GOT_AUTH_LO12) .Case("gottprel", AArch64MCExpr::VK_GOTTPREL_PAGE) .Case("gottprel_lo12", AArch64MCExpr::VK_GOTTPREL_LO12_NC) .Case("gottprel_g1", AArch64MCExpr::VK_GOTTPREL_G1) @@ -5708,6 +5714,7 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc, // Only allow these with ADDXri/ADDWri if ((ELFRefKind == AArch64MCExpr::VK_LO12 || + ELFRefKind == AArch64MCExpr::VK_GOT_AUTH_LO12 || ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12 || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC || diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index e9e6b6cb68d0..ef8fcc074170 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -2845,7 +2845,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { } if (OpFlags & AArch64II::MO_GOT) { - I.setDesc(TII.get(AArch64::LOADgot)); + I.setDesc(TII.get(MF.getInfo<AArch64FunctionInfo>()->hasELFSignedGOT() + ? AArch64::LOADgotAUTH + : AArch64::LOADgot)); I.getOperand(1).setTargetFlags(OpFlags); } else if (TM.getCodeModel() == CodeModel::Large && !TM.isPositionIndependent()) { diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp index 4a1977ba1a00..afea8bdec197 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp @@ -288,10 +288,16 @@ bool matchDupFromBuildVector(int Lane, MachineInstr &MI, MachineRegisterInfo &MRI, ShuffleVectorPseudo &MatchInfo) { assert(Lane >= 0 && "Expected positive lane?"); + int NumElements = MRI.getType(MI.getOperand(1).getReg()).getNumElements(); // Test if the LHS is a BUILD_VECTOR. If it is, then we can just reference the // lane's definition directly. - auto *BuildVecMI = getOpcodeDef(TargetOpcode::G_BUILD_VECTOR, - MI.getOperand(1).getReg(), MRI); + auto *BuildVecMI = + getOpcodeDef(TargetOpcode::G_BUILD_VECTOR, + MI.getOperand(Lane < NumElements ? 1 : 2).getReg(), MRI); + // If Lane >= NumElements then it is point to RHS, just check from RHS + if (NumElements <= Lane) + Lane -= NumElements; + if (!BuildVecMI) return false; Register Reg = BuildVecMI->getOperand(Lane + 1).getReg(); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp index b4c5cde5fd88..72671b0715f6 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp @@ -167,6 +167,15 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, } if (SymLoc == AArch64MCExpr::VK_GOT && !IsNC) return R_CLS(ADR_GOT_PAGE); + if (SymLoc == AArch64MCExpr::VK_GOT_AUTH && !IsNC) { + if (IsILP32) { + Ctx.reportError(Fixup.getLoc(), + "ILP32 ADRP AUTH relocation not supported " + "(LP64 eqv: AUTH_ADR_GOT_PAGE)"); + return ELF::R_AARCH64_NONE; + } + return ELF::R_AARCH64_AUTH_ADR_GOT_PAGE; + } if (SymLoc == AArch64MCExpr::VK_GOTTPREL && !IsNC) return R_CLS(TLSIE_ADR_GOTTPREL_PAGE21); if (SymLoc == AArch64MCExpr::VK_TLSDESC && !IsNC) @@ -237,6 +246,15 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, return R_CLS(TLSLE_ADD_TPREL_LO12); if (RefKind == AArch64MCExpr::VK_TLSDESC_LO12) return R_CLS(TLSDESC_ADD_LO12); + if (RefKind == AArch64MCExpr::VK_GOT_AUTH_LO12 && IsNC) { + if (IsILP32) { + Ctx.reportError(Fixup.getLoc(), + "ILP32 ADD AUTH relocation not supported " + "(LP64 eqv: AUTH_GOT_ADD_LO12_NC)"); + return ELF::R_AARCH64_NONE; + } + return ELF::R_AARCH64_AUTH_GOT_ADD_LO12_NC; + } if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) return R_CLS(ADD_ABS_LO12_NC); @@ -329,17 +347,23 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, case AArch64::fixup_aarch64_ldst_imm12_scale8: if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) return R_CLS(LDST64_ABS_LO12_NC); - if (SymLoc == AArch64MCExpr::VK_GOT && IsNC) { + if ((SymLoc == AArch64MCExpr::VK_GOT || + SymLoc == AArch64MCExpr::VK_GOT_AUTH) && + IsNC) { AArch64MCExpr::VariantKind AddressLoc = AArch64MCExpr::getAddressFrag(RefKind); + bool IsAuth = (SymLoc == AArch64MCExpr::VK_GOT_AUTH); if (!IsILP32) { if (AddressLoc == AArch64MCExpr::VK_LO15) return ELF::R_AARCH64_LD64_GOTPAGE_LO15; - return ELF::R_AARCH64_LD64_GOT_LO12_NC; + return (IsAuth ? ELF::R_AARCH64_AUTH_LD64_GOT_LO12_NC + : ELF::R_AARCH64_LD64_GOT_LO12_NC); } - Ctx.reportError(Fixup.getLoc(), "ILP32 64-bit load/store " - "relocation not supported (LP64 eqv: " - "LD64_GOT_LO12_NC)"); + Ctx.reportError(Fixup.getLoc(), + Twine("ILP32 64-bit load/store " + "relocation not supported (LP64 eqv: ") + + (IsAuth ? "AUTH_GOT_LO12_NC" : "LD64_GOT_LO12_NC") + + Twine(')')); return ELF::R_AARCH64_NONE; } if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC) diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp index fb8eb9f47da1..3430b9002894 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp @@ -30,6 +30,7 @@ const AArch64MCExpr *AArch64MCExpr::create(const MCExpr *Expr, VariantKind Kind, } StringRef AArch64MCExpr::getVariantKindName() const { + // clang-format off switch (static_cast<uint32_t>(getKind())) { case VK_CALL: return ""; case VK_LO12: return ":lo12:"; @@ -82,9 +83,13 @@ StringRef AArch64MCExpr::getVariantKindName() const { case VK_TLSDESC_PAGE: return ":tlsdesc:"; case VK_SECREL_LO12: return ":secrel_lo12:"; case VK_SECREL_HI12: return ":secrel_hi12:"; + case VK_GOT_AUTH: return ":got_auth:"; + case VK_GOT_AUTH_PAGE: return ":got_auth:"; + case VK_GOT_AUTH_LO12: return ":got_auth_lo12:"; default: llvm_unreachable("Invalid ELF symbol kind"); } + // clang-format on } void AArch64MCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const { diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h index cf3a90f95a2c..699992782f67 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h @@ -24,6 +24,7 @@ namespace llvm { class AArch64MCExpr : public MCTargetExpr { public: enum VariantKind { + // clang-format off // Symbol locations specifying (roughly speaking) what calculation should be // performed to construct the final address for the relocated // symbol. E.g. direct, via the GOT, ... @@ -38,6 +39,7 @@ public: VK_SECREL = 0x009, VK_AUTH = 0x00a, VK_AUTHADDR = 0x00b, + VK_GOT_AUTH = 0x00c, VK_SymLocBits = 0x00f, // Variants specifying which part of the final address calculation is @@ -88,6 +90,8 @@ public: VK_GOT_LO12 = VK_GOT | VK_PAGEOFF | VK_NC, VK_GOT_PAGE = VK_GOT | VK_PAGE, VK_GOT_PAGE_LO15 = VK_GOT | VK_LO15 | VK_NC, + VK_GOT_AUTH_LO12 = VK_GOT_AUTH | VK_PAGEOFF | VK_NC, + VK_GOT_AUTH_PAGE = VK_GOT_AUTH | VK_PAGE, VK_DTPREL_G2 = VK_DTPREL | VK_G2, VK_DTPREL_G1 = VK_DTPREL | VK_G1, VK_DTPREL_G1_NC = VK_DTPREL | VK_G1 | VK_NC, @@ -114,6 +118,7 @@ public: VK_SECREL_HI12 = VK_SECREL | VK_HI12, VK_INVALID = 0xfff + // clang-format on }; private: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index de1f3421cce4..39c52140dfbd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -1038,7 +1038,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM) { &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID, - &AAUnderlyingObjects::ID}); + &AAUnderlyingObjects::ID, &AAAddressSpace::ID}); AttributorConfig AC(CGUpdater); AC.Allowed = &Allowed; @@ -1064,6 +1064,17 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM) { } else if (CC == CallingConv::AMDGPU_KERNEL) { addPreloadKernArgHint(F, TM); } + + for (auto &I : instructions(F)) { + if (auto *LI = dyn_cast<LoadInst>(&I)) { + A.getOrCreateAAFor<AAAddressSpace>( + IRPosition::value(*LI->getPointerOperand())); + } + if (auto *SI = dyn_cast<StoreInst>(&I)) { + A.getOrCreateAAFor<AAAddressSpace>( + IRPosition::value(*SI->getPointerOperand())); + } + } } ChangeStatus Change = A.run(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 73f3921b2ff4..f78699f88de5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1372,8 +1372,8 @@ bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const { MachineInstrBuilder SelectedMI; MachineOperand &LHS = I.getOperand(2); MachineOperand &RHS = I.getOperand(3); - auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS); - auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS); + auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg()); + auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg()); Register Src0Reg = copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true); Register Src1Reg = @@ -2467,14 +2467,48 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { return false; } +static Register stripCopy(Register Reg, MachineRegisterInfo &MRI) { + return getDefSrcRegIgnoringCopies(Reg, MRI)->Reg; +} + +static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI) { + Register BitcastSrc; + if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc)))) + Reg = BitcastSrc; + return Reg; +} + static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out) { + Register Trunc; + if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc)))) + return false; + Register LShlSrc; - if (mi_match(In, MRI, - m_GTrunc(m_GLShr(m_Reg(LShlSrc), m_SpecificICst(16))))) { - Out = LShlSrc; + Register Cst; + if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) { + Cst = stripCopy(Cst, MRI); + if (mi_match(Cst, MRI, m_SpecificICst(16))) { + Out = stripBitCast(LShlSrc, MRI); + return true; + } + } + + MachineInstr *Shuffle = MRI.getVRegDef(Trunc); + if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR) + return false; + + assert(MRI.getType(Shuffle->getOperand(0).getReg()) == + LLT::fixed_vector(2, 16)); + + ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask(); + assert(Mask.size() == 2); + + if (Mask[0] == 1 && Mask[1] <= 1) { + Out = Shuffle->getOperand(0).getReg(); return true; } + return false; } @@ -3550,11 +3584,8 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { } -std::pair<Register, unsigned> -AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root, - bool IsCanonicalizing, - bool AllowAbs, bool OpSel) const { - Register Src = Root.getReg(); +std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl( + Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const { unsigned Mods = 0; MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); @@ -3617,7 +3648,7 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root); + std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); return {{ [=](MachineInstrBuilder &MIB) { @@ -3633,7 +3664,7 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const { Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root, + std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true, /*AllowAbs=*/false); @@ -3660,7 +3691,7 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root); + std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); return {{ [=](MachineInstrBuilder &MIB) { @@ -3675,7 +3706,8 @@ AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing( MachineOperand &Root) const { Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/false); + std::tie(Src, Mods) = + selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false); return {{ [=](MachineInstrBuilder &MIB) { @@ -3689,8 +3721,9 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const { Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/true, - /*AllowAbs=*/false); + std::tie(Src, Mods) = + selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true, + /*AllowAbs=*/false); return {{ [=](MachineInstrBuilder &MIB) { @@ -4016,7 +4049,7 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root); + std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); // FIXME: Handle op_sel return {{ @@ -4029,7 +4062,7 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const { Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root, + std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true, /*AllowAbs=*/false, /*OpSel=*/false); @@ -4047,7 +4080,7 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const { Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root, + std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true, /*AllowAbs=*/false, /*OpSel=*/true); @@ -5229,59 +5262,6 @@ AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const { [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}}; } -// Variant of stripBitCast that returns the instruction instead of a -// MachineOperand. -static MachineInstr *stripBitCast(MachineInstr *MI, MachineRegisterInfo &MRI) { - if (MI->getOpcode() == AMDGPU::G_BITCAST) - return getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI); - return MI; -} - -// Figure out if this is really an extract of the high 16-bits of a dword, -// returns nullptr if it isn't. -static MachineInstr *isExtractHiElt(MachineInstr *Inst, - MachineRegisterInfo &MRI) { - Inst = stripBitCast(Inst, MRI); - - if (Inst->getOpcode() != AMDGPU::G_TRUNC) - return nullptr; - - MachineInstr *TruncOp = - getDefIgnoringCopies(Inst->getOperand(1).getReg(), MRI); - TruncOp = stripBitCast(TruncOp, MRI); - - // G_LSHR x, (G_CONSTANT i32 16) - if (TruncOp->getOpcode() == AMDGPU::G_LSHR) { - auto SrlAmount = getIConstantVRegValWithLookThrough( - TruncOp->getOperand(2).getReg(), MRI); - if (SrlAmount && SrlAmount->Value.getZExtValue() == 16) { - MachineInstr *SrlOp = - getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI); - return stripBitCast(SrlOp, MRI); - } - } - - // G_SHUFFLE_VECTOR x, y, shufflemask(1, 1|0) - // 1, 0 swaps the low/high 16 bits. - // 1, 1 sets the high 16 bits to be the same as the low 16. - // in any case, it selects the high elts. - if (TruncOp->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) { - assert(MRI.getType(TruncOp->getOperand(0).getReg()) == - LLT::fixed_vector(2, 16)); - - ArrayRef<int> Mask = TruncOp->getOperand(3).getShuffleMask(); - assert(Mask.size() == 2); - - if (Mask[0] == 1 && Mask[1] <= 1) { - MachineInstr *LHS = - getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI); - return stripBitCast(LHS, MRI); - } - } - - return nullptr; -} - std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root, bool &Matched) const { @@ -5289,37 +5269,34 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root, Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root); - - MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); - if (MI->getOpcode() == AMDGPU::G_FPEXT) { - MachineOperand *MO = &MI->getOperand(1); - Src = MO->getReg(); - MI = getDefIgnoringCopies(Src, *MRI); + std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); + if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) { assert(MRI->getType(Src) == LLT::scalar(16)); - // See through bitcasts. - // FIXME: Would be nice to use stripBitCast here. - if (MI->getOpcode() == AMDGPU::G_BITCAST) { - MO = &MI->getOperand(1); - Src = MO->getReg(); - MI = getDefIgnoringCopies(Src, *MRI); - } + // Only change Src if src modifier could be gained. In such cases new Src + // could be sgpr but this does not violate constant bus restriction for + // instruction that is being selected. + // Note: Src is not changed when there is only a simple sgpr to vgpr copy + // since this could violate constant bus restriction. + Register PeekSrc = stripCopy(Src, *MRI); const auto CheckAbsNeg = [&]() { // Be careful about folding modifiers if we already have an abs. fneg is // applied last, so we don't want to apply an earlier fneg. if ((Mods & SISrcMods::ABS) == 0) { unsigned ModsTmp; - std::tie(Src, ModsTmp) = selectVOP3ModsImpl(*MO); - MI = getDefIgnoringCopies(Src, *MRI); + std::tie(PeekSrc, ModsTmp) = selectVOP3ModsImpl(PeekSrc); - if ((ModsTmp & SISrcMods::NEG) != 0) + if ((ModsTmp & SISrcMods::NEG) != 0) { Mods ^= SISrcMods::NEG; + Src = PeekSrc; + } - if ((ModsTmp & SISrcMods::ABS) != 0) + if ((ModsTmp & SISrcMods::ABS) != 0) { Mods |= SISrcMods::ABS; + Src = PeekSrc; + } } }; @@ -5332,12 +5309,9 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root, Mods |= SISrcMods::OP_SEL_1; - if (MachineInstr *ExtractHiEltMI = isExtractHiElt(MI, *MRI)) { + if (isExtractHiElt(*MRI, PeekSrc, PeekSrc)) { + Src = PeekSrc; Mods |= SISrcMods::OP_SEL_0; - MI = ExtractHiEltMI; - MO = &MI->getOperand(0); - Src = MO->getReg(); - CheckAbsNeg(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 7fff7d2685e7..69806b240cf2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -150,7 +150,7 @@ private: bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const; bool selectSBarrierLeave(MachineInstr &I) const; - std::pair<Register, unsigned> selectVOP3ModsImpl(MachineOperand &Root, + std::pair<Register, unsigned> selectVOP3ModsImpl(Register Src, bool IsCanonicalizing = true, bool AllowAbs = true, bool OpSel = false) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 9a6ba5ac6808..17067ddd93ff 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3739,17 +3739,28 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { const MachineRegisterInfo &MRI = MF.getRegInfo(); if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) { + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + // The default logic bothers to analyze impossible alternative mappings. We // want the most straightforward mapping, so just directly handle this. - const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI, - *TRI); - const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI, - *TRI); + const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI); + const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); assert(SrcBank && "src bank should have been assigned already"); + + // For COPY between a physical reg and an s1, there is no type associated so + // we need to take the virtual register's type as a hint on how to interpret + // s1 values. + if (!SrcReg.isVirtual() && !DstBank && + MRI.getType(DstReg) == LLT::scalar(1)) + DstBank = &AMDGPU::VCCRegBank; + else if (!DstReg.isVirtual() && MRI.getType(SrcReg) == LLT::scalar(1)) + DstBank = &AMDGPU::VCCRegBank; + if (!DstBank) DstBank = SrcBank; - unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + unsigned Size = getSizeInBits(DstReg, MRI, *TRI); if (MI.getOpcode() != AMDGPU::G_FREEZE && cannotCopy(*DstBank, *SrcBank, TypeSize::getFixed(Size))) return getInvalidInstructionMapping(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 80ca30829032..5c4d2b8d030e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -381,3 +381,7 @@ def : AlwaysUniform<int_amdgcn_if_break>; def : AlwaysUniform<int_amdgcn_workgroup_id_x>; def : AlwaysUniform<int_amdgcn_workgroup_id_y>; def : AlwaysUniform<int_amdgcn_workgroup_id_z>; +def : AlwaysUniform<int_amdgcn_s_getpc>; +def : AlwaysUniform<int_amdgcn_s_getreg>; +def : AlwaysUniform<int_amdgcn_s_memrealtime>; +def : AlwaysUniform<int_amdgcn_s_memtime>; diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 26a839a95df9..c8b594ffbc64 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1618,6 +1618,14 @@ public: ParseStatus parseTH(OperandVector &Operands, int64_t &TH); ParseStatus parseStringWithPrefix(StringRef Prefix, StringRef &Value, SMLoc &StringLoc); + ParseStatus parseStringOrIntWithPrefix(OperandVector &Operands, + StringRef Name, + ArrayRef<const char *> Ids, + int64_t &IntVal); + ParseStatus parseStringOrIntWithPrefix(OperandVector &Operands, + StringRef Name, + ArrayRef<const char *> Ids, + AMDGPUOperand::ImmTy Type); bool isModifier(); bool isOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const; @@ -6633,27 +6641,17 @@ ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) { ParseStatus AMDGPUAsmParser::parseScope(OperandVector &Operands, int64_t &Scope) { - Scope = AMDGPU::CPol::SCOPE_CU; // default; + static const unsigned Scopes[] = {CPol::SCOPE_CU, CPol::SCOPE_SE, + CPol::SCOPE_DEV, CPol::SCOPE_SYS}; - StringRef Value; - SMLoc StringLoc; - ParseStatus Res; - - Res = parseStringWithPrefix("scope", Value, StringLoc); - if (!Res.isSuccess()) - return Res; - - Scope = StringSwitch<int64_t>(Value) - .Case("SCOPE_CU", AMDGPU::CPol::SCOPE_CU) - .Case("SCOPE_SE", AMDGPU::CPol::SCOPE_SE) - .Case("SCOPE_DEV", AMDGPU::CPol::SCOPE_DEV) - .Case("SCOPE_SYS", AMDGPU::CPol::SCOPE_SYS) - .Default(0xffffffff); + ParseStatus Res = parseStringOrIntWithPrefix( + Operands, "scope", {"SCOPE_CU", "SCOPE_SE", "SCOPE_DEV", "SCOPE_SYS"}, + Scope); - if (Scope == 0xffffffff) - return Error(StringLoc, "invalid scope value"); + if (Res.isSuccess()) + Scope = Scopes[Scope]; - return ParseStatus::Success; + return Res; } ParseStatus AMDGPUAsmParser::parseTH(OperandVector &Operands, int64_t &TH) { @@ -6742,6 +6740,44 @@ ParseStatus AMDGPUAsmParser::parseStringWithPrefix(StringRef Prefix, : ParseStatus::Failure; } +ParseStatus AMDGPUAsmParser::parseStringOrIntWithPrefix( + OperandVector &Operands, StringRef Name, ArrayRef<const char *> Ids, + int64_t &IntVal) { + if (!trySkipId(Name, AsmToken::Colon)) + return ParseStatus::NoMatch; + + SMLoc StringLoc = getLoc(); + + StringRef Value; + if (isToken(AsmToken::Identifier)) { + Value = getTokenStr(); + lex(); + + for (IntVal = 0; IntVal < (int64_t)Ids.size(); ++IntVal) + if (Value == Ids[IntVal]) + break; + } else if (!parseExpr(IntVal)) + return ParseStatus::Failure; + + if (IntVal < 0 || IntVal >= (int64_t)Ids.size()) + return Error(StringLoc, "invalid " + Twine(Name) + " value"); + + return ParseStatus::Success; +} + +ParseStatus AMDGPUAsmParser::parseStringOrIntWithPrefix( + OperandVector &Operands, StringRef Name, ArrayRef<const char *> Ids, + AMDGPUOperand::ImmTy Type) { + SMLoc S = getLoc(); + int64_t IntVal; + + ParseStatus Res = parseStringOrIntWithPrefix(Operands, Name, Ids, IntVal); + if (Res.isSuccess()) + Operands.push_back(AMDGPUOperand::CreateImm(this, IntVal, S, Type)); + + return Res; +} + //===----------------------------------------------------------------------===// // MTBUF format //===----------------------------------------------------------------------===// @@ -9396,57 +9432,16 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool I ParseStatus AMDGPUAsmParser::parseSDWASel(OperandVector &Operands, StringRef Prefix, AMDGPUOperand::ImmTy Type) { - using namespace llvm::AMDGPU::SDWA; - - SMLoc S = getLoc(); - StringRef Value; - - SMLoc StringLoc; - ParseStatus Res = parseStringWithPrefix(Prefix, Value, StringLoc); - if (!Res.isSuccess()) - return Res; - - int64_t Int; - Int = StringSwitch<int64_t>(Value) - .Case("BYTE_0", SdwaSel::BYTE_0) - .Case("BYTE_1", SdwaSel::BYTE_1) - .Case("BYTE_2", SdwaSel::BYTE_2) - .Case("BYTE_3", SdwaSel::BYTE_3) - .Case("WORD_0", SdwaSel::WORD_0) - .Case("WORD_1", SdwaSel::WORD_1) - .Case("DWORD", SdwaSel::DWORD) - .Default(0xffffffff); - - if (Int == 0xffffffff) - return Error(StringLoc, "invalid " + Twine(Prefix) + " value"); - - Operands.push_back(AMDGPUOperand::CreateImm(this, Int, S, Type)); - return ParseStatus::Success; + return parseStringOrIntWithPrefix( + Operands, Prefix, + {"BYTE_0", "BYTE_1", "BYTE_2", "BYTE_3", "WORD_0", "WORD_1", "DWORD"}, + Type); } ParseStatus AMDGPUAsmParser::parseSDWADstUnused(OperandVector &Operands) { - using namespace llvm::AMDGPU::SDWA; - - SMLoc S = getLoc(); - StringRef Value; - - SMLoc StringLoc; - ParseStatus Res = parseStringWithPrefix("dst_unused", Value, StringLoc); - if (!Res.isSuccess()) - return Res; - - int64_t Int; - Int = StringSwitch<int64_t>(Value) - .Case("UNUSED_PAD", DstUnused::UNUSED_PAD) - .Case("UNUSED_SEXT", DstUnused::UNUSED_SEXT) - .Case("UNUSED_PRESERVE", DstUnused::UNUSED_PRESERVE) - .Default(0xffffffff); - - if (Int == 0xffffffff) - return Error(StringLoc, "invalid dst_unused value"); - - Operands.push_back(AMDGPUOperand::CreateImm(this, Int, S, AMDGPUOperand::ImmTySDWADstUnused)); - return ParseStatus::Success; + return parseStringOrIntWithPrefix( + Operands, "dst_unused", {"UNUSED_PAD", "UNUSED_SEXT", "UNUSED_PRESERVE"}, + AMDGPUOperand::ImmTySDWADstUnused); } void AMDGPUAsmParser::cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands) { diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index ae537b194f50..b39fbdc26795 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -352,6 +352,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { return 1; case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec: case AMDGPU::S_LOAD_DWORDX2_IMM: case AMDGPU::S_LOAD_DWORDX2_IMM_ec: case AMDGPU::GLOBAL_LOAD_DWORDX2: @@ -363,6 +365,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { return 2; case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec: case AMDGPU::S_LOAD_DWORDX3_IMM: case AMDGPU::S_LOAD_DWORDX3_IMM_ec: case AMDGPU::GLOBAL_LOAD_DWORDX3: @@ -374,6 +378,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { return 3; case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec: case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::S_LOAD_DWORDX4_IMM_ec: case AMDGPU::GLOBAL_LOAD_DWORDX4: @@ -385,6 +391,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { return 4; case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec: case AMDGPU::S_LOAD_DWORDX8_IMM: case AMDGPU::S_LOAD_DWORDX8_IMM_ec: return 8; @@ -499,12 +507,20 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec: return S_BUFFER_LOAD_IMM; case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec: return S_BUFFER_LOAD_SGPR_IMM; case AMDGPU::S_LOAD_DWORD_IMM: case AMDGPU::S_LOAD_DWORDX2_IMM: @@ -587,12 +603,20 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec: return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec: return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM; case AMDGPU::S_LOAD_DWORD_IMM: case AMDGPU::S_LOAD_DWORDX2_IMM: @@ -703,6 +727,10 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec: Result.SOffset = true; [[fallthrough]]; case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: @@ -710,6 +738,10 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec: case AMDGPU::S_LOAD_DWORD_IMM: case AMDGPU::S_LOAD_DWORDX2_IMM: case AMDGPU::S_LOAD_DWORDX3_IMM: @@ -1679,6 +1711,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( return New; } +static bool needsConstrainedOpcode(const GCNSubtarget &STM, + ArrayRef<MachineMemOperand *> MMOs, + unsigned Width) { + // Conservatively returns true if not found the MMO. + return STM.isXNACKEnabled() && + (MMOs.size() != 1 || MMOs[0]->getAlign().value() < Width * 4); +} + unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired) { const unsigned Width = CI.Width + Paired.Width; @@ -1696,38 +1736,55 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, case UNKNOWN: llvm_unreachable("Unknown instruction class"); - case S_BUFFER_LOAD_IMM: + case S_BUFFER_LOAD_IMM: { + // If XNACK is enabled, use the constrained opcodes when the first load is + // under-aligned. + bool NeedsConstrainedOpc = + needsConstrainedOpcode(*STM, CI.I->memoperands(), Width); switch (Width) { default: return 0; case 2: - return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; + return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec + : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; case 3: - return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM; + return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec + : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM; case 4: - return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; + return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec + : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; case 8: - return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; + return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec + : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; } - case S_BUFFER_LOAD_SGPR_IMM: + } + case S_BUFFER_LOAD_SGPR_IMM: { + // If XNACK is enabled, use the constrained opcodes when the first load is + // under-aligned. + bool NeedsConstrainedOpc = + needsConstrainedOpcode(*STM, CI.I->memoperands(), Width); switch (Width) { default: return 0; case 2: - return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM; + return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec + : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM; case 3: - return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM; + return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec + : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM; case 4: - return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM; + return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec + : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM; case 8: - return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM; + return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec + : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM; } + } case S_LOAD_IMM: { // If XNACK is enabled, use the constrained opcodes when the first load is // under-aligned. - const MachineMemOperand *MMO = *CI.I->memoperands_begin(); bool NeedsConstrainedOpc = - STM->isXNACKEnabled() && MMO->getAlign().value() < Width * 4; + needsConstrainedOpcode(*STM, CI.I->memoperands(), Width); switch (Width) { default: return 0; diff --git a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp index fb33308e491c..1312b44b49bd 100644 --- a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -414,6 +414,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) { // Renumber all of the machine basic blocks in the function, guaranteeing that // the numbers agree with the position of the block in the function. MF->RenumberBlocks(); + DT->updateBlockNumbers(); // Try to reorder and otherwise adjust the block layout to make good use // of the TB[BH] instructions. @@ -425,6 +426,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) { T2JumpTables.clear(); // Blocks may have shifted around. Keep the numbering up to date. MF->RenumberBlocks(); + DT->updateBlockNumbers(); } // Align any non-fallthrough blocks @@ -670,8 +672,10 @@ void ARMConstantIslands::doInitialJumpTablePlacement( } // If we did anything then we need to renumber the subsequent blocks. - if (LastCorrectlyNumberedBB) + if (LastCorrectlyNumberedBB) { MF->RenumberBlocks(LastCorrectlyNumberedBB); + DT->updateBlockNumbers(); + } } /// BBHasFallthrough - Return true if the specified basic block can fallthrough @@ -972,6 +976,7 @@ static bool CompareMBBNumbers(const MachineBasicBlock *LHS, void ARMConstantIslands::updateForInsertedWaterBlock(MachineBasicBlock *NewBB) { // Renumber the MBB's to keep them consecutive. NewBB->getParent()->RenumberBlocks(NewBB); + DT->updateBlockNumbers(); // Insert an entry into BBInfo to align it properly with the (newly // renumbered) block numbers. @@ -1034,6 +1039,7 @@ MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) { // This is almost the same as updateForInsertedWaterBlock, except that // the Water goes after OrigBB, not NewBB. MF->RenumberBlocks(NewBB); + DT->updateBlockNumbers(); // Insert an entry into BBInfo to align it properly with the (newly // renumbered) block numbers. @@ -2485,6 +2491,7 @@ MachineBasicBlock *ARMConstantIslands::adjustJTTargetBlockForward( BB->updateTerminator(OldNext != MF->end() ? &*OldNext : nullptr); // Update numbering to account for the block being moved. MF->RenumberBlocks(); + DT->updateBlockNumbers(); ++NumJTMoved; return nullptr; } @@ -2513,6 +2520,7 @@ MachineBasicBlock *ARMConstantIslands::adjustJTTargetBlockForward( // Update internal data structures to account for the newly inserted MBB. MF->RenumberBlocks(NewBB); + DT->updateBlockNumbers(); // Update the CFG. NewBB->addSuccessor(BB); diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td index e133dbeba365..61635bd1629e 100644 --- a/llvm/lib/Target/ARM/ARMInstrThumb2.td +++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -5849,6 +5849,7 @@ def t2AUT : PACBTIHintSpaceUseInst<"aut", 0b00101101> { def ARMt2CallBTI : SDNode<"ARMISD::t2CALL_BTI", SDT_ARMcall, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>; +let Defs = [LR], Uses = [SP] in def t2CALL_BTI : PseudoInst<(outs), (ins pred:$p, thumb_bl_target:$func), IIC_Br, [(ARMt2CallBTI tglobaladdr:$func)]>, Requires<[IsThumb2]>, Sched<[WriteBrL]>; diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index 994b43f1abb4..4e4a19ddf558 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -587,6 +587,14 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm, return 0; return 0xffffff & ((Value - 8) >> 2); case ARM::fixup_t2_uncondbranch: { + if (STI->getTargetTriple().isOSBinFormatCOFF() && !IsResolved && + Value != 4) { + // MSVC link.exe and lld do not support this relocation type + // with a non-zero offset. ("Value" is offset by 4 at this point.) + Ctx.reportError(Fixup.getLoc(), + "cannot perform a PC-relative fixup with a non-zero " + "symbol offset"); + } Value = Value - 4; if (!isInt<25>(Value)) { Ctx.reportError(Fixup.getLoc(), "Relocation out of range"); @@ -637,6 +645,14 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm, Ctx.reportError(Fixup.getLoc(), "Relocation out of range"); return 0; } + if (STI->getTargetTriple().isOSBinFormatCOFF() && !IsResolved && + Value != 4) { + // MSVC link.exe and lld do not support this relocation type + // with a non-zero offset. ("Value" is offset by 4 at this point.) + Ctx.reportError(Fixup.getLoc(), + "cannot perform a PC-relative fixup with a non-zero " + "symbol offset"); + } // The value doesn't encode the low bit (always zero) and is offset by // four. The 32-bit immediate value is encoded as @@ -666,6 +682,14 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm, Endian == llvm::endianness::little); } case ARM::fixup_arm_thumb_blx: { + if (STI->getTargetTriple().isOSBinFormatCOFF() && !IsResolved && + Value != 4) { + // MSVC link.exe and lld do not support this relocation type + // with a non-zero offset. ("Value" is offset by 4 at this point.) + Ctx.reportError(Fixup.getLoc(), + "cannot perform a PC-relative fixup with a non-zero " + "symbol offset"); + } // The value doesn't encode the low two bits (always zero) and is offset by // four (see fixup_arm_thumb_cp). The 32-bit immediate value is encoded as // imm32 = SignExtend(S:I1:I2:imm10H:imm10L:00) diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp index 50a59ce76763..ddc62b534598 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp @@ -41,8 +41,6 @@ namespace { bool needsRelocateWithSymbol(const MCValue &Val, const MCSymbol &Sym, unsigned Type) const override; - - void addTargetSectionFlags(MCContext &Ctx, MCSectionELF &Sec) override; }; } // end anonymous namespace @@ -319,25 +317,6 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, } } -void ARMELFObjectWriter::addTargetSectionFlags(MCContext &Ctx, - MCSectionELF &Sec) { - // The mix of execute-only and non-execute-only at link time is - // non-execute-only. To avoid the empty implicitly created .text - // section from making the whole .text section non-execute-only, we - // mark it execute-only if it is empty and there is at least one - // execute-only section in the object. - MCSectionELF *TextSection = - static_cast<MCSectionELF *>(Ctx.getObjectFileInfo()->getTextSection()); - bool IsExecOnly = Sec.getFlags() & ELF::SHF_ARM_PURECODE; - if (IsExecOnly && !TextSection->hasInstructions()) { - for (auto &F : *TextSection) - if (auto *DF = dyn_cast<MCDataFragment>(&F)) - if (!DF->getContents().empty()) - return; - TextSection->setFlags(TextSection->getFlags() | ELF::SHF_ARM_PURECODE); - } -} - std::unique_ptr<MCObjectTargetWriter> llvm::createARMELFObjectWriter(uint8_t OSABI) { return std::make_unique<ARMELFObjectWriter>(OSABI); diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index 9df752f8eb68..59f29660a777 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -34,6 +34,7 @@ #include "llvm/MC/MCFragment.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSection.h" @@ -1113,6 +1114,25 @@ void ARMTargetELFStreamer::reset() { AttributeSection = nullptr; } void ARMTargetELFStreamer::finish() { ARMTargetStreamer::finish(); finishAttributeSection(); + + // The mix of execute-only and non-execute-only at link time is + // non-execute-only. To avoid the empty implicitly created .text + // section from making the whole .text section non-execute-only, we + // mark it execute-only if it is empty and there is at least one + // execute-only section in the object. + MCContext &Ctx = getStreamer().getContext(); + auto &Asm = getStreamer().getAssembler(); + if (any_of(Asm, [](const MCSection &Sec) { + return cast<MCSectionELF>(Sec).getFlags() & ELF::SHF_ARM_PURECODE; + })) { + auto *Text = + static_cast<MCSectionELF *>(Ctx.getObjectFileInfo()->getTextSection()); + for (auto &F : *Text) + if (auto *DF = dyn_cast<MCDataFragment>(&F)) + if (!DF->getContents().empty()) + return; + Text->setFlags(Text->getFlags() | ELF::SHF_ARM_PURECODE); + } } void ARMELFStreamer::reset() { diff --git a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp index 97bdd4c45a8c..d7f4d4b93f95 100644 --- a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp +++ b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp @@ -28,7 +28,6 @@ #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" -#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -217,11 +216,6 @@ public: bool runOnMachineFunction(MachineFunction &F) override; - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<MachineDominatorTreeWrapperPass>(); - MachineFunctionPass::getAnalysisUsage(AU); - } - MachineFunctionProperties getRequiredProperties() const override { return MachineFunctionProperties().set( MachineFunctionProperties::Property::NoVRegs); diff --git a/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp b/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp index e9d95c6e89db..a2230289ae69 100644 --- a/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp +++ b/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp @@ -249,7 +249,8 @@ void HexagonCopyHoisting::moveCopyInstr(MachineBasicBlock *DestBB, DestBB->splice(FirstTI, MI->getParent(), MI); addMItoCopyList(MI); - for (auto I = ++(DestBB->succ_begin()), E = DestBB->succ_end(); I != E; ++I) { + for (auto I = std::next(DestBB->succ_begin()), E = DestBB->succ_end(); I != E; + ++I) { MachineBasicBlock *SuccBB = *I; auto &BBCopyInst = CopyMIList[SuccBB->getNumber()]; MachineInstr *SuccMI = BBCopyInst[Key]; diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index a004d64c21cc..5b568b0487b4 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -13,12 +13,14 @@ #include "MCTargetDesc/NVPTXInstPrinter.h" #include "MCTargetDesc/NVPTXBaseInfo.h" #include "NVPTX.h" +#include "NVPTXUtilities.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/FormattedStream.h" #include <cctype> using namespace llvm; @@ -228,31 +230,29 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, const MCOperand &MO = MI->getOperand(OpNum); int Imm = (int) MO.getImm(); if (!strcmp(Modifier, "sem")) { - switch (Imm) { - case NVPTX::PTXLdStInstCode::NotAtomic: + auto Ordering = NVPTX::Ordering(Imm); + switch (Ordering) { + case NVPTX::Ordering::NotAtomic: break; - case NVPTX::PTXLdStInstCode::Volatile: + case NVPTX::Ordering::Volatile: O << ".volatile"; break; - case NVPTX::PTXLdStInstCode::Relaxed: + case NVPTX::Ordering::Relaxed: O << ".relaxed.sys"; break; - case NVPTX::PTXLdStInstCode::Acquire: + case NVPTX::Ordering::Acquire: O << ".acquire.sys"; break; - case NVPTX::PTXLdStInstCode::Release: + case NVPTX::Ordering::Release: O << ".release.sys"; break; - case NVPTX::PTXLdStInstCode::RelaxedMMIO: + case NVPTX::Ordering::RelaxedMMIO: O << ".mmio.relaxed.sys"; break; default: - SmallString<256> Msg; - raw_svector_ostream OS(Msg); - OS << "NVPTX LdStCode Printer does not support \"" << Imm - << "\" sem modifier."; - report_fatal_error(OS.str()); - break; + report_fatal_error(formatv( + "NVPTX LdStCode Printer does not support \"{}\" sem modifier.", + OrderingToCString(Ordering))); } } else if (!strcmp(Modifier, "addsp")) { switch (Imm) { diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index 3c7167b15702..f6f6acb9e13c 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -16,6 +16,7 @@ #include "llvm/IR/PassManager.h" #include "llvm/Pass.h" +#include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/CodeGen.h" namespace llvm { @@ -106,15 +107,25 @@ enum LoadStore { isStoreShift = 6 }; -namespace PTXLdStInstCode { -enum MemorySemantic { - NotAtomic = 0, // PTX calls these: "Weak" - Volatile = 1, - Relaxed = 2, - Acquire = 3, - Release = 4, - RelaxedMMIO = 5 +// Extends LLVM AtomicOrdering with PTX Orderings: +using OrderingUnderlyingType = unsigned int; +enum Ordering : OrderingUnderlyingType { + NotAtomic = (OrderingUnderlyingType) + AtomicOrdering::NotAtomic, // PTX calls these: "Weak" + // Unordered = 1, // NVPTX maps LLVM Unorderd to Relaxed + Relaxed = (OrderingUnderlyingType)AtomicOrdering::Monotonic, + // Consume = 3, // Unimplemented in LLVM; NVPTX would map to "Acquire" + Acquire = (OrderingUnderlyingType)AtomicOrdering::Acquire, + Release = (OrderingUnderlyingType)AtomicOrdering::Release, + // AcquireRelease = 6, // TODO + SequentiallyConsistent = + (OrderingUnderlyingType)AtomicOrdering::SequentiallyConsistent, + Volatile = SequentiallyConsistent + 1, + RelaxedMMIO = Volatile + 1, + LAST = RelaxedMMIO }; + +namespace PTXLdStInstCode { enum AddressSpace { GENERIC = 0, GLOBAL = 1, @@ -134,7 +145,7 @@ enum VecType { V2 = 2, V4 = 4 }; -} +} // namespace PTXLdStInstCode /// PTXCvtMode - Conversion code enumeration namespace PTXCvtMode { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 96456ad0547e..25c198f0121e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -22,6 +22,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetIntrinsicInfo.h" @@ -714,21 +715,28 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) { return NVPTX::PTXLdStInstCode::GENERIC; } -static unsigned int getCodeMemorySemantic(MemSDNode *N, - const NVPTXSubtarget *Subtarget) { +namespace { + +struct OperationOrderings { + NVPTX::Ordering InstructionOrdering, FenceOrdering; + OperationOrderings(NVPTX::Ordering IO = NVPTX::Ordering::NotAtomic, + NVPTX::Ordering FO = NVPTX::Ordering::NotAtomic) + : InstructionOrdering(IO), FenceOrdering(FO) {} +}; + +static OperationOrderings +getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) { AtomicOrdering Ordering = N->getSuccessOrdering(); auto CodeAddrSpace = getCodeAddrSpace(N); bool HasMemoryOrdering = Subtarget->hasMemoryOrdering(); bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO(); - // TODO: lowering for SequentiallyConsistent Operations: for now, we error. - // TODO: lowering for AcquireRelease Operations: for now, we error. - // - // clang-format off - // Lowering for non-SequentiallyConsistent Operations + // Lowering for Load/Store Operations (note: AcquireRelease Loads or Stores error). + // Note: uses of Relaxed in the Atomic column of this table refer + // to LLVM AtomicOrdering::Monotonic. // // | Atomic | Volatile | Statespace | PTX sm_60- | PTX sm_70+ | // |---------|----------|--------------------|------------|------------------------------| @@ -749,6 +757,25 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N, // | Other | Yes | Generic, Shared, | Error [2] | <atomic sem> [3] | // | | | / Global [0] | | | + // Lowering of CUDA C++ SequentiallyConsistent Operations and Fences to PTX + // by following the ABI proven sound in: + // Lustig et al, A Formal Analysis of the NVIDIA PTX Memory Consistency Model, ASPLOS’19. + // https://dl.acm.org/doi/pdf/10.1145/3297858.3304043 + // + // | CUDA C++ Atomic Operation or Atomic Fence | PTX Atomic Operation or Fence | + // |------------------------------------------------------|-------------------------------| + // | cuda::atomic_thread_fence | fence.sc.<scope>; | + // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | | + // |------------------------------------------------------|-------------------------------| + // | cuda::atomic_load | fence.sc.<scope>; | + // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | ld.acquire.<scope>; | + // |------------------------------------------------------|-------------------------------| + // | cuda::atomic_store | fence.sc.<scope>; | + // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | st.release.<scope>; | + // |------------------------------------------------------|-------------------------------| + // | cuda::atomic_fetch_<op> | fence.sc.<scope>; | + // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | atom.acq_rel.<scope>; | + // clang-format on // [0]: volatile and atomics are only supported on global or shared @@ -788,11 +815,10 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N, // - the "weak" memory instruction we are currently lowering to, and // - some other instruction that preserves the side-effect, e.g., // a dead dummy volatile load. - if (CodeAddrSpace == NVPTX::PTXLdStInstCode::LOCAL || CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT || CodeAddrSpace == NVPTX::PTXLdStInstCode::PARAM) { - return NVPTX::PTXLdStInstCode::NotAtomic; + return NVPTX::Ordering::NotAtomic; } // [2]: Atomics with Ordering different than Unordered or Relaxed are not @@ -801,12 +827,11 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N, Ordering == AtomicOrdering::Unordered || Ordering == AtomicOrdering::Monotonic) && !HasMemoryOrdering) { - SmallString<256> Msg; - raw_svector_ostream OS(Msg); - OS << "PTX does not support \"atomic\" for orderings different than" - "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order is: \"" - << toIRString(Ordering) << "\"."; - report_fatal_error(OS.str()); + report_fatal_error( + formatv("PTX does not support \"atomic\" for orderings different than" + "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order " + "is: \"{}\".", + toIRString(Ordering))); } // [3]: TODO: these should eventually use .mmio<.atomic sem>; for now we drop @@ -820,68 +845,76 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N, (CodeAddrSpace == NVPTX::PTXLdStInstCode::GENERIC || CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL || CodeAddrSpace == NVPTX::PTXLdStInstCode::SHARED); + if (!AddrGenericOrGlobalOrShared) + return NVPTX::Ordering::NotAtomic; + bool UseRelaxedMMIO = HasRelaxedMMIO && CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL; switch (Ordering) { case AtomicOrdering::NotAtomic: - return N->isVolatile() && AddrGenericOrGlobalOrShared - ? NVPTX::PTXLdStInstCode::Volatile - : NVPTX::PTXLdStInstCode::NotAtomic; + return N->isVolatile() ? NVPTX::Ordering::Volatile + : NVPTX::Ordering::NotAtomic; case AtomicOrdering::Unordered: // We lower unordered in the exact same way as 'monotonic' to respect // LLVM IR atomicity requirements. case AtomicOrdering::Monotonic: if (N->isVolatile()) - return UseRelaxedMMIO ? NVPTX::PTXLdStInstCode::RelaxedMMIO - : AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile - : NVPTX::PTXLdStInstCode::NotAtomic; + return UseRelaxedMMIO ? NVPTX::Ordering::RelaxedMMIO + : NVPTX::Ordering::Volatile; else - return HasMemoryOrdering ? NVPTX::PTXLdStInstCode::Relaxed - : AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile - : NVPTX::PTXLdStInstCode::NotAtomic; + return HasMemoryOrdering ? NVPTX::Ordering::Relaxed + : NVPTX::Ordering::Volatile; + // case AtomicOrdering::Consume: // If LLVM ever provides this, lower it to + // Acquire. case AtomicOrdering::Acquire: - if (!N->readMem()) { - SmallString<256> Msg; - raw_svector_ostream OS(Msg); - OS << "PTX only supports Acquire Ordering on reads: " - << N->getOperationName(); - N->print(OS); - report_fatal_error(OS.str()); - } - return AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Acquire - : NVPTX::PTXLdStInstCode::NotAtomic; + if (!N->readMem()) + report_fatal_error( + formatv("PTX only supports Acquire Ordering on reads: {}", + N->getOperationName())); + return NVPTX::Ordering::Acquire; case AtomicOrdering::Release: - if (!N->writeMem()) { - SmallString<256> Msg; - raw_svector_ostream OS(Msg); - OS << "PTX only supports Release Ordering on writes: " - << N->getOperationName(); - N->print(OS); - report_fatal_error(OS.str()); - } - return AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Release - : NVPTX::PTXLdStInstCode::NotAtomic; + if (!N->writeMem()) + report_fatal_error( + formatv("PTX only supports Release Ordering on writes: {}", + N->getOperationName())); + return NVPTX::Ordering::Release; case AtomicOrdering::AcquireRelease: { - SmallString<256> Msg; - raw_svector_ostream OS(Msg); - OS << "PTX only supports AcquireRelease Ordering on read-modify-write: " - << N->getOperationName(); - N->print(OS); - report_fatal_error(OS.str()); + report_fatal_error( + formatv("NVPTX does not support AcquireRelease Ordering on " + "read-modify-write " + "yet and PTX does not support it on loads or stores: {}", + N->getOperationName())); + } + case AtomicOrdering::SequentiallyConsistent: { + // LLVM-IR SequentiallyConsistent atomics map to a two-instruction PTX + // sequence including a "fence.sc.sco" and the memory instruction with an + // Ordering that differs from "sc": acq, rel, or acq_rel, depending on + // whether the memory operation is a read, write, or read-modify-write. + // + // This sets the ordering of the fence to SequentiallyConsistent, and + // sets the corresponding ordering for the instruction. + NVPTX::Ordering InstrOrder; + if (N->readMem()) + InstrOrder = NVPTX::Ordering::Acquire; + else if (N->writeMem()) + InstrOrder = NVPTX::Ordering::Release; + else + report_fatal_error( + formatv("NVPTX does not support SequentiallyConsistent Ordering on " + "read-modify-writes yet: {}", + N->getOperationName())); + return OperationOrderings(InstrOrder, + NVPTX::Ordering::SequentiallyConsistent); } - case AtomicOrdering::SequentiallyConsistent: - // TODO: support AcquireRelease and SequentiallyConsistent - SmallString<256> Msg; - raw_svector_ostream OS(Msg); - OS << "NVPTX backend does not support AtomicOrdering \"" - << toIRString(Ordering) << "\" yet."; - report_fatal_error(OS.str()); } - - llvm_unreachable("unexpected unhandled case"); + report_fatal_error( + formatv("NVPTX backend does not support AtomicOrdering \"{}\" yet.", + toIRString(Ordering))); } +} // namespace + static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, unsigned CodeAddrSpace, MachineFunction *F) { // We use ldg (i.e. ld.global.nc) for invariant loads from the global address @@ -924,6 +957,35 @@ static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, }); } +NVPTX::Ordering NVPTXDAGToDAGISel::insertMemoryInstructionFence(SDLoc DL, + SDValue &Chain, + MemSDNode *N) { + // Some memory instructions - loads, stores, atomics - need an extra fence + // instruction. Get the memory order of the instruction, and that of its + // fence, if any. + auto [InstructionOrdering, FenceOrdering] = + getOperationOrderings(N, Subtarget); + + // If a fence is required before the operation, insert it: + switch (NVPTX::Ordering(FenceOrdering)) { + case NVPTX::Ordering::NotAtomic: + break; + case NVPTX::Ordering::SequentiallyConsistent: { + unsigned Op = Subtarget->hasMemoryOrdering() + ? NVPTX::atomic_thread_fence_seq_cst_sys + : NVPTX::INT_MEMBAR_SYS; + Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0); + break; + } + default: + report_fatal_error( + formatv("Unexpected fence ordering: \"{}\".", + OrderingToCString(NVPTX::Ordering(FenceOrdering)))); + } + + return InstructionOrdering; +} + bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) { unsigned IID = N->getConstantOperandVal(0); switch (IID) { @@ -1070,17 +1132,15 @@ static int getLdStRegType(EVT VT) { } bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { - SDLoc dl(N); MemSDNode *LD = cast<MemSDNode>(N); assert(LD->readMem() && "Expected load"); - LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N); - EVT LoadedVT = LD->getMemoryVT(); - SDNode *NVPTXLD = nullptr; // do not support pre/post inc/dec + LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N); if (PlainLoad && PlainLoad->isIndexed()) return false; + EVT LoadedVT = LD->getMemoryVT(); if (!LoadedVT.isSimple()) return false; @@ -1089,13 +1149,13 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) { return tryLDGLDU(N); } - - // Memory Semantic Setting - unsigned int CodeMemorySem = getCodeMemorySemantic(LD, Subtarget); - unsigned int PointerSize = CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace()); + SDLoc DL(N); + SDValue Chain = N->getOperand(0); + auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, LD); + // Type Setting: fromType + fromTypeWidth // // Sign : ISD::SEXTLOAD @@ -1105,45 +1165,42 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { MVT SimpleVT = LoadedVT.getSimpleVT(); MVT ScalarVT = SimpleVT.getScalarType(); // Read at least 8 bits (predicates are stored as 8-bit values) - unsigned fromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits()); - unsigned int fromType; + unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits()); + unsigned int FromType; // Vector Setting - unsigned vecType = NVPTX::PTXLdStInstCode::Scalar; + unsigned VecType = NVPTX::PTXLdStInstCode::Scalar; if (SimpleVT.isVector()) { assert((Isv2x16VT(LoadedVT) || LoadedVT == MVT::v4i8) && "Unexpected vector type"); // v2f16/v2bf16/v2i16 is loaded using ld.b32 - fromTypeWidth = 32; + FromTypeWidth = 32; } if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD)) - fromType = NVPTX::PTXLdStInstCode::Signed; + FromType = NVPTX::PTXLdStInstCode::Signed; else - fromType = getLdStRegType(ScalarVT); + FromType = getLdStRegType(ScalarVT); // Create the machine instruction DAG - SDValue Chain = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue Addr; SDValue Offset, Base; std::optional<unsigned> Opcode; MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy; + SmallVector<SDValue, 12> Ops({getI32Imm(InstructionOrdering, DL), + getI32Imm(CodeAddrSpace, DL), + getI32Imm(VecType, DL), getI32Imm(FromType, DL), + getI32Imm(FromTypeWidth, DL)}); + if (SelectDirectAddr(N1, Addr)) { Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar, NVPTX::LD_i32_avar, NVPTX::LD_i64_avar, NVPTX::LD_f32_avar, NVPTX::LD_f64_avar); if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(CodeMemorySem, dl), - getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), - getI32Imm(fromType, dl), - getI32Imm(fromTypeWidth, dl), - Addr, - Chain}; - NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); + Ops.append({Addr, Chain}); } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset) : SelectADDRsi(N1.getNode(), N1, Base, Offset)) { Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi, @@ -1151,15 +1208,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_asi, NVPTX::LD_f64_asi); if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(CodeMemorySem, dl), - getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), - getI32Imm(fromType, dl), - getI32Imm(fromTypeWidth, dl), - Base, - Offset, - Chain}; - NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); + Ops.append({Base, Offset, Chain}); } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset) : SelectADDRri(N1.getNode(), N1, Base, Offset)) { if (PointerSize == 64) @@ -1173,15 +1222,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_ari, NVPTX::LD_f64_ari); if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(CodeMemorySem, dl), - getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), - getI32Imm(fromType, dl), - getI32Imm(fromTypeWidth, dl), - Base, - Offset, - Chain}; - NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); + Ops.append({Base, Offset, Chain}); } else { if (PointerSize == 64) Opcode = @@ -1194,16 +1235,11 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_areg, NVPTX::LD_f64_areg); if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(CodeMemorySem, dl), - getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), - getI32Imm(fromType, dl), - getI32Imm(fromTypeWidth, dl), - N1, - Chain}; - NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); + Ops.append({N1, Chain}); } + SDNode *NVPTXLD = + CurDAG->getMachineNode(*Opcode, DL, TargetVT, MVT::Other, Ops); if (!NVPTXLD) return false; @@ -1215,16 +1251,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { } bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { - - SDValue Chain = N->getOperand(0); - SDValue Op1 = N->getOperand(1); - SDValue Addr, Offset, Base; - std::optional<unsigned> Opcode; - SDLoc DL(N); - SDNode *LD; MemSDNode *MemSD = cast<MemSDNode>(N); EVT LoadedVT = MemSD->getMemoryVT(); - if (!LoadedVT.isSimple()) return false; @@ -1233,12 +1261,12 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) { return tryLDGLDU(N); } - unsigned int PointerSize = CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace()); - // Memory Semantic Setting - unsigned int CodeMemorySem = getCodeMemorySemantic(MemSD, Subtarget); + SDLoc DL(N); + SDValue Chain = N->getOperand(0); + auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, MemSD); // Vector Setting MVT SimpleVT = LoadedVT.getSimpleVT(); @@ -1286,6 +1314,16 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { FromTypeWidth = 32; } + SDValue Op1 = N->getOperand(1); + SDValue Addr, Offset, Base; + std::optional<unsigned> Opcode; + SDNode *LD; + + SmallVector<SDValue, 12> Ops({getI32Imm(InstructionOrdering, DL), + getI32Imm(CodeAddrSpace, DL), + getI32Imm(VecType, DL), getI32Imm(FromType, DL), + getI32Imm(FromTypeWidth, DL)}); + if (SelectDirectAddr(Op1, Addr)) { switch (N->getOpcode()) { default: @@ -1305,14 +1343,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(CodeMemorySem, DL), - getI32Imm(CodeAddrSpace, DL), - getI32Imm(VecType, DL), - getI32Imm(FromType, DL), - getI32Imm(FromTypeWidth, DL), - Addr, - Chain}; - LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); + Ops.append({Addr, Chain}); } else if (PointerSize == 64 ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset) : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) { @@ -1334,15 +1365,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(CodeMemorySem, DL), - getI32Imm(CodeAddrSpace, DL), - getI32Imm(VecType, DL), - getI32Imm(FromType, DL), - getI32Imm(FromTypeWidth, DL), - Base, - Offset, - Chain}; - LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); + Ops.append({Base, Offset, Chain}); } else if (PointerSize == 64 ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset) : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) { @@ -1384,16 +1407,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(CodeMemorySem, DL), - getI32Imm(CodeAddrSpace, DL), - getI32Imm(VecType, DL), - getI32Imm(FromType, DL), - getI32Imm(FromTypeWidth, DL), - Base, - Offset, - Chain}; - - LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); + Ops.append({Base, Offset, Chain}); } else { if (PointerSize == 64) { switch (N->getOpcode()) { @@ -1434,15 +1448,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(CodeMemorySem, DL), - getI32Imm(CodeAddrSpace, DL), - getI32Imm(VecType, DL), - getI32Imm(FromType, DL), - getI32Imm(FromTypeWidth, DL), - Op1, - Chain}; - LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); + Ops.append({Op1, Chain}); } + LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef}); @@ -1452,8 +1460,6 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { - - SDValue Chain = N->getOperand(0); SDValue Op1; MemSDNode *Mem; bool IsLDG = true; @@ -1483,12 +1489,7 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { Mem = cast<MemSDNode>(N); } - std::optional<unsigned> Opcode; - SDLoc DL(N); - SDNode *LD; - SDValue Base, Offset, Addr; EVT OrigType = N->getValueType(0); - EVT EltVT = Mem->getMemoryVT(); unsigned NumElts = 1; if (EltVT.isVector()) { @@ -1517,6 +1518,12 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { } InstVTs.push_back(MVT::Other); SDVTList InstVTList = CurDAG->getVTList(InstVTs); + SDValue Chain = N->getOperand(0); + + std::optional<unsigned> Opcode; + SDLoc DL(N); + SDNode *LD; + SDValue Base, Offset, Addr; if (SelectDirectAddr(Op1, Addr)) { switch (N->getOpcode()) { @@ -1867,19 +1874,17 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { } bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { - SDLoc dl(N); MemSDNode *ST = cast<MemSDNode>(N); assert(ST->writeMem() && "Expected store"); StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N); AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N); assert((PlainStore || AtomicStore) && "Expected store"); - EVT StoreVT = ST->getMemoryVT(); - SDNode *NVPTXST = nullptr; // do not support pre/post inc/dec if (PlainStore && PlainStore->isIndexed()) return false; + EVT StoreVT = ST->getMemoryVT(); if (!StoreVT.isSimple()) return false; @@ -1888,29 +1893,28 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { unsigned int PointerSize = CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace()); - // Memory Semantic Setting - unsigned int CodeMemorySem = getCodeMemorySemantic(ST, Subtarget); + SDLoc DL(N); + SDValue Chain = ST->getChain(); + auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, ST); // Vector Setting MVT SimpleVT = StoreVT.getSimpleVT(); - unsigned vecType = NVPTX::PTXLdStInstCode::Scalar; + unsigned VecType = NVPTX::PTXLdStInstCode::Scalar; // Type Setting: toType + toTypeWidth // - for integer type, always use 'u' - // MVT ScalarVT = SimpleVT.getScalarType(); - unsigned toTypeWidth = ScalarVT.getSizeInBits(); + unsigned ToTypeWidth = ScalarVT.getSizeInBits(); if (SimpleVT.isVector()) { assert((Isv2x16VT(StoreVT) || StoreVT == MVT::v4i8) && "Unexpected vector type"); // v2x16 is stored using st.b32 - toTypeWidth = 32; + ToTypeWidth = 32; } - unsigned int toType = getLdStRegType(ScalarVT); + unsigned int ToType = getLdStRegType(ScalarVT); // Create the machine instruction DAG - SDValue Chain = ST->getChain(); SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal(); SDValue BasePtr = ST->getBasePtr(); SDValue Addr; @@ -1919,21 +1923,18 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { MVT::SimpleValueType SourceVT = Value.getNode()->getSimpleValueType(0).SimpleTy; + SmallVector<SDValue, 12> Ops({Value, getI32Imm(InstructionOrdering, DL), + getI32Imm(CodeAddrSpace, DL), + getI32Imm(VecType, DL), getI32Imm(ToType, DL), + getI32Imm(ToTypeWidth, DL)}); + if (SelectDirectAddr(BasePtr, Addr)) { Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar, NVPTX::ST_i32_avar, NVPTX::ST_i64_avar, NVPTX::ST_f32_avar, NVPTX::ST_f64_avar); if (!Opcode) return false; - SDValue Ops[] = {Value, - getI32Imm(CodeMemorySem, dl), - getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), - getI32Imm(toType, dl), - getI32Imm(toTypeWidth, dl), - Addr, - Chain}; - NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); + Ops.append({Addr, Chain}); } else if (PointerSize == 64 ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset) : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) { @@ -1942,16 +1943,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { NVPTX::ST_f32_asi, NVPTX::ST_f64_asi); if (!Opcode) return false; - SDValue Ops[] = {Value, - getI32Imm(CodeMemorySem, dl), - getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), - getI32Imm(toType, dl), - getI32Imm(toTypeWidth, dl), - Base, - Offset, - Chain}; - NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); + Ops.append({Base, Offset, Chain}); } else if (PointerSize == 64 ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset) : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) { @@ -1966,17 +1958,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { NVPTX::ST_f32_ari, NVPTX::ST_f64_ari); if (!Opcode) return false; - - SDValue Ops[] = {Value, - getI32Imm(CodeMemorySem, dl), - getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), - getI32Imm(toType, dl), - getI32Imm(toTypeWidth, dl), - Base, - Offset, - Chain}; - NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); + Ops.append({Base, Offset, Chain}); } else { if (PointerSize == 64) Opcode = @@ -1989,17 +1971,12 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { NVPTX::ST_f32_areg, NVPTX::ST_f64_areg); if (!Opcode) return false; - SDValue Ops[] = {Value, - getI32Imm(CodeMemorySem, dl), - getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), - getI32Imm(toType, dl), - getI32Imm(toTypeWidth, dl), - BasePtr, - Chain}; - NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); + Ops.append({BasePtr, Chain}); } + SDNode *NVPTXST = NVPTXST = + CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops); + if (!NVPTXST) return false; @@ -2010,11 +1987,9 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { } bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { - SDValue Chain = N->getOperand(0); SDValue Op1 = N->getOperand(1); SDValue Addr, Offset, Base; std::optional<unsigned> Opcode; - SDLoc DL(N); SDNode *ST; EVT EltVT = Op1.getValueType(); MemSDNode *MemSD = cast<MemSDNode>(N); @@ -2029,8 +2004,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { unsigned int PointerSize = CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace()); - // Memory Semantic Setting - unsigned int CodeMemorySem = getCodeMemorySemantic(MemSD, Subtarget); + SDLoc DL(N); + SDValue Chain = N->getOperand(0); + auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, MemSD); // Type Setting: toType + toTypeWidth // - for integer type, always use 'u' @@ -2039,23 +2015,20 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { unsigned ToTypeWidth = ScalarVT.getSizeInBits(); unsigned ToType = getLdStRegType(ScalarVT); - SmallVector<SDValue, 12> StOps; + SmallVector<SDValue, 12> Ops; SDValue N2; unsigned VecType; switch (N->getOpcode()) { case NVPTXISD::StoreV2: VecType = NVPTX::PTXLdStInstCode::V2; - StOps.push_back(N->getOperand(1)); - StOps.push_back(N->getOperand(2)); + Ops.append({N->getOperand(1), N->getOperand(2)}); N2 = N->getOperand(3); break; case NVPTXISD::StoreV4: VecType = NVPTX::PTXLdStInstCode::V4; - StOps.push_back(N->getOperand(1)); - StOps.push_back(N->getOperand(2)); - StOps.push_back(N->getOperand(3)); - StOps.push_back(N->getOperand(4)); + Ops.append({N->getOperand(1), N->getOperand(2), N->getOperand(3), + N->getOperand(4)}); N2 = N->getOperand(5); break; default: @@ -2072,11 +2045,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { ToTypeWidth = 32; } - StOps.push_back(getI32Imm(CodeMemorySem, DL)); - StOps.push_back(getI32Imm(CodeAddrSpace, DL)); - StOps.push_back(getI32Imm(VecType, DL)); - StOps.push_back(getI32Imm(ToType, DL)); - StOps.push_back(getI32Imm(ToTypeWidth, DL)); + Ops.append({getI32Imm(InstructionOrdering, DL), getI32Imm(CodeAddrSpace, DL), + getI32Imm(VecType, DL), getI32Imm(ToType, DL), + getI32Imm(ToTypeWidth, DL)}); if (SelectDirectAddr(N2, Addr)) { switch (N->getOpcode()) { @@ -2095,7 +2066,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { NVPTX::STV_f32_v4_avar, std::nullopt); break; } - StOps.push_back(Addr); + Ops.push_back(Addr); } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset) : SelectADDRsi(N2.getNode(), N2, Base, Offset)) { switch (N->getOpcode()) { @@ -2114,8 +2085,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt); break; } - StOps.push_back(Base); - StOps.push_back(Offset); + Ops.append({Base, Offset}); } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset) : SelectADDRri(N2.getNode(), N2, Base, Offset)) { if (PointerSize == 64) { @@ -2154,8 +2124,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { break; } } - StOps.push_back(Base); - StOps.push_back(Offset); + Ops.append({Base, Offset}); } else { if (PointerSize == 64) { switch (N->getOpcode()) { @@ -2194,15 +2163,15 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { break; } } - StOps.push_back(N2); + Ops.push_back(N2); } if (!Opcode) return false; - StOps.push_back(Chain); + Ops.push_back(Chain); - ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, StOps); + ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops); MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef}); @@ -2276,10 +2245,8 @@ bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) { unsigned OffsetVal = Offset->getAsZExtVal(); - SmallVector<SDValue, 2> Ops; - Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32)); - Ops.push_back(Chain); - Ops.push_back(Glue); + SmallVector<SDValue, 2> Ops( + {CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue}); ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops)); return true; @@ -2312,8 +2279,7 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) { SmallVector<SDValue, 6> Ops; for (unsigned i = 0; i < NumElts; ++i) Ops.push_back(N->getOperand(i + 2)); - Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32)); - Ops.push_back(Chain); + Ops.append({CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain}); // Determine target opcode // If we have an i1, use an 8-bit store. The lowering code in @@ -2493,10 +2459,8 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) { SmallVector<SDValue, 8> Ops; for (unsigned i = 0; i < NumElts; ++i) Ops.push_back(N->getOperand(i + 3)); - Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32)); - Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32)); - Ops.push_back(Chain); - Ops.push_back(Glue); + Ops.append({CurDAG->getTargetConstant(ParamVal, DL, MVT::i32), + CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue}); // Determine target opcode // If we have an i1, use an 8-bit store. The lowering code in diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 49626d405148..eac405659951 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -99,6 +99,9 @@ private: bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const; static unsigned GetConvertOpcode(MVT DestTy, MVT SrcTy, LoadSDNode *N); + + NVPTX::Ordering insertMemoryInstructionFence(SDLoc DL, SDValue &Chain, + MemSDNode *N); }; class NVPTXDAGToDAGISelLegacy : public SelectionDAGISelLegacy { diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index a5bdc6fac3ca..6a096fa5acea 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -3926,7 +3926,6 @@ def : Pat<(atomic_fence (i64 6), (i64 1)), (atomic_thread_fence_acq_rel_sys)>, / def : Pat<(atomic_fence (i64 7), (i64 1)), (atomic_thread_fence_seq_cst_sys)>, // seq_cst(7) sys(1) Requires<[hasPTX<60>, hasSM<70>]>; - // If PTX<60 or SM<70, we fall back to MEMBAR: def : Pat<(atomic_fence (i64 4), (i64 1)), (INT_MEMBAR_SYS)>; // acquire(4) sys(1) def : Pat<(atomic_fence (i64 5), (i64 1)), (INT_MEMBAR_SYS)>; // release(5) sys(1) diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index c81dfa68e4bd..887951b55fb3 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -335,6 +335,48 @@ def INT_FENCE_SC_CLUSTER: MEMBAR<"fence.sc.cluster;", int_nvvm_fence_sc_cluster>, Requires<[hasPTX<78>, hasSM<90>]>; +// Proxy fence (uni-directional) +// fence.proxy.tensormap.release variants + +class FENCE_PROXY_TENSORMAP_GENERIC_RELEASE<string Scope, Intrinsic Intr> : + NVPTXInst<(outs), (ins), + "fence.proxy.tensormap::generic.release." # Scope # ";", [(Intr)]>, + Requires<[hasPTX<83>, hasSM<90>]>; + +def INT_FENCE_PROXY_TENSORMAP_GENERIC_RELEASE_CTA: + FENCE_PROXY_TENSORMAP_GENERIC_RELEASE<"cta", + int_nvvm_fence_proxy_tensormap_generic_release_cta>; +def INT_FENCE_PROXY_TENSORMAP_GENERIC_RELEASE_CLUSTER: + FENCE_PROXY_TENSORMAP_GENERIC_RELEASE<"cluster", + int_nvvm_fence_proxy_tensormap_generic_release_cluster>; +def INT_FENCE_PROXY_TENSORMAP_GENERIC_RELEASE_GPU: + FENCE_PROXY_TENSORMAP_GENERIC_RELEASE<"gpu", + int_nvvm_fence_proxy_tensormap_generic_release_gpu>; +def INT_FENCE_PROXY_TENSORMAP_GENERIC_RELEASE_SYS: + FENCE_PROXY_TENSORMAP_GENERIC_RELEASE<"sys", + int_nvvm_fence_proxy_tensormap_generic_release_sys>; + +// fence.proxy.tensormap.acquire variants + +class FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE<string Scope, Intrinsic Intr> : + NVPTXInst<(outs), (ins Int64Regs:$addr), + "fence.proxy.tensormap::generic.acquire." # Scope # " [$addr], 128;", + [(Intr Int64Regs:$addr, (i32 128))]>, + Requires<[hasPTX<83>, hasSM<90>]>; + +def INT_FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE_CTA : + FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE<"cta", + int_nvvm_fence_proxy_tensormap_generic_acquire_cta>; +def INT_FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE_CLUSTER : + FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE<"cluster", + int_nvvm_fence_proxy_tensormap_generic_acquire_cluster>; +def INT_FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE_GPU : + FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE<"gpu", + int_nvvm_fence_proxy_tensormap_generic_acquire_gpu>; +def INT_FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE_SYS : + FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE<"sys", + int_nvvm_fence_proxy_tensormap_generic_acquire_sys>; + //----------------------------------- // Async Copy Functions //----------------------------------- diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h index c15ff6cae1f2..eebd91fefe4f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -13,6 +13,7 @@ #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H #define LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H +#include "NVPTX.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" @@ -82,6 +83,36 @@ inline unsigned promoteScalarArgumentSize(unsigned size) { bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM); bool Isv2x16VT(EVT VT); + +namespace NVPTX { + +inline std::string OrderingToCString(Ordering Order) { + switch (Order) { + case Ordering::NotAtomic: + return "NotAtomic"; + case Ordering::Relaxed: + return "Relaxed"; + case Ordering::Acquire: + return "Acquire"; + case Ordering::Release: + return "Release"; + // case Ordering::AcquireRelease: return "AcquireRelease"; + case Ordering::SequentiallyConsistent: + return "SequentiallyConsistent"; + case Ordering::Volatile: + return "Volatile"; + case Ordering::RelaxedMMIO: + return "RelaxedMMIO"; + } + report_fatal_error("unknown ordering"); +} + +inline raw_ostream &operator<<(raw_ostream &O, Ordering Order) { + O << OrderingToCString(Order); + return O; } +} // namespace NVPTX +} // namespace llvm + #endif diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp index 1963582ce686..a57ed33bda9c 100644 --- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -1007,7 +1007,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, // R0 cannot be used as a base register, but it can be used as an // index in a store-indexed. int LastOffset = 0; - if (HasFP) { + if (HasFP) { // R0 += (FPOffset-LastOffset). // Need addic, since addi treats R0 as 0. BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDIC), ScratchReg) @@ -2025,8 +2025,18 @@ void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF, // code. Same goes for the base pointer and the PIC base register. if (needsFP(MF)) SavedRegs.reset(isPPC64 ? PPC::X31 : PPC::R31); - if (RegInfo->hasBasePointer(MF)) + if (RegInfo->hasBasePointer(MF)) { SavedRegs.reset(RegInfo->getBaseRegister(MF)); + // On AIX, when BaseRegister(R30) is used, need to spill r31 too to match + // AIX trackback table requirement. + if (!needsFP(MF) && !SavedRegs.test(isPPC64 ? PPC::X31 : PPC::R31) && + Subtarget.isAIXABI()) { + assert( + (RegInfo->getBaseRegister(MF) == (isPPC64 ? PPC::X30 : PPC::R30)) && + "Invalid base register on AIX!"); + SavedRegs.set(isPPC64 ? PPC::X31 : PPC::R31); + } + } if (FI->usesPICBase()) SavedRegs.reset(PPC::R30); diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td index 8f5afbae01de..0177034a5ae0 100644 --- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -1014,12 +1014,14 @@ def POPCNTB8 : XForm_11<31, 122, (outs g8rc:$RA), (ins g8rc:$RST), [(set i64:$RA, (int_ppc_popcntb i64:$RST))]>; def CDTBCD8 : XForm_11<31, 282, (outs g8rc:$RA), (ins g8rc:$RST), - "cdtbcd $RA, $RST", IIC_IntGeneral, []>; + "cdtbcd $RA, $RST", IIC_IntGeneral, + [(set i64:$RA, (int_ppc_cdtbcdd i64:$RST))]>; def CBCDTD8 : XForm_11<31, 314, (outs g8rc:$RA), (ins g8rc:$RST), - "cbcdtd $RA, $RST", IIC_IntGeneral, []>; - + "cbcdtd $RA, $RST", IIC_IntGeneral, + [(set i64:$RA, (int_ppc_cbcdtdd i64:$RST))]>; def ADDG6S8 : XOForm_1<31, 74, 0, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB), - "addg6s $RT, $RA, $RB", IIC_IntGeneral, []>; + "addg6s $RT, $RA, $RB", IIC_IntGeneral, + [(set i64:$RT, (int_ppc_addg6sd i64:$RA, i64:$RB))]>; } defm DIVD : XOForm_1rcr<31, 489, 0, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB), diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index 1686249c0f89..411ea77afc0d 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -1931,12 +1931,14 @@ def POPCNTB : XForm_11<31, 122, (outs gprc:$RA), (ins gprc:$RST), [(set i32:$RA, (int_ppc_popcntb i32:$RST))]>; def CDTBCD : XForm_11<31, 282, (outs gprc:$RA), (ins gprc:$RST), - "cdtbcd $RA, $RST", IIC_IntGeneral, []>; + "cdtbcd $RA, $RST", IIC_IntGeneral, + [(set i32:$RA, (int_ppc_cdtbcd i32:$RST))]>; def CBCDTD : XForm_11<31, 314, (outs gprc:$RA), (ins gprc:$RST), - "cbcdtd $RA, $RST", IIC_IntGeneral, []>; - + "cbcdtd $RA, $RST", IIC_IntGeneral, + [(set i32:$RA, (int_ppc_cbcdtd i32:$RST))]>; def ADDG6S : XOForm_1<31, 74, 0, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), - "addg6s $RT, $RA, $RB", IIC_IntGeneral, []>; + "addg6s $RT, $RA, $RB", IIC_IntGeneral, + [(set i32:$RT, (int_ppc_addg6s i32:$RA, i32:$RB))]>; //===----------------------------------------------------------------------===// // PPC32 Load Instructions. diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt index f28a7092e3ce..5146e519c352 100644 --- a/llvm/lib/Target/RISCV/CMakeLists.txt +++ b/llvm/lib/Target/RISCV/CMakeLists.txt @@ -36,6 +36,7 @@ add_llvm_target(RISCVCodeGen RISCVExpandPseudoInsts.cpp RISCVFrameLowering.cpp RISCVGatherScatterLowering.cpp + RISCVIndirectBranchTracking.cpp RISCVInsertVSETVLI.cpp RISCVInsertReadWriteCSR.cpp RISCVInsertWriteVXRM.cpp diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h index 0d2473c7c5de..80cb39529149 100644 --- a/llvm/lib/Target/RISCV/RISCV.h +++ b/llvm/lib/Target/RISCV/RISCV.h @@ -31,6 +31,9 @@ void initializeRISCVCodeGenPreparePass(PassRegistry &); FunctionPass *createRISCVDeadRegisterDefinitionsPass(); void initializeRISCVDeadRegisterDefinitionsPass(PassRegistry &); +FunctionPass *createRISCVIndirectBranchTrackingPass(); +void initializeRISCVIndirectBranchTrackingPass(PassRegistry &); + FunctionPass *createRISCVISelDag(RISCVTargetMachine &TM, CodeGenOptLevel OptLevel); diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 43c04d417c7d..604234b24315 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -1461,7 +1461,12 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { SDValue X = N0.getOperand(0); - if (isMask_64(C1)) { + // Prefer SRAIW + ANDI when possible. + bool Skip = C2 > 32 && isInt<12>(N1C->getSExtValue()) && + X.getOpcode() == ISD::SHL && + isa<ConstantSDNode>(X.getOperand(1)) && + X.getConstantOperandVal(1) == 32; + if (isMask_64(C1) && !Skip) { unsigned Leading = XLen - llvm::bit_width(C1); if (C2 > Leading) { SDNode *SRAI = CurDAG->getMachineNode( diff --git a/llvm/lib/Target/RISCV/RISCVIndirectBranchTracking.cpp b/llvm/lib/Target/RISCV/RISCVIndirectBranchTracking.cpp new file mode 100644 index 000000000000..1b484d486edc --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVIndirectBranchTracking.cpp @@ -0,0 +1,102 @@ +//===------ RISCVIndirectBranchTracking.cpp - Enables lpad mechanism ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// The pass adds LPAD (AUIPC with rs1 = X0) machine instructions at the +// beginning of each basic block or function that is referenced by an indrect +// jump/call instruction. +// +//===----------------------------------------------------------------------===// + +#include "RISCV.h" +#include "RISCVInstrInfo.h" +#include "RISCVSubtarget.h" +#include "RISCVTargetMachine.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" + +using namespace llvm; + +static cl::opt<uint32_t> PreferredLandingPadLabel( + "riscv-landing-pad-label", cl::ReallyHidden, + cl::desc("Use preferred fixed label for all labels")); + +namespace { +class RISCVIndirectBranchTrackingPass : public MachineFunctionPass { +public: + RISCVIndirectBranchTrackingPass() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { + return "RISC-V Indirect Branch Tracking"; + } + + bool runOnMachineFunction(MachineFunction &MF) override; + +private: + static char ID; + const Align LpadAlign = Align(4); +}; + +} // end anonymous namespace + +char RISCVIndirectBranchTrackingPass::ID = 0; + +FunctionPass *llvm::createRISCVIndirectBranchTrackingPass() { + return new RISCVIndirectBranchTrackingPass(); +} + +static void emitLpad(MachineBasicBlock &MBB, const RISCVInstrInfo *TII, + uint32_t Label) { + auto I = MBB.begin(); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(RISCV::AUIPC), RISCV::X0) + .addImm(Label); +} + +bool RISCVIndirectBranchTrackingPass::runOnMachineFunction( + MachineFunction &MF) { + const auto &Subtarget = MF.getSubtarget<RISCVSubtarget>(); + const RISCVInstrInfo *TII = Subtarget.getInstrInfo(); + if (!Subtarget.hasStdExtZicfilp()) + return false; + + uint32_t FixedLabel = 0; + if (PreferredLandingPadLabel.getNumOccurrences() > 0) { + if (!isUInt<20>(PreferredLandingPadLabel)) + report_fatal_error("riscv-landing-pad-label=<val>, <val> needs to fit in " + "unsigned 20-bits"); + FixedLabel = PreferredLandingPadLabel; + } + + bool Changed = false; + for (MachineBasicBlock &MBB : MF) { + if (&MBB == &MF.front()) { + Function &F = MF.getFunction(); + // When trap is taken, landing pad is not needed. + if (F.hasFnAttribute("interrupt")) + continue; + + if (F.hasAddressTaken() || !F.hasLocalLinkage()) { + emitLpad(MBB, TII, FixedLabel); + if (MF.getAlignment() < LpadAlign) + MF.setAlignment(LpadAlign); + Changed = true; + } + continue; + } + + if (MBB.hasAddressTaken()) { + emitLpad(MBB, TII, FixedLabel); + if (MBB.getAlignment() < LpadAlign) + MBB.setAlignment(LpadAlign); + Changed = true; + } + } + + return Changed; +} diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index b5dbc055e03d..86b30e836473 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -371,6 +371,9 @@ defset list<VTypeInfo> AllVectors = { } } +defvar AllFloatVectorsExceptFP16 = !filter(vti, AllFloatVectors, !ne(vti.Scalar, f16)); +defvar AllFP16Vectors = !filter(vti, AllFloatVectors, !eq(vti.Scalar, f16)); + // This functor is used to obtain the int vector type that has the same SEW and // multiplier as the input parameter type class GetIntVTypeInfo<VTypeInfo vti> { @@ -7245,6 +7248,14 @@ foreach vti = AllFloatVectors in { vti.RegClass, vti.ScalarRegClass>; } +foreach vti = AllBFloatVectors in + let Predicates = [HasVInstructionsBF16Minimal] in + defm : VPatBinaryCarryInTAIL<"int_riscv_vmerge", "PseudoVMERGE", "VVM", + vti.Vector, + vti.Vector, vti.Vector, vti.Mask, + vti.Log2SEW, vti.LMul, vti.RegClass, + vti.RegClass, vti.RegClass>; + foreach fvti = AllFloatVectors in { defvar instr = !cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX); let Predicates = GetVTypePredicates<fvti>.Predicates in @@ -7445,14 +7456,22 @@ defm : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16_vv", "PseudoVRGATHEREI16", eew=16, vtilist=AllIntegerVectors>; defm : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER", - AllFloatVectors, uimm5>; + AllFloatVectorsExceptFP16, uimm5>; +let Predicates = [HasVInstructionsF16Minimal] in + defm : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER", + AllFP16Vectors, uimm5>; +defm : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER", + AllBFloatVectors, uimm5>; defm : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16_vv", "PseudoVRGATHEREI16", eew=16, vtilist=AllFloatVectors>; //===----------------------------------------------------------------------===// // 16.5. Vector Compress Instruction //===----------------------------------------------------------------------===// defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllIntegerVectors>; -defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllFloatVectors>; +defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllFloatVectorsExceptFP16>; +let Predicates = [HasVInstructionsF16Minimal] in + defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllFP16Vectors>; +defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllBFloatVectors>; // Include the non-intrinsic ISel patterns include "RISCVInstrInfoVVLPatterns.td" diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td index 83d27b35cf0d..cc40d6a2f986 100644 --- a/llvm/lib/Target/RISCV/RISCVProcessors.td +++ b/llvm/lib/Target/RISCV/RISCVProcessors.td @@ -239,6 +239,12 @@ def SIFIVE_X280 : RISCVProcessorModel<"sifive-x280", SiFive7Model, FeatureStdExtZbb], SiFiveX280TuneFeatures>; +defvar SiFiveP400TuneFeatures = [TuneNoDefaultUnroll, + TuneConditionalCompressedMoveFusion, + TuneLUIADDIFusion, + TuneAUIPCADDIFusion, + FeaturePostRAScheduler]; + def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model, [Feature64Bit, FeatureStdExtI, @@ -266,11 +272,26 @@ def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model, FeatureStdExtZfhmin, FeatureUnalignedScalarMem, FeatureUnalignedVectorMem], - [TuneNoDefaultUnroll, - TuneConditionalCompressedMoveFusion, - TuneLUIADDIFusion, - TuneAUIPCADDIFusion, - FeaturePostRAScheduler]>; + SiFiveP400TuneFeatures>; + +def SIFIVE_P470 : RISCVProcessorModel<"sifive-p470", SiFiveP400Model, + !listconcat(RVA22U64Features, + [FeatureStdExtV, + FeatureStdExtZifencei, + FeatureStdExtZihintntl, + FeatureStdExtZvl128b, + FeatureStdExtZvbb, + FeatureStdExtZvknc, + FeatureStdExtZvkng, + FeatureStdExtZvksc, + FeatureStdExtZvksg, + FeatureVendorXSiFivecdiscarddlone, + FeatureVendorXSiFivecflushdlone, + FeatureUnalignedScalarMem, + FeatureUnalignedVectorMem]), + !listconcat(SiFiveP400TuneFeatures, + [TuneNoSinkSplatOperands])>; + def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model, [Feature64Bit, diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index 21fbf47875e6..8b3770aeb5d1 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -520,6 +520,7 @@ void RISCVPassConfig::addPreEmitPass2() { // ensuring return instruction is detected correctly. addPass(createRISCVPushPopOptimizationPass()); } + addPass(createRISCVIndirectBranchTrackingPass()); addPass(createRISCVExpandPseudoPass()); // Schedule the expansion of AMOs at the last possible moment, avoiding the diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp index 53bac88df65f..04eada18ef0d 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp @@ -186,10 +186,11 @@ struct Entry { /// Explore them. static void sortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI, const WebAssemblyExceptionInfo &WEI, - const MachineDominatorTree &MDT) { + MachineDominatorTree &MDT) { // Remember original layout ordering, so we can update terminators after // reordering to point to the original layout successor. MF.RenumberBlocks(); + MDT.updateBlockNumbers(); // Prepare for a topological sort: Record the number of predecessors each // block has, ignoring loop backedges. @@ -330,6 +331,7 @@ static void sortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI, } assert(Entries.empty() && "Active sort region list not finished"); MF.RenumberBlocks(); + MDT.updateBlockNumbers(); #ifndef NDEBUG SmallSetVector<const SortRegion *, 8> OnStack; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp index 70b91c266c49..c7001ef2b33e 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp @@ -45,6 +45,8 @@ STATISTIC(NumCatchUnwindMismatches, "Number of catch unwind mismatches found"); namespace { class WebAssemblyCFGStackify final : public MachineFunctionPass { + MachineDominatorTree *MDT; + StringRef getPassName() const override { return "WebAssembly CFG Stackify"; } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -252,7 +254,6 @@ void WebAssemblyCFGStackify::unregisterScope(MachineInstr *Begin) { void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) { assert(!MBB.isEHPad()); MachineFunction &MF = *MBB.getParent(); - auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo(); const auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>(); @@ -264,7 +265,7 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) { int MBBNumber = MBB.getNumber(); for (MachineBasicBlock *Pred : MBB.predecessors()) { if (Pred->getNumber() < MBBNumber) { - Header = Header ? MDT.findNearestCommonDominator(Header, Pred) : Pred; + Header = Header ? MDT->findNearestCommonDominator(Header, Pred) : Pred; if (explicitlyBranchesTo(Pred, &MBB)) IsBranchedTo = true; } @@ -1439,6 +1440,7 @@ void WebAssemblyCFGStackify::recalculateScopeTops(MachineFunction &MF) { // Renumber BBs and recalculate ScopeTop info because new BBs might have been // created and inserted during fixing unwind mismatches. MF.RenumberBlocks(); + MDT->updateBlockNumbers(); ScopeTops.clear(); ScopeTops.resize(MF.getNumBlockIDs()); for (auto &MBB : reverse(MF)) { @@ -1741,6 +1743,7 @@ bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) { "********** Function: " << MF.getName() << '\n'); const MCAsmInfo *MCAI = MF.getTarget().getMCAsmInfo(); + MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); releaseMemory(); diff --git a/llvm/lib/Target/X86/X86DomainReassignment.cpp b/llvm/lib/Target/X86/X86DomainReassignment.cpp index 6289b3a1df1f..831944cce3af 100644 --- a/llvm/lib/Target/X86/X86DomainReassignment.cpp +++ b/llvm/lib/Target/X86/X86DomainReassignment.cpp @@ -41,13 +41,6 @@ static cl::opt<bool> DisableX86DomainReassignment( namespace { enum RegDomain { NoDomain = -1, GPRDomain, MaskDomain, OtherDomain, NumDomains }; -static bool isGPR(const TargetRegisterClass *RC) { - return X86::GR64RegClass.hasSubClassEq(RC) || - X86::GR32RegClass.hasSubClassEq(RC) || - X86::GR16RegClass.hasSubClassEq(RC) || - X86::GR8RegClass.hasSubClassEq(RC); -} - static bool isMask(const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) { return X86::VK16RegClass.hasSubClassEq(RC); @@ -55,7 +48,7 @@ static bool isMask(const TargetRegisterClass *RC, static RegDomain getDomain(const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) { - if (isGPR(RC)) + if (TRI->isGeneralPurposeRegisterClass(RC)) return GPRDomain; if (isMask(RC, TRI)) return MaskDomain; @@ -797,7 +790,8 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) { continue; // GPR only current source domain supported. - if (!isGPR(MRI->getRegClass(Reg))) + if (!MRI->getTargetRegisterInfo()->isGeneralPurposeRegisterClass( + MRI->getRegClass(Reg))) continue; // Register already in closure. diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index bdc9a0d29670..77dac1197f85 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -4227,3 +4227,323 @@ void X86FrameLowering::restoreWinEHStackPointersInParent( /*RestoreSP=*/IsSEH); } } + +// Compute the alignment gap between current SP after spilling FP/BP and the +// next properly aligned stack offset. +static int computeFPBPAlignmentGap(MachineFunction &MF, + const TargetRegisterClass *RC, + unsigned NumSpilledRegs) { + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + unsigned AllocSize = TRI->getSpillSize(*RC) * NumSpilledRegs; + Align StackAlign = MF.getSubtarget().getFrameLowering()->getStackAlign(); + unsigned AlignedSize = alignTo(AllocSize, StackAlign); + return AlignedSize - AllocSize; +} + +void X86FrameLowering::spillFPBPUsingSP(MachineFunction &MF, + MachineBasicBlock::iterator BeforeMI, + Register FP, Register BP, + int SPAdjust) const { + assert(FP.isValid() || BP.isValid()); + + MachineBasicBlock *MBB = BeforeMI->getParent(); + DebugLoc DL = BeforeMI->getDebugLoc(); + + // Spill FP. + if (FP.isValid()) { + BuildMI(*MBB, BeforeMI, DL, + TII.get(getPUSHOpcode(MF.getSubtarget<X86Subtarget>()))) + .addReg(FP); + } + + // Spill BP. + if (BP.isValid()) { + BuildMI(*MBB, BeforeMI, DL, + TII.get(getPUSHOpcode(MF.getSubtarget<X86Subtarget>()))) + .addReg(BP); + } + + // Make sure SP is aligned. + if (SPAdjust) + emitSPUpdate(*MBB, BeforeMI, DL, -SPAdjust, false); + + // Emit unwinding information. + if (FP.isValid() && needsDwarfCFI(MF)) { + // Emit .cfi_remember_state to remember old frame. + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::createRememberState(nullptr)); + BuildMI(*MBB, BeforeMI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + + // Setup new CFA value with DW_CFA_def_cfa_expression: + // DW_OP_breg7+offset, DW_OP_deref, DW_OP_consts 16, DW_OP_plus + SmallString<64> CfaExpr; + uint8_t buffer[16]; + int Offset = SPAdjust; + if (BP.isValid()) + Offset += TRI->getSpillSize(*TRI->getMinimalPhysRegClass(BP)); + // If BeforeMI is a frame setup instruction, we need to adjust the position + // and offset of the new cfi instruction. + if (TII.isFrameSetup(*BeforeMI)) { + Offset += alignTo(TII.getFrameSize(*BeforeMI), getStackAlign()); + BeforeMI = std::next(BeforeMI); + } + Register StackPtr = TRI->getStackRegister(); + if (STI.isTarget64BitILP32()) + StackPtr = Register(getX86SubSuperRegister(StackPtr, 64)); + unsigned DwarfStackPtr = TRI->getDwarfRegNum(StackPtr, true); + CfaExpr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfStackPtr)); + CfaExpr.append(buffer, buffer + encodeSLEB128(Offset, buffer)); + CfaExpr.push_back(dwarf::DW_OP_deref); + CfaExpr.push_back(dwarf::DW_OP_consts); + CfaExpr.append(buffer, buffer + encodeSLEB128(SlotSize * 2, buffer)); + CfaExpr.push_back((uint8_t)dwarf::DW_OP_plus); + + SmallString<64> DefCfaExpr; + DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression); + DefCfaExpr.append(buffer, buffer + encodeSLEB128(CfaExpr.size(), buffer)); + DefCfaExpr.append(CfaExpr.str()); + BuildCFI(*MBB, BeforeMI, DL, + MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str()), + MachineInstr::FrameSetup); + } +} + +void X86FrameLowering::restoreFPBPUsingSP(MachineFunction &MF, + MachineBasicBlock::iterator AfterMI, + Register FP, Register BP, + int SPAdjust) const { + assert(FP.isValid() || BP.isValid()); + + // Adjust SP so it points to spilled FP or BP. + MachineBasicBlock *MBB = AfterMI->getParent(); + MachineBasicBlock::iterator Pos = std::next(AfterMI); + DebugLoc DL = AfterMI->getDebugLoc(); + if (SPAdjust) + emitSPUpdate(*MBB, Pos, DL, SPAdjust, false); + + // Restore BP. + if (BP.isValid()) { + BuildMI(*MBB, Pos, DL, + TII.get(getPOPOpcode(MF.getSubtarget<X86Subtarget>())), BP); + } + + // Restore FP. + if (FP.isValid()) { + BuildMI(*MBB, Pos, DL, + TII.get(getPOPOpcode(MF.getSubtarget<X86Subtarget>())), FP); + + // Emit unwinding information. + if (needsDwarfCFI(MF)) { + // Restore original frame with .cfi_restore_state. + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::createRestoreState(nullptr)); + BuildMI(*MBB, Pos, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } + } +} + +void X86FrameLowering::saveAndRestoreFPBPUsingSP( + MachineFunction &MF, MachineBasicBlock::iterator BeforeMI, + MachineBasicBlock::iterator AfterMI, bool SpillFP, bool SpillBP) const { + assert(SpillFP || SpillBP); + + Register FP, BP; + const TargetRegisterClass *RC; + unsigned NumRegs = 0; + + if (SpillFP) { + FP = TRI->getFrameRegister(MF); + if (STI.isTarget64BitILP32()) + FP = Register(getX86SubSuperRegister(FP, 64)); + RC = TRI->getMinimalPhysRegClass(FP); + ++NumRegs; + } + if (SpillBP) { + BP = TRI->getBaseRegister(); + if (STI.isTarget64BitILP32()) + BP = Register(getX86SubSuperRegister(BP, 64)); + RC = TRI->getMinimalPhysRegClass(BP); + ++NumRegs; + } + int SPAdjust = computeFPBPAlignmentGap(MF, RC, NumRegs); + + spillFPBPUsingSP(MF, BeforeMI, FP, BP, SPAdjust); + restoreFPBPUsingSP(MF, AfterMI, FP, BP, SPAdjust); +} + +bool X86FrameLowering::skipSpillFPBP( + MachineFunction &MF, MachineBasicBlock::reverse_iterator &MI) const { + if (MI->getOpcode() == X86::LCMPXCHG16B_SAVE_RBX) { + // The pseudo instruction LCMPXCHG16B_SAVE_RBX is generated in the form + // SaveRbx = COPY RBX + // SaveRbx = LCMPXCHG16B_SAVE_RBX ..., SaveRbx, implicit-def rbx + // And later LCMPXCHG16B_SAVE_RBX is expanded to restore RBX from SaveRbx. + // We should skip this instruction sequence. + int FI; + unsigned Reg; + while (!(MI->getOpcode() == TargetOpcode::COPY && + MI->getOperand(1).getReg() == X86::RBX) && + !((Reg = TII.isStoreToStackSlot(*MI, FI)) && Reg == X86::RBX)) + ++MI; + return true; + } + return false; +} + +static bool isFPBPAccess(const MachineInstr &MI, Register FP, Register BP, + const TargetRegisterInfo *TRI, bool &AccessFP, + bool &AccessBP) { + AccessFP = AccessBP = false; + if (FP) { + if (MI.findRegisterUseOperandIdx(FP, TRI, false) != -1 || + MI.findRegisterDefOperandIdx(FP, TRI, false, true) != -1) + AccessFP = true; + } + if (BP) { + if (MI.findRegisterUseOperandIdx(BP, TRI, false) != -1 || + MI.findRegisterDefOperandIdx(BP, TRI, false, true) != -1) + AccessBP = true; + } + return AccessFP || AccessBP; +} + +// Invoke instruction has been lowered to normal function call. We try to figure +// out if MI comes from Invoke. +// Do we have any better method? +static bool isInvoke(const MachineInstr &MI, bool InsideEHLabels) { + if (!MI.isCall()) + return false; + if (InsideEHLabels) + return true; + + const MachineBasicBlock *MBB = MI.getParent(); + if (!MBB->hasEHPadSuccessor()) + return false; + + // Check if there is another call instruction from MI to the end of MBB. + MachineBasicBlock::const_iterator MBBI = MI, ME = MBB->end(); + for (++MBBI; MBBI != ME; ++MBBI) + if (MBBI->isCall()) + return false; + return true; +} + +/// If a function uses base pointer and the base pointer is clobbered by inline +/// asm, RA doesn't detect this case, and after the inline asm, the base pointer +/// contains garbage value. +/// For example if a 32b x86 function uses base pointer esi, and esi is +/// clobbered by following inline asm +/// asm("rep movsb" : "+D"(ptr), "+S"(x), "+c"(c)::"memory"); +/// We need to save esi before the asm and restore it after the asm. +/// +/// The problem can also occur to frame pointer if there is a function call, and +/// the callee uses a different calling convention and clobbers the fp. +/// +/// Because normal frame objects (spill slots) are accessed through fp/bp +/// register, so we can't spill fp/bp to normal spill slots. +/// +/// FIXME: There are 2 possible enhancements: +/// 1. In many cases there are different physical registers not clobbered by +/// inline asm, we can use one of them as base pointer. Or use a virtual +/// register as base pointer and let RA allocate a physical register to it. +/// 2. If there is no other instructions access stack with fp/bp from the +/// inline asm to the epilog, and no cfi requirement for a correct fp, we can +/// skip the save and restore operations. +void X86FrameLowering::spillFPBP(MachineFunction &MF) const { + Register FP, BP; + const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering(); + if (TFI.hasFP(MF)) + FP = TRI->getFrameRegister(MF); + if (TRI->hasBasePointer(MF)) + BP = TRI->getBaseRegister(); + if (!FP && !BP) + return; + + for (MachineBasicBlock &MBB : MF) { + bool InsideEHLabels = false; + auto MI = MBB.rbegin(), ME = MBB.rend(); + auto TermMI = MBB.getFirstTerminator(); + if (TermMI != MBB.begin()) + MI = *(std::prev(TermMI)); + + while (MI != ME) { + // Skip frame setup/destroy instructions. + // Skip Invoke (call inside try block) instructions. + // Skip instructions handled by target. + if (MI->getFlag(MachineInstr::MIFlag::FrameSetup) || + MI->getFlag(MachineInstr::MIFlag::FrameDestroy) || + isInvoke(*MI, InsideEHLabels) || skipSpillFPBP(MF, MI)) { + ++MI; + continue; + } + + if (MI->getOpcode() == TargetOpcode::EH_LABEL) { + InsideEHLabels = !InsideEHLabels; + ++MI; + continue; + } + + bool AccessFP, AccessBP; + // Check if fp or bp is used in MI. + if (!isFPBPAccess(*MI, FP, BP, TRI, AccessFP, AccessBP)) { + ++MI; + continue; + } + + // Look for the range [DefMI, KillMI] in which fp or bp is defined and + // used. + bool FPLive = false, BPLive = false; + bool SpillFP = false, SpillBP = false; + auto DefMI = MI, KillMI = MI; + do { + SpillFP |= AccessFP; + SpillBP |= AccessBP; + + // Maintain FPLive and BPLive. + if (FPLive && MI->findRegisterDefOperandIdx(FP, TRI, false, true) != -1) + FPLive = false; + if (FP && MI->findRegisterUseOperandIdx(FP, TRI, false) != -1) + FPLive = true; + if (BPLive && MI->findRegisterDefOperandIdx(BP, TRI, false, true) != -1) + BPLive = false; + if (BP && MI->findRegisterUseOperandIdx(BP, TRI, false) != -1) + BPLive = true; + + DefMI = MI++; + } while ((MI != ME) && + (FPLive || BPLive || + isFPBPAccess(*MI, FP, BP, TRI, AccessFP, AccessBP))); + + // Don't need to save/restore if FP is accessed through llvm.frameaddress. + if (FPLive && !SpillBP) + continue; + + // If the bp is clobbered by a call, we should save and restore outside of + // the frame setup instructions. + if (KillMI->isCall() && DefMI != ME) { + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + auto FrameSetup = std::next(DefMI); + // Look for frame setup instruction toward the start of the BB. + // If we reach another call instruction, it means no frame setup + // instruction for the current call instruction. + while (FrameSetup != ME && !TII.isFrameSetup(*FrameSetup) && + !FrameSetup->isCall()) + ++FrameSetup; + // If a frame setup instruction is found, we need to find out the + // corresponding frame destroy instruction. + if (FrameSetup != ME && TII.isFrameSetup(*FrameSetup)) { + while (!TII.isFrameInstr(*KillMI)) + --KillMI; + DefMI = FrameSetup; + MI = DefMI; + ++MI; + } + } + + // Call target function to spill and restore FP and BP registers. + saveAndRestoreFPBPUsingSP(MF, &(*DefMI), &(*KillMI), SpillFP, SpillBP); + } + } +} diff --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h index 2dc9ecc6109d..e21f6ab3d16d 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.h +++ b/llvm/lib/Target/X86/X86FrameLowering.h @@ -103,6 +103,8 @@ public: MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const override; + void spillFPBP(MachineFunction &MF) const override; + bool hasFP(const MachineFunction &MF) const override; bool hasReservedCallFrame(const MachineFunction &MF) const override; bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override; @@ -267,6 +269,29 @@ private: void emitCatchRetReturnValue(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineInstr *CatchRet) const; + + /// Issue instructions to allocate stack space and spill frame pointer and/or + /// base pointer to stack using stack pointer register. + void spillFPBPUsingSP(MachineFunction &MF, + const MachineBasicBlock::iterator BeforeMI, Register FP, + Register BP, int SPAdjust) const; + + /// Issue instructions to restore frame pointer and/or base pointer from stack + /// using stack pointer register, and free stack space. + void restoreFPBPUsingSP(MachineFunction &MF, + const MachineBasicBlock::iterator AfterMI, + Register FP, Register BP, int SPAdjust) const; + + void saveAndRestoreFPBPUsingSP(MachineFunction &MF, + MachineBasicBlock::iterator BeforeMI, + MachineBasicBlock::iterator AfterMI, + bool SpillFP, bool SpillBP) const; + + // If MI uses fp/bp, but target can handle it, and doesn't want to be spilled + // again, this function should return true, and update MI so we will not check + // any instructions from related sequence. + bool skipSpillFPBP(MachineFunction &MF, + MachineBasicBlock::reverse_iterator &MI) const; }; } // End llvm namespace diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index fff65a1bd967..2891e21be1b2 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -34069,6 +34069,14 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(VMINMAX_SAE) NODE_NAME_CASE(VMINMAXS) NODE_NAME_CASE(VMINMAXS_SAE) + NODE_NAME_CASE(CVTP2IBS) + NODE_NAME_CASE(CVTP2IUBS) + NODE_NAME_CASE(CVTP2IBS_RND) + NODE_NAME_CASE(CVTP2IUBS_RND) + NODE_NAME_CASE(CVTTP2IBS) + NODE_NAME_CASE(CVTTP2IUBS) + NODE_NAME_CASE(CVTTP2IBS_SAE) + NODE_NAME_CASE(CVTTP2IUBS_SAE) NODE_NAME_CASE(AESENC128KL) NODE_NAME_CASE(AESDEC128KL) NODE_NAME_CASE(AESENC256KL) diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index b985f7529ea2..2e7538cb3c11 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -607,6 +607,15 @@ namespace llvm { VMINMAXS, VMINMAXS_SAE, + CVTP2IBS, + CVTP2IUBS, + CVTP2IBS_RND, + CVTP2IUBS_RND, + CVTTP2IBS, + CVTTP2IUBS, + CVTTP2IBS_SAE, + CVTTP2IUBS_SAE, + MPSADBW, // Compress and expand. diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td index 8e4586f2002d..fe381b377826 100644 --- a/llvm/lib/Target/X86/X86InstrAVX10.td +++ b/llvm/lib/Target/X86/X86InstrAVX10.td @@ -451,3 +451,176 @@ defm VMINMAXSH : avx10_minmax_scalar<"vminmaxsh", v8f16x_info, X86vminmaxs, X86v AVX512PSIi8Base, VEX_LIG, EVEX, VVVV, EVEX_CD8<16, CD8VT1>, TA; defm VMINMAXSS : avx10_minmax_scalar<"vminmaxss", v4f32x_info, X86vminmaxs, X86vminmaxsSae>, AVX512AIi8Base, VEX_LIG, EVEX, VVVV, EVEX_CD8<32, CD8VT1>; + +//------------------------------------------------- +// AVX10 SATCVT instructions +//------------------------------------------------- + +multiclass avx10_sat_cvt_rmb<bits<8> Opc, string OpStr, X86FoldableSchedWrite sched, + X86VectorVTInfo DestInfo, + X86VectorVTInfo SrcInfo, + SDNode MaskNode> { + defm rr: AVX512_maskable<Opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst), + (ins SrcInfo.RC:$src), OpStr, "$src", "$src", + (DestInfo.VT (MaskNode (SrcInfo.VT SrcInfo.RC:$src)))>, + Sched<[sched]>; + defm rm: AVX512_maskable<Opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst), + (ins SrcInfo.MemOp:$src), OpStr, "$src", "$src", + (DestInfo.VT (MaskNode (SrcInfo.VT + (SrcInfo.LdFrag addr:$src))))>, + Sched<[sched.Folded, sched.ReadAfterFold]>; + defm rmb: AVX512_maskable<Opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst), + (ins SrcInfo.ScalarMemOp:$src), OpStr, + "${src}"#SrcInfo.BroadcastStr, "${src}"#SrcInfo.BroadcastStr, + (DestInfo.VT (MaskNode (SrcInfo.VT + (SrcInfo.BroadcastLdFrag addr:$src))))>, EVEX_B, + Sched<[sched.Folded, sched.ReadAfterFold]>; +} + +// Conversion with rounding control (RC) +multiclass avx10_sat_cvt_rc<bits<8> Opc, string OpStr, X86SchedWriteWidths sched, + AVX512VLVectorVTInfo DestInfo, AVX512VLVectorVTInfo SrcInfo, + SDNode MaskNode> { + let Predicates = [HasAVX10_2_512], Uses = [MXCSR] in + defm Zrrb : AVX512_maskable<Opc, MRMSrcReg, DestInfo.info512, + (outs DestInfo.info512.RC:$dst), + (ins SrcInfo.info512.RC:$src, AVX512RC:$rc), + OpStr, "$rc, $src", "$src, $rc", + (DestInfo.info512.VT + (MaskNode (SrcInfo.info512.VT SrcInfo.info512.RC:$src), + (i32 timm:$rc)))>, + Sched<[sched.ZMM]>, EVEX, EVEX_RC, EVEX_B; + let Predicates = [HasAVX10_2], hasEVEX_U = 1 in { + defm Z256rrb : AVX512_maskable<Opc, MRMSrcReg, DestInfo.info256, + (outs DestInfo.info256.RC:$dst), + (ins SrcInfo.info256.RC:$src, AVX512RC:$rc), + OpStr, "$rc, $src", "$src, $rc", + (DestInfo.info256.VT + (MaskNode (SrcInfo.info256.VT SrcInfo.info256.RC:$src), + (i32 timm:$rc)))>, + Sched<[sched.YMM]>, EVEX, EVEX_RC, EVEX_B; + } +} + +// Conversion with SAE +multiclass avx10_sat_cvt_sae<bits<8> Opc, string OpStr, X86SchedWriteWidths sched, + AVX512VLVectorVTInfo DestInfo, AVX512VLVectorVTInfo SrcInfo, + SDNode Node> { + let Predicates = [HasAVX10_2_512], Uses = [MXCSR] in + defm Zrrb : AVX512_maskable<Opc, MRMSrcReg, DestInfo.info512, + (outs DestInfo.info512.RC:$dst), + (ins SrcInfo.info512.RC:$src), + OpStr, "{sae}, $src", "$src, {sae}", + (DestInfo.info512.VT + (Node (SrcInfo.info512.VT SrcInfo.info512.RC:$src)))>, + Sched<[sched.ZMM]>, EVEX, EVEX_B; + let Predicates = [HasAVX10_2], hasEVEX_U = 1 in { + defm Z256rrb : AVX512_maskable<Opc, MRMSrcReg, DestInfo.info256, + (outs DestInfo.info256.RC:$dst), + (ins SrcInfo.info256.RC:$src), + OpStr, "{sae}, $src", "$src, {sae}", + (DestInfo.info256.VT + (Node (SrcInfo.info256.VT SrcInfo.info256.RC:$src)))>, + Sched<[sched.YMM]>, EVEX, EVEX_B; + } +} + +multiclass avx10_sat_cvt_base<bits<8> Opc, string OpStr, X86SchedWriteWidths sched, + SDNode MaskNode, AVX512VLVectorVTInfo DestInfo, + AVX512VLVectorVTInfo SrcInfo> { + let Predicates = [HasAVX10_2_512] in + defm Z : avx10_sat_cvt_rmb<Opc, OpStr, sched.ZMM, + DestInfo.info512, SrcInfo.info512, + MaskNode>, + EVEX, EVEX_V512; + let Predicates = [HasAVX10_2] in { + defm Z256 + : avx10_sat_cvt_rmb<Opc, OpStr, sched.YMM, + DestInfo.info256, SrcInfo.info256, + MaskNode>, + EVEX, EVEX_V256; + defm Z128 + : avx10_sat_cvt_rmb<Opc, OpStr, sched.XMM, + DestInfo.info128, SrcInfo.info128, + MaskNode>, + EVEX, EVEX_V128; + } +} + +defm VCVTNEBF162IBS : avx10_sat_cvt_base<0x69, "vcvtnebf162ibs", + SchedWriteVecIMul, X86vcvtp2ibs, + avx512vl_i16_info, avx512vl_bf16_info>, + AVX512XDIi8Base, T_MAP5, EVEX_CD8<16, CD8VF>; +defm VCVTNEBF162IUBS : avx10_sat_cvt_base<0x6b, "vcvtnebf162iubs", + SchedWriteVecIMul, X86vcvtp2iubs, + avx512vl_i16_info, avx512vl_bf16_info>, + AVX512XDIi8Base, T_MAP5, EVEX_CD8<16, CD8VF>; + +defm VCVTPH2IBS : avx10_sat_cvt_base<0x69, "vcvtph2ibs", SchedWriteVecIMul, + X86vcvtp2ibs, avx512vl_i16_info, + avx512vl_f16_info>, + avx10_sat_cvt_rc<0x69, "vcvtph2ibs", SchedWriteVecIMul, + avx512vl_i16_info, avx512vl_f16_info, + X86vcvtp2ibsRnd>, + AVX512PSIi8Base, T_MAP5, EVEX_CD8<16, CD8VF>; +defm VCVTPH2IUBS : avx10_sat_cvt_base<0x6b, "vcvtph2iubs", SchedWriteVecIMul, + X86vcvtp2iubs, avx512vl_i16_info, + avx512vl_f16_info>, + avx10_sat_cvt_rc<0x6b, "vcvtph2iubs", SchedWriteVecIMul, + avx512vl_i16_info, avx512vl_f16_info, + X86vcvtp2iubsRnd>, + AVX512PSIi8Base, T_MAP5, EVEX_CD8<16, CD8VF>; + +defm VCVTPS2IBS : avx10_sat_cvt_base<0x69, "vcvtps2ibs", SchedWriteVecIMul, + X86vcvtp2ibs, avx512vl_i32_info, + avx512vl_f32_info>, + avx10_sat_cvt_rc<0x69, "vcvtps2ibs", SchedWriteVecIMul, + avx512vl_i32_info, avx512vl_f32_info, + X86vcvtp2ibsRnd>, + AVX512PDIi8Base, T_MAP5, EVEX_CD8<32, CD8VF>; +defm VCVTPS2IUBS : avx10_sat_cvt_base<0x6b, "vcvtps2iubs", SchedWriteVecIMul, + X86vcvtp2iubs, avx512vl_i32_info, + avx512vl_f32_info>, + avx10_sat_cvt_rc<0x6b, "vcvtps2iubs", SchedWriteVecIMul, + avx512vl_i32_info, avx512vl_f32_info, + X86vcvtp2iubsRnd>, + AVX512PDIi8Base, T_MAP5, EVEX_CD8<32, CD8VF>; + +defm VCVTTNEBF162IBS : avx10_sat_cvt_base<0x68, "vcvttnebf162ibs", + SchedWriteVecIMul, X86vcvttp2ibs, + avx512vl_i16_info, avx512vl_bf16_info>, + AVX512XDIi8Base, T_MAP5, EVEX_CD8<16, CD8VF>; +defm VCVTTNEBF162IUBS : avx10_sat_cvt_base<0x6a, "vcvttnebf162iubs", + SchedWriteVecIMul, X86vcvttp2iubs, + avx512vl_i16_info, avx512vl_bf16_info>, + AVX512XDIi8Base, T_MAP5, EVEX_CD8<16, CD8VF>; + +defm VCVTTPH2IBS : avx10_sat_cvt_base<0x68, "vcvttph2ibs", SchedWriteVecIMul, + X86vcvttp2ibs, avx512vl_i16_info, + avx512vl_f16_info>, + avx10_sat_cvt_sae<0x68, "vcvttph2ibs", SchedWriteVecIMul, + avx512vl_i16_info, avx512vl_f16_info, + X86vcvttp2ibsSAE>, + AVX512PSIi8Base, T_MAP5, EVEX_CD8<16, CD8VF>; +defm VCVTTPH2IUBS : avx10_sat_cvt_base<0x6a, "vcvttph2iubs", SchedWriteVecIMul, + X86vcvttp2iubs, avx512vl_i16_info, + avx512vl_f16_info>, + avx10_sat_cvt_sae<0x6a, "vcvttph2iubs", SchedWriteVecIMul, + avx512vl_i16_info, avx512vl_f16_info, + X86vcvttp2iubsSAE>, + AVX512PSIi8Base, T_MAP5, EVEX_CD8<16, CD8VF>; + +defm VCVTTPS2IBS : avx10_sat_cvt_base<0x68, "vcvttps2ibs", SchedWriteVecIMul, + X86vcvttp2ibs, avx512vl_i32_info, + avx512vl_f32_info>, + avx10_sat_cvt_sae<0x68, "vcvttps2ibs", SchedWriteVecIMul, + avx512vl_i32_info, avx512vl_f32_info, + X86vcvttp2ibsSAE>, + AVX512PDIi8Base, T_MAP5, EVEX_CD8<32, CD8VF>; +defm VCVTTPS2IUBS : avx10_sat_cvt_base<0x6a, "vcvttps2iubs", SchedWriteVecIMul, + X86vcvttp2iubs, avx512vl_i32_info, + avx512vl_f32_info>, + avx10_sat_cvt_sae<0x6a, "vcvttps2iubs", SchedWriteVecIMul, + avx512vl_i32_info, avx512vl_f32_info, + X86vcvttp2iubsSAE>, + AVX512PDIi8Base, T_MAP5, EVEX_CD8<32, CD8VF>; diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 78c76cacbfef..6db1cf7c9ee1 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -833,6 +833,20 @@ def X86vpdpwuuds : SDNode<"X86ISD::VPDPWUUDS", SDTVnni>; def X86Vmpsadbw : SDNode<"X86ISD::MPSADBW", SDTX86PSADBW>; +// in place saturated cvt fp-to-int +def X86vcvtp2ibs : SDNode<"X86ISD::CVTP2IBS", SDTFloatToInt>; +def X86vcvtp2iubs : SDNode<"X86ISD::CVTP2IUBS", SDTFloatToInt>; + +def X86vcvtp2ibsRnd : SDNode<"X86ISD::CVTP2IBS_RND", SDTFloatToIntRnd>; +def X86vcvtp2iubsRnd : SDNode<"X86ISD::CVTP2IUBS_RND", SDTFloatToIntRnd>; + +// in place saturated cvtt fp-to-int staff +def X86vcvttp2ibs : SDNode<"X86ISD::CVTTP2IBS", SDTFloatToInt>; +def X86vcvttp2iubs : SDNode<"X86ISD::CVTTP2IUBS", SDTFloatToInt>; + +def X86vcvttp2ibsSAE : SDNode<"X86ISD::CVTTP2IBS_SAE", SDTFloatToInt>; +def X86vcvttp2iubsSAE : SDNode<"X86ISD::CVTTP2IUBS_SAE", SDTFloatToInt>; + //===----------------------------------------------------------------------===// // SSE pattern fragments //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86InstrUtils.td b/llvm/lib/Target/X86/X86InstrUtils.td index 8387b76a40cd..208af630a352 100644 --- a/llvm/lib/Target/X86/X86InstrUtils.td +++ b/llvm/lib/Target/X86/X86InstrUtils.td @@ -313,7 +313,7 @@ def v32i16_info : X86VectorVTInfo<32, i16, VR512, "w">; def v16i32_info : X86VectorVTInfo<16, i32, VR512, "d">; def v8i64_info : X86VectorVTInfo<8, i64, VR512, "q">; def v32f16_info : X86VectorVTInfo<32, f16, VR512, "ph">; -def v32bf16_info: X86VectorVTInfo<32, bf16, VR512, "pbf">; +def v32bf16_info: X86VectorVTInfo<32, bf16, VR512, "pbh">; def v16f32_info : X86VectorVTInfo<16, f32, VR512, "ps">; def v8f64_info : X86VectorVTInfo<8, f64, VR512, "pd">; @@ -323,7 +323,7 @@ def v16i16x_info : X86VectorVTInfo<16, i16, VR256X, "w">; def v8i32x_info : X86VectorVTInfo<8, i32, VR256X, "d">; def v4i64x_info : X86VectorVTInfo<4, i64, VR256X, "q">; def v16f16x_info : X86VectorVTInfo<16, f16, VR256X, "ph">; -def v16bf16x_info: X86VectorVTInfo<16, bf16, VR256X, "pbf">; +def v16bf16x_info: X86VectorVTInfo<16, bf16, VR256X, "pbh">; def v8f32x_info : X86VectorVTInfo<8, f32, VR256X, "ps">; def v4f64x_info : X86VectorVTInfo<4, f64, VR256X, "pd">; @@ -332,7 +332,7 @@ def v8i16x_info : X86VectorVTInfo<8, i16, VR128X, "w">; def v4i32x_info : X86VectorVTInfo<4, i32, VR128X, "d">; def v2i64x_info : X86VectorVTInfo<2, i64, VR128X, "q">; def v8f16x_info : X86VectorVTInfo<8, f16, VR128X, "ph">; -def v8bf16x_info : X86VectorVTInfo<8, bf16, VR128X, "pbf">; +def v8bf16x_info : X86VectorVTInfo<8, bf16, VR128X, "pbh">; def v4f32x_info : X86VectorVTInfo<4, f32, VR128X, "ps">; def v2f64x_info : X86VectorVTInfo<2, f64, VR128X, "pd">; diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 536391da295d..47be08c8af3e 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -408,6 +408,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND), X86_INTRINSIC_DATA(avx10_mask_vcvtph2dq256, INTR_TYPE_1OP_MASK, X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND), + X86_INTRINSIC_DATA(avx10_mask_vcvtph2ibs128, INTR_TYPE_1OP_MASK, + X86ISD::CVTP2IBS, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvtph2ibs256, INTR_TYPE_1OP_MASK, + X86ISD::CVTP2IBS, X86ISD::CVTP2IBS_RND), + X86_INTRINSIC_DATA(avx10_mask_vcvtph2ibs512, INTR_TYPE_1OP_MASK, + X86ISD::CVTP2IBS, X86ISD::CVTP2IBS_RND), + X86_INTRINSIC_DATA(avx10_mask_vcvtph2iubs128, INTR_TYPE_1OP_MASK, + X86ISD::CVTP2IUBS, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvtph2iubs256, INTR_TYPE_1OP_MASK, + X86ISD::CVTP2IUBS, X86ISD::CVTP2IUBS_RND), + X86_INTRINSIC_DATA(avx10_mask_vcvtph2iubs512, INTR_TYPE_1OP_MASK, + X86ISD::CVTP2IUBS, X86ISD::CVTP2IUBS_RND), X86_INTRINSIC_DATA(avx10_mask_vcvtph2pd256, INTR_TYPE_1OP_MASK_SAE, ISD::FP_EXTEND, X86ISD::VFPEXT_SAE), X86_INTRINSIC_DATA(avx10_mask_vcvtph2psx256, INTR_TYPE_1OP_MASK_SAE, @@ -424,6 +436,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND), X86_INTRINSIC_DATA(avx10_mask_vcvtps2dq256, INTR_TYPE_1OP_MASK, X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND), + X86_INTRINSIC_DATA(avx10_mask_vcvtps2ibs128, INTR_TYPE_1OP_MASK, + X86ISD::CVTP2IBS, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvtps2ibs256, INTR_TYPE_1OP_MASK, + X86ISD::CVTP2IBS, X86ISD::CVTP2IBS_RND), + X86_INTRINSIC_DATA(avx10_mask_vcvtps2ibs512, INTR_TYPE_1OP_MASK, + X86ISD::CVTP2IBS, X86ISD::CVTP2IBS_RND), + X86_INTRINSIC_DATA(avx10_mask_vcvtps2iubs128, INTR_TYPE_1OP_MASK, + X86ISD::CVTP2IUBS, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvtps2iubs256, INTR_TYPE_1OP_MASK, + X86ISD::CVTP2IUBS, X86ISD::CVTP2IUBS_RND), + X86_INTRINSIC_DATA(avx10_mask_vcvtps2iubs512, INTR_TYPE_1OP_MASK, + X86ISD::CVTP2IUBS, X86ISD::CVTP2IUBS_RND), X86_INTRINSIC_DATA(avx10_mask_vcvtps2pd256, INTR_TYPE_1OP_MASK_SAE, ISD::FP_EXTEND, X86ISD::VFPEXT_SAE), X86_INTRINSIC_DATA(avx10_mask_vcvtps2phx256, INTR_TYPE_1OP_MASK, @@ -444,6 +468,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE), X86_INTRINSIC_DATA(avx10_mask_vcvttph2dq256, INTR_TYPE_1OP_MASK, X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE), + X86_INTRINSIC_DATA(avx10_mask_vcvttph2ibs128, INTR_TYPE_1OP_MASK, + X86ISD::CVTTP2IBS, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvttph2ibs256, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTTP2IBS, X86ISD::CVTTP2IBS_SAE), + X86_INTRINSIC_DATA(avx10_mask_vcvttph2ibs512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTTP2IBS, X86ISD::CVTTP2IBS_SAE), + X86_INTRINSIC_DATA(avx10_mask_vcvttph2iubs128, INTR_TYPE_1OP_MASK, + X86ISD::CVTTP2IUBS, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvttph2iubs256, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTTP2IUBS, X86ISD::CVTTP2IUBS_SAE), + X86_INTRINSIC_DATA(avx10_mask_vcvttph2iubs512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTTP2IUBS, X86ISD::CVTTP2IUBS_SAE), X86_INTRINSIC_DATA(avx10_mask_vcvttph2qq256, INTR_TYPE_1OP_MASK, X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE), X86_INTRINSIC_DATA(avx10_mask_vcvttph2udq256, INTR_TYPE_1OP_MASK, @@ -456,6 +492,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE), X86_INTRINSIC_DATA(avx10_mask_vcvttps2dq256, INTR_TYPE_1OP_MASK, X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE), + X86_INTRINSIC_DATA(avx10_mask_vcvttps2ibs128, INTR_TYPE_1OP_MASK, + X86ISD::CVTTP2IBS, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvttps2ibs256, INTR_TYPE_1OP_MASK, + X86ISD::CVTTP2IBS, X86ISD::CVTTP2IBS_SAE), + X86_INTRINSIC_DATA(avx10_mask_vcvttps2ibs512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTTP2IBS, X86ISD::CVTTP2IBS_SAE), + X86_INTRINSIC_DATA(avx10_mask_vcvttps2iubs128, INTR_TYPE_1OP_MASK, + X86ISD::CVTTP2IUBS, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvttps2iubs256, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTTP2IUBS, X86ISD::CVTTP2IUBS_SAE), + X86_INTRINSIC_DATA(avx10_mask_vcvttps2iubs512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTTP2IUBS, X86ISD::CVTTP2IUBS_SAE), X86_INTRINSIC_DATA(avx10_mask_vcvttps2qq256, INTR_TYPE_1OP_MASK, X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE), X86_INTRINSIC_DATA(avx10_mask_vcvttps2udq256, INTR_TYPE_1OP_MASK, @@ -546,6 +594,30 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::FADD_RND), X86_INTRINSIC_DATA(avx10_vaddps256, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND), + X86_INTRINSIC_DATA(avx10_vcvtnebf162ibs128, INTR_TYPE_1OP, X86ISD::CVTP2IBS, + 0), + X86_INTRINSIC_DATA(avx10_vcvtnebf162ibs256, INTR_TYPE_1OP, X86ISD::CVTP2IBS, + 0), + X86_INTRINSIC_DATA(avx10_vcvtnebf162ibs512, INTR_TYPE_1OP, X86ISD::CVTP2IBS, + 0), + X86_INTRINSIC_DATA(avx10_vcvtnebf162iubs128, INTR_TYPE_1OP, + X86ISD::CVTP2IUBS, 0), + X86_INTRINSIC_DATA(avx10_vcvtnebf162iubs256, INTR_TYPE_1OP, + X86ISD::CVTP2IUBS, 0), + X86_INTRINSIC_DATA(avx10_vcvtnebf162iubs512, INTR_TYPE_1OP, + X86ISD::CVTP2IUBS, 0), + X86_INTRINSIC_DATA(avx10_vcvttnebf162ibs128, INTR_TYPE_1OP, + X86ISD::CVTTP2IBS, 0), + X86_INTRINSIC_DATA(avx10_vcvttnebf162ibs256, INTR_TYPE_1OP, + X86ISD::CVTTP2IBS, 0), + X86_INTRINSIC_DATA(avx10_vcvttnebf162ibs512, INTR_TYPE_1OP, + X86ISD::CVTTP2IBS, 0), + X86_INTRINSIC_DATA(avx10_vcvttnebf162iubs128, INTR_TYPE_1OP, + X86ISD::CVTTP2IUBS, 0), + X86_INTRINSIC_DATA(avx10_vcvttnebf162iubs256, INTR_TYPE_1OP, + X86ISD::CVTTP2IUBS, 0), + X86_INTRINSIC_DATA(avx10_vcvttnebf162iubs512, INTR_TYPE_1OP, + X86ISD::CVTTP2IUBS, 0), X86_INTRINSIC_DATA(avx10_vdivpd256, INTR_TYPE_2OP, ISD::FDIV, X86ISD::FDIV_RND), X86_INTRINSIC_DATA(avx10_vdivph256, INTR_TYPE_2OP, ISD::FDIV, diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp index 555ede9e9540..3376367cc76b 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -45,6 +45,12 @@ static cl::opt<bool> EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true), cl::desc("Enable use of a base pointer for complex stack frames")); +static cl::opt<bool> + DisableRegAllocNDDHints("x86-disable-regalloc-hints-for-ndd", cl::Hidden, + cl::init(false), + cl::desc("Disable two address hints for register " + "allocation")); + X86RegisterInfo::X86RegisterInfo(const Triple &TT) : X86GenRegisterInfo((TT.isArch64Bit() ? X86::RIP : X86::EIP), X86_MC::getDwarfRegFlavour(TT, false), @@ -1080,10 +1086,57 @@ bool X86RegisterInfo::getRegAllocationHints(Register VirtReg, const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints( VirtReg, Order, Hints, MF, VRM, Matrix); + const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); + const TargetRegisterInfo &TRI = *ST.getRegisterInfo(); unsigned ID = RC.getID(); - if (ID != X86::TILERegClassID) + + if (!VRM) + return BaseImplRetVal; + + if (ID != X86::TILERegClassID) { + if (DisableRegAllocNDDHints || !ST.hasNDD() || + !TRI.isGeneralPurposeRegisterClass(&RC)) + return BaseImplRetVal; + + // Add any two address hints after any copy hints. + SmallSet<unsigned, 4> TwoAddrHints; + + auto TryAddNDDHint = [&](const MachineOperand &MO) { + Register Reg = MO.getReg(); + Register PhysReg = + Register::isPhysicalRegister(Reg) ? Reg : Register(VRM->getPhys(Reg)); + if (PhysReg && !MRI->isReserved(PhysReg) && !is_contained(Hints, PhysReg)) + TwoAddrHints.insert(PhysReg); + }; + + // NDD instructions is compressible when Op0 is allocated to the same + // physic register as Op1 (or Op2 if it's commutable). + for (auto &MO : MRI->reg_nodbg_operands(VirtReg)) { + const MachineInstr &MI = *MO.getParent(); + if (!X86::getNonNDVariant(MI.getOpcode())) + continue; + unsigned OpIdx = MI.getOperandNo(&MO); + if (OpIdx == 0) { + assert(MI.getOperand(1).isReg()); + TryAddNDDHint(MI.getOperand(1)); + if (MI.isCommutable()) { + assert(MI.getOperand(2).isReg()); + TryAddNDDHint(MI.getOperand(2)); + } + } else if (OpIdx == 1) { + TryAddNDDHint(MI.getOperand(0)); + } else if (MI.isCommutable() && OpIdx == 2) { + TryAddNDDHint(MI.getOperand(0)); + } + } + + for (MCPhysReg OrderReg : Order) + if (TwoAddrHints.count(OrderReg)) + Hints.push_back(OrderReg); + return BaseImplRetVal; + } ShapeT VirtShape = getTileShape(VirtReg, const_cast<VirtRegMap *>(VRM), MRI); auto AddHint = [&](MCPhysReg PhysReg) { |
