summaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp73
-rw-r--r--llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp4
-rw-r--r--llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp33
-rw-r--r--llvm/lib/Target/AArch64/AArch64FastISel.cpp3
-rw-r--r--llvm/lib/Target/AArch64/AArch64Features.td9
-rw-r--r--llvm/lib/Target/AArch64/AArch64FrameLowering.cpp204
-rw-r--r--llvm/lib/Target/AArch64/AArch64FrameLowering.h6
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp6
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td9
-rw-r--r--llvm/lib/Target/AArch64/AArch64MCInstLower.cpp10
-rw-r--r--llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp25
-rw-r--r--llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h10
-rw-r--r--llvm/lib/Target/AArch64/AArch64Processors.td20
-rw-r--r--llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td10
-rw-r--r--llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td21
-rw-r--r--llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp33
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp4
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp10
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp34
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp5
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp13
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp164
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp21
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td4
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp127
-rw-r--r--llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp81
-rw-r--r--llvm/lib/Target/ARM/ARMConstantIslandPass.cpp10
-rw-r--r--llvm/lib/Target/ARM/ARMInstrThumb2.td1
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp24
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp21
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp20
-rw-r--r--llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp6
-rw-r--r--llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp3
-rw-r--r--llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp26
-rw-r--r--llvm/lib/Target/NVPTX/NVPTX.h29
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp454
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h3
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.td1
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXIntrinsics.td42
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXUtilities.h31
-rw-r--r--llvm/lib/Target/PowerPC/PPCFrameLowering.cpp14
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstr64Bit.td10
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrInfo.td10
-rw-r--r--llvm/lib/Target/RISCV/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/RISCV/RISCV.h3
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp7
-rw-r--r--llvm/lib/Target/RISCV/RISCVIndirectBranchTracking.cpp102
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td23
-rw-r--r--llvm/lib/Target/RISCV/RISCVProcessors.td31
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetMachine.cpp1
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp4
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp7
-rw-r--r--llvm/lib/Target/X86/X86DomainReassignment.cpp12
-rw-r--r--llvm/lib/Target/X86/X86FrameLowering.cpp320
-rw-r--r--llvm/lib/Target/X86/X86FrameLowering.h25
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp8
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.h9
-rw-r--r--llvm/lib/Target/X86/X86InstrAVX10.td173
-rw-r--r--llvm/lib/Target/X86/X86InstrFragmentsSIMD.td14
-rw-r--r--llvm/lib/Target/X86/X86InstrUtils.td6
-rw-r--r--llvm/lib/Target/X86/X86IntrinsicsInfo.h72
-rw-r--r--llvm/lib/Target/X86/X86RegisterInfo.cpp55
64 files changed, 1951 insertions, 573 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index b51c056e9d53..040a47f2f6d7 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -113,6 +113,8 @@ public:
void emitFunctionEntryLabel() override;
+ void emitXXStructor(const DataLayout &DL, const Constant *CV) override;
+
void LowerJumpTableDest(MCStreamer &OutStreamer, const MachineInstr &MI);
void LowerHardenedBRJumpTable(const MachineInstr &MI);
@@ -1280,6 +1282,23 @@ void AArch64AsmPrinter::emitFunctionEntryLabel() {
}
}
+void AArch64AsmPrinter::emitXXStructor(const DataLayout &DL,
+ const Constant *CV) {
+ if (const auto *CPA = dyn_cast<ConstantPtrAuth>(CV))
+ if (CPA->hasAddressDiscriminator() &&
+ !CPA->hasSpecialAddressDiscriminator(
+ ConstantPtrAuth::AddrDiscriminator_CtorsDtors))
+ report_fatal_error(
+ "unexpected address discrimination value for ctors/dtors entry, only "
+ "'ptr inttoptr (i64 1 to ptr)' is allowed");
+ // If we have signed pointers in xxstructors list, they'll be lowered to @AUTH
+ // MCExpr's via AArch64AsmPrinter::lowerConstantPtrAuth. It does not look at
+ // actual address discrimination value and only checks
+ // hasAddressDiscriminator(), so it's OK to leave special address
+ // discrimination value here.
+ AsmPrinter::emitXXStructor(DL, CV);
+}
+
void AArch64AsmPrinter::emitGlobalAlias(const Module &M,
const GlobalAlias &GA) {
if (auto F = dyn_cast_or_null<Function>(GA.getAliasee())) {
@@ -2142,6 +2161,10 @@ void AArch64AsmPrinter::LowerMOVaddrPAC(const MachineInstr &MI) {
};
const bool IsGOTLoad = MI.getOpcode() == AArch64::LOADgotPAC;
+ const bool IsELFSignedGOT = MI.getParent()
+ ->getParent()
+ ->getInfo<AArch64FunctionInfo>()
+ ->hasELFSignedGOT();
MachineOperand GAOp = MI.getOperand(0);
const uint64_t KeyC = MI.getOperand(1).getImm();
assert(KeyC <= AArch64PACKey::LAST &&
@@ -2158,9 +2181,16 @@ void AArch64AsmPrinter::LowerMOVaddrPAC(const MachineInstr &MI) {
// Emit:
// target materialization:
// - via GOT:
- // adrp x16, :got:target
- // ldr x16, [x16, :got_lo12:target]
- // add offset to x16 if offset != 0
+ // - unsigned GOT:
+ // adrp x16, :got:target
+ // ldr x16, [x16, :got_lo12:target]
+ // add offset to x16 if offset != 0
+ // - ELF signed GOT:
+ // adrp x17, :got:target
+ // add x17, x17, :got_auth_lo12:target
+ // ldr x16, [x17]
+ // aut{i|d}a x16, x17
+ // add offset to x16 if offset != 0
//
// - direct:
// adrp x16, target
@@ -2203,13 +2233,40 @@ void AArch64AsmPrinter::LowerMOVaddrPAC(const MachineInstr &MI) {
MCInstLowering.lowerOperand(GAMOLo, GAMCLo);
EmitAndIncrement(
- MCInstBuilder(AArch64::ADRP).addReg(AArch64::X16).addOperand(GAMCHi));
+ MCInstBuilder(AArch64::ADRP)
+ .addReg(IsGOTLoad && IsELFSignedGOT ? AArch64::X17 : AArch64::X16)
+ .addOperand(GAMCHi));
if (IsGOTLoad) {
- EmitAndIncrement(MCInstBuilder(AArch64::LDRXui)
- .addReg(AArch64::X16)
- .addReg(AArch64::X16)
- .addOperand(GAMCLo));
+ if (IsELFSignedGOT) {
+ EmitAndIncrement(MCInstBuilder(AArch64::ADDXri)
+ .addReg(AArch64::X17)
+ .addReg(AArch64::X17)
+ .addOperand(GAMCLo)
+ .addImm(0));
+
+ EmitAndIncrement(MCInstBuilder(AArch64::LDRXui)
+ .addReg(AArch64::X16)
+ .addReg(AArch64::X17)
+ .addImm(0));
+
+ assert(GAOp.isGlobal());
+ assert(GAOp.getGlobal()->getValueType() != nullptr);
+ unsigned AuthOpcode = GAOp.getGlobal()->getValueType()->isFunctionTy()
+ ? AArch64::AUTIA
+ : AArch64::AUTDA;
+
+ EmitAndIncrement(MCInstBuilder(AuthOpcode)
+ .addReg(AArch64::X16)
+ .addReg(AArch64::X16)
+ .addReg(AArch64::X17));
+
+ } else {
+ EmitAndIncrement(MCInstBuilder(AArch64::LDRXui)
+ .addReg(AArch64::X16)
+ .addReg(AArch64::X16)
+ .addOperand(GAMCLo));
+ }
} else {
EmitAndIncrement(MCInstBuilder(AArch64::ADDXri)
.addReg(AArch64::X16)
diff --git a/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index 2bc14f9821e6..161cf24dd403 100644
--- a/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -108,6 +108,10 @@ static bool atomicReadDroppedOnZero(unsigned Opcode) {
case AArch64::LDUMINW: case AArch64::LDUMINX:
case AArch64::LDUMINLB: case AArch64::LDUMINLH:
case AArch64::LDUMINLW: case AArch64::LDUMINLX:
+ case AArch64::SWPB: case AArch64::SWPH:
+ case AArch64::SWPW: case AArch64::SWPX:
+ case AArch64::SWPLB: case AArch64::SWPLH:
+ case AArch64::SWPLW: case AArch64::SWPLX:
return true;
}
return false;
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 9b7fc228d5de..72c767200b38 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1291,7 +1291,40 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
MI.eraseFromParent();
return true;
}
+ case AArch64::LOADgotAUTH: {
+ Register DstReg = MI.getOperand(0).getReg();
+ const MachineOperand &MO1 = MI.getOperand(1);
+
+ MachineOperand GAHiOp(MO1);
+ MachineOperand GALoOp(MO1);
+ GAHiOp.addTargetFlag(AArch64II::MO_PAGE);
+ GALoOp.addTargetFlag(AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+
+ DebugLoc DL = MI.getDebugLoc();
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), AArch64::X16)
+ .add(GAHiOp);
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), AArch64::X16)
+ .addReg(AArch64::X16)
+ .add(GALoOp)
+ .addImm(0);
+
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::LDRXui), DstReg)
+ .addReg(AArch64::X16)
+ .addImm(0);
+
+ assert(MO1.isGlobal());
+ assert(MO1.getGlobal()->getValueType() != nullptr);
+ unsigned AuthOpcode = MO1.getGlobal()->getValueType()->isFunctionTy()
+ ? AArch64::AUTIA
+ : AArch64::AUTDA;
+ BuildMI(MBB, MBBI, DL, TII->get(AuthOpcode), DstReg)
+ .addReg(DstReg)
+ .addReg(AArch64::X16);
+
+ MI.eraseFromParent();
+ return true;
+ }
case AArch64::LOADgot: {
MachineFunction *MF = MBB.getParent();
Register DstReg = MI.getOperand(0).getReg();
diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index cbf38f2c57a3..4487d34a936c 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -453,6 +453,9 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) {
if (!Subtarget->useSmallAddressing() && !Subtarget->isTargetMachO())
return 0;
+ if (FuncInfo.MF->getInfo<AArch64FunctionInfo>()->hasELFSignedGOT())
+ return 0;
+
unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, TM);
EVT DestEVT = TLI.getValueType(DL, GV->getType(), true);
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index a1ae0873fc19..2aa74deb0e85 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -186,7 +186,7 @@ def FeatureJS : ExtensionWithMArch<"jsconv", "JS", "FEAT_JSCVT",
[FeatureFPARMv8]>;
def FeatureFPAC : Extension<"fpac", "FPAC", "FEAT_FPAC",
- "Enable v8.3-A Pointer Authentication Faulting enhancement">;
+ "Enable Armv8.3-A Pointer Authentication Faulting enhancement">;
def FeatureCCIDX : Extension<"ccidx", "CCIDX", "FEAT_CCIDX",
"Enable Armv8.3-A Extend of the CCSIDR number of sets">;
@@ -435,8 +435,11 @@ def FeatureMEC : Extension<"mec", "MEC", "FEAT_MEC",
def FeatureSVE2p1: ExtensionWithMArch<"sve2p1", "SVE2p1", "FEAT_SVE2p1",
"Enable Scalable Vector Extension 2.1 instructions", [FeatureSVE2]>;
-def FeatureB16B16 : ExtensionWithMArch<"b16b16", "B16B16", "FEAT_SVE_B16B16",
- "Enable SVE2.1 or SME2.1 non-widening BFloat16 to BFloat16 instructions", [FeatureBF16]>;
+def FeatureB16B16 : ExtensionWithMArch<"b16b16", "B16B16", "FEAT_B16B16",
+ "Enable SME2.1 ZA-targeting non-widening BFloat16 to BFloat16 instructions", [FeatureBF16]>;
+
+def FeatureSVEB16B16: ExtensionWithMArch<"sve-b16b16", "SVEB16B16", "FEAT_SVE_B16B16",
+ "Enable SVE2.1 non-widening and SME2.1 Z-targeting non-widening BFloat16 to BFloat16 instructions">;
def FeatureSMEF16F16 : ExtensionWithMArch<"sme-f16f16", "SMEF16F16", "FEAT_SME_F16F16",
"Enable SME non-widening Float16 instructions", [FeatureSME2]>;
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index f28511c74dcd..bf0eb1461e55 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -240,6 +240,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatVariadic.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
@@ -275,6 +276,10 @@ cl::opt<bool> EnableHomogeneousPrologEpilog(
// Stack hazard padding size. 0 = disabled.
static cl::opt<unsigned> StackHazardSize("aarch64-stack-hazard-size",
cl::init(0), cl::Hidden);
+// Stack hazard size for analysis remarks. StackHazardSize takes precedence.
+static cl::opt<unsigned>
+ StackHazardRemarkSize("aarch64-stack-hazard-remark-size", cl::init(0),
+ cl::Hidden);
// Whether to insert padding into non-streaming functions (for testing).
static cl::opt<bool>
StackHazardInNonStreaming("aarch64-stack-hazard-in-non-streaming",
@@ -2616,9 +2621,16 @@ AArch64FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF,
const auto &MFI = MF.getFrameInfo();
int64_t ObjectOffset = MFI.getObjectOffset(FI);
+ StackOffset SVEStackSize = getSVEStackSize(MF);
+
+ // For VLA-area objects, just emit an offset at the end of the stack frame.
+ // Whilst not quite correct, these objects do live at the end of the frame and
+ // so it is more useful for analysis for the offset to reflect this.
+ if (MFI.isVariableSizedObjectIndex(FI)) {
+ return StackOffset::getFixed(-((int64_t)MFI.getStackSize())) - SVEStackSize;
+ }
// This is correct in the absence of any SVE stack objects.
- StackOffset SVEStackSize = getSVEStackSize(MF);
if (!SVEStackSize)
return StackOffset::getFixed(ObjectOffset - getOffsetOfLocalArea());
@@ -3529,13 +3541,9 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
return true;
}
-// Return the FrameID for a Load/Store instruction by looking at the MMO.
-static std::optional<int> getLdStFrameID(const MachineInstr &MI,
- const MachineFrameInfo &MFI) {
- if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1)
- return std::nullopt;
-
- MachineMemOperand *MMO = *MI.memoperands_begin();
+// Return the FrameID for a MMO.
+static std::optional<int> getMMOFrameID(MachineMemOperand *MMO,
+ const MachineFrameInfo &MFI) {
auto *PSV =
dyn_cast_or_null<FixedStackPseudoSourceValue>(MMO->getPseudoValue());
if (PSV)
@@ -3553,6 +3561,15 @@ static std::optional<int> getLdStFrameID(const MachineInstr &MI,
return std::nullopt;
}
+// Return the FrameID for a Load/Store instruction by looking at the first MMO.
+static std::optional<int> getLdStFrameID(const MachineInstr &MI,
+ const MachineFrameInfo &MFI) {
+ if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1)
+ return std::nullopt;
+
+ return getMMOFrameID(*MI.memoperands_begin(), MFI);
+}
+
// Check if a Hazard slot is needed for the current function, and if so create
// one for it. The index is stored in AArch64FunctionInfo->StackHazardSlotIndex,
// which can be used to determine if any hazard padding is needed.
@@ -5030,3 +5047,174 @@ void AArch64FrameLowering::inlineStackProbe(MachineFunction &MF,
MI->eraseFromParent();
}
}
+
+struct StackAccess {
+ enum AccessType {
+ NotAccessed = 0, // Stack object not accessed by load/store instructions.
+ GPR = 1 << 0, // A general purpose register.
+ PPR = 1 << 1, // A predicate register.
+ FPR = 1 << 2, // A floating point/Neon/SVE register.
+ };
+
+ int Idx;
+ StackOffset Offset;
+ int64_t Size;
+ unsigned AccessTypes;
+
+ StackAccess() : Idx(0), Offset(), Size(0), AccessTypes(NotAccessed) {}
+
+ bool operator<(const StackAccess &Rhs) const {
+ return std::make_tuple(start(), Idx) <
+ std::make_tuple(Rhs.start(), Rhs.Idx);
+ }
+
+ bool isCPU() const {
+ // Predicate register load and store instructions execute on the CPU.
+ return AccessTypes & (AccessType::GPR | AccessType::PPR);
+ }
+ bool isSME() const { return AccessTypes & AccessType::FPR; }
+ bool isMixed() const { return isCPU() && isSME(); }
+
+ int64_t start() const { return Offset.getFixed() + Offset.getScalable(); }
+ int64_t end() const { return start() + Size; }
+
+ std::string getTypeString() const {
+ switch (AccessTypes) {
+ case AccessType::FPR:
+ return "FPR";
+ case AccessType::PPR:
+ return "PPR";
+ case AccessType::GPR:
+ return "GPR";
+ case AccessType::NotAccessed:
+ return "NA";
+ default:
+ return "Mixed";
+ }
+ }
+
+ void print(raw_ostream &OS) const {
+ OS << getTypeString() << " stack object at [SP"
+ << (Offset.getFixed() < 0 ? "" : "+") << Offset.getFixed();
+ if (Offset.getScalable())
+ OS << (Offset.getScalable() < 0 ? "" : "+") << Offset.getScalable()
+ << " * vscale";
+ OS << "]";
+ }
+};
+
+static inline raw_ostream &operator<<(raw_ostream &OS, const StackAccess &SA) {
+ SA.print(OS);
+ return OS;
+}
+
+void AArch64FrameLowering::emitRemarks(
+ const MachineFunction &MF, MachineOptimizationRemarkEmitter *ORE) const {
+
+ SMEAttrs Attrs(MF.getFunction());
+ if (Attrs.hasNonStreamingInterfaceAndBody())
+ return;
+
+ const uint64_t HazardSize =
+ (StackHazardSize) ? StackHazardSize : StackHazardRemarkSize;
+
+ if (HazardSize == 0)
+ return;
+
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ // Bail if function has no stack objects.
+ if (!MFI.hasStackObjects())
+ return;
+
+ std::vector<StackAccess> StackAccesses(MFI.getNumObjects());
+
+ size_t NumFPLdSt = 0;
+ size_t NumNonFPLdSt = 0;
+
+ // Collect stack accesses via Load/Store instructions.
+ for (const MachineBasicBlock &MBB : MF) {
+ for (const MachineInstr &MI : MBB) {
+ if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1)
+ continue;
+ for (MachineMemOperand *MMO : MI.memoperands()) {
+ std::optional<int> FI = getMMOFrameID(MMO, MFI);
+ if (FI && !MFI.isDeadObjectIndex(*FI)) {
+ int FrameIdx = *FI;
+
+ size_t ArrIdx = FrameIdx + MFI.getNumFixedObjects();
+ if (StackAccesses[ArrIdx].AccessTypes == StackAccess::NotAccessed) {
+ StackAccesses[ArrIdx].Idx = FrameIdx;
+ StackAccesses[ArrIdx].Offset =
+ getFrameIndexReferenceFromSP(MF, FrameIdx);
+ StackAccesses[ArrIdx].Size = MFI.getObjectSize(FrameIdx);
+ }
+
+ unsigned RegTy = StackAccess::AccessType::GPR;
+ if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector) {
+ if (AArch64::PPRRegClass.contains(MI.getOperand(0).getReg()))
+ RegTy = StackAccess::PPR;
+ else
+ RegTy = StackAccess::FPR;
+ } else if (AArch64InstrInfo::isFpOrNEON(MI)) {
+ RegTy = StackAccess::FPR;
+ }
+
+ StackAccesses[ArrIdx].AccessTypes |= RegTy;
+
+ if (RegTy == StackAccess::FPR)
+ ++NumFPLdSt;
+ else
+ ++NumNonFPLdSt;
+ }
+ }
+ }
+ }
+
+ if (NumFPLdSt == 0 || NumNonFPLdSt == 0)
+ return;
+
+ llvm::sort(StackAccesses);
+ StackAccesses.erase(llvm::remove_if(StackAccesses,
+ [](const StackAccess &S) {
+ return S.AccessTypes ==
+ StackAccess::NotAccessed;
+ }),
+ StackAccesses.end());
+
+ SmallVector<const StackAccess *> MixedObjects;
+ SmallVector<std::pair<const StackAccess *, const StackAccess *>> HazardPairs;
+
+ if (StackAccesses.front().isMixed())
+ MixedObjects.push_back(&StackAccesses.front());
+
+ for (auto It = StackAccesses.begin(), End = std::prev(StackAccesses.end());
+ It != End; ++It) {
+ const auto &First = *It;
+ const auto &Second = *(It + 1);
+
+ if (Second.isMixed())
+ MixedObjects.push_back(&Second);
+
+ if ((First.isSME() && Second.isCPU()) ||
+ (First.isCPU() && Second.isSME())) {
+ uint64_t Distance = static_cast<uint64_t>(Second.start() - First.end());
+ if (Distance < HazardSize)
+ HazardPairs.emplace_back(&First, &Second);
+ }
+ }
+
+ auto EmitRemark = [&](llvm::StringRef Str) {
+ ORE->emit([&]() {
+ auto R = MachineOptimizationRemarkAnalysis(
+ "sme", "StackHazard", MF.getFunction().getSubprogram(), &MF.front());
+ return R << formatv("stack hazard in '{0}': ", MF.getName()).str() << Str;
+ });
+ };
+
+ for (const auto &P : HazardPairs)
+ EmitRemark(formatv("{0} is too close to {1}", *P.first, *P.second).str());
+
+ for (const auto *Obj : MixedObjects)
+ EmitRemark(
+ formatv("{0} accessed by both GP and FP instructions", *Obj).str());
+}
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index 0ebab1700e9c..c19731249620 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -13,8 +13,9 @@
#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
#define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
-#include "llvm/Support/TypeSize.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/Support/TypeSize.h"
namespace llvm {
@@ -178,6 +179,9 @@ private:
inlineStackProbeLoopExactMultiple(MachineBasicBlock::iterator MBBI,
int64_t NegProbeSize,
Register TargetReg) const;
+
+ void emitRemarks(const MachineFunction &MF,
+ MachineOptimizationRemarkEmitter *ORE) const override;
};
} // End llvm namespace
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7704321a0fc3..94130736c398 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9225,6 +9225,11 @@ SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
// FIXME: Once remat is capable of dealing with instructions with register
// operands, expand this into two nodes instead of using a wrapper node.
+ if (DAG.getMachineFunction()
+ .getInfo<AArch64FunctionInfo>()
+ ->hasELFSignedGOT())
+ return SDValue(DAG.getMachineNode(AArch64::LOADgotAUTH, DL, Ty, GotAddr),
+ 0);
return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
}
@@ -21769,6 +21774,7 @@ static SDValue performExtendCombine(SDNode *N,
// helps the backend to decide that an sabdl2 would be useful, saving a real
// extract_high operation.
if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
+ N->getOperand(0).getValueType().is64BitVector() &&
(N->getOperand(0).getOpcode() == ISD::ABDU ||
N->getOperand(0).getOpcode() == ISD::ABDS)) {
SDNode *ABDNode = N->getOperand(0).getNode();
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index e720c6b21a97..1e5c5e2657e6 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -143,6 +143,8 @@ def HasFuseAES : Predicate<"Subtarget->hasFuseAES()">,
"fuse-aes">;
def HasSVE : Predicate<"Subtarget->isSVEAvailable()">,
AssemblerPredicateWithAll<(all_of FeatureSVE), "sve">;
+def HasSVEB16B16 : Predicate<"Subtarget->hasSVEB16B16()">,
+ AssemblerPredicateWithAll<(all_of FeatureSVEB16B16), "sve-b16b16">;
def HasSVE2 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2()">,
AssemblerPredicateWithAll<(all_of FeatureSVE2), "sve2">;
def HasSVE2p1 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2p1()">,
@@ -1872,7 +1874,7 @@ let Predicates = [HasPAuth] in {
Sched<[WriteI, ReadI]> {
let isReMaterializable = 1;
let isCodeGenOnly = 1;
- let Size = 40; // 12 fixed + 28 variable, for pointer offset, and discriminator
+ let Size = 48; // 12 fixed + 36 variable, for pointer offset, and discriminator
let Defs = [X16,X17];
}
@@ -1911,6 +1913,11 @@ let Predicates = [HasPAuth] in {
tcGPR64:$AddrDisc),
(AUTH_TCRETURN_BTI tcGPRx16x17:$dst, imm:$FPDiff, imm:$Key,
imm:$Disc, tcGPR64:$AddrDisc)>;
+
+ def LOADgotAUTH : Pseudo<(outs GPR64common:$dst), (ins i64imm:$addr), []>,
+ Sched<[WriteI, ReadI]> {
+ let Defs = [X16];
+ }
}
// v9.5-A pointer authentication extensions
diff --git a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
index 48672241f905..9f234b0f9170 100644
--- a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "AArch64MCInstLower.h"
+#include "AArch64MachineFunctionInfo.h"
#include "MCTargetDesc/AArch64MCExpr.h"
#include "Utils/AArch64BaseInfo.h"
#include "llvm/CodeGen/AsmPrinter.h"
@@ -185,9 +186,12 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO,
MCSymbol *Sym) const {
uint32_t RefFlags = 0;
- if (MO.getTargetFlags() & AArch64II::MO_GOT)
- RefFlags |= AArch64MCExpr::VK_GOT;
- else if (MO.getTargetFlags() & AArch64II::MO_TLS) {
+ if (MO.getTargetFlags() & AArch64II::MO_GOT) {
+ const MachineFunction *MF = MO.getParent()->getParent()->getParent();
+ RefFlags |= (MF->getInfo<AArch64FunctionInfo>()->hasELFSignedGOT()
+ ? AArch64MCExpr::VK_GOT_AUTH
+ : AArch64MCExpr::VK_GOT);
+ } else if (MO.getTargetFlags() & AArch64II::MO_TLS) {
TLSModel::Model Model;
if (MO.isGlobal()) {
const GlobalValue *GV = MO.getGlobal();
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index e96c5a953ff2..a0f0a489816c 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -16,6 +16,7 @@
#include "AArch64MachineFunctionInfo.h"
#include "AArch64InstrInfo.h"
#include "AArch64Subtarget.h"
+#include "llvm/BinaryFormat/ELF.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
@@ -72,6 +73,29 @@ static bool ShouldSignWithBKey(const Function &F, const AArch64Subtarget &STI) {
return Key == "b_key";
}
+// Determine if we need to treat pointers in GOT as signed (as described in
+// https://github.com/ARM-software/abi-aa/blob/main/pauthabielf64/pauthabielf64.rst#appendix-signed-got)
+// based on PAuth core info encoded as "aarch64-elf-pauthabi-platform" and
+// "aarch64-elf-pauthabi-version" module flags. Currently, only
+// AARCH64_PAUTH_PLATFORM_LLVM_LINUX platform supports signed GOT with
+// AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_GOT bit in version value set.
+static bool hasELFSignedGOTHelper(const Function &F,
+ const AArch64Subtarget *STI) {
+ if (!Triple(STI->getTargetTriple()).isOSBinFormatELF())
+ return false;
+ const Module *M = F.getParent();
+ const auto *PAP = mdconst::extract_or_null<ConstantInt>(
+ M->getModuleFlag("aarch64-elf-pauthabi-platform"));
+ if (!PAP || PAP->getZExtValue() != ELF::AARCH64_PAUTH_PLATFORM_LLVM_LINUX)
+ return false;
+ const auto *PAV = mdconst::extract_or_null<ConstantInt>(
+ M->getModuleFlag("aarch64-elf-pauthabi-version"));
+ if (!PAV)
+ return false;
+ return PAV->getZExtValue() &
+ (1 << ELF::AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_GOT);
+}
+
AArch64FunctionInfo::AArch64FunctionInfo(const Function &F,
const AArch64Subtarget *STI) {
// If we already know that the function doesn't have a redzone, set
@@ -80,6 +104,7 @@ AArch64FunctionInfo::AArch64FunctionInfo(const Function &F,
HasRedZone = false;
std::tie(SignReturnAddress, SignReturnAddressAll) = GetSignReturnAddress(F);
SignWithBKey = ShouldSignWithBKey(F, *STI);
+ HasELFSignedGOT = hasELFSignedGOTHelper(F, STI);
// TODO: skip functions that have no instrumented allocas for optimization
IsMTETagged = F.hasFnAttribute(Attribute::SanitizeMemTag);
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 72f110cebbdc..9ae458488343 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -177,6 +177,14 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
/// SignWithBKey modifies the default PAC-RET mode to signing with the B key.
bool SignWithBKey = false;
+ /// HasELFSignedGOT is true if the target binary format is ELF and the IR
+ /// module containing the corresponding function has the following flags:
+ /// - aarch64-elf-pauthabi-platform flag equal to
+ /// AARCH64_PAUTH_PLATFORM_LLVM_LINUX;
+ /// - aarch64-elf-pauthabi-version flag with
+ /// AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_GOT bit set.
+ bool HasELFSignedGOT = false;
+
/// SigningInstrOffset captures the offset of the PAC-RET signing instruction
/// within the prologue, so it can be re-used for authentication in the
/// epilogue when using PC as a second salt (FEAT_PAuth_LR)
@@ -509,6 +517,8 @@ public:
bool shouldSignWithBKey() const { return SignWithBKey; }
+ bool hasELFSignedGOT() const { return HasELFSignedGOT; }
+
MCSymbol *getSigningInstrLabel() const { return SignInstrLabel; }
void setSigningInstrLabel(MCSymbol *Label) { SignInstrLabel = Label; }
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 71384a23c49a..52b5c8a0903e 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -863,22 +863,25 @@ def ProcessorFeatures {
list<SubtargetFeature> AppleA15 = [HasV8_6aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8,
FeatureNEON, FeaturePerfMon, FeatureSHA3,
FeatureFullFP16, FeatureFP16FML,
- FeatureComplxNum, FeatureCRC, FeatureJS, FeatureLSE,
- FeaturePAuth, FeatureRAS, FeatureRCPC, FeatureRDM,
+ FeatureComplxNum, FeatureCRC, FeatureJS,
+ FeatureLSE, FeaturePAuth, FeatureFPAC,
+ FeatureRAS, FeatureRCPC, FeatureRDM,
FeatureBF16, FeatureDotProd, FeatureMatMulInt8];
list<SubtargetFeature> AppleA16 = [HasV8_6aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8,
FeatureNEON, FeaturePerfMon, FeatureSHA3,
FeatureFullFP16, FeatureFP16FML,
FeatureHCX,
- FeatureComplxNum, FeatureCRC, FeatureJS, FeatureLSE,
- FeaturePAuth, FeatureRAS, FeatureRCPC, FeatureRDM,
+ FeatureComplxNum, FeatureCRC, FeatureJS,
+ FeatureLSE, FeaturePAuth, FeatureFPAC,
+ FeatureRAS, FeatureRCPC, FeatureRDM,
FeatureBF16, FeatureDotProd, FeatureMatMulInt8];
list<SubtargetFeature> AppleA17 = [HasV8_6aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8,
FeatureNEON, FeaturePerfMon, FeatureSHA3,
FeatureFullFP16, FeatureFP16FML,
FeatureHCX,
- FeatureComplxNum, FeatureCRC, FeatureJS, FeatureLSE,
- FeaturePAuth, FeatureRAS, FeatureRCPC, FeatureRDM,
+ FeatureComplxNum, FeatureCRC, FeatureJS,
+ FeatureLSE, FeaturePAuth, FeatureFPAC,
+ FeatureRAS, FeatureRCPC, FeatureRDM,
FeatureBF16, FeatureDotProd, FeatureMatMulInt8];
list<SubtargetFeature> AppleM4 = [HasV9_2aOps, FeatureSHA2, FeatureFPARMv8,
FeatureNEON, FeaturePerfMon, FeatureSHA3,
@@ -886,8 +889,9 @@ def ProcessorFeatures {
FeatureAES, FeatureBF16,
FeatureSME, FeatureSME2,
FeatureSMEF64F64, FeatureSMEI16I64,
- FeatureComplxNum, FeatureCRC, FeatureJS, FeatureLSE,
- FeaturePAuth, FeatureRAS, FeatureRCPC, FeatureRDM,
+ FeatureComplxNum, FeatureCRC, FeatureJS,
+ FeatureLSE, FeaturePAuth, FeatureFPAC,
+ FeatureRAS, FeatureRCPC, FeatureRDM,
FeatureDotProd, FeatureMatMulInt8];
list<SubtargetFeature> ExynosM3 = [HasV8_0aOps, FeatureCRC, FeatureSHA2, FeatureAES,
FeaturePerfMon, FeatureNEON, FeatureFPARMv8];
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 709a98d3a8cb..22de9e1458b7 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -857,6 +857,7 @@ defm FMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmopa", 0b0, 0b0, nxv8f16, int_aarch
defm FMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmops", 0b0, 0b1, nxv8f16, int_aarch64_sme_mops>;
}
+// SME2 ZA-targeting non-widening BFloat16 instructions
let Predicates = [HasSME2, HasB16B16] in {
defm BFADD_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"bfadd", 0b1100, MatrixOp16, ZZ_h_mul_r, nxv8bf16, int_aarch64_sme_add_za16_vg1x2>;
defm BFADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"bfadd", 0b1100, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, int_aarch64_sme_add_za16_vg1x4>;
@@ -877,6 +878,12 @@ defm BFMLS_VG4_M4ZZ : sme2_dot_mla_add_sub_array_vg4_single<"bfmls", 0b1111101,
defm BFMLS_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmls", 0b1100011, MatrixOp16, ZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fmls_vg1x2>;
defm BFMLS_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmls", 0b1100011, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fmls_vg1x4>;
+defm BFMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"bfmopa", 0b1, 0b0, nxv8bf16, int_aarch64_sme_mopa>;
+defm BFMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"bfmops", 0b1, 0b1, nxv8bf16, int_aarch64_sme_mops>;
+}
+
+// SME2 Z-targeting non-widening BFloat16 instructions
+let Predicates = [HasSME2, HasSVEB16B16] in {
defm BFMAX_VG2_2ZZ : sme2p1_bf_max_min_vector_vg2_single<"bfmax", 0b0010000>;
defm BFMAX_VG4_4ZZ : sme2p1_bf_max_min_vector_vg4_single<"bfmax", 0b0010000>;
defm BFMAX_VG2_2Z2Z : sme2p1_bf_max_min_vector_vg2_multi<"bfmax", 0b0010000>;
@@ -899,9 +906,6 @@ defm BFMINNM_VG4_4Z2Z : sme2p1_bf_max_min_vector_vg4_multi<"bfminnm", 0b0010011
defm BFCLAMP_VG2_2ZZZ: sme2p1_bfclamp_vector_vg2_multi<"bfclamp">;
defm BFCLAMP_VG4_4ZZZ: sme2p1_bfclamp_vector_vg4_multi<"bfclamp">;
-
-defm BFMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"bfmopa", 0b1, 0b0, nxv8bf16, int_aarch64_sme_mopa>;
-defm BFMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"bfmops", 0b1, 0b1, nxv8bf16, int_aarch64_sme_mops>;
}
let Predicates = [HasSME2, HasFP8] in {
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 19c03011e07b..d9a70b5ef02f 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -279,6 +279,10 @@ def AArch64fsub_m1 : PatFrags<(ops node:$pg, node:$op1, node:$op2), [
(int_aarch64_sve_fsub node:$pg, node:$op1, node:$op2),
(vselect node:$pg, (AArch64fsub_p (SVEAllActive), node:$op1, node:$op2), node:$op1)
]>;
+def AArch64fsubr_m1 : PatFrags<(ops node:$pg, node:$op1, node:$op2), [
+ (int_aarch64_sve_fsubr node:$pg, node:$op1, node:$op2),
+ (vselect node:$pg, (AArch64fsub_p (SVEAllActive), node:$op2, node:$op1), node:$op1)
+]>;
def AArch64shadd : PatFrags<(ops node:$pg, node:$op1, node:$op2),
[(int_aarch64_sve_shadd node:$pg, node:$op1, node:$op2),
@@ -423,6 +427,11 @@ def AArch64bic : PatFrags<(ops node:$op1, node:$op2),
def AArch64subr : PatFrag<(ops node:$op1, node:$op2),
(sub node:$op2, node:$op1)>;
+
+def AArch64subr_m1 : PatFrags<(ops node:$pg, node:$op1, node:$op2),
+ [(int_aarch64_sve_subr node:$pg, node:$op1, node:$op2),
+ (vselect node:$pg, (sub node:$op2, node:$op1), node:$op1)]>;
+
def AArch64mla_m1 : PatFrags<(ops node:$pred, node:$op1, node:$op2, node:$op3),
[(int_aarch64_sve_mla node:$pred, node:$op1, node:$op2, node:$op3),
(vselect node:$pred, (add node:$op1, (AArch64mul_p_oneuse (SVEAllActive), node:$op2, node:$op3)), node:$op1)]>;
@@ -529,7 +538,7 @@ let Predicates = [HasSVEorSME] in {
defm ADD_ZPmZ : sve_int_bin_pred_arit_0<0b000, "add", "ADD_ZPZZ", AArch64add_m1, DestructiveBinaryComm>;
defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub", "SUB_ZPZZ", AArch64sub_m1, DestructiveBinaryCommWithRev, "SUBR_ZPmZ">;
- defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr", "SUBR_ZPZZ", int_aarch64_sve_subr, DestructiveBinaryCommWithRev, "SUB_ZPmZ", /*isReverseInstr*/ 1>;
+ defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr", "SUBR_ZPZZ", AArch64subr_m1, DestructiveBinaryCommWithRev, "SUB_ZPmZ", /*isReverseInstr*/ 1>;
defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr", "ORR_ZPZZ", AArch64orr_m1, DestructiveBinaryComm>;
defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor", "EOR_ZPZZ", AArch64eor_m1, DestructiveBinaryComm>;
@@ -685,7 +694,7 @@ let Predicates = [HasSVEorSME] in {
defm FADD_ZPmZ : sve_fp_2op_p_zds<0b0000, "fadd", "FADD_ZPZZ", AArch64fadd_m1, DestructiveBinaryComm>;
defm FSUB_ZPmZ : sve_fp_2op_p_zds<0b0001, "fsub", "FSUB_ZPZZ", AArch64fsub_m1, DestructiveBinaryCommWithRev, "FSUBR_ZPmZ">;
defm FMUL_ZPmZ : sve_fp_2op_p_zds<0b0010, "fmul", "FMUL_ZPZZ", AArch64fmul_m1, DestructiveBinaryComm>;
- defm FSUBR_ZPmZ : sve_fp_2op_p_zds<0b0011, "fsubr", "FSUBR_ZPZZ", int_aarch64_sve_fsubr, DestructiveBinaryCommWithRev, "FSUB_ZPmZ", /*isReverseInstr*/ 1>;
+ defm FSUBR_ZPmZ : sve_fp_2op_p_zds<0b0011, "fsubr", "FSUBR_ZPZZ", AArch64fsubr_m1, DestructiveBinaryCommWithRev, "FSUB_ZPmZ", /*isReverseInstr*/ 1>;
defm FMAXNM_ZPmZ : sve_fp_2op_p_zds<0b0100, "fmaxnm", "FMAXNM_ZPZZ", AArch64fmaxnm_m1, DestructiveBinaryComm>;
defm FMINNM_ZPmZ : sve_fp_2op_p_zds<0b0101, "fminnm", "FMINNM_ZPZZ", AArch64fminnm_m1, DestructiveBinaryComm>;
defm FMAX_ZPmZ : sve_fp_2op_p_zds<0b0110, "fmax", "FMAX_ZPZZ", AArch64fmax_m1, DestructiveBinaryComm>;
@@ -4101,7 +4110,7 @@ def : InstAlias<"pfalse\t$Pd", (PFALSE PPRorPNR8:$Pd), 0>;
// Non-widening BFloat16 to BFloat16 instructions
//===----------------------------------------------------------------------===//
-let Predicates = [HasSVE2orSME2, HasB16B16, UseExperimentalZeroingPseudos] in {
+let Predicates = [HasSVE2orSME2, HasSVEB16B16, UseExperimentalZeroingPseudos] in {
defm BFADD_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fadd>;
defm BFSUB_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fsub>;
defm BFMUL_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fmul>;
@@ -4109,9 +4118,9 @@ defm BFMAXNM_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fmaxnm>;
defm BFMINNM_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fminnm>;
defm BFMIN_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fmin>;
defm BFMAX_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fmax>;
-} // HasSVE2orSME2, HasB16B16, UseExperimentalZeroingPseudos
+} // HasSVE2orSME2, HasSVEB16B16, UseExperimentalZeroingPseudos
-let Predicates = [HasSVE2orSME2, HasB16B16] in {
+let Predicates = [HasSVE2orSME2, HasSVEB16B16] in {
defm BFMLA_ZPmZZ : sve_fp_3op_p_zds_a_bf<0b00, "bfmla", "BFMLA_ZPZZZ", AArch64fmla_m1>;
defm BFMLS_ZPmZZ : sve_fp_3op_p_zds_a_bf<0b01, "bfmls", "BFMLS_ZPZZZ", AArch64fmls_m1>;
@@ -4151,7 +4160,7 @@ defm BFMINNM_ZPZZ : sve2p1_bf_bin_pred_zds<AArch64fminnm_p>;
defm BFMUL_ZZZI : sve2p1_fp_bfmul_by_indexed_elem<"bfmul", int_aarch64_sve_fmul_lane>;
defm BFCLAMP_ZZZ : sve2p1_bfclamp<"bfclamp", AArch64fclamp>;
-} // End HasSVE2orSME2, HasB16B16
+} // End HasSVE2orSME2, HasSVEB16B16
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 5e17ed40df8a..8a93b7fc4c89 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -875,6 +875,7 @@ public:
if (DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF ||
ELFRefKind == AArch64MCExpr::VK_LO12 ||
ELFRefKind == AArch64MCExpr::VK_GOT_LO12 ||
+ ELFRefKind == AArch64MCExpr::VK_GOT_AUTH_LO12 ||
ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 ||
ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC ||
ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 ||
@@ -986,19 +987,20 @@ public:
int64_t Addend;
if (AArch64AsmParser::classifySymbolRef(Expr, ELFRefKind,
DarwinRefKind, Addend)) {
- return DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF
- || DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF
- || (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF && Addend == 0)
- || ELFRefKind == AArch64MCExpr::VK_LO12
- || ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12
- || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12
- || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC
- || ELFRefKind == AArch64MCExpr::VK_TPREL_HI12
- || ELFRefKind == AArch64MCExpr::VK_TPREL_LO12
- || ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC
- || ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12
- || ELFRefKind == AArch64MCExpr::VK_SECREL_HI12
- || ELFRefKind == AArch64MCExpr::VK_SECREL_LO12;
+ return DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF ||
+ DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF ||
+ (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF && Addend == 0) ||
+ ELFRefKind == AArch64MCExpr::VK_LO12 ||
+ ELFRefKind == AArch64MCExpr::VK_GOT_AUTH_LO12 ||
+ ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12 ||
+ ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 ||
+ ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC ||
+ ELFRefKind == AArch64MCExpr::VK_TPREL_HI12 ||
+ ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 ||
+ ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC ||
+ ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12 ||
+ ELFRefKind == AArch64MCExpr::VK_SECREL_HI12 ||
+ ELFRefKind == AArch64MCExpr::VK_SECREL_LO12;
}
// If it's a constant, it should be a real immediate in range.
@@ -3250,6 +3252,7 @@ ParseStatus AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
DarwinRefKind != MCSymbolRefExpr::VK_TLVPPAGE &&
ELFRefKind != AArch64MCExpr::VK_ABS_PAGE_NC &&
ELFRefKind != AArch64MCExpr::VK_GOT_PAGE &&
+ ELFRefKind != AArch64MCExpr::VK_GOT_AUTH_PAGE &&
ELFRefKind != AArch64MCExpr::VK_GOT_PAGE_LO15 &&
ELFRefKind != AArch64MCExpr::VK_GOTTPREL_PAGE &&
ELFRefKind != AArch64MCExpr::VK_TLSDESC_PAGE) {
@@ -3674,6 +3677,7 @@ static const struct Extension {
{"rcpc", {AArch64::FeatureRCPC}},
{"rng", {AArch64::FeatureRandGen}},
{"sve", {AArch64::FeatureSVE}},
+ {"sve-b16b16", {AArch64::FeatureSVEB16B16}},
{"sve2", {AArch64::FeatureSVE2}},
{"sve2-aes", {AArch64::FeatureSVE2AES}},
{"sve2-sm4", {AArch64::FeatureSVE2SM4}},
@@ -4334,6 +4338,8 @@ bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
.Case("got", AArch64MCExpr::VK_GOT_PAGE)
.Case("gotpage_lo15", AArch64MCExpr::VK_GOT_PAGE_LO15)
.Case("got_lo12", AArch64MCExpr::VK_GOT_LO12)
+ .Case("got_auth", AArch64MCExpr::VK_GOT_AUTH_PAGE)
+ .Case("got_auth_lo12", AArch64MCExpr::VK_GOT_AUTH_LO12)
.Case("gottprel", AArch64MCExpr::VK_GOTTPREL_PAGE)
.Case("gottprel_lo12", AArch64MCExpr::VK_GOTTPREL_LO12_NC)
.Case("gottprel_g1", AArch64MCExpr::VK_GOTTPREL_G1)
@@ -5708,6 +5714,7 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc,
// Only allow these with ADDXri/ADDWri
if ((ELFRefKind == AArch64MCExpr::VK_LO12 ||
+ ELFRefKind == AArch64MCExpr::VK_GOT_AUTH_LO12 ||
ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12 ||
ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 ||
ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC ||
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index e9e6b6cb68d0..ef8fcc074170 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -2845,7 +2845,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
}
if (OpFlags & AArch64II::MO_GOT) {
- I.setDesc(TII.get(AArch64::LOADgot));
+ I.setDesc(TII.get(MF.getInfo<AArch64FunctionInfo>()->hasELFSignedGOT()
+ ? AArch64::LOADgotAUTH
+ : AArch64::LOADgot));
I.getOperand(1).setTargetFlags(OpFlags);
} else if (TM.getCodeModel() == CodeModel::Large &&
!TM.isPositionIndependent()) {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index 4a1977ba1a00..afea8bdec197 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -288,10 +288,16 @@ bool matchDupFromBuildVector(int Lane, MachineInstr &MI,
MachineRegisterInfo &MRI,
ShuffleVectorPseudo &MatchInfo) {
assert(Lane >= 0 && "Expected positive lane?");
+ int NumElements = MRI.getType(MI.getOperand(1).getReg()).getNumElements();
// Test if the LHS is a BUILD_VECTOR. If it is, then we can just reference the
// lane's definition directly.
- auto *BuildVecMI = getOpcodeDef(TargetOpcode::G_BUILD_VECTOR,
- MI.getOperand(1).getReg(), MRI);
+ auto *BuildVecMI =
+ getOpcodeDef(TargetOpcode::G_BUILD_VECTOR,
+ MI.getOperand(Lane < NumElements ? 1 : 2).getReg(), MRI);
+ // If Lane >= NumElements then it is point to RHS, just check from RHS
+ if (NumElements <= Lane)
+ Lane -= NumElements;
+
if (!BuildVecMI)
return false;
Register Reg = BuildVecMI->getOperand(Lane + 1).getReg();
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index b4c5cde5fd88..72671b0715f6 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -167,6 +167,15 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
}
if (SymLoc == AArch64MCExpr::VK_GOT && !IsNC)
return R_CLS(ADR_GOT_PAGE);
+ if (SymLoc == AArch64MCExpr::VK_GOT_AUTH && !IsNC) {
+ if (IsILP32) {
+ Ctx.reportError(Fixup.getLoc(),
+ "ILP32 ADRP AUTH relocation not supported "
+ "(LP64 eqv: AUTH_ADR_GOT_PAGE)");
+ return ELF::R_AARCH64_NONE;
+ }
+ return ELF::R_AARCH64_AUTH_ADR_GOT_PAGE;
+ }
if (SymLoc == AArch64MCExpr::VK_GOTTPREL && !IsNC)
return R_CLS(TLSIE_ADR_GOTTPREL_PAGE21);
if (SymLoc == AArch64MCExpr::VK_TLSDESC && !IsNC)
@@ -237,6 +246,15 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
return R_CLS(TLSLE_ADD_TPREL_LO12);
if (RefKind == AArch64MCExpr::VK_TLSDESC_LO12)
return R_CLS(TLSDESC_ADD_LO12);
+ if (RefKind == AArch64MCExpr::VK_GOT_AUTH_LO12 && IsNC) {
+ if (IsILP32) {
+ Ctx.reportError(Fixup.getLoc(),
+ "ILP32 ADD AUTH relocation not supported "
+ "(LP64 eqv: AUTH_GOT_ADD_LO12_NC)");
+ return ELF::R_AARCH64_NONE;
+ }
+ return ELF::R_AARCH64_AUTH_GOT_ADD_LO12_NC;
+ }
if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
return R_CLS(ADD_ABS_LO12_NC);
@@ -329,17 +347,23 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
case AArch64::fixup_aarch64_ldst_imm12_scale8:
if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
return R_CLS(LDST64_ABS_LO12_NC);
- if (SymLoc == AArch64MCExpr::VK_GOT && IsNC) {
+ if ((SymLoc == AArch64MCExpr::VK_GOT ||
+ SymLoc == AArch64MCExpr::VK_GOT_AUTH) &&
+ IsNC) {
AArch64MCExpr::VariantKind AddressLoc =
AArch64MCExpr::getAddressFrag(RefKind);
+ bool IsAuth = (SymLoc == AArch64MCExpr::VK_GOT_AUTH);
if (!IsILP32) {
if (AddressLoc == AArch64MCExpr::VK_LO15)
return ELF::R_AARCH64_LD64_GOTPAGE_LO15;
- return ELF::R_AARCH64_LD64_GOT_LO12_NC;
+ return (IsAuth ? ELF::R_AARCH64_AUTH_LD64_GOT_LO12_NC
+ : ELF::R_AARCH64_LD64_GOT_LO12_NC);
}
- Ctx.reportError(Fixup.getLoc(), "ILP32 64-bit load/store "
- "relocation not supported (LP64 eqv: "
- "LD64_GOT_LO12_NC)");
+ Ctx.reportError(Fixup.getLoc(),
+ Twine("ILP32 64-bit load/store "
+ "relocation not supported (LP64 eqv: ") +
+ (IsAuth ? "AUTH_GOT_LO12_NC" : "LD64_GOT_LO12_NC") +
+ Twine(')'));
return ELF::R_AARCH64_NONE;
}
if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index fb8eb9f47da1..3430b9002894 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -30,6 +30,7 @@ const AArch64MCExpr *AArch64MCExpr::create(const MCExpr *Expr, VariantKind Kind,
}
StringRef AArch64MCExpr::getVariantKindName() const {
+ // clang-format off
switch (static_cast<uint32_t>(getKind())) {
case VK_CALL: return "";
case VK_LO12: return ":lo12:";
@@ -82,9 +83,13 @@ StringRef AArch64MCExpr::getVariantKindName() const {
case VK_TLSDESC_PAGE: return ":tlsdesc:";
case VK_SECREL_LO12: return ":secrel_lo12:";
case VK_SECREL_HI12: return ":secrel_hi12:";
+ case VK_GOT_AUTH: return ":got_auth:";
+ case VK_GOT_AUTH_PAGE: return ":got_auth:";
+ case VK_GOT_AUTH_LO12: return ":got_auth_lo12:";
default:
llvm_unreachable("Invalid ELF symbol kind");
}
+ // clang-format on
}
void AArch64MCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index cf3a90f95a2c..699992782f67 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -24,6 +24,7 @@ namespace llvm {
class AArch64MCExpr : public MCTargetExpr {
public:
enum VariantKind {
+ // clang-format off
// Symbol locations specifying (roughly speaking) what calculation should be
// performed to construct the final address for the relocated
// symbol. E.g. direct, via the GOT, ...
@@ -38,6 +39,7 @@ public:
VK_SECREL = 0x009,
VK_AUTH = 0x00a,
VK_AUTHADDR = 0x00b,
+ VK_GOT_AUTH = 0x00c,
VK_SymLocBits = 0x00f,
// Variants specifying which part of the final address calculation is
@@ -88,6 +90,8 @@ public:
VK_GOT_LO12 = VK_GOT | VK_PAGEOFF | VK_NC,
VK_GOT_PAGE = VK_GOT | VK_PAGE,
VK_GOT_PAGE_LO15 = VK_GOT | VK_LO15 | VK_NC,
+ VK_GOT_AUTH_LO12 = VK_GOT_AUTH | VK_PAGEOFF | VK_NC,
+ VK_GOT_AUTH_PAGE = VK_GOT_AUTH | VK_PAGE,
VK_DTPREL_G2 = VK_DTPREL | VK_G2,
VK_DTPREL_G1 = VK_DTPREL | VK_G1,
VK_DTPREL_G1_NC = VK_DTPREL | VK_G1 | VK_NC,
@@ -114,6 +118,7 @@ public:
VK_SECREL_HI12 = VK_SECREL | VK_HI12,
VK_INVALID = 0xfff
+ // clang-format on
};
private:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index de1f3421cce4..39c52140dfbd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1038,7 +1038,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM) {
&AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
&AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID,
&AAPointerInfo::ID, &AAPotentialConstantValues::ID,
- &AAUnderlyingObjects::ID});
+ &AAUnderlyingObjects::ID, &AAAddressSpace::ID});
AttributorConfig AC(CGUpdater);
AC.Allowed = &Allowed;
@@ -1064,6 +1064,17 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM) {
} else if (CC == CallingConv::AMDGPU_KERNEL) {
addPreloadKernArgHint(F, TM);
}
+
+ for (auto &I : instructions(F)) {
+ if (auto *LI = dyn_cast<LoadInst>(&I)) {
+ A.getOrCreateAAFor<AAAddressSpace>(
+ IRPosition::value(*LI->getPointerOperand()));
+ }
+ if (auto *SI = dyn_cast<StoreInst>(&I)) {
+ A.getOrCreateAAFor<AAAddressSpace>(
+ IRPosition::value(*SI->getPointerOperand()));
+ }
+ }
}
ChangeStatus Change = A.run();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 73f3921b2ff4..f78699f88de5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1372,8 +1372,8 @@ bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
MachineInstrBuilder SelectedMI;
MachineOperand &LHS = I.getOperand(2);
MachineOperand &RHS = I.getOperand(3);
- auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS);
- auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS);
+ auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
+ auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
Register Src0Reg =
copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
Register Src1Reg =
@@ -2467,14 +2467,48 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
return false;
}
+static Register stripCopy(Register Reg, MachineRegisterInfo &MRI) {
+ return getDefSrcRegIgnoringCopies(Reg, MRI)->Reg;
+}
+
+static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI) {
+ Register BitcastSrc;
+ if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
+ Reg = BitcastSrc;
+ return Reg;
+}
+
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In,
Register &Out) {
+ Register Trunc;
+ if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
+ return false;
+
Register LShlSrc;
- if (mi_match(In, MRI,
- m_GTrunc(m_GLShr(m_Reg(LShlSrc), m_SpecificICst(16))))) {
- Out = LShlSrc;
+ Register Cst;
+ if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
+ Cst = stripCopy(Cst, MRI);
+ if (mi_match(Cst, MRI, m_SpecificICst(16))) {
+ Out = stripBitCast(LShlSrc, MRI);
+ return true;
+ }
+ }
+
+ MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
+ if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
+ return false;
+
+ assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
+ LLT::fixed_vector(2, 16));
+
+ ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
+ assert(Mask.size() == 2);
+
+ if (Mask[0] == 1 && Mask[1] <= 1) {
+ Out = Shuffle->getOperand(0).getReg();
return true;
}
+
return false;
}
@@ -3550,11 +3584,8 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
}
-std::pair<Register, unsigned>
-AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
- bool IsCanonicalizing,
- bool AllowAbs, bool OpSel) const {
- Register Src = Root.getReg();
+std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
+ Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
unsigned Mods = 0;
MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
@@ -3617,7 +3648,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
return {{
[=](MachineInstrBuilder &MIB) {
@@ -3633,7 +3664,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
/*IsCanonicalizing=*/true,
/*AllowAbs=*/false);
@@ -3660,7 +3691,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
return {{
[=](MachineInstrBuilder &MIB) {
@@ -3675,7 +3706,8 @@ AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/false);
+ std::tie(Src, Mods) =
+ selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
return {{
[=](MachineInstrBuilder &MIB) {
@@ -3689,8 +3721,9 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/true,
- /*AllowAbs=*/false);
+ std::tie(Src, Mods) =
+ selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
+ /*AllowAbs=*/false);
return {{
[=](MachineInstrBuilder &MIB) {
@@ -4016,7 +4049,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
// FIXME: Handle op_sel
return {{
@@ -4029,7 +4062,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
/*IsCanonicalizing=*/true,
/*AllowAbs=*/false,
/*OpSel=*/false);
@@ -4047,7 +4080,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
/*IsCanonicalizing=*/true,
/*AllowAbs=*/false,
/*OpSel=*/true);
@@ -5229,59 +5262,6 @@ AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
[=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
}
-// Variant of stripBitCast that returns the instruction instead of a
-// MachineOperand.
-static MachineInstr *stripBitCast(MachineInstr *MI, MachineRegisterInfo &MRI) {
- if (MI->getOpcode() == AMDGPU::G_BITCAST)
- return getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI);
- return MI;
-}
-
-// Figure out if this is really an extract of the high 16-bits of a dword,
-// returns nullptr if it isn't.
-static MachineInstr *isExtractHiElt(MachineInstr *Inst,
- MachineRegisterInfo &MRI) {
- Inst = stripBitCast(Inst, MRI);
-
- if (Inst->getOpcode() != AMDGPU::G_TRUNC)
- return nullptr;
-
- MachineInstr *TruncOp =
- getDefIgnoringCopies(Inst->getOperand(1).getReg(), MRI);
- TruncOp = stripBitCast(TruncOp, MRI);
-
- // G_LSHR x, (G_CONSTANT i32 16)
- if (TruncOp->getOpcode() == AMDGPU::G_LSHR) {
- auto SrlAmount = getIConstantVRegValWithLookThrough(
- TruncOp->getOperand(2).getReg(), MRI);
- if (SrlAmount && SrlAmount->Value.getZExtValue() == 16) {
- MachineInstr *SrlOp =
- getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
- return stripBitCast(SrlOp, MRI);
- }
- }
-
- // G_SHUFFLE_VECTOR x, y, shufflemask(1, 1|0)
- // 1, 0 swaps the low/high 16 bits.
- // 1, 1 sets the high 16 bits to be the same as the low 16.
- // in any case, it selects the high elts.
- if (TruncOp->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) {
- assert(MRI.getType(TruncOp->getOperand(0).getReg()) ==
- LLT::fixed_vector(2, 16));
-
- ArrayRef<int> Mask = TruncOp->getOperand(3).getShuffleMask();
- assert(Mask.size() == 2);
-
- if (Mask[0] == 1 && Mask[1] <= 1) {
- MachineInstr *LHS =
- getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
- return stripBitCast(LHS, MRI);
- }
- }
-
- return nullptr;
-}
-
std::pair<Register, unsigned>
AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
bool &Matched) const {
@@ -5289,37 +5269,34 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
-
- MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
- if (MI->getOpcode() == AMDGPU::G_FPEXT) {
- MachineOperand *MO = &MI->getOperand(1);
- Src = MO->getReg();
- MI = getDefIgnoringCopies(Src, *MRI);
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
+ if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
assert(MRI->getType(Src) == LLT::scalar(16));
- // See through bitcasts.
- // FIXME: Would be nice to use stripBitCast here.
- if (MI->getOpcode() == AMDGPU::G_BITCAST) {
- MO = &MI->getOperand(1);
- Src = MO->getReg();
- MI = getDefIgnoringCopies(Src, *MRI);
- }
+ // Only change Src if src modifier could be gained. In such cases new Src
+ // could be sgpr but this does not violate constant bus restriction for
+ // instruction that is being selected.
+ // Note: Src is not changed when there is only a simple sgpr to vgpr copy
+ // since this could violate constant bus restriction.
+ Register PeekSrc = stripCopy(Src, *MRI);
const auto CheckAbsNeg = [&]() {
// Be careful about folding modifiers if we already have an abs. fneg is
// applied last, so we don't want to apply an earlier fneg.
if ((Mods & SISrcMods::ABS) == 0) {
unsigned ModsTmp;
- std::tie(Src, ModsTmp) = selectVOP3ModsImpl(*MO);
- MI = getDefIgnoringCopies(Src, *MRI);
+ std::tie(PeekSrc, ModsTmp) = selectVOP3ModsImpl(PeekSrc);
- if ((ModsTmp & SISrcMods::NEG) != 0)
+ if ((ModsTmp & SISrcMods::NEG) != 0) {
Mods ^= SISrcMods::NEG;
+ Src = PeekSrc;
+ }
- if ((ModsTmp & SISrcMods::ABS) != 0)
+ if ((ModsTmp & SISrcMods::ABS) != 0) {
Mods |= SISrcMods::ABS;
+ Src = PeekSrc;
+ }
}
};
@@ -5332,12 +5309,9 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
Mods |= SISrcMods::OP_SEL_1;
- if (MachineInstr *ExtractHiEltMI = isExtractHiElt(MI, *MRI)) {
+ if (isExtractHiElt(*MRI, PeekSrc, PeekSrc)) {
+ Src = PeekSrc;
Mods |= SISrcMods::OP_SEL_0;
- MI = ExtractHiEltMI;
- MO = &MI->getOperand(0);
- Src = MO->getReg();
-
CheckAbsNeg();
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 7fff7d2685e7..69806b240cf2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -150,7 +150,7 @@ private:
bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const;
bool selectSBarrierLeave(MachineInstr &I) const;
- std::pair<Register, unsigned> selectVOP3ModsImpl(MachineOperand &Root,
+ std::pair<Register, unsigned> selectVOP3ModsImpl(Register Src,
bool IsCanonicalizing = true,
bool AllowAbs = true,
bool OpSel = false) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 9a6ba5ac6808..17067ddd93ff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3739,17 +3739,28 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
const MachineRegisterInfo &MRI = MF.getRegInfo();
if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+
// The default logic bothers to analyze impossible alternative mappings. We
// want the most straightforward mapping, so just directly handle this.
- const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
- *TRI);
- const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
- *TRI);
+ const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
+ const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
assert(SrcBank && "src bank should have been assigned already");
+
+ // For COPY between a physical reg and an s1, there is no type associated so
+ // we need to take the virtual register's type as a hint on how to interpret
+ // s1 values.
+ if (!SrcReg.isVirtual() && !DstBank &&
+ MRI.getType(DstReg) == LLT::scalar(1))
+ DstBank = &AMDGPU::VCCRegBank;
+ else if (!DstReg.isVirtual() && MRI.getType(SrcReg) == LLT::scalar(1))
+ DstBank = &AMDGPU::VCCRegBank;
+
if (!DstBank)
DstBank = SrcBank;
- unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+ unsigned Size = getSizeInBits(DstReg, MRI, *TRI);
if (MI.getOpcode() != AMDGPU::G_FREEZE &&
cannotCopy(*DstBank, *SrcBank, TypeSize::getFixed(Size)))
return getInvalidInstructionMapping();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 80ca30829032..5c4d2b8d030e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -381,3 +381,7 @@ def : AlwaysUniform<int_amdgcn_if_break>;
def : AlwaysUniform<int_amdgcn_workgroup_id_x>;
def : AlwaysUniform<int_amdgcn_workgroup_id_y>;
def : AlwaysUniform<int_amdgcn_workgroup_id_z>;
+def : AlwaysUniform<int_amdgcn_s_getpc>;
+def : AlwaysUniform<int_amdgcn_s_getreg>;
+def : AlwaysUniform<int_amdgcn_s_memrealtime>;
+def : AlwaysUniform<int_amdgcn_s_memtime>;
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 26a839a95df9..c8b594ffbc64 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1618,6 +1618,14 @@ public:
ParseStatus parseTH(OperandVector &Operands, int64_t &TH);
ParseStatus parseStringWithPrefix(StringRef Prefix, StringRef &Value,
SMLoc &StringLoc);
+ ParseStatus parseStringOrIntWithPrefix(OperandVector &Operands,
+ StringRef Name,
+ ArrayRef<const char *> Ids,
+ int64_t &IntVal);
+ ParseStatus parseStringOrIntWithPrefix(OperandVector &Operands,
+ StringRef Name,
+ ArrayRef<const char *> Ids,
+ AMDGPUOperand::ImmTy Type);
bool isModifier();
bool isOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const;
@@ -6633,27 +6641,17 @@ ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) {
ParseStatus AMDGPUAsmParser::parseScope(OperandVector &Operands,
int64_t &Scope) {
- Scope = AMDGPU::CPol::SCOPE_CU; // default;
+ static const unsigned Scopes[] = {CPol::SCOPE_CU, CPol::SCOPE_SE,
+ CPol::SCOPE_DEV, CPol::SCOPE_SYS};
- StringRef Value;
- SMLoc StringLoc;
- ParseStatus Res;
-
- Res = parseStringWithPrefix("scope", Value, StringLoc);
- if (!Res.isSuccess())
- return Res;
-
- Scope = StringSwitch<int64_t>(Value)
- .Case("SCOPE_CU", AMDGPU::CPol::SCOPE_CU)
- .Case("SCOPE_SE", AMDGPU::CPol::SCOPE_SE)
- .Case("SCOPE_DEV", AMDGPU::CPol::SCOPE_DEV)
- .Case("SCOPE_SYS", AMDGPU::CPol::SCOPE_SYS)
- .Default(0xffffffff);
+ ParseStatus Res = parseStringOrIntWithPrefix(
+ Operands, "scope", {"SCOPE_CU", "SCOPE_SE", "SCOPE_DEV", "SCOPE_SYS"},
+ Scope);
- if (Scope == 0xffffffff)
- return Error(StringLoc, "invalid scope value");
+ if (Res.isSuccess())
+ Scope = Scopes[Scope];
- return ParseStatus::Success;
+ return Res;
}
ParseStatus AMDGPUAsmParser::parseTH(OperandVector &Operands, int64_t &TH) {
@@ -6742,6 +6740,44 @@ ParseStatus AMDGPUAsmParser::parseStringWithPrefix(StringRef Prefix,
: ParseStatus::Failure;
}
+ParseStatus AMDGPUAsmParser::parseStringOrIntWithPrefix(
+ OperandVector &Operands, StringRef Name, ArrayRef<const char *> Ids,
+ int64_t &IntVal) {
+ if (!trySkipId(Name, AsmToken::Colon))
+ return ParseStatus::NoMatch;
+
+ SMLoc StringLoc = getLoc();
+
+ StringRef Value;
+ if (isToken(AsmToken::Identifier)) {
+ Value = getTokenStr();
+ lex();
+
+ for (IntVal = 0; IntVal < (int64_t)Ids.size(); ++IntVal)
+ if (Value == Ids[IntVal])
+ break;
+ } else if (!parseExpr(IntVal))
+ return ParseStatus::Failure;
+
+ if (IntVal < 0 || IntVal >= (int64_t)Ids.size())
+ return Error(StringLoc, "invalid " + Twine(Name) + " value");
+
+ return ParseStatus::Success;
+}
+
+ParseStatus AMDGPUAsmParser::parseStringOrIntWithPrefix(
+ OperandVector &Operands, StringRef Name, ArrayRef<const char *> Ids,
+ AMDGPUOperand::ImmTy Type) {
+ SMLoc S = getLoc();
+ int64_t IntVal;
+
+ ParseStatus Res = parseStringOrIntWithPrefix(Operands, Name, Ids, IntVal);
+ if (Res.isSuccess())
+ Operands.push_back(AMDGPUOperand::CreateImm(this, IntVal, S, Type));
+
+ return Res;
+}
+
//===----------------------------------------------------------------------===//
// MTBUF format
//===----------------------------------------------------------------------===//
@@ -9396,57 +9432,16 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool I
ParseStatus AMDGPUAsmParser::parseSDWASel(OperandVector &Operands,
StringRef Prefix,
AMDGPUOperand::ImmTy Type) {
- using namespace llvm::AMDGPU::SDWA;
-
- SMLoc S = getLoc();
- StringRef Value;
-
- SMLoc StringLoc;
- ParseStatus Res = parseStringWithPrefix(Prefix, Value, StringLoc);
- if (!Res.isSuccess())
- return Res;
-
- int64_t Int;
- Int = StringSwitch<int64_t>(Value)
- .Case("BYTE_0", SdwaSel::BYTE_0)
- .Case("BYTE_1", SdwaSel::BYTE_1)
- .Case("BYTE_2", SdwaSel::BYTE_2)
- .Case("BYTE_3", SdwaSel::BYTE_3)
- .Case("WORD_0", SdwaSel::WORD_0)
- .Case("WORD_1", SdwaSel::WORD_1)
- .Case("DWORD", SdwaSel::DWORD)
- .Default(0xffffffff);
-
- if (Int == 0xffffffff)
- return Error(StringLoc, "invalid " + Twine(Prefix) + " value");
-
- Operands.push_back(AMDGPUOperand::CreateImm(this, Int, S, Type));
- return ParseStatus::Success;
+ return parseStringOrIntWithPrefix(
+ Operands, Prefix,
+ {"BYTE_0", "BYTE_1", "BYTE_2", "BYTE_3", "WORD_0", "WORD_1", "DWORD"},
+ Type);
}
ParseStatus AMDGPUAsmParser::parseSDWADstUnused(OperandVector &Operands) {
- using namespace llvm::AMDGPU::SDWA;
-
- SMLoc S = getLoc();
- StringRef Value;
-
- SMLoc StringLoc;
- ParseStatus Res = parseStringWithPrefix("dst_unused", Value, StringLoc);
- if (!Res.isSuccess())
- return Res;
-
- int64_t Int;
- Int = StringSwitch<int64_t>(Value)
- .Case("UNUSED_PAD", DstUnused::UNUSED_PAD)
- .Case("UNUSED_SEXT", DstUnused::UNUSED_SEXT)
- .Case("UNUSED_PRESERVE", DstUnused::UNUSED_PRESERVE)
- .Default(0xffffffff);
-
- if (Int == 0xffffffff)
- return Error(StringLoc, "invalid dst_unused value");
-
- Operands.push_back(AMDGPUOperand::CreateImm(this, Int, S, AMDGPUOperand::ImmTySDWADstUnused));
- return ParseStatus::Success;
+ return parseStringOrIntWithPrefix(
+ Operands, "dst_unused", {"UNUSED_PAD", "UNUSED_SEXT", "UNUSED_PRESERVE"},
+ AMDGPUOperand::ImmTySDWADstUnused);
}
void AMDGPUAsmParser::cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands) {
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index ae537b194f50..b39fbdc26795 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -352,6 +352,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
return 1;
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
case AMDGPU::S_LOAD_DWORDX2_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
case AMDGPU::GLOBAL_LOAD_DWORDX2:
@@ -363,6 +365,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
return 2;
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
case AMDGPU::S_LOAD_DWORDX3_IMM:
case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
case AMDGPU::GLOBAL_LOAD_DWORDX3:
@@ -374,6 +378,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
return 3;
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
case AMDGPU::GLOBAL_LOAD_DWORDX4:
@@ -385,6 +391,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
return 4;
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
case AMDGPU::S_LOAD_DWORDX8_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
return 8;
@@ -499,12 +507,20 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
return S_BUFFER_LOAD_IMM;
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
return S_BUFFER_LOAD_SGPR_IMM;
case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
@@ -587,12 +603,20 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
@@ -703,6 +727,10 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
Result.SOffset = true;
[[fallthrough]];
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
@@ -710,6 +738,10 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
case AMDGPU::S_LOAD_DWORDX3_IMM:
@@ -1679,6 +1711,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
return New;
}
+static bool needsConstrainedOpcode(const GCNSubtarget &STM,
+ ArrayRef<MachineMemOperand *> MMOs,
+ unsigned Width) {
+ // Conservatively returns true if not found the MMO.
+ return STM.isXNACKEnabled() &&
+ (MMOs.size() != 1 || MMOs[0]->getAlign().value() < Width * 4);
+}
+
unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
const CombineInfo &Paired) {
const unsigned Width = CI.Width + Paired.Width;
@@ -1696,38 +1736,55 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
case UNKNOWN:
llvm_unreachable("Unknown instruction class");
- case S_BUFFER_LOAD_IMM:
+ case S_BUFFER_LOAD_IMM: {
+ // If XNACK is enabled, use the constrained opcodes when the first load is
+ // under-aligned.
+ bool NeedsConstrainedOpc =
+ needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
switch (Width) {
default:
return 0;
case 2:
- return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
case 3:
- return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
case 4:
- return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
case 8:
- return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
}
- case S_BUFFER_LOAD_SGPR_IMM:
+ }
+ case S_BUFFER_LOAD_SGPR_IMM: {
+ // If XNACK is enabled, use the constrained opcodes when the first load is
+ // under-aligned.
+ bool NeedsConstrainedOpc =
+ needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
switch (Width) {
default:
return 0;
case 2:
- return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
case 3:
- return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
case 4:
- return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
case 8:
- return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
}
+ }
case S_LOAD_IMM: {
// If XNACK is enabled, use the constrained opcodes when the first load is
// under-aligned.
- const MachineMemOperand *MMO = *CI.I->memoperands_begin();
bool NeedsConstrainedOpc =
- STM->isXNACKEnabled() && MMO->getAlign().value() < Width * 4;
+ needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
switch (Width) {
default:
return 0;
diff --git a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
index fb33308e491c..1312b44b49bd 100644
--- a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -414,6 +414,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
// Renumber all of the machine basic blocks in the function, guaranteeing that
// the numbers agree with the position of the block in the function.
MF->RenumberBlocks();
+ DT->updateBlockNumbers();
// Try to reorder and otherwise adjust the block layout to make good use
// of the TB[BH] instructions.
@@ -425,6 +426,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
T2JumpTables.clear();
// Blocks may have shifted around. Keep the numbering up to date.
MF->RenumberBlocks();
+ DT->updateBlockNumbers();
}
// Align any non-fallthrough blocks
@@ -670,8 +672,10 @@ void ARMConstantIslands::doInitialJumpTablePlacement(
}
// If we did anything then we need to renumber the subsequent blocks.
- if (LastCorrectlyNumberedBB)
+ if (LastCorrectlyNumberedBB) {
MF->RenumberBlocks(LastCorrectlyNumberedBB);
+ DT->updateBlockNumbers();
+ }
}
/// BBHasFallthrough - Return true if the specified basic block can fallthrough
@@ -972,6 +976,7 @@ static bool CompareMBBNumbers(const MachineBasicBlock *LHS,
void ARMConstantIslands::updateForInsertedWaterBlock(MachineBasicBlock *NewBB) {
// Renumber the MBB's to keep them consecutive.
NewBB->getParent()->RenumberBlocks(NewBB);
+ DT->updateBlockNumbers();
// Insert an entry into BBInfo to align it properly with the (newly
// renumbered) block numbers.
@@ -1034,6 +1039,7 @@ MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) {
// This is almost the same as updateForInsertedWaterBlock, except that
// the Water goes after OrigBB, not NewBB.
MF->RenumberBlocks(NewBB);
+ DT->updateBlockNumbers();
// Insert an entry into BBInfo to align it properly with the (newly
// renumbered) block numbers.
@@ -2485,6 +2491,7 @@ MachineBasicBlock *ARMConstantIslands::adjustJTTargetBlockForward(
BB->updateTerminator(OldNext != MF->end() ? &*OldNext : nullptr);
// Update numbering to account for the block being moved.
MF->RenumberBlocks();
+ DT->updateBlockNumbers();
++NumJTMoved;
return nullptr;
}
@@ -2513,6 +2520,7 @@ MachineBasicBlock *ARMConstantIslands::adjustJTTargetBlockForward(
// Update internal data structures to account for the newly inserted MBB.
MF->RenumberBlocks(NewBB);
+ DT->updateBlockNumbers();
// Update the CFG.
NewBB->addSuccessor(BB);
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td
index e133dbeba365..61635bd1629e 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -5849,6 +5849,7 @@ def t2AUT : PACBTIHintSpaceUseInst<"aut", 0b00101101> {
def ARMt2CallBTI : SDNode<"ARMISD::t2CALL_BTI", SDT_ARMcall,
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>;
+let Defs = [LR], Uses = [SP] in
def t2CALL_BTI : PseudoInst<(outs), (ins pred:$p, thumb_bl_target:$func),
IIC_Br, [(ARMt2CallBTI tglobaladdr:$func)]>,
Requires<[IsThumb2]>, Sched<[WriteBrL]>;
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 994b43f1abb4..4e4a19ddf558 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -587,6 +587,14 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
return 0;
return 0xffffff & ((Value - 8) >> 2);
case ARM::fixup_t2_uncondbranch: {
+ if (STI->getTargetTriple().isOSBinFormatCOFF() && !IsResolved &&
+ Value != 4) {
+ // MSVC link.exe and lld do not support this relocation type
+ // with a non-zero offset. ("Value" is offset by 4 at this point.)
+ Ctx.reportError(Fixup.getLoc(),
+ "cannot perform a PC-relative fixup with a non-zero "
+ "symbol offset");
+ }
Value = Value - 4;
if (!isInt<25>(Value)) {
Ctx.reportError(Fixup.getLoc(), "Relocation out of range");
@@ -637,6 +645,14 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
Ctx.reportError(Fixup.getLoc(), "Relocation out of range");
return 0;
}
+ if (STI->getTargetTriple().isOSBinFormatCOFF() && !IsResolved &&
+ Value != 4) {
+ // MSVC link.exe and lld do not support this relocation type
+ // with a non-zero offset. ("Value" is offset by 4 at this point.)
+ Ctx.reportError(Fixup.getLoc(),
+ "cannot perform a PC-relative fixup with a non-zero "
+ "symbol offset");
+ }
// The value doesn't encode the low bit (always zero) and is offset by
// four. The 32-bit immediate value is encoded as
@@ -666,6 +682,14 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
Endian == llvm::endianness::little);
}
case ARM::fixup_arm_thumb_blx: {
+ if (STI->getTargetTriple().isOSBinFormatCOFF() && !IsResolved &&
+ Value != 4) {
+ // MSVC link.exe and lld do not support this relocation type
+ // with a non-zero offset. ("Value" is offset by 4 at this point.)
+ Ctx.reportError(Fixup.getLoc(),
+ "cannot perform a PC-relative fixup with a non-zero "
+ "symbol offset");
+ }
// The value doesn't encode the low two bits (always zero) and is offset by
// four (see fixup_arm_thumb_cp). The 32-bit immediate value is encoded as
// imm32 = SignExtend(S:I1:I2:imm10H:imm10L:00)
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index 50a59ce76763..ddc62b534598 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -41,8 +41,6 @@ namespace {
bool needsRelocateWithSymbol(const MCValue &Val, const MCSymbol &Sym,
unsigned Type) const override;
-
- void addTargetSectionFlags(MCContext &Ctx, MCSectionELF &Sec) override;
};
} // end anonymous namespace
@@ -319,25 +317,6 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
}
}
-void ARMELFObjectWriter::addTargetSectionFlags(MCContext &Ctx,
- MCSectionELF &Sec) {
- // The mix of execute-only and non-execute-only at link time is
- // non-execute-only. To avoid the empty implicitly created .text
- // section from making the whole .text section non-execute-only, we
- // mark it execute-only if it is empty and there is at least one
- // execute-only section in the object.
- MCSectionELF *TextSection =
- static_cast<MCSectionELF *>(Ctx.getObjectFileInfo()->getTextSection());
- bool IsExecOnly = Sec.getFlags() & ELF::SHF_ARM_PURECODE;
- if (IsExecOnly && !TextSection->hasInstructions()) {
- for (auto &F : *TextSection)
- if (auto *DF = dyn_cast<MCDataFragment>(&F))
- if (!DF->getContents().empty())
- return;
- TextSection->setFlags(TextSection->getFlags() | ELF::SHF_ARM_PURECODE);
- }
-}
-
std::unique_ptr<MCObjectTargetWriter>
llvm::createARMELFObjectWriter(uint8_t OSABI) {
return std::make_unique<ARMELFObjectWriter>(OSABI);
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 9df752f8eb68..59f29660a777 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -34,6 +34,7 @@
#include "llvm/MC/MCFragment.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCObjectFileInfo.h"
#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSection.h"
@@ -1113,6 +1114,25 @@ void ARMTargetELFStreamer::reset() { AttributeSection = nullptr; }
void ARMTargetELFStreamer::finish() {
ARMTargetStreamer::finish();
finishAttributeSection();
+
+ // The mix of execute-only and non-execute-only at link time is
+ // non-execute-only. To avoid the empty implicitly created .text
+ // section from making the whole .text section non-execute-only, we
+ // mark it execute-only if it is empty and there is at least one
+ // execute-only section in the object.
+ MCContext &Ctx = getStreamer().getContext();
+ auto &Asm = getStreamer().getAssembler();
+ if (any_of(Asm, [](const MCSection &Sec) {
+ return cast<MCSectionELF>(Sec).getFlags() & ELF::SHF_ARM_PURECODE;
+ })) {
+ auto *Text =
+ static_cast<MCSectionELF *>(Ctx.getObjectFileInfo()->getTextSection());
+ for (auto &F : *Text)
+ if (auto *DF = dyn_cast<MCDataFragment>(&F))
+ if (!DF->getContents().empty())
+ return;
+ Text->setFlags(Text->getFlags() | ELF::SHF_ARM_PURECODE);
+ }
}
void ARMELFStreamer::reset() {
diff --git a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp
index 97bdd4c45a8c..d7f4d4b93f95 100644
--- a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp
+++ b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp
@@ -28,7 +28,6 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineConstantPool.h"
-#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -217,11 +216,6 @@ public:
bool runOnMachineFunction(MachineFunction &F) override;
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<MachineDominatorTreeWrapperPass>();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-
MachineFunctionProperties getRequiredProperties() const override {
return MachineFunctionProperties().set(
MachineFunctionProperties::Property::NoVRegs);
diff --git a/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp b/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp
index e9d95c6e89db..a2230289ae69 100644
--- a/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp
@@ -249,7 +249,8 @@ void HexagonCopyHoisting::moveCopyInstr(MachineBasicBlock *DestBB,
DestBB->splice(FirstTI, MI->getParent(), MI);
addMItoCopyList(MI);
- for (auto I = ++(DestBB->succ_begin()), E = DestBB->succ_end(); I != E; ++I) {
+ for (auto I = std::next(DestBB->succ_begin()), E = DestBB->succ_end(); I != E;
+ ++I) {
MachineBasicBlock *SuccBB = *I;
auto &BBCopyInst = CopyMIList[SuccBB->getNumber()];
MachineInstr *SuccMI = BBCopyInst[Key];
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index a004d64c21cc..5b568b0487b4 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -13,12 +13,14 @@
#include "MCTargetDesc/NVPTXInstPrinter.h"
#include "MCTargetDesc/NVPTXBaseInfo.h"
#include "NVPTX.h"
+#include "NVPTXUtilities.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatVariadic.h"
#include "llvm/Support/FormattedStream.h"
#include <cctype>
using namespace llvm;
@@ -228,31 +230,29 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
const MCOperand &MO = MI->getOperand(OpNum);
int Imm = (int) MO.getImm();
if (!strcmp(Modifier, "sem")) {
- switch (Imm) {
- case NVPTX::PTXLdStInstCode::NotAtomic:
+ auto Ordering = NVPTX::Ordering(Imm);
+ switch (Ordering) {
+ case NVPTX::Ordering::NotAtomic:
break;
- case NVPTX::PTXLdStInstCode::Volatile:
+ case NVPTX::Ordering::Volatile:
O << ".volatile";
break;
- case NVPTX::PTXLdStInstCode::Relaxed:
+ case NVPTX::Ordering::Relaxed:
O << ".relaxed.sys";
break;
- case NVPTX::PTXLdStInstCode::Acquire:
+ case NVPTX::Ordering::Acquire:
O << ".acquire.sys";
break;
- case NVPTX::PTXLdStInstCode::Release:
+ case NVPTX::Ordering::Release:
O << ".release.sys";
break;
- case NVPTX::PTXLdStInstCode::RelaxedMMIO:
+ case NVPTX::Ordering::RelaxedMMIO:
O << ".mmio.relaxed.sys";
break;
default:
- SmallString<256> Msg;
- raw_svector_ostream OS(Msg);
- OS << "NVPTX LdStCode Printer does not support \"" << Imm
- << "\" sem modifier.";
- report_fatal_error(OS.str());
- break;
+ report_fatal_error(formatv(
+ "NVPTX LdStCode Printer does not support \"{}\" sem modifier.",
+ OrderingToCString(Ordering)));
}
} else if (!strcmp(Modifier, "addsp")) {
switch (Imm) {
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 3c7167b15702..f6f6acb9e13c 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -16,6 +16,7 @@
#include "llvm/IR/PassManager.h"
#include "llvm/Pass.h"
+#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/CodeGen.h"
namespace llvm {
@@ -106,15 +107,25 @@ enum LoadStore {
isStoreShift = 6
};
-namespace PTXLdStInstCode {
-enum MemorySemantic {
- NotAtomic = 0, // PTX calls these: "Weak"
- Volatile = 1,
- Relaxed = 2,
- Acquire = 3,
- Release = 4,
- RelaxedMMIO = 5
+// Extends LLVM AtomicOrdering with PTX Orderings:
+using OrderingUnderlyingType = unsigned int;
+enum Ordering : OrderingUnderlyingType {
+ NotAtomic = (OrderingUnderlyingType)
+ AtomicOrdering::NotAtomic, // PTX calls these: "Weak"
+ // Unordered = 1, // NVPTX maps LLVM Unorderd to Relaxed
+ Relaxed = (OrderingUnderlyingType)AtomicOrdering::Monotonic,
+ // Consume = 3, // Unimplemented in LLVM; NVPTX would map to "Acquire"
+ Acquire = (OrderingUnderlyingType)AtomicOrdering::Acquire,
+ Release = (OrderingUnderlyingType)AtomicOrdering::Release,
+ // AcquireRelease = 6, // TODO
+ SequentiallyConsistent =
+ (OrderingUnderlyingType)AtomicOrdering::SequentiallyConsistent,
+ Volatile = SequentiallyConsistent + 1,
+ RelaxedMMIO = Volatile + 1,
+ LAST = RelaxedMMIO
};
+
+namespace PTXLdStInstCode {
enum AddressSpace {
GENERIC = 0,
GLOBAL = 1,
@@ -134,7 +145,7 @@ enum VecType {
V2 = 2,
V4 = 4
};
-}
+} // namespace PTXLdStInstCode
/// PTXCvtMode - Conversion code enumeration
namespace PTXCvtMode {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 96456ad0547e..25c198f0121e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -22,6 +22,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatVariadic.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetIntrinsicInfo.h"
@@ -714,21 +715,28 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) {
return NVPTX::PTXLdStInstCode::GENERIC;
}
-static unsigned int getCodeMemorySemantic(MemSDNode *N,
- const NVPTXSubtarget *Subtarget) {
+namespace {
+
+struct OperationOrderings {
+ NVPTX::Ordering InstructionOrdering, FenceOrdering;
+ OperationOrderings(NVPTX::Ordering IO = NVPTX::Ordering::NotAtomic,
+ NVPTX::Ordering FO = NVPTX::Ordering::NotAtomic)
+ : InstructionOrdering(IO), FenceOrdering(FO) {}
+};
+
+static OperationOrderings
+getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
AtomicOrdering Ordering = N->getSuccessOrdering();
auto CodeAddrSpace = getCodeAddrSpace(N);
bool HasMemoryOrdering = Subtarget->hasMemoryOrdering();
bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO();
- // TODO: lowering for SequentiallyConsistent Operations: for now, we error.
- // TODO: lowering for AcquireRelease Operations: for now, we error.
- //
-
// clang-format off
- // Lowering for non-SequentiallyConsistent Operations
+ // Lowering for Load/Store Operations (note: AcquireRelease Loads or Stores error).
+ // Note: uses of Relaxed in the Atomic column of this table refer
+ // to LLVM AtomicOrdering::Monotonic.
//
// | Atomic | Volatile | Statespace | PTX sm_60- | PTX sm_70+ |
// |---------|----------|--------------------|------------|------------------------------|
@@ -749,6 +757,25 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
// | Other | Yes | Generic, Shared, | Error [2] | <atomic sem> [3] |
// | | | / Global [0] | | |
+ // Lowering of CUDA C++ SequentiallyConsistent Operations and Fences to PTX
+ // by following the ABI proven sound in:
+ // Lustig et al, A Formal Analysis of the NVIDIA PTX Memory Consistency Model, ASPLOS’19.
+ // https://dl.acm.org/doi/pdf/10.1145/3297858.3304043
+ //
+ // | CUDA C++ Atomic Operation or Atomic Fence | PTX Atomic Operation or Fence |
+ // |------------------------------------------------------|-------------------------------|
+ // | cuda::atomic_thread_fence | fence.sc.<scope>; |
+ // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | |
+ // |------------------------------------------------------|-------------------------------|
+ // | cuda::atomic_load | fence.sc.<scope>; |
+ // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | ld.acquire.<scope>; |
+ // |------------------------------------------------------|-------------------------------|
+ // | cuda::atomic_store | fence.sc.<scope>; |
+ // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | st.release.<scope>; |
+ // |------------------------------------------------------|-------------------------------|
+ // | cuda::atomic_fetch_<op> | fence.sc.<scope>; |
+ // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | atom.acq_rel.<scope>; |
+
// clang-format on
// [0]: volatile and atomics are only supported on global or shared
@@ -788,11 +815,10 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
// - the "weak" memory instruction we are currently lowering to, and
// - some other instruction that preserves the side-effect, e.g.,
// a dead dummy volatile load.
-
if (CodeAddrSpace == NVPTX::PTXLdStInstCode::LOCAL ||
CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT ||
CodeAddrSpace == NVPTX::PTXLdStInstCode::PARAM) {
- return NVPTX::PTXLdStInstCode::NotAtomic;
+ return NVPTX::Ordering::NotAtomic;
}
// [2]: Atomics with Ordering different than Unordered or Relaxed are not
@@ -801,12 +827,11 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
Ordering == AtomicOrdering::Unordered ||
Ordering == AtomicOrdering::Monotonic) &&
!HasMemoryOrdering) {
- SmallString<256> Msg;
- raw_svector_ostream OS(Msg);
- OS << "PTX does not support \"atomic\" for orderings different than"
- "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order is: \""
- << toIRString(Ordering) << "\".";
- report_fatal_error(OS.str());
+ report_fatal_error(
+ formatv("PTX does not support \"atomic\" for orderings different than"
+ "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order "
+ "is: \"{}\".",
+ toIRString(Ordering)));
}
// [3]: TODO: these should eventually use .mmio<.atomic sem>; for now we drop
@@ -820,68 +845,76 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
(CodeAddrSpace == NVPTX::PTXLdStInstCode::GENERIC ||
CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL ||
CodeAddrSpace == NVPTX::PTXLdStInstCode::SHARED);
+ if (!AddrGenericOrGlobalOrShared)
+ return NVPTX::Ordering::NotAtomic;
+
bool UseRelaxedMMIO =
HasRelaxedMMIO && CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL;
switch (Ordering) {
case AtomicOrdering::NotAtomic:
- return N->isVolatile() && AddrGenericOrGlobalOrShared
- ? NVPTX::PTXLdStInstCode::Volatile
- : NVPTX::PTXLdStInstCode::NotAtomic;
+ return N->isVolatile() ? NVPTX::Ordering::Volatile
+ : NVPTX::Ordering::NotAtomic;
case AtomicOrdering::Unordered:
// We lower unordered in the exact same way as 'monotonic' to respect
// LLVM IR atomicity requirements.
case AtomicOrdering::Monotonic:
if (N->isVolatile())
- return UseRelaxedMMIO ? NVPTX::PTXLdStInstCode::RelaxedMMIO
- : AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile
- : NVPTX::PTXLdStInstCode::NotAtomic;
+ return UseRelaxedMMIO ? NVPTX::Ordering::RelaxedMMIO
+ : NVPTX::Ordering::Volatile;
else
- return HasMemoryOrdering ? NVPTX::PTXLdStInstCode::Relaxed
- : AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile
- : NVPTX::PTXLdStInstCode::NotAtomic;
+ return HasMemoryOrdering ? NVPTX::Ordering::Relaxed
+ : NVPTX::Ordering::Volatile;
+ // case AtomicOrdering::Consume: // If LLVM ever provides this, lower it to
+ // Acquire.
case AtomicOrdering::Acquire:
- if (!N->readMem()) {
- SmallString<256> Msg;
- raw_svector_ostream OS(Msg);
- OS << "PTX only supports Acquire Ordering on reads: "
- << N->getOperationName();
- N->print(OS);
- report_fatal_error(OS.str());
- }
- return AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Acquire
- : NVPTX::PTXLdStInstCode::NotAtomic;
+ if (!N->readMem())
+ report_fatal_error(
+ formatv("PTX only supports Acquire Ordering on reads: {}",
+ N->getOperationName()));
+ return NVPTX::Ordering::Acquire;
case AtomicOrdering::Release:
- if (!N->writeMem()) {
- SmallString<256> Msg;
- raw_svector_ostream OS(Msg);
- OS << "PTX only supports Release Ordering on writes: "
- << N->getOperationName();
- N->print(OS);
- report_fatal_error(OS.str());
- }
- return AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Release
- : NVPTX::PTXLdStInstCode::NotAtomic;
+ if (!N->writeMem())
+ report_fatal_error(
+ formatv("PTX only supports Release Ordering on writes: {}",
+ N->getOperationName()));
+ return NVPTX::Ordering::Release;
case AtomicOrdering::AcquireRelease: {
- SmallString<256> Msg;
- raw_svector_ostream OS(Msg);
- OS << "PTX only supports AcquireRelease Ordering on read-modify-write: "
- << N->getOperationName();
- N->print(OS);
- report_fatal_error(OS.str());
+ report_fatal_error(
+ formatv("NVPTX does not support AcquireRelease Ordering on "
+ "read-modify-write "
+ "yet and PTX does not support it on loads or stores: {}",
+ N->getOperationName()));
+ }
+ case AtomicOrdering::SequentiallyConsistent: {
+ // LLVM-IR SequentiallyConsistent atomics map to a two-instruction PTX
+ // sequence including a "fence.sc.sco" and the memory instruction with an
+ // Ordering that differs from "sc": acq, rel, or acq_rel, depending on
+ // whether the memory operation is a read, write, or read-modify-write.
+ //
+ // This sets the ordering of the fence to SequentiallyConsistent, and
+ // sets the corresponding ordering for the instruction.
+ NVPTX::Ordering InstrOrder;
+ if (N->readMem())
+ InstrOrder = NVPTX::Ordering::Acquire;
+ else if (N->writeMem())
+ InstrOrder = NVPTX::Ordering::Release;
+ else
+ report_fatal_error(
+ formatv("NVPTX does not support SequentiallyConsistent Ordering on "
+ "read-modify-writes yet: {}",
+ N->getOperationName()));
+ return OperationOrderings(InstrOrder,
+ NVPTX::Ordering::SequentiallyConsistent);
}
- case AtomicOrdering::SequentiallyConsistent:
- // TODO: support AcquireRelease and SequentiallyConsistent
- SmallString<256> Msg;
- raw_svector_ostream OS(Msg);
- OS << "NVPTX backend does not support AtomicOrdering \""
- << toIRString(Ordering) << "\" yet.";
- report_fatal_error(OS.str());
}
-
- llvm_unreachable("unexpected unhandled case");
+ report_fatal_error(
+ formatv("NVPTX backend does not support AtomicOrdering \"{}\" yet.",
+ toIRString(Ordering)));
}
+} // namespace
+
static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
unsigned CodeAddrSpace, MachineFunction *F) {
// We use ldg (i.e. ld.global.nc) for invariant loads from the global address
@@ -924,6 +957,35 @@ static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
});
}
+NVPTX::Ordering NVPTXDAGToDAGISel::insertMemoryInstructionFence(SDLoc DL,
+ SDValue &Chain,
+ MemSDNode *N) {
+ // Some memory instructions - loads, stores, atomics - need an extra fence
+ // instruction. Get the memory order of the instruction, and that of its
+ // fence, if any.
+ auto [InstructionOrdering, FenceOrdering] =
+ getOperationOrderings(N, Subtarget);
+
+ // If a fence is required before the operation, insert it:
+ switch (NVPTX::Ordering(FenceOrdering)) {
+ case NVPTX::Ordering::NotAtomic:
+ break;
+ case NVPTX::Ordering::SequentiallyConsistent: {
+ unsigned Op = Subtarget->hasMemoryOrdering()
+ ? NVPTX::atomic_thread_fence_seq_cst_sys
+ : NVPTX::INT_MEMBAR_SYS;
+ Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0);
+ break;
+ }
+ default:
+ report_fatal_error(
+ formatv("Unexpected fence ordering: \"{}\".",
+ OrderingToCString(NVPTX::Ordering(FenceOrdering))));
+ }
+
+ return InstructionOrdering;
+}
+
bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
unsigned IID = N->getConstantOperandVal(0);
switch (IID) {
@@ -1070,17 +1132,15 @@ static int getLdStRegType(EVT VT) {
}
bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
- SDLoc dl(N);
MemSDNode *LD = cast<MemSDNode>(N);
assert(LD->readMem() && "Expected load");
- LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
- EVT LoadedVT = LD->getMemoryVT();
- SDNode *NVPTXLD = nullptr;
// do not support pre/post inc/dec
+ LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
if (PlainLoad && PlainLoad->isIndexed())
return false;
+ EVT LoadedVT = LD->getMemoryVT();
if (!LoadedVT.isSimple())
return false;
@@ -1089,13 +1149,13 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
return tryLDGLDU(N);
}
-
- // Memory Semantic Setting
- unsigned int CodeMemorySem = getCodeMemorySemantic(LD, Subtarget);
-
unsigned int PointerSize =
CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
+ SDLoc DL(N);
+ SDValue Chain = N->getOperand(0);
+ auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, LD);
+
// Type Setting: fromType + fromTypeWidth
//
// Sign : ISD::SEXTLOAD
@@ -1105,45 +1165,42 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
MVT SimpleVT = LoadedVT.getSimpleVT();
MVT ScalarVT = SimpleVT.getScalarType();
// Read at least 8 bits (predicates are stored as 8-bit values)
- unsigned fromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
- unsigned int fromType;
+ unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
+ unsigned int FromType;
// Vector Setting
- unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
+ unsigned VecType = NVPTX::PTXLdStInstCode::Scalar;
if (SimpleVT.isVector()) {
assert((Isv2x16VT(LoadedVT) || LoadedVT == MVT::v4i8) &&
"Unexpected vector type");
// v2f16/v2bf16/v2i16 is loaded using ld.b32
- fromTypeWidth = 32;
+ FromTypeWidth = 32;
}
if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
- fromType = NVPTX::PTXLdStInstCode::Signed;
+ FromType = NVPTX::PTXLdStInstCode::Signed;
else
- fromType = getLdStRegType(ScalarVT);
+ FromType = getLdStRegType(ScalarVT);
// Create the machine instruction DAG
- SDValue Chain = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue Addr;
SDValue Offset, Base;
std::optional<unsigned> Opcode;
MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
+ SmallVector<SDValue, 12> Ops({getI32Imm(InstructionOrdering, DL),
+ getI32Imm(CodeAddrSpace, DL),
+ getI32Imm(VecType, DL), getI32Imm(FromType, DL),
+ getI32Imm(FromTypeWidth, DL)});
+
if (SelectDirectAddr(N1, Addr)) {
Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar,
NVPTX::LD_i32_avar, NVPTX::LD_i64_avar,
NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
- getI32Imm(CodeAddrSpace, dl),
- getI32Imm(vecType, dl),
- getI32Imm(fromType, dl),
- getI32Imm(fromTypeWidth, dl),
- Addr,
- Chain};
- NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
+ Ops.append({Addr, Chain});
} else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
: SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
@@ -1151,15 +1208,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
- getI32Imm(CodeAddrSpace, dl),
- getI32Imm(vecType, dl),
- getI32Imm(fromType, dl),
- getI32Imm(fromTypeWidth, dl),
- Base,
- Offset,
- Chain};
- NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
+ Ops.append({Base, Offset, Chain});
} else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
: SelectADDRri(N1.getNode(), N1, Base, Offset)) {
if (PointerSize == 64)
@@ -1173,15 +1222,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
- getI32Imm(CodeAddrSpace, dl),
- getI32Imm(vecType, dl),
- getI32Imm(fromType, dl),
- getI32Imm(fromTypeWidth, dl),
- Base,
- Offset,
- Chain};
- NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
+ Ops.append({Base, Offset, Chain});
} else {
if (PointerSize == 64)
Opcode =
@@ -1194,16 +1235,11 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
- getI32Imm(CodeAddrSpace, dl),
- getI32Imm(vecType, dl),
- getI32Imm(fromType, dl),
- getI32Imm(fromTypeWidth, dl),
- N1,
- Chain};
- NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
+ Ops.append({N1, Chain});
}
+ SDNode *NVPTXLD =
+ CurDAG->getMachineNode(*Opcode, DL, TargetVT, MVT::Other, Ops);
if (!NVPTXLD)
return false;
@@ -1215,16 +1251,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
}
bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
-
- SDValue Chain = N->getOperand(0);
- SDValue Op1 = N->getOperand(1);
- SDValue Addr, Offset, Base;
- std::optional<unsigned> Opcode;
- SDLoc DL(N);
- SDNode *LD;
MemSDNode *MemSD = cast<MemSDNode>(N);
EVT LoadedVT = MemSD->getMemoryVT();
-
if (!LoadedVT.isSimple())
return false;
@@ -1233,12 +1261,12 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
return tryLDGLDU(N);
}
-
unsigned int PointerSize =
CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
- // Memory Semantic Setting
- unsigned int CodeMemorySem = getCodeMemorySemantic(MemSD, Subtarget);
+ SDLoc DL(N);
+ SDValue Chain = N->getOperand(0);
+ auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, MemSD);
// Vector Setting
MVT SimpleVT = LoadedVT.getSimpleVT();
@@ -1286,6 +1314,16 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
FromTypeWidth = 32;
}
+ SDValue Op1 = N->getOperand(1);
+ SDValue Addr, Offset, Base;
+ std::optional<unsigned> Opcode;
+ SDNode *LD;
+
+ SmallVector<SDValue, 12> Ops({getI32Imm(InstructionOrdering, DL),
+ getI32Imm(CodeAddrSpace, DL),
+ getI32Imm(VecType, DL), getI32Imm(FromType, DL),
+ getI32Imm(FromTypeWidth, DL)});
+
if (SelectDirectAddr(Op1, Addr)) {
switch (N->getOpcode()) {
default:
@@ -1305,14 +1343,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
- getI32Imm(CodeAddrSpace, DL),
- getI32Imm(VecType, DL),
- getI32Imm(FromType, DL),
- getI32Imm(FromTypeWidth, DL),
- Addr,
- Chain};
- LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
+ Ops.append({Addr, Chain});
} else if (PointerSize == 64
? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
: SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
@@ -1334,15 +1365,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
- getI32Imm(CodeAddrSpace, DL),
- getI32Imm(VecType, DL),
- getI32Imm(FromType, DL),
- getI32Imm(FromTypeWidth, DL),
- Base,
- Offset,
- Chain};
- LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
+ Ops.append({Base, Offset, Chain});
} else if (PointerSize == 64
? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
: SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
@@ -1384,16 +1407,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
- getI32Imm(CodeAddrSpace, DL),
- getI32Imm(VecType, DL),
- getI32Imm(FromType, DL),
- getI32Imm(FromTypeWidth, DL),
- Base,
- Offset,
- Chain};
-
- LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
+ Ops.append({Base, Offset, Chain});
} else {
if (PointerSize == 64) {
switch (N->getOpcode()) {
@@ -1434,15 +1448,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
- getI32Imm(CodeAddrSpace, DL),
- getI32Imm(VecType, DL),
- getI32Imm(FromType, DL),
- getI32Imm(FromTypeWidth, DL),
- Op1,
- Chain};
- LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
+ Ops.append({Op1, Chain});
}
+ LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});
@@ -1452,8 +1460,6 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
-
- SDValue Chain = N->getOperand(0);
SDValue Op1;
MemSDNode *Mem;
bool IsLDG = true;
@@ -1483,12 +1489,7 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
Mem = cast<MemSDNode>(N);
}
- std::optional<unsigned> Opcode;
- SDLoc DL(N);
- SDNode *LD;
- SDValue Base, Offset, Addr;
EVT OrigType = N->getValueType(0);
-
EVT EltVT = Mem->getMemoryVT();
unsigned NumElts = 1;
if (EltVT.isVector()) {
@@ -1517,6 +1518,12 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
}
InstVTs.push_back(MVT::Other);
SDVTList InstVTList = CurDAG->getVTList(InstVTs);
+ SDValue Chain = N->getOperand(0);
+
+ std::optional<unsigned> Opcode;
+ SDLoc DL(N);
+ SDNode *LD;
+ SDValue Base, Offset, Addr;
if (SelectDirectAddr(Op1, Addr)) {
switch (N->getOpcode()) {
@@ -1867,19 +1874,17 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
}
bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
- SDLoc dl(N);
MemSDNode *ST = cast<MemSDNode>(N);
assert(ST->writeMem() && "Expected store");
StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N);
AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N);
assert((PlainStore || AtomicStore) && "Expected store");
- EVT StoreVT = ST->getMemoryVT();
- SDNode *NVPTXST = nullptr;
// do not support pre/post inc/dec
if (PlainStore && PlainStore->isIndexed())
return false;
+ EVT StoreVT = ST->getMemoryVT();
if (!StoreVT.isSimple())
return false;
@@ -1888,29 +1893,28 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
unsigned int PointerSize =
CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
- // Memory Semantic Setting
- unsigned int CodeMemorySem = getCodeMemorySemantic(ST, Subtarget);
+ SDLoc DL(N);
+ SDValue Chain = ST->getChain();
+ auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, ST);
// Vector Setting
MVT SimpleVT = StoreVT.getSimpleVT();
- unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
+ unsigned VecType = NVPTX::PTXLdStInstCode::Scalar;
// Type Setting: toType + toTypeWidth
// - for integer type, always use 'u'
- //
MVT ScalarVT = SimpleVT.getScalarType();
- unsigned toTypeWidth = ScalarVT.getSizeInBits();
+ unsigned ToTypeWidth = ScalarVT.getSizeInBits();
if (SimpleVT.isVector()) {
assert((Isv2x16VT(StoreVT) || StoreVT == MVT::v4i8) &&
"Unexpected vector type");
// v2x16 is stored using st.b32
- toTypeWidth = 32;
+ ToTypeWidth = 32;
}
- unsigned int toType = getLdStRegType(ScalarVT);
+ unsigned int ToType = getLdStRegType(ScalarVT);
// Create the machine instruction DAG
- SDValue Chain = ST->getChain();
SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
SDValue BasePtr = ST->getBasePtr();
SDValue Addr;
@@ -1919,21 +1923,18 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
MVT::SimpleValueType SourceVT =
Value.getNode()->getSimpleValueType(0).SimpleTy;
+ SmallVector<SDValue, 12> Ops({Value, getI32Imm(InstructionOrdering, DL),
+ getI32Imm(CodeAddrSpace, DL),
+ getI32Imm(VecType, DL), getI32Imm(ToType, DL),
+ getI32Imm(ToTypeWidth, DL)});
+
if (SelectDirectAddr(BasePtr, Addr)) {
Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
if (!Opcode)
return false;
- SDValue Ops[] = {Value,
- getI32Imm(CodeMemorySem, dl),
- getI32Imm(CodeAddrSpace, dl),
- getI32Imm(vecType, dl),
- getI32Imm(toType, dl),
- getI32Imm(toTypeWidth, dl),
- Addr,
- Chain};
- NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
+ Ops.append({Addr, Chain});
} else if (PointerSize == 64
? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset)
: SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) {
@@ -1942,16 +1943,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
if (!Opcode)
return false;
- SDValue Ops[] = {Value,
- getI32Imm(CodeMemorySem, dl),
- getI32Imm(CodeAddrSpace, dl),
- getI32Imm(vecType, dl),
- getI32Imm(toType, dl),
- getI32Imm(toTypeWidth, dl),
- Base,
- Offset,
- Chain};
- NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
+ Ops.append({Base, Offset, Chain});
} else if (PointerSize == 64
? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset)
: SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) {
@@ -1966,17 +1958,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);
if (!Opcode)
return false;
-
- SDValue Ops[] = {Value,
- getI32Imm(CodeMemorySem, dl),
- getI32Imm(CodeAddrSpace, dl),
- getI32Imm(vecType, dl),
- getI32Imm(toType, dl),
- getI32Imm(toTypeWidth, dl),
- Base,
- Offset,
- Chain};
- NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
+ Ops.append({Base, Offset, Chain});
} else {
if (PointerSize == 64)
Opcode =
@@ -1989,17 +1971,12 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
if (!Opcode)
return false;
- SDValue Ops[] = {Value,
- getI32Imm(CodeMemorySem, dl),
- getI32Imm(CodeAddrSpace, dl),
- getI32Imm(vecType, dl),
- getI32Imm(toType, dl),
- getI32Imm(toTypeWidth, dl),
- BasePtr,
- Chain};
- NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
+ Ops.append({BasePtr, Chain});
}
+ SDNode *NVPTXST = NVPTXST =
+ CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
+
if (!NVPTXST)
return false;
@@ -2010,11 +1987,9 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
}
bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
- SDValue Chain = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
SDValue Addr, Offset, Base;
std::optional<unsigned> Opcode;
- SDLoc DL(N);
SDNode *ST;
EVT EltVT = Op1.getValueType();
MemSDNode *MemSD = cast<MemSDNode>(N);
@@ -2029,8 +2004,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
unsigned int PointerSize =
CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
- // Memory Semantic Setting
- unsigned int CodeMemorySem = getCodeMemorySemantic(MemSD, Subtarget);
+ SDLoc DL(N);
+ SDValue Chain = N->getOperand(0);
+ auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, MemSD);
// Type Setting: toType + toTypeWidth
// - for integer type, always use 'u'
@@ -2039,23 +2015,20 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
unsigned ToTypeWidth = ScalarVT.getSizeInBits();
unsigned ToType = getLdStRegType(ScalarVT);
- SmallVector<SDValue, 12> StOps;
+ SmallVector<SDValue, 12> Ops;
SDValue N2;
unsigned VecType;
switch (N->getOpcode()) {
case NVPTXISD::StoreV2:
VecType = NVPTX::PTXLdStInstCode::V2;
- StOps.push_back(N->getOperand(1));
- StOps.push_back(N->getOperand(2));
+ Ops.append({N->getOperand(1), N->getOperand(2)});
N2 = N->getOperand(3);
break;
case NVPTXISD::StoreV4:
VecType = NVPTX::PTXLdStInstCode::V4;
- StOps.push_back(N->getOperand(1));
- StOps.push_back(N->getOperand(2));
- StOps.push_back(N->getOperand(3));
- StOps.push_back(N->getOperand(4));
+ Ops.append({N->getOperand(1), N->getOperand(2), N->getOperand(3),
+ N->getOperand(4)});
N2 = N->getOperand(5);
break;
default:
@@ -2072,11 +2045,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
ToTypeWidth = 32;
}
- StOps.push_back(getI32Imm(CodeMemorySem, DL));
- StOps.push_back(getI32Imm(CodeAddrSpace, DL));
- StOps.push_back(getI32Imm(VecType, DL));
- StOps.push_back(getI32Imm(ToType, DL));
- StOps.push_back(getI32Imm(ToTypeWidth, DL));
+ Ops.append({getI32Imm(InstructionOrdering, DL), getI32Imm(CodeAddrSpace, DL),
+ getI32Imm(VecType, DL), getI32Imm(ToType, DL),
+ getI32Imm(ToTypeWidth, DL)});
if (SelectDirectAddr(N2, Addr)) {
switch (N->getOpcode()) {
@@ -2095,7 +2066,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
NVPTX::STV_f32_v4_avar, std::nullopt);
break;
}
- StOps.push_back(Addr);
+ Ops.push_back(Addr);
} else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
: SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
switch (N->getOpcode()) {
@@ -2114,8 +2085,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt);
break;
}
- StOps.push_back(Base);
- StOps.push_back(Offset);
+ Ops.append({Base, Offset});
} else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
: SelectADDRri(N2.getNode(), N2, Base, Offset)) {
if (PointerSize == 64) {
@@ -2154,8 +2124,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
break;
}
}
- StOps.push_back(Base);
- StOps.push_back(Offset);
+ Ops.append({Base, Offset});
} else {
if (PointerSize == 64) {
switch (N->getOpcode()) {
@@ -2194,15 +2163,15 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
break;
}
}
- StOps.push_back(N2);
+ Ops.push_back(N2);
}
if (!Opcode)
return false;
- StOps.push_back(Chain);
+ Ops.push_back(Chain);
- ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, StOps);
+ ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef});
@@ -2276,10 +2245,8 @@ bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
unsigned OffsetVal = Offset->getAsZExtVal();
- SmallVector<SDValue, 2> Ops;
- Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
- Ops.push_back(Chain);
- Ops.push_back(Glue);
+ SmallVector<SDValue, 2> Ops(
+ {CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue});
ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops));
return true;
@@ -2312,8 +2279,7 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
SmallVector<SDValue, 6> Ops;
for (unsigned i = 0; i < NumElts; ++i)
Ops.push_back(N->getOperand(i + 2));
- Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
- Ops.push_back(Chain);
+ Ops.append({CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain});
// Determine target opcode
// If we have an i1, use an 8-bit store. The lowering code in
@@ -2493,10 +2459,8 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
SmallVector<SDValue, 8> Ops;
for (unsigned i = 0; i < NumElts; ++i)
Ops.push_back(N->getOperand(i + 3));
- Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32));
- Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
- Ops.push_back(Chain);
- Ops.push_back(Glue);
+ Ops.append({CurDAG->getTargetConstant(ParamVal, DL, MVT::i32),
+ CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue});
// Determine target opcode
// If we have an i1, use an 8-bit store. The lowering code in
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 49626d405148..eac405659951 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -99,6 +99,9 @@ private:
bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const;
static unsigned GetConvertOpcode(MVT DestTy, MVT SrcTy, LoadSDNode *N);
+
+ NVPTX::Ordering insertMemoryInstructionFence(SDLoc DL, SDValue &Chain,
+ MemSDNode *N);
};
class NVPTXDAGToDAGISelLegacy : public SelectionDAGISelLegacy {
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index a5bdc6fac3ca..6a096fa5acea 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3926,7 +3926,6 @@ def : Pat<(atomic_fence (i64 6), (i64 1)), (atomic_thread_fence_acq_rel_sys)>, /
def : Pat<(atomic_fence (i64 7), (i64 1)), (atomic_thread_fence_seq_cst_sys)>, // seq_cst(7) sys(1)
Requires<[hasPTX<60>, hasSM<70>]>;
-
// If PTX<60 or SM<70, we fall back to MEMBAR:
def : Pat<(atomic_fence (i64 4), (i64 1)), (INT_MEMBAR_SYS)>; // acquire(4) sys(1)
def : Pat<(atomic_fence (i64 5), (i64 1)), (INT_MEMBAR_SYS)>; // release(5) sys(1)
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index c81dfa68e4bd..887951b55fb3 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -335,6 +335,48 @@ def INT_FENCE_SC_CLUSTER:
MEMBAR<"fence.sc.cluster;", int_nvvm_fence_sc_cluster>,
Requires<[hasPTX<78>, hasSM<90>]>;
+// Proxy fence (uni-directional)
+// fence.proxy.tensormap.release variants
+
+class FENCE_PROXY_TENSORMAP_GENERIC_RELEASE<string Scope, Intrinsic Intr> :
+ NVPTXInst<(outs), (ins),
+ "fence.proxy.tensormap::generic.release." # Scope # ";", [(Intr)]>,
+ Requires<[hasPTX<83>, hasSM<90>]>;
+
+def INT_FENCE_PROXY_TENSORMAP_GENERIC_RELEASE_CTA:
+ FENCE_PROXY_TENSORMAP_GENERIC_RELEASE<"cta",
+ int_nvvm_fence_proxy_tensormap_generic_release_cta>;
+def INT_FENCE_PROXY_TENSORMAP_GENERIC_RELEASE_CLUSTER:
+ FENCE_PROXY_TENSORMAP_GENERIC_RELEASE<"cluster",
+ int_nvvm_fence_proxy_tensormap_generic_release_cluster>;
+def INT_FENCE_PROXY_TENSORMAP_GENERIC_RELEASE_GPU:
+ FENCE_PROXY_TENSORMAP_GENERIC_RELEASE<"gpu",
+ int_nvvm_fence_proxy_tensormap_generic_release_gpu>;
+def INT_FENCE_PROXY_TENSORMAP_GENERIC_RELEASE_SYS:
+ FENCE_PROXY_TENSORMAP_GENERIC_RELEASE<"sys",
+ int_nvvm_fence_proxy_tensormap_generic_release_sys>;
+
+// fence.proxy.tensormap.acquire variants
+
+class FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE<string Scope, Intrinsic Intr> :
+ NVPTXInst<(outs), (ins Int64Regs:$addr),
+ "fence.proxy.tensormap::generic.acquire." # Scope # " [$addr], 128;",
+ [(Intr Int64Regs:$addr, (i32 128))]>,
+ Requires<[hasPTX<83>, hasSM<90>]>;
+
+def INT_FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE_CTA :
+ FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE<"cta",
+ int_nvvm_fence_proxy_tensormap_generic_acquire_cta>;
+def INT_FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE_CLUSTER :
+ FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE<"cluster",
+ int_nvvm_fence_proxy_tensormap_generic_acquire_cluster>;
+def INT_FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE_GPU :
+ FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE<"gpu",
+ int_nvvm_fence_proxy_tensormap_generic_acquire_gpu>;
+def INT_FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE_SYS :
+ FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE<"sys",
+ int_nvvm_fence_proxy_tensormap_generic_acquire_sys>;
+
//-----------------------------------
// Async Copy Functions
//-----------------------------------
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index c15ff6cae1f2..eebd91fefe4f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -13,6 +13,7 @@
#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H
#define LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H
+#include "NVPTX.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalVariable.h"
@@ -82,6 +83,36 @@ inline unsigned promoteScalarArgumentSize(unsigned size) {
bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM);
bool Isv2x16VT(EVT VT);
+
+namespace NVPTX {
+
+inline std::string OrderingToCString(Ordering Order) {
+ switch (Order) {
+ case Ordering::NotAtomic:
+ return "NotAtomic";
+ case Ordering::Relaxed:
+ return "Relaxed";
+ case Ordering::Acquire:
+ return "Acquire";
+ case Ordering::Release:
+ return "Release";
+ // case Ordering::AcquireRelease: return "AcquireRelease";
+ case Ordering::SequentiallyConsistent:
+ return "SequentiallyConsistent";
+ case Ordering::Volatile:
+ return "Volatile";
+ case Ordering::RelaxedMMIO:
+ return "RelaxedMMIO";
+ }
+ report_fatal_error("unknown ordering");
+}
+
+inline raw_ostream &operator<<(raw_ostream &O, Ordering Order) {
+ O << OrderingToCString(Order);
+ return O;
}
+} // namespace NVPTX
+} // namespace llvm
+
#endif
diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index 1963582ce686..a57ed33bda9c 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -1007,7 +1007,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
// R0 cannot be used as a base register, but it can be used as an
// index in a store-indexed.
int LastOffset = 0;
- if (HasFP) {
+ if (HasFP) {
// R0 += (FPOffset-LastOffset).
// Need addic, since addi treats R0 as 0.
BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDIC), ScratchReg)
@@ -2025,8 +2025,18 @@ void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF,
// code. Same goes for the base pointer and the PIC base register.
if (needsFP(MF))
SavedRegs.reset(isPPC64 ? PPC::X31 : PPC::R31);
- if (RegInfo->hasBasePointer(MF))
+ if (RegInfo->hasBasePointer(MF)) {
SavedRegs.reset(RegInfo->getBaseRegister(MF));
+ // On AIX, when BaseRegister(R30) is used, need to spill r31 too to match
+ // AIX trackback table requirement.
+ if (!needsFP(MF) && !SavedRegs.test(isPPC64 ? PPC::X31 : PPC::R31) &&
+ Subtarget.isAIXABI()) {
+ assert(
+ (RegInfo->getBaseRegister(MF) == (isPPC64 ? PPC::X30 : PPC::R30)) &&
+ "Invalid base register on AIX!");
+ SavedRegs.set(isPPC64 ? PPC::X31 : PPC::R31);
+ }
+ }
if (FI->usesPICBase())
SavedRegs.reset(PPC::R30);
diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
index 8f5afbae01de..0177034a5ae0 100644
--- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -1014,12 +1014,14 @@ def POPCNTB8 : XForm_11<31, 122, (outs g8rc:$RA), (ins g8rc:$RST),
[(set i64:$RA, (int_ppc_popcntb i64:$RST))]>;
def CDTBCD8 : XForm_11<31, 282, (outs g8rc:$RA), (ins g8rc:$RST),
- "cdtbcd $RA, $RST", IIC_IntGeneral, []>;
+ "cdtbcd $RA, $RST", IIC_IntGeneral,
+ [(set i64:$RA, (int_ppc_cdtbcdd i64:$RST))]>;
def CBCDTD8 : XForm_11<31, 314, (outs g8rc:$RA), (ins g8rc:$RST),
- "cbcdtd $RA, $RST", IIC_IntGeneral, []>;
-
+ "cbcdtd $RA, $RST", IIC_IntGeneral,
+ [(set i64:$RA, (int_ppc_cbcdtdd i64:$RST))]>;
def ADDG6S8 : XOForm_1<31, 74, 0, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB),
- "addg6s $RT, $RA, $RB", IIC_IntGeneral, []>;
+ "addg6s $RT, $RA, $RB", IIC_IntGeneral,
+ [(set i64:$RT, (int_ppc_addg6sd i64:$RA, i64:$RB))]>;
}
defm DIVD : XOForm_1rcr<31, 489, 0, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB),
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index 1686249c0f89..411ea77afc0d 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -1931,12 +1931,14 @@ def POPCNTB : XForm_11<31, 122, (outs gprc:$RA), (ins gprc:$RST),
[(set i32:$RA, (int_ppc_popcntb i32:$RST))]>;
def CDTBCD : XForm_11<31, 282, (outs gprc:$RA), (ins gprc:$RST),
- "cdtbcd $RA, $RST", IIC_IntGeneral, []>;
+ "cdtbcd $RA, $RST", IIC_IntGeneral,
+ [(set i32:$RA, (int_ppc_cdtbcd i32:$RST))]>;
def CBCDTD : XForm_11<31, 314, (outs gprc:$RA), (ins gprc:$RST),
- "cbcdtd $RA, $RST", IIC_IntGeneral, []>;
-
+ "cbcdtd $RA, $RST", IIC_IntGeneral,
+ [(set i32:$RA, (int_ppc_cbcdtd i32:$RST))]>;
def ADDG6S : XOForm_1<31, 74, 0, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
- "addg6s $RT, $RA, $RB", IIC_IntGeneral, []>;
+ "addg6s $RT, $RA, $RB", IIC_IntGeneral,
+ [(set i32:$RT, (int_ppc_addg6s i32:$RA, i32:$RB))]>;
//===----------------------------------------------------------------------===//
// PPC32 Load Instructions.
diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
index f28a7092e3ce..5146e519c352 100644
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -36,6 +36,7 @@ add_llvm_target(RISCVCodeGen
RISCVExpandPseudoInsts.cpp
RISCVFrameLowering.cpp
RISCVGatherScatterLowering.cpp
+ RISCVIndirectBranchTracking.cpp
RISCVInsertVSETVLI.cpp
RISCVInsertReadWriteCSR.cpp
RISCVInsertWriteVXRM.cpp
diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
index 0d2473c7c5de..80cb39529149 100644
--- a/llvm/lib/Target/RISCV/RISCV.h
+++ b/llvm/lib/Target/RISCV/RISCV.h
@@ -31,6 +31,9 @@ void initializeRISCVCodeGenPreparePass(PassRegistry &);
FunctionPass *createRISCVDeadRegisterDefinitionsPass();
void initializeRISCVDeadRegisterDefinitionsPass(PassRegistry &);
+FunctionPass *createRISCVIndirectBranchTrackingPass();
+void initializeRISCVIndirectBranchTrackingPass(PassRegistry &);
+
FunctionPass *createRISCVISelDag(RISCVTargetMachine &TM,
CodeGenOptLevel OptLevel);
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 43c04d417c7d..604234b24315 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -1461,7 +1461,12 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
SDValue X = N0.getOperand(0);
- if (isMask_64(C1)) {
+ // Prefer SRAIW + ANDI when possible.
+ bool Skip = C2 > 32 && isInt<12>(N1C->getSExtValue()) &&
+ X.getOpcode() == ISD::SHL &&
+ isa<ConstantSDNode>(X.getOperand(1)) &&
+ X.getConstantOperandVal(1) == 32;
+ if (isMask_64(C1) && !Skip) {
unsigned Leading = XLen - llvm::bit_width(C1);
if (C2 > Leading) {
SDNode *SRAI = CurDAG->getMachineNode(
diff --git a/llvm/lib/Target/RISCV/RISCVIndirectBranchTracking.cpp b/llvm/lib/Target/RISCV/RISCVIndirectBranchTracking.cpp
new file mode 100644
index 000000000000..1b484d486edc
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVIndirectBranchTracking.cpp
@@ -0,0 +1,102 @@
+//===------ RISCVIndirectBranchTracking.cpp - Enables lpad mechanism ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The pass adds LPAD (AUIPC with rs1 = X0) machine instructions at the
+// beginning of each basic block or function that is referenced by an indrect
+// jump/call instruction.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "RISCVInstrInfo.h"
+#include "RISCVSubtarget.h"
+#include "RISCVTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+
+using namespace llvm;
+
+static cl::opt<uint32_t> PreferredLandingPadLabel(
+ "riscv-landing-pad-label", cl::ReallyHidden,
+ cl::desc("Use preferred fixed label for all labels"));
+
+namespace {
+class RISCVIndirectBranchTrackingPass : public MachineFunctionPass {
+public:
+ RISCVIndirectBranchTrackingPass() : MachineFunctionPass(ID) {}
+
+ StringRef getPassName() const override {
+ return "RISC-V Indirect Branch Tracking";
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ static char ID;
+ const Align LpadAlign = Align(4);
+};
+
+} // end anonymous namespace
+
+char RISCVIndirectBranchTrackingPass::ID = 0;
+
+FunctionPass *llvm::createRISCVIndirectBranchTrackingPass() {
+ return new RISCVIndirectBranchTrackingPass();
+}
+
+static void emitLpad(MachineBasicBlock &MBB, const RISCVInstrInfo *TII,
+ uint32_t Label) {
+ auto I = MBB.begin();
+ BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(RISCV::AUIPC), RISCV::X0)
+ .addImm(Label);
+}
+
+bool RISCVIndirectBranchTrackingPass::runOnMachineFunction(
+ MachineFunction &MF) {
+ const auto &Subtarget = MF.getSubtarget<RISCVSubtarget>();
+ const RISCVInstrInfo *TII = Subtarget.getInstrInfo();
+ if (!Subtarget.hasStdExtZicfilp())
+ return false;
+
+ uint32_t FixedLabel = 0;
+ if (PreferredLandingPadLabel.getNumOccurrences() > 0) {
+ if (!isUInt<20>(PreferredLandingPadLabel))
+ report_fatal_error("riscv-landing-pad-label=<val>, <val> needs to fit in "
+ "unsigned 20-bits");
+ FixedLabel = PreferredLandingPadLabel;
+ }
+
+ bool Changed = false;
+ for (MachineBasicBlock &MBB : MF) {
+ if (&MBB == &MF.front()) {
+ Function &F = MF.getFunction();
+ // When trap is taken, landing pad is not needed.
+ if (F.hasFnAttribute("interrupt"))
+ continue;
+
+ if (F.hasAddressTaken() || !F.hasLocalLinkage()) {
+ emitLpad(MBB, TII, FixedLabel);
+ if (MF.getAlignment() < LpadAlign)
+ MF.setAlignment(LpadAlign);
+ Changed = true;
+ }
+ continue;
+ }
+
+ if (MBB.hasAddressTaken()) {
+ emitLpad(MBB, TII, FixedLabel);
+ if (MBB.getAlignment() < LpadAlign)
+ MBB.setAlignment(LpadAlign);
+ Changed = true;
+ }
+ }
+
+ return Changed;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index b5dbc055e03d..86b30e836473 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -371,6 +371,9 @@ defset list<VTypeInfo> AllVectors = {
}
}
+defvar AllFloatVectorsExceptFP16 = !filter(vti, AllFloatVectors, !ne(vti.Scalar, f16));
+defvar AllFP16Vectors = !filter(vti, AllFloatVectors, !eq(vti.Scalar, f16));
+
// This functor is used to obtain the int vector type that has the same SEW and
// multiplier as the input parameter type
class GetIntVTypeInfo<VTypeInfo vti> {
@@ -7245,6 +7248,14 @@ foreach vti = AllFloatVectors in {
vti.RegClass, vti.ScalarRegClass>;
}
+foreach vti = AllBFloatVectors in
+ let Predicates = [HasVInstructionsBF16Minimal] in
+ defm : VPatBinaryCarryInTAIL<"int_riscv_vmerge", "PseudoVMERGE", "VVM",
+ vti.Vector,
+ vti.Vector, vti.Vector, vti.Mask,
+ vti.Log2SEW, vti.LMul, vti.RegClass,
+ vti.RegClass, vti.RegClass>;
+
foreach fvti = AllFloatVectors in {
defvar instr = !cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX);
let Predicates = GetVTypePredicates<fvti>.Predicates in
@@ -7445,14 +7456,22 @@ defm : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16_vv", "PseudoVRGATHEREI16",
eew=16, vtilist=AllIntegerVectors>;
defm : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER",
- AllFloatVectors, uimm5>;
+ AllFloatVectorsExceptFP16, uimm5>;
+let Predicates = [HasVInstructionsF16Minimal] in
+ defm : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER",
+ AllFP16Vectors, uimm5>;
+defm : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER",
+ AllBFloatVectors, uimm5>;
defm : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16_vv", "PseudoVRGATHEREI16",
eew=16, vtilist=AllFloatVectors>;
//===----------------------------------------------------------------------===//
// 16.5. Vector Compress Instruction
//===----------------------------------------------------------------------===//
defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllIntegerVectors>;
-defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllFloatVectors>;
+defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllFloatVectorsExceptFP16>;
+let Predicates = [HasVInstructionsF16Minimal] in
+ defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllFP16Vectors>;
+defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllBFloatVectors>;
// Include the non-intrinsic ISel patterns
include "RISCVInstrInfoVVLPatterns.td"
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 83d27b35cf0d..cc40d6a2f986 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -239,6 +239,12 @@ def SIFIVE_X280 : RISCVProcessorModel<"sifive-x280", SiFive7Model,
FeatureStdExtZbb],
SiFiveX280TuneFeatures>;
+defvar SiFiveP400TuneFeatures = [TuneNoDefaultUnroll,
+ TuneConditionalCompressedMoveFusion,
+ TuneLUIADDIFusion,
+ TuneAUIPCADDIFusion,
+ FeaturePostRAScheduler];
+
def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model,
[Feature64Bit,
FeatureStdExtI,
@@ -266,11 +272,26 @@ def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model,
FeatureStdExtZfhmin,
FeatureUnalignedScalarMem,
FeatureUnalignedVectorMem],
- [TuneNoDefaultUnroll,
- TuneConditionalCompressedMoveFusion,
- TuneLUIADDIFusion,
- TuneAUIPCADDIFusion,
- FeaturePostRAScheduler]>;
+ SiFiveP400TuneFeatures>;
+
+def SIFIVE_P470 : RISCVProcessorModel<"sifive-p470", SiFiveP400Model,
+ !listconcat(RVA22U64Features,
+ [FeatureStdExtV,
+ FeatureStdExtZifencei,
+ FeatureStdExtZihintntl,
+ FeatureStdExtZvl128b,
+ FeatureStdExtZvbb,
+ FeatureStdExtZvknc,
+ FeatureStdExtZvkng,
+ FeatureStdExtZvksc,
+ FeatureStdExtZvksg,
+ FeatureVendorXSiFivecdiscarddlone,
+ FeatureVendorXSiFivecflushdlone,
+ FeatureUnalignedScalarMem,
+ FeatureUnalignedVectorMem]),
+ !listconcat(SiFiveP400TuneFeatures,
+ [TuneNoSinkSplatOperands])>;
+
def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model,
[Feature64Bit,
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 21fbf47875e6..8b3770aeb5d1 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -520,6 +520,7 @@ void RISCVPassConfig::addPreEmitPass2() {
// ensuring return instruction is detected correctly.
addPass(createRISCVPushPopOptimizationPass());
}
+ addPass(createRISCVIndirectBranchTrackingPass());
addPass(createRISCVExpandPseudoPass());
// Schedule the expansion of AMOs at the last possible moment, avoiding the
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
index 53bac88df65f..04eada18ef0d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
@@ -186,10 +186,11 @@ struct Entry {
/// Explore them.
static void sortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
const WebAssemblyExceptionInfo &WEI,
- const MachineDominatorTree &MDT) {
+ MachineDominatorTree &MDT) {
// Remember original layout ordering, so we can update terminators after
// reordering to point to the original layout successor.
MF.RenumberBlocks();
+ MDT.updateBlockNumbers();
// Prepare for a topological sort: Record the number of predecessors each
// block has, ignoring loop backedges.
@@ -330,6 +331,7 @@ static void sortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
}
assert(Entries.empty() && "Active sort region list not finished");
MF.RenumberBlocks();
+ MDT.updateBlockNumbers();
#ifndef NDEBUG
SmallSetVector<const SortRegion *, 8> OnStack;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index 70b91c266c49..c7001ef2b33e 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -45,6 +45,8 @@ STATISTIC(NumCatchUnwindMismatches, "Number of catch unwind mismatches found");
namespace {
class WebAssemblyCFGStackify final : public MachineFunctionPass {
+ MachineDominatorTree *MDT;
+
StringRef getPassName() const override { return "WebAssembly CFG Stackify"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -252,7 +254,6 @@ void WebAssemblyCFGStackify::unregisterScope(MachineInstr *Begin) {
void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) {
assert(!MBB.isEHPad());
MachineFunction &MF = *MBB.getParent();
- auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
const auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
@@ -264,7 +265,7 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) {
int MBBNumber = MBB.getNumber();
for (MachineBasicBlock *Pred : MBB.predecessors()) {
if (Pred->getNumber() < MBBNumber) {
- Header = Header ? MDT.findNearestCommonDominator(Header, Pred) : Pred;
+ Header = Header ? MDT->findNearestCommonDominator(Header, Pred) : Pred;
if (explicitlyBranchesTo(Pred, &MBB))
IsBranchedTo = true;
}
@@ -1439,6 +1440,7 @@ void WebAssemblyCFGStackify::recalculateScopeTops(MachineFunction &MF) {
// Renumber BBs and recalculate ScopeTop info because new BBs might have been
// created and inserted during fixing unwind mismatches.
MF.RenumberBlocks();
+ MDT->updateBlockNumbers();
ScopeTops.clear();
ScopeTops.resize(MF.getNumBlockIDs());
for (auto &MBB : reverse(MF)) {
@@ -1741,6 +1743,7 @@ bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
"********** Function: "
<< MF.getName() << '\n');
const MCAsmInfo *MCAI = MF.getTarget().getMCAsmInfo();
+ MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
releaseMemory();
diff --git a/llvm/lib/Target/X86/X86DomainReassignment.cpp b/llvm/lib/Target/X86/X86DomainReassignment.cpp
index 6289b3a1df1f..831944cce3af 100644
--- a/llvm/lib/Target/X86/X86DomainReassignment.cpp
+++ b/llvm/lib/Target/X86/X86DomainReassignment.cpp
@@ -41,13 +41,6 @@ static cl::opt<bool> DisableX86DomainReassignment(
namespace {
enum RegDomain { NoDomain = -1, GPRDomain, MaskDomain, OtherDomain, NumDomains };
-static bool isGPR(const TargetRegisterClass *RC) {
- return X86::GR64RegClass.hasSubClassEq(RC) ||
- X86::GR32RegClass.hasSubClassEq(RC) ||
- X86::GR16RegClass.hasSubClassEq(RC) ||
- X86::GR8RegClass.hasSubClassEq(RC);
-}
-
static bool isMask(const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) {
return X86::VK16RegClass.hasSubClassEq(RC);
@@ -55,7 +48,7 @@ static bool isMask(const TargetRegisterClass *RC,
static RegDomain getDomain(const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) {
- if (isGPR(RC))
+ if (TRI->isGeneralPurposeRegisterClass(RC))
return GPRDomain;
if (isMask(RC, TRI))
return MaskDomain;
@@ -797,7 +790,8 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {
continue;
// GPR only current source domain supported.
- if (!isGPR(MRI->getRegClass(Reg)))
+ if (!MRI->getTargetRegisterInfo()->isGeneralPurposeRegisterClass(
+ MRI->getRegClass(Reg)))
continue;
// Register already in closure.
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index bdc9a0d29670..77dac1197f85 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -4227,3 +4227,323 @@ void X86FrameLowering::restoreWinEHStackPointersInParent(
/*RestoreSP=*/IsSEH);
}
}
+
+// Compute the alignment gap between current SP after spilling FP/BP and the
+// next properly aligned stack offset.
+static int computeFPBPAlignmentGap(MachineFunction &MF,
+ const TargetRegisterClass *RC,
+ unsigned NumSpilledRegs) {
+ const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+ unsigned AllocSize = TRI->getSpillSize(*RC) * NumSpilledRegs;
+ Align StackAlign = MF.getSubtarget().getFrameLowering()->getStackAlign();
+ unsigned AlignedSize = alignTo(AllocSize, StackAlign);
+ return AlignedSize - AllocSize;
+}
+
+void X86FrameLowering::spillFPBPUsingSP(MachineFunction &MF,
+ MachineBasicBlock::iterator BeforeMI,
+ Register FP, Register BP,
+ int SPAdjust) const {
+ assert(FP.isValid() || BP.isValid());
+
+ MachineBasicBlock *MBB = BeforeMI->getParent();
+ DebugLoc DL = BeforeMI->getDebugLoc();
+
+ // Spill FP.
+ if (FP.isValid()) {
+ BuildMI(*MBB, BeforeMI, DL,
+ TII.get(getPUSHOpcode(MF.getSubtarget<X86Subtarget>())))
+ .addReg(FP);
+ }
+
+ // Spill BP.
+ if (BP.isValid()) {
+ BuildMI(*MBB, BeforeMI, DL,
+ TII.get(getPUSHOpcode(MF.getSubtarget<X86Subtarget>())))
+ .addReg(BP);
+ }
+
+ // Make sure SP is aligned.
+ if (SPAdjust)
+ emitSPUpdate(*MBB, BeforeMI, DL, -SPAdjust, false);
+
+ // Emit unwinding information.
+ if (FP.isValid() && needsDwarfCFI(MF)) {
+ // Emit .cfi_remember_state to remember old frame.
+ unsigned CFIIndex =
+ MF.addFrameInst(MCCFIInstruction::createRememberState(nullptr));
+ BuildMI(*MBB, BeforeMI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+
+ // Setup new CFA value with DW_CFA_def_cfa_expression:
+ // DW_OP_breg7+offset, DW_OP_deref, DW_OP_consts 16, DW_OP_plus
+ SmallString<64> CfaExpr;
+ uint8_t buffer[16];
+ int Offset = SPAdjust;
+ if (BP.isValid())
+ Offset += TRI->getSpillSize(*TRI->getMinimalPhysRegClass(BP));
+ // If BeforeMI is a frame setup instruction, we need to adjust the position
+ // and offset of the new cfi instruction.
+ if (TII.isFrameSetup(*BeforeMI)) {
+ Offset += alignTo(TII.getFrameSize(*BeforeMI), getStackAlign());
+ BeforeMI = std::next(BeforeMI);
+ }
+ Register StackPtr = TRI->getStackRegister();
+ if (STI.isTarget64BitILP32())
+ StackPtr = Register(getX86SubSuperRegister(StackPtr, 64));
+ unsigned DwarfStackPtr = TRI->getDwarfRegNum(StackPtr, true);
+ CfaExpr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfStackPtr));
+ CfaExpr.append(buffer, buffer + encodeSLEB128(Offset, buffer));
+ CfaExpr.push_back(dwarf::DW_OP_deref);
+ CfaExpr.push_back(dwarf::DW_OP_consts);
+ CfaExpr.append(buffer, buffer + encodeSLEB128(SlotSize * 2, buffer));
+ CfaExpr.push_back((uint8_t)dwarf::DW_OP_plus);
+
+ SmallString<64> DefCfaExpr;
+ DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
+ DefCfaExpr.append(buffer, buffer + encodeSLEB128(CfaExpr.size(), buffer));
+ DefCfaExpr.append(CfaExpr.str());
+ BuildCFI(*MBB, BeforeMI, DL,
+ MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str()),
+ MachineInstr::FrameSetup);
+ }
+}
+
+void X86FrameLowering::restoreFPBPUsingSP(MachineFunction &MF,
+ MachineBasicBlock::iterator AfterMI,
+ Register FP, Register BP,
+ int SPAdjust) const {
+ assert(FP.isValid() || BP.isValid());
+
+ // Adjust SP so it points to spilled FP or BP.
+ MachineBasicBlock *MBB = AfterMI->getParent();
+ MachineBasicBlock::iterator Pos = std::next(AfterMI);
+ DebugLoc DL = AfterMI->getDebugLoc();
+ if (SPAdjust)
+ emitSPUpdate(*MBB, Pos, DL, SPAdjust, false);
+
+ // Restore BP.
+ if (BP.isValid()) {
+ BuildMI(*MBB, Pos, DL,
+ TII.get(getPOPOpcode(MF.getSubtarget<X86Subtarget>())), BP);
+ }
+
+ // Restore FP.
+ if (FP.isValid()) {
+ BuildMI(*MBB, Pos, DL,
+ TII.get(getPOPOpcode(MF.getSubtarget<X86Subtarget>())), FP);
+
+ // Emit unwinding information.
+ if (needsDwarfCFI(MF)) {
+ // Restore original frame with .cfi_restore_state.
+ unsigned CFIIndex =
+ MF.addFrameInst(MCCFIInstruction::createRestoreState(nullptr));
+ BuildMI(*MBB, Pos, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ }
+ }
+}
+
+void X86FrameLowering::saveAndRestoreFPBPUsingSP(
+ MachineFunction &MF, MachineBasicBlock::iterator BeforeMI,
+ MachineBasicBlock::iterator AfterMI, bool SpillFP, bool SpillBP) const {
+ assert(SpillFP || SpillBP);
+
+ Register FP, BP;
+ const TargetRegisterClass *RC;
+ unsigned NumRegs = 0;
+
+ if (SpillFP) {
+ FP = TRI->getFrameRegister(MF);
+ if (STI.isTarget64BitILP32())
+ FP = Register(getX86SubSuperRegister(FP, 64));
+ RC = TRI->getMinimalPhysRegClass(FP);
+ ++NumRegs;
+ }
+ if (SpillBP) {
+ BP = TRI->getBaseRegister();
+ if (STI.isTarget64BitILP32())
+ BP = Register(getX86SubSuperRegister(BP, 64));
+ RC = TRI->getMinimalPhysRegClass(BP);
+ ++NumRegs;
+ }
+ int SPAdjust = computeFPBPAlignmentGap(MF, RC, NumRegs);
+
+ spillFPBPUsingSP(MF, BeforeMI, FP, BP, SPAdjust);
+ restoreFPBPUsingSP(MF, AfterMI, FP, BP, SPAdjust);
+}
+
+bool X86FrameLowering::skipSpillFPBP(
+ MachineFunction &MF, MachineBasicBlock::reverse_iterator &MI) const {
+ if (MI->getOpcode() == X86::LCMPXCHG16B_SAVE_RBX) {
+ // The pseudo instruction LCMPXCHG16B_SAVE_RBX is generated in the form
+ // SaveRbx = COPY RBX
+ // SaveRbx = LCMPXCHG16B_SAVE_RBX ..., SaveRbx, implicit-def rbx
+ // And later LCMPXCHG16B_SAVE_RBX is expanded to restore RBX from SaveRbx.
+ // We should skip this instruction sequence.
+ int FI;
+ unsigned Reg;
+ while (!(MI->getOpcode() == TargetOpcode::COPY &&
+ MI->getOperand(1).getReg() == X86::RBX) &&
+ !((Reg = TII.isStoreToStackSlot(*MI, FI)) && Reg == X86::RBX))
+ ++MI;
+ return true;
+ }
+ return false;
+}
+
+static bool isFPBPAccess(const MachineInstr &MI, Register FP, Register BP,
+ const TargetRegisterInfo *TRI, bool &AccessFP,
+ bool &AccessBP) {
+ AccessFP = AccessBP = false;
+ if (FP) {
+ if (MI.findRegisterUseOperandIdx(FP, TRI, false) != -1 ||
+ MI.findRegisterDefOperandIdx(FP, TRI, false, true) != -1)
+ AccessFP = true;
+ }
+ if (BP) {
+ if (MI.findRegisterUseOperandIdx(BP, TRI, false) != -1 ||
+ MI.findRegisterDefOperandIdx(BP, TRI, false, true) != -1)
+ AccessBP = true;
+ }
+ return AccessFP || AccessBP;
+}
+
+// Invoke instruction has been lowered to normal function call. We try to figure
+// out if MI comes from Invoke.
+// Do we have any better method?
+static bool isInvoke(const MachineInstr &MI, bool InsideEHLabels) {
+ if (!MI.isCall())
+ return false;
+ if (InsideEHLabels)
+ return true;
+
+ const MachineBasicBlock *MBB = MI.getParent();
+ if (!MBB->hasEHPadSuccessor())
+ return false;
+
+ // Check if there is another call instruction from MI to the end of MBB.
+ MachineBasicBlock::const_iterator MBBI = MI, ME = MBB->end();
+ for (++MBBI; MBBI != ME; ++MBBI)
+ if (MBBI->isCall())
+ return false;
+ return true;
+}
+
+/// If a function uses base pointer and the base pointer is clobbered by inline
+/// asm, RA doesn't detect this case, and after the inline asm, the base pointer
+/// contains garbage value.
+/// For example if a 32b x86 function uses base pointer esi, and esi is
+/// clobbered by following inline asm
+/// asm("rep movsb" : "+D"(ptr), "+S"(x), "+c"(c)::"memory");
+/// We need to save esi before the asm and restore it after the asm.
+///
+/// The problem can also occur to frame pointer if there is a function call, and
+/// the callee uses a different calling convention and clobbers the fp.
+///
+/// Because normal frame objects (spill slots) are accessed through fp/bp
+/// register, so we can't spill fp/bp to normal spill slots.
+///
+/// FIXME: There are 2 possible enhancements:
+/// 1. In many cases there are different physical registers not clobbered by
+/// inline asm, we can use one of them as base pointer. Or use a virtual
+/// register as base pointer and let RA allocate a physical register to it.
+/// 2. If there is no other instructions access stack with fp/bp from the
+/// inline asm to the epilog, and no cfi requirement for a correct fp, we can
+/// skip the save and restore operations.
+void X86FrameLowering::spillFPBP(MachineFunction &MF) const {
+ Register FP, BP;
+ const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
+ if (TFI.hasFP(MF))
+ FP = TRI->getFrameRegister(MF);
+ if (TRI->hasBasePointer(MF))
+ BP = TRI->getBaseRegister();
+ if (!FP && !BP)
+ return;
+
+ for (MachineBasicBlock &MBB : MF) {
+ bool InsideEHLabels = false;
+ auto MI = MBB.rbegin(), ME = MBB.rend();
+ auto TermMI = MBB.getFirstTerminator();
+ if (TermMI != MBB.begin())
+ MI = *(std::prev(TermMI));
+
+ while (MI != ME) {
+ // Skip frame setup/destroy instructions.
+ // Skip Invoke (call inside try block) instructions.
+ // Skip instructions handled by target.
+ if (MI->getFlag(MachineInstr::MIFlag::FrameSetup) ||
+ MI->getFlag(MachineInstr::MIFlag::FrameDestroy) ||
+ isInvoke(*MI, InsideEHLabels) || skipSpillFPBP(MF, MI)) {
+ ++MI;
+ continue;
+ }
+
+ if (MI->getOpcode() == TargetOpcode::EH_LABEL) {
+ InsideEHLabels = !InsideEHLabels;
+ ++MI;
+ continue;
+ }
+
+ bool AccessFP, AccessBP;
+ // Check if fp or bp is used in MI.
+ if (!isFPBPAccess(*MI, FP, BP, TRI, AccessFP, AccessBP)) {
+ ++MI;
+ continue;
+ }
+
+ // Look for the range [DefMI, KillMI] in which fp or bp is defined and
+ // used.
+ bool FPLive = false, BPLive = false;
+ bool SpillFP = false, SpillBP = false;
+ auto DefMI = MI, KillMI = MI;
+ do {
+ SpillFP |= AccessFP;
+ SpillBP |= AccessBP;
+
+ // Maintain FPLive and BPLive.
+ if (FPLive && MI->findRegisterDefOperandIdx(FP, TRI, false, true) != -1)
+ FPLive = false;
+ if (FP && MI->findRegisterUseOperandIdx(FP, TRI, false) != -1)
+ FPLive = true;
+ if (BPLive && MI->findRegisterDefOperandIdx(BP, TRI, false, true) != -1)
+ BPLive = false;
+ if (BP && MI->findRegisterUseOperandIdx(BP, TRI, false) != -1)
+ BPLive = true;
+
+ DefMI = MI++;
+ } while ((MI != ME) &&
+ (FPLive || BPLive ||
+ isFPBPAccess(*MI, FP, BP, TRI, AccessFP, AccessBP)));
+
+ // Don't need to save/restore if FP is accessed through llvm.frameaddress.
+ if (FPLive && !SpillBP)
+ continue;
+
+ // If the bp is clobbered by a call, we should save and restore outside of
+ // the frame setup instructions.
+ if (KillMI->isCall() && DefMI != ME) {
+ const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ auto FrameSetup = std::next(DefMI);
+ // Look for frame setup instruction toward the start of the BB.
+ // If we reach another call instruction, it means no frame setup
+ // instruction for the current call instruction.
+ while (FrameSetup != ME && !TII.isFrameSetup(*FrameSetup) &&
+ !FrameSetup->isCall())
+ ++FrameSetup;
+ // If a frame setup instruction is found, we need to find out the
+ // corresponding frame destroy instruction.
+ if (FrameSetup != ME && TII.isFrameSetup(*FrameSetup)) {
+ while (!TII.isFrameInstr(*KillMI))
+ --KillMI;
+ DefMI = FrameSetup;
+ MI = DefMI;
+ ++MI;
+ }
+ }
+
+ // Call target function to spill and restore FP and BP registers.
+ saveAndRestoreFPBPUsingSP(MF, &(*DefMI), &(*KillMI), SpillFP, SpillBP);
+ }
+ }
+}
diff --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h
index 2dc9ecc6109d..e21f6ab3d16d 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.h
+++ b/llvm/lib/Target/X86/X86FrameLowering.h
@@ -103,6 +103,8 @@ public:
MutableArrayRef<CalleeSavedInfo> CSI,
const TargetRegisterInfo *TRI) const override;
+ void spillFPBP(MachineFunction &MF) const override;
+
bool hasFP(const MachineFunction &MF) const override;
bool hasReservedCallFrame(const MachineFunction &MF) const override;
bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
@@ -267,6 +269,29 @@ private:
void emitCatchRetReturnValue(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
MachineInstr *CatchRet) const;
+
+ /// Issue instructions to allocate stack space and spill frame pointer and/or
+ /// base pointer to stack using stack pointer register.
+ void spillFPBPUsingSP(MachineFunction &MF,
+ const MachineBasicBlock::iterator BeforeMI, Register FP,
+ Register BP, int SPAdjust) const;
+
+ /// Issue instructions to restore frame pointer and/or base pointer from stack
+ /// using stack pointer register, and free stack space.
+ void restoreFPBPUsingSP(MachineFunction &MF,
+ const MachineBasicBlock::iterator AfterMI,
+ Register FP, Register BP, int SPAdjust) const;
+
+ void saveAndRestoreFPBPUsingSP(MachineFunction &MF,
+ MachineBasicBlock::iterator BeforeMI,
+ MachineBasicBlock::iterator AfterMI,
+ bool SpillFP, bool SpillBP) const;
+
+ // If MI uses fp/bp, but target can handle it, and doesn't want to be spilled
+ // again, this function should return true, and update MI so we will not check
+ // any instructions from related sequence.
+ bool skipSpillFPBP(MachineFunction &MF,
+ MachineBasicBlock::reverse_iterator &MI) const;
};
} // End llvm namespace
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index fff65a1bd967..2891e21be1b2 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -34069,6 +34069,14 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(VMINMAX_SAE)
NODE_NAME_CASE(VMINMAXS)
NODE_NAME_CASE(VMINMAXS_SAE)
+ NODE_NAME_CASE(CVTP2IBS)
+ NODE_NAME_CASE(CVTP2IUBS)
+ NODE_NAME_CASE(CVTP2IBS_RND)
+ NODE_NAME_CASE(CVTP2IUBS_RND)
+ NODE_NAME_CASE(CVTTP2IBS)
+ NODE_NAME_CASE(CVTTP2IUBS)
+ NODE_NAME_CASE(CVTTP2IBS_SAE)
+ NODE_NAME_CASE(CVTTP2IUBS_SAE)
NODE_NAME_CASE(AESENC128KL)
NODE_NAME_CASE(AESDEC128KL)
NODE_NAME_CASE(AESENC256KL)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index b985f7529ea2..2e7538cb3c11 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -607,6 +607,15 @@ namespace llvm {
VMINMAXS,
VMINMAXS_SAE,
+ CVTP2IBS,
+ CVTP2IUBS,
+ CVTP2IBS_RND,
+ CVTP2IUBS_RND,
+ CVTTP2IBS,
+ CVTTP2IUBS,
+ CVTTP2IBS_SAE,
+ CVTTP2IUBS_SAE,
+
MPSADBW,
// Compress and expand.
diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td
index 8e4586f2002d..fe381b377826 100644
--- a/llvm/lib/Target/X86/X86InstrAVX10.td
+++ b/llvm/lib/Target/X86/X86InstrAVX10.td
@@ -451,3 +451,176 @@ defm VMINMAXSH : avx10_minmax_scalar<"vminmaxsh", v8f16x_info, X86vminmaxs, X86v
AVX512PSIi8Base, VEX_LIG, EVEX, VVVV, EVEX_CD8<16, CD8VT1>, TA;
defm VMINMAXSS : avx10_minmax_scalar<"vminmaxss", v4f32x_info, X86vminmaxs, X86vminmaxsSae>,
AVX512AIi8Base, VEX_LIG, EVEX, VVVV, EVEX_CD8<32, CD8VT1>;
+
+//-------------------------------------------------
+// AVX10 SATCVT instructions
+//-------------------------------------------------
+
+multiclass avx10_sat_cvt_rmb<bits<8> Opc, string OpStr, X86FoldableSchedWrite sched,
+ X86VectorVTInfo DestInfo,
+ X86VectorVTInfo SrcInfo,
+ SDNode MaskNode> {
+ defm rr: AVX512_maskable<Opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
+ (ins SrcInfo.RC:$src), OpStr, "$src", "$src",
+ (DestInfo.VT (MaskNode (SrcInfo.VT SrcInfo.RC:$src)))>,
+ Sched<[sched]>;
+ defm rm: AVX512_maskable<Opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
+ (ins SrcInfo.MemOp:$src), OpStr, "$src", "$src",
+ (DestInfo.VT (MaskNode (SrcInfo.VT
+ (SrcInfo.LdFrag addr:$src))))>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ defm rmb: AVX512_maskable<Opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
+ (ins SrcInfo.ScalarMemOp:$src), OpStr,
+ "${src}"#SrcInfo.BroadcastStr, "${src}"#SrcInfo.BroadcastStr,
+ (DestInfo.VT (MaskNode (SrcInfo.VT
+ (SrcInfo.BroadcastLdFrag addr:$src))))>, EVEX_B,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+// Conversion with rounding control (RC)
+multiclass avx10_sat_cvt_rc<bits<8> Opc, string OpStr, X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo DestInfo, AVX512VLVectorVTInfo SrcInfo,
+ SDNode MaskNode> {
+ let Predicates = [HasAVX10_2_512], Uses = [MXCSR] in
+ defm Zrrb : AVX512_maskable<Opc, MRMSrcReg, DestInfo.info512,
+ (outs DestInfo.info512.RC:$dst),
+ (ins SrcInfo.info512.RC:$src, AVX512RC:$rc),
+ OpStr, "$rc, $src", "$src, $rc",
+ (DestInfo.info512.VT
+ (MaskNode (SrcInfo.info512.VT SrcInfo.info512.RC:$src),
+ (i32 timm:$rc)))>,
+ Sched<[sched.ZMM]>, EVEX, EVEX_RC, EVEX_B;
+ let Predicates = [HasAVX10_2], hasEVEX_U = 1 in {
+ defm Z256rrb : AVX512_maskable<Opc, MRMSrcReg, DestInfo.info256,
+ (outs DestInfo.info256.RC:$dst),
+ (ins SrcInfo.info256.RC:$src, AVX512RC:$rc),
+ OpStr, "$rc, $src", "$src, $rc",
+ (DestInfo.info256.VT
+ (MaskNode (SrcInfo.info256.VT SrcInfo.info256.RC:$src),
+ (i32 timm:$rc)))>,
+ Sched<[sched.YMM]>, EVEX, EVEX_RC, EVEX_B;
+ }
+}
+
+// Conversion with SAE
+multiclass avx10_sat_cvt_sae<bits<8> Opc, string OpStr, X86SchedWriteWidths sched,
+ AVX512VLVectorVTInfo DestInfo, AVX512VLVectorVTInfo SrcInfo,
+ SDNode Node> {
+ let Predicates = [HasAVX10_2_512], Uses = [MXCSR] in
+ defm Zrrb : AVX512_maskable<Opc, MRMSrcReg, DestInfo.info512,
+ (outs DestInfo.info512.RC:$dst),
+ (ins SrcInfo.info512.RC:$src),
+ OpStr, "{sae}, $src", "$src, {sae}",
+ (DestInfo.info512.VT
+ (Node (SrcInfo.info512.VT SrcInfo.info512.RC:$src)))>,
+ Sched<[sched.ZMM]>, EVEX, EVEX_B;
+ let Predicates = [HasAVX10_2], hasEVEX_U = 1 in {
+ defm Z256rrb : AVX512_maskable<Opc, MRMSrcReg, DestInfo.info256,
+ (outs DestInfo.info256.RC:$dst),
+ (ins SrcInfo.info256.RC:$src),
+ OpStr, "{sae}, $src", "$src, {sae}",
+ (DestInfo.info256.VT
+ (Node (SrcInfo.info256.VT SrcInfo.info256.RC:$src)))>,
+ Sched<[sched.YMM]>, EVEX, EVEX_B;
+ }
+}
+
+multiclass avx10_sat_cvt_base<bits<8> Opc, string OpStr, X86SchedWriteWidths sched,
+ SDNode MaskNode, AVX512VLVectorVTInfo DestInfo,
+ AVX512VLVectorVTInfo SrcInfo> {
+ let Predicates = [HasAVX10_2_512] in
+ defm Z : avx10_sat_cvt_rmb<Opc, OpStr, sched.ZMM,
+ DestInfo.info512, SrcInfo.info512,
+ MaskNode>,
+ EVEX, EVEX_V512;
+ let Predicates = [HasAVX10_2] in {
+ defm Z256
+ : avx10_sat_cvt_rmb<Opc, OpStr, sched.YMM,
+ DestInfo.info256, SrcInfo.info256,
+ MaskNode>,
+ EVEX, EVEX_V256;
+ defm Z128
+ : avx10_sat_cvt_rmb<Opc, OpStr, sched.XMM,
+ DestInfo.info128, SrcInfo.info128,
+ MaskNode>,
+ EVEX, EVEX_V128;
+ }
+}
+
+defm VCVTNEBF162IBS : avx10_sat_cvt_base<0x69, "vcvtnebf162ibs",
+ SchedWriteVecIMul, X86vcvtp2ibs,
+ avx512vl_i16_info, avx512vl_bf16_info>,
+ AVX512XDIi8Base, T_MAP5, EVEX_CD8<16, CD8VF>;
+defm VCVTNEBF162IUBS : avx10_sat_cvt_base<0x6b, "vcvtnebf162iubs",
+ SchedWriteVecIMul, X86vcvtp2iubs,
+ avx512vl_i16_info, avx512vl_bf16_info>,
+ AVX512XDIi8Base, T_MAP5, EVEX_CD8<16, CD8VF>;
+
+defm VCVTPH2IBS : avx10_sat_cvt_base<0x69, "vcvtph2ibs", SchedWriteVecIMul,
+ X86vcvtp2ibs, avx512vl_i16_info,
+ avx512vl_f16_info>,
+ avx10_sat_cvt_rc<0x69, "vcvtph2ibs", SchedWriteVecIMul,
+ avx512vl_i16_info, avx512vl_f16_info,
+ X86vcvtp2ibsRnd>,
+ AVX512PSIi8Base, T_MAP5, EVEX_CD8<16, CD8VF>;
+defm VCVTPH2IUBS : avx10_sat_cvt_base<0x6b, "vcvtph2iubs", SchedWriteVecIMul,
+ X86vcvtp2iubs, avx512vl_i16_info,
+ avx512vl_f16_info>,
+ avx10_sat_cvt_rc<0x6b, "vcvtph2iubs", SchedWriteVecIMul,
+ avx512vl_i16_info, avx512vl_f16_info,
+ X86vcvtp2iubsRnd>,
+ AVX512PSIi8Base, T_MAP5, EVEX_CD8<16, CD8VF>;
+
+defm VCVTPS2IBS : avx10_sat_cvt_base<0x69, "vcvtps2ibs", SchedWriteVecIMul,
+ X86vcvtp2ibs, avx512vl_i32_info,
+ avx512vl_f32_info>,
+ avx10_sat_cvt_rc<0x69, "vcvtps2ibs", SchedWriteVecIMul,
+ avx512vl_i32_info, avx512vl_f32_info,
+ X86vcvtp2ibsRnd>,
+ AVX512PDIi8Base, T_MAP5, EVEX_CD8<32, CD8VF>;
+defm VCVTPS2IUBS : avx10_sat_cvt_base<0x6b, "vcvtps2iubs", SchedWriteVecIMul,
+ X86vcvtp2iubs, avx512vl_i32_info,
+ avx512vl_f32_info>,
+ avx10_sat_cvt_rc<0x6b, "vcvtps2iubs", SchedWriteVecIMul,
+ avx512vl_i32_info, avx512vl_f32_info,
+ X86vcvtp2iubsRnd>,
+ AVX512PDIi8Base, T_MAP5, EVEX_CD8<32, CD8VF>;
+
+defm VCVTTNEBF162IBS : avx10_sat_cvt_base<0x68, "vcvttnebf162ibs",
+ SchedWriteVecIMul, X86vcvttp2ibs,
+ avx512vl_i16_info, avx512vl_bf16_info>,
+ AVX512XDIi8Base, T_MAP5, EVEX_CD8<16, CD8VF>;
+defm VCVTTNEBF162IUBS : avx10_sat_cvt_base<0x6a, "vcvttnebf162iubs",
+ SchedWriteVecIMul, X86vcvttp2iubs,
+ avx512vl_i16_info, avx512vl_bf16_info>,
+ AVX512XDIi8Base, T_MAP5, EVEX_CD8<16, CD8VF>;
+
+defm VCVTTPH2IBS : avx10_sat_cvt_base<0x68, "vcvttph2ibs", SchedWriteVecIMul,
+ X86vcvttp2ibs, avx512vl_i16_info,
+ avx512vl_f16_info>,
+ avx10_sat_cvt_sae<0x68, "vcvttph2ibs", SchedWriteVecIMul,
+ avx512vl_i16_info, avx512vl_f16_info,
+ X86vcvttp2ibsSAE>,
+ AVX512PSIi8Base, T_MAP5, EVEX_CD8<16, CD8VF>;
+defm VCVTTPH2IUBS : avx10_sat_cvt_base<0x6a, "vcvttph2iubs", SchedWriteVecIMul,
+ X86vcvttp2iubs, avx512vl_i16_info,
+ avx512vl_f16_info>,
+ avx10_sat_cvt_sae<0x6a, "vcvttph2iubs", SchedWriteVecIMul,
+ avx512vl_i16_info, avx512vl_f16_info,
+ X86vcvttp2iubsSAE>,
+ AVX512PSIi8Base, T_MAP5, EVEX_CD8<16, CD8VF>;
+
+defm VCVTTPS2IBS : avx10_sat_cvt_base<0x68, "vcvttps2ibs", SchedWriteVecIMul,
+ X86vcvttp2ibs, avx512vl_i32_info,
+ avx512vl_f32_info>,
+ avx10_sat_cvt_sae<0x68, "vcvttps2ibs", SchedWriteVecIMul,
+ avx512vl_i32_info, avx512vl_f32_info,
+ X86vcvttp2ibsSAE>,
+ AVX512PDIi8Base, T_MAP5, EVEX_CD8<32, CD8VF>;
+defm VCVTTPS2IUBS : avx10_sat_cvt_base<0x6a, "vcvttps2iubs", SchedWriteVecIMul,
+ X86vcvttp2iubs, avx512vl_i32_info,
+ avx512vl_f32_info>,
+ avx10_sat_cvt_sae<0x6a, "vcvttps2iubs", SchedWriteVecIMul,
+ avx512vl_i32_info, avx512vl_f32_info,
+ X86vcvttp2iubsSAE>,
+ AVX512PDIi8Base, T_MAP5, EVEX_CD8<32, CD8VF>;
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index 78c76cacbfef..6db1cf7c9ee1 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -833,6 +833,20 @@ def X86vpdpwuuds : SDNode<"X86ISD::VPDPWUUDS", SDTVnni>;
def X86Vmpsadbw : SDNode<"X86ISD::MPSADBW", SDTX86PSADBW>;
+// in place saturated cvt fp-to-int
+def X86vcvtp2ibs : SDNode<"X86ISD::CVTP2IBS", SDTFloatToInt>;
+def X86vcvtp2iubs : SDNode<"X86ISD::CVTP2IUBS", SDTFloatToInt>;
+
+def X86vcvtp2ibsRnd : SDNode<"X86ISD::CVTP2IBS_RND", SDTFloatToIntRnd>;
+def X86vcvtp2iubsRnd : SDNode<"X86ISD::CVTP2IUBS_RND", SDTFloatToIntRnd>;
+
+// in place saturated cvtt fp-to-int staff
+def X86vcvttp2ibs : SDNode<"X86ISD::CVTTP2IBS", SDTFloatToInt>;
+def X86vcvttp2iubs : SDNode<"X86ISD::CVTTP2IUBS", SDTFloatToInt>;
+
+def X86vcvttp2ibsSAE : SDNode<"X86ISD::CVTTP2IBS_SAE", SDTFloatToInt>;
+def X86vcvttp2iubsSAE : SDNode<"X86ISD::CVTTP2IUBS_SAE", SDTFloatToInt>;
+
//===----------------------------------------------------------------------===//
// SSE pattern fragments
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86InstrUtils.td b/llvm/lib/Target/X86/X86InstrUtils.td
index 8387b76a40cd..208af630a352 100644
--- a/llvm/lib/Target/X86/X86InstrUtils.td
+++ b/llvm/lib/Target/X86/X86InstrUtils.td
@@ -313,7 +313,7 @@ def v32i16_info : X86VectorVTInfo<32, i16, VR512, "w">;
def v16i32_info : X86VectorVTInfo<16, i32, VR512, "d">;
def v8i64_info : X86VectorVTInfo<8, i64, VR512, "q">;
def v32f16_info : X86VectorVTInfo<32, f16, VR512, "ph">;
-def v32bf16_info: X86VectorVTInfo<32, bf16, VR512, "pbf">;
+def v32bf16_info: X86VectorVTInfo<32, bf16, VR512, "pbh">;
def v16f32_info : X86VectorVTInfo<16, f32, VR512, "ps">;
def v8f64_info : X86VectorVTInfo<8, f64, VR512, "pd">;
@@ -323,7 +323,7 @@ def v16i16x_info : X86VectorVTInfo<16, i16, VR256X, "w">;
def v8i32x_info : X86VectorVTInfo<8, i32, VR256X, "d">;
def v4i64x_info : X86VectorVTInfo<4, i64, VR256X, "q">;
def v16f16x_info : X86VectorVTInfo<16, f16, VR256X, "ph">;
-def v16bf16x_info: X86VectorVTInfo<16, bf16, VR256X, "pbf">;
+def v16bf16x_info: X86VectorVTInfo<16, bf16, VR256X, "pbh">;
def v8f32x_info : X86VectorVTInfo<8, f32, VR256X, "ps">;
def v4f64x_info : X86VectorVTInfo<4, f64, VR256X, "pd">;
@@ -332,7 +332,7 @@ def v8i16x_info : X86VectorVTInfo<8, i16, VR128X, "w">;
def v4i32x_info : X86VectorVTInfo<4, i32, VR128X, "d">;
def v2i64x_info : X86VectorVTInfo<2, i64, VR128X, "q">;
def v8f16x_info : X86VectorVTInfo<8, f16, VR128X, "ph">;
-def v8bf16x_info : X86VectorVTInfo<8, bf16, VR128X, "pbf">;
+def v8bf16x_info : X86VectorVTInfo<8, bf16, VR128X, "pbh">;
def v4f32x_info : X86VectorVTInfo<4, f32, VR128X, "ps">;
def v2f64x_info : X86VectorVTInfo<2, f64, VR128X, "pd">;
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 536391da295d..47be08c8af3e 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -408,6 +408,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
X86_INTRINSIC_DATA(avx10_mask_vcvtph2dq256, INTR_TYPE_1OP_MASK,
X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
+ X86_INTRINSIC_DATA(avx10_mask_vcvtph2ibs128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2IBS, 0),
+ X86_INTRINSIC_DATA(avx10_mask_vcvtph2ibs256, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2IBS, X86ISD::CVTP2IBS_RND),
+ X86_INTRINSIC_DATA(avx10_mask_vcvtph2ibs512, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2IBS, X86ISD::CVTP2IBS_RND),
+ X86_INTRINSIC_DATA(avx10_mask_vcvtph2iubs128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2IUBS, 0),
+ X86_INTRINSIC_DATA(avx10_mask_vcvtph2iubs256, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2IUBS, X86ISD::CVTP2IUBS_RND),
+ X86_INTRINSIC_DATA(avx10_mask_vcvtph2iubs512, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2IUBS, X86ISD::CVTP2IUBS_RND),
X86_INTRINSIC_DATA(avx10_mask_vcvtph2pd256, INTR_TYPE_1OP_MASK_SAE,
ISD::FP_EXTEND, X86ISD::VFPEXT_SAE),
X86_INTRINSIC_DATA(avx10_mask_vcvtph2psx256, INTR_TYPE_1OP_MASK_SAE,
@@ -424,6 +436,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
X86_INTRINSIC_DATA(avx10_mask_vcvtps2dq256, INTR_TYPE_1OP_MASK,
X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
+ X86_INTRINSIC_DATA(avx10_mask_vcvtps2ibs128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2IBS, 0),
+ X86_INTRINSIC_DATA(avx10_mask_vcvtps2ibs256, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2IBS, X86ISD::CVTP2IBS_RND),
+ X86_INTRINSIC_DATA(avx10_mask_vcvtps2ibs512, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2IBS, X86ISD::CVTP2IBS_RND),
+ X86_INTRINSIC_DATA(avx10_mask_vcvtps2iubs128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2IUBS, 0),
+ X86_INTRINSIC_DATA(avx10_mask_vcvtps2iubs256, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2IUBS, X86ISD::CVTP2IUBS_RND),
+ X86_INTRINSIC_DATA(avx10_mask_vcvtps2iubs512, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTP2IUBS, X86ISD::CVTP2IUBS_RND),
X86_INTRINSIC_DATA(avx10_mask_vcvtps2pd256, INTR_TYPE_1OP_MASK_SAE,
ISD::FP_EXTEND, X86ISD::VFPEXT_SAE),
X86_INTRINSIC_DATA(avx10_mask_vcvtps2phx256, INTR_TYPE_1OP_MASK,
@@ -444,6 +468,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE),
X86_INTRINSIC_DATA(avx10_mask_vcvttph2dq256, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE),
+ X86_INTRINSIC_DATA(avx10_mask_vcvttph2ibs128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTTP2IBS, 0),
+ X86_INTRINSIC_DATA(avx10_mask_vcvttph2ibs256, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2IBS, X86ISD::CVTTP2IBS_SAE),
+ X86_INTRINSIC_DATA(avx10_mask_vcvttph2ibs512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2IBS, X86ISD::CVTTP2IBS_SAE),
+ X86_INTRINSIC_DATA(avx10_mask_vcvttph2iubs128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTTP2IUBS, 0),
+ X86_INTRINSIC_DATA(avx10_mask_vcvttph2iubs256, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2IUBS, X86ISD::CVTTP2IUBS_SAE),
+ X86_INTRINSIC_DATA(avx10_mask_vcvttph2iubs512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2IUBS, X86ISD::CVTTP2IUBS_SAE),
X86_INTRINSIC_DATA(avx10_mask_vcvttph2qq256, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE),
X86_INTRINSIC_DATA(avx10_mask_vcvttph2udq256, INTR_TYPE_1OP_MASK,
@@ -456,6 +492,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE),
X86_INTRINSIC_DATA(avx10_mask_vcvttps2dq256, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE),
+ X86_INTRINSIC_DATA(avx10_mask_vcvttps2ibs128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTTP2IBS, 0),
+ X86_INTRINSIC_DATA(avx10_mask_vcvttps2ibs256, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTTP2IBS, X86ISD::CVTTP2IBS_SAE),
+ X86_INTRINSIC_DATA(avx10_mask_vcvttps2ibs512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2IBS, X86ISD::CVTTP2IBS_SAE),
+ X86_INTRINSIC_DATA(avx10_mask_vcvttps2iubs128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTTP2IUBS, 0),
+ X86_INTRINSIC_DATA(avx10_mask_vcvttps2iubs256, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2IUBS, X86ISD::CVTTP2IUBS_SAE),
+ X86_INTRINSIC_DATA(avx10_mask_vcvttps2iubs512, INTR_TYPE_1OP_MASK_SAE,
+ X86ISD::CVTTP2IUBS, X86ISD::CVTTP2IUBS_SAE),
X86_INTRINSIC_DATA(avx10_mask_vcvttps2qq256, INTR_TYPE_1OP_MASK,
X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE),
X86_INTRINSIC_DATA(avx10_mask_vcvttps2udq256, INTR_TYPE_1OP_MASK,
@@ -546,6 +594,30 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::FADD_RND),
X86_INTRINSIC_DATA(avx10_vaddps256, INTR_TYPE_2OP, ISD::FADD,
X86ISD::FADD_RND),
+ X86_INTRINSIC_DATA(avx10_vcvtnebf162ibs128, INTR_TYPE_1OP, X86ISD::CVTP2IBS,
+ 0),
+ X86_INTRINSIC_DATA(avx10_vcvtnebf162ibs256, INTR_TYPE_1OP, X86ISD::CVTP2IBS,
+ 0),
+ X86_INTRINSIC_DATA(avx10_vcvtnebf162ibs512, INTR_TYPE_1OP, X86ISD::CVTP2IBS,
+ 0),
+ X86_INTRINSIC_DATA(avx10_vcvtnebf162iubs128, INTR_TYPE_1OP,
+ X86ISD::CVTP2IUBS, 0),
+ X86_INTRINSIC_DATA(avx10_vcvtnebf162iubs256, INTR_TYPE_1OP,
+ X86ISD::CVTP2IUBS, 0),
+ X86_INTRINSIC_DATA(avx10_vcvtnebf162iubs512, INTR_TYPE_1OP,
+ X86ISD::CVTP2IUBS, 0),
+ X86_INTRINSIC_DATA(avx10_vcvttnebf162ibs128, INTR_TYPE_1OP,
+ X86ISD::CVTTP2IBS, 0),
+ X86_INTRINSIC_DATA(avx10_vcvttnebf162ibs256, INTR_TYPE_1OP,
+ X86ISD::CVTTP2IBS, 0),
+ X86_INTRINSIC_DATA(avx10_vcvttnebf162ibs512, INTR_TYPE_1OP,
+ X86ISD::CVTTP2IBS, 0),
+ X86_INTRINSIC_DATA(avx10_vcvttnebf162iubs128, INTR_TYPE_1OP,
+ X86ISD::CVTTP2IUBS, 0),
+ X86_INTRINSIC_DATA(avx10_vcvttnebf162iubs256, INTR_TYPE_1OP,
+ X86ISD::CVTTP2IUBS, 0),
+ X86_INTRINSIC_DATA(avx10_vcvttnebf162iubs512, INTR_TYPE_1OP,
+ X86ISD::CVTTP2IUBS, 0),
X86_INTRINSIC_DATA(avx10_vdivpd256, INTR_TYPE_2OP, ISD::FDIV,
X86ISD::FDIV_RND),
X86_INTRINSIC_DATA(avx10_vdivph256, INTR_TYPE_2OP, ISD::FDIV,
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 555ede9e9540..3376367cc76b 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -45,6 +45,12 @@ static cl::opt<bool>
EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true),
cl::desc("Enable use of a base pointer for complex stack frames"));
+static cl::opt<bool>
+ DisableRegAllocNDDHints("x86-disable-regalloc-hints-for-ndd", cl::Hidden,
+ cl::init(false),
+ cl::desc("Disable two address hints for register "
+ "allocation"));
+
X86RegisterInfo::X86RegisterInfo(const Triple &TT)
: X86GenRegisterInfo((TT.isArch64Bit() ? X86::RIP : X86::EIP),
X86_MC::getDwarfRegFlavour(TT, false),
@@ -1080,10 +1086,57 @@ bool X86RegisterInfo::getRegAllocationHints(Register VirtReg,
const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints(
VirtReg, Order, Hints, MF, VRM, Matrix);
+ const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
+ const TargetRegisterInfo &TRI = *ST.getRegisterInfo();
unsigned ID = RC.getID();
- if (ID != X86::TILERegClassID)
+
+ if (!VRM)
+ return BaseImplRetVal;
+
+ if (ID != X86::TILERegClassID) {
+ if (DisableRegAllocNDDHints || !ST.hasNDD() ||
+ !TRI.isGeneralPurposeRegisterClass(&RC))
+ return BaseImplRetVal;
+
+ // Add any two address hints after any copy hints.
+ SmallSet<unsigned, 4> TwoAddrHints;
+
+ auto TryAddNDDHint = [&](const MachineOperand &MO) {
+ Register Reg = MO.getReg();
+ Register PhysReg =
+ Register::isPhysicalRegister(Reg) ? Reg : Register(VRM->getPhys(Reg));
+ if (PhysReg && !MRI->isReserved(PhysReg) && !is_contained(Hints, PhysReg))
+ TwoAddrHints.insert(PhysReg);
+ };
+
+ // NDD instructions is compressible when Op0 is allocated to the same
+ // physic register as Op1 (or Op2 if it's commutable).
+ for (auto &MO : MRI->reg_nodbg_operands(VirtReg)) {
+ const MachineInstr &MI = *MO.getParent();
+ if (!X86::getNonNDVariant(MI.getOpcode()))
+ continue;
+ unsigned OpIdx = MI.getOperandNo(&MO);
+ if (OpIdx == 0) {
+ assert(MI.getOperand(1).isReg());
+ TryAddNDDHint(MI.getOperand(1));
+ if (MI.isCommutable()) {
+ assert(MI.getOperand(2).isReg());
+ TryAddNDDHint(MI.getOperand(2));
+ }
+ } else if (OpIdx == 1) {
+ TryAddNDDHint(MI.getOperand(0));
+ } else if (MI.isCommutable() && OpIdx == 2) {
+ TryAddNDDHint(MI.getOperand(0));
+ }
+ }
+
+ for (MCPhysReg OrderReg : Order)
+ if (TwoAddrHints.count(OrderReg))
+ Hints.push_back(OrderReg);
+
return BaseImplRetVal;
+ }
ShapeT VirtShape = getTileShape(VirtReg, const_cast<VirtRegMap *>(VRM), MRI);
auto AddHint = [&](MCPhysReg PhysReg) {