summaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
authorMehdi Amini <joker.eph@gmail.com>2025-08-14 15:36:46 +0200
committerGitHub <noreply@github.com>2025-08-14 15:36:46 +0200
commitdf57d6a01e85ca78da2febab21b268d9fd6955a0 (patch)
tree19b0aab453e6bc7e2b15d3220024dfdacd4fa57e /llvm/lib/Target
parentdf86ea61b7ed484ca797f96d7ad40fd9ada7ba30 (diff)
parent7bda76367f19cfc19086f68d9dd5ac019a9ceccd (diff)
Merge branch 'main' into users/joker-eph-python-bindings-maintainersusers/joker-eph-python-bindings-maintainers
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64FrameLowering.cpp16
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp47
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp2
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp21
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h5
-rw-r--r--llvm/lib/Target/AArch64/SMEABIPass.cpp31
-rw-r--r--llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp41
-rw-r--r--llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h18
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.h4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp119
-rw-r--r--llvm/lib/Target/AMDGPU/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/AMDGPU/SIDefines.h6
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td1
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp56
-rw-r--r--llvm/lib/Target/AMDGPU/SOPInstructions.td4
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp4
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp169
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp42
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.h5
-rw-r--r--llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp4
-rw-r--r--llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp7
-rw-r--r--llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h5
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp8
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td18
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp7
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h1
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp33
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.h4
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.td5
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp47
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h1
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp4
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.td14
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoP.td92
-rw-r--r--llvm/lib/Target/RISCV/RISCVSchedAndes45.td2
-rw-r--r--llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp22
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp12
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp9
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.h5
43 files changed, 383 insertions, 525 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index ba02c82b25aa..885f2a94f85f 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1487,11 +1487,8 @@ bool isVGInstruction(MachineBasicBlock::iterator MBBI) {
if (Opc == AArch64::BL) {
auto Op1 = MBBI->getOperand(0);
- auto &TLI =
- *MBBI->getMF()->getSubtarget<AArch64Subtarget>().getTargetLowering();
- char const *GetCurrentVG =
- TLI.getLibcallName(RTLIB::SMEABI_GET_CURRENT_VG);
- return Op1.isSymbol() && StringRef(Op1.getSymbolName()) == GetCurrentVG;
+ return Op1.isSymbol() &&
+ (StringRef(Op1.getSymbolName()) == "__arm_get_current_vg");
}
}
@@ -3471,7 +3468,6 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
MachineFunction &MF = *MBB.getParent();
- auto &TLI = *MF.getSubtarget<AArch64Subtarget>().getTargetLowering();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
bool NeedsWinCFI = needsWinCFI(MF);
@@ -3585,11 +3581,11 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
.addReg(AArch64::X0, RegState::Implicit)
.setMIFlag(MachineInstr::FrameSetup);
- RTLIB::Libcall LC = RTLIB::SMEABI_GET_CURRENT_VG;
- const uint32_t *RegMask =
- TRI->getCallPreservedMask(MF, TLI.getLibcallCallingConv(LC));
+ const uint32_t *RegMask = TRI->getCallPreservedMask(
+ MF,
+ CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1);
BuildMI(MBB, MI, DL, TII.get(AArch64::BL))
- .addExternalSymbol(TLI.getLibcallName(LC))
+ .addExternalSymbol("__arm_get_current_vg")
.addRegMask(RegMask)
.addReg(AArch64::X0, RegState::ImplicitDefine)
.setMIFlag(MachineInstr::FrameSetup);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 224bbe7e38a1..2072e48914ae 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3083,12 +3083,13 @@ AArch64TargetLowering::EmitGetSMESaveSize(MachineInstr &MI,
AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
if (FuncInfo->isSMESaveBufferUsed()) {
- RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
- .addExternalSymbol(getLibcallName(LC))
+ .addExternalSymbol("__arm_sme_state_size")
.addReg(AArch64::X0, RegState::ImplicitDefine)
- .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
+ .addRegMask(TRI->getCallPreservedMask(
+ *MF, CallingConv::
+ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1));
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
MI.getOperand(0).getReg())
.addReg(AArch64::X0);
@@ -3108,12 +3109,13 @@ AArch64TargetLowering::EmitEntryPStateSM(MachineInstr &MI,
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
Register ResultReg = MI.getOperand(0).getReg();
if (FuncInfo->isPStateSMRegUsed()) {
- RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
- .addExternalSymbol(getLibcallName(LC))
+ .addExternalSymbol("__arm_sme_state")
.addReg(AArch64::X0, RegState::ImplicitDefine)
- .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
+ .addRegMask(TRI->getCallPreservedMask(
+ *MF, CallingConv::
+ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2));
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), ResultReg)
.addReg(AArch64::X0);
} else {
@@ -5737,15 +5739,15 @@ static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) {
SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
SDValue Chain, SDLoc DL,
EVT VT) const {
- RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
- SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
+ SDValue Callee = DAG.getExternalSymbol("__arm_sme_state",
getPointerTy(DAG.getDataLayout()));
Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
Type *RetTy = StructType::get(Int64Ty, Int64Ty);
TargetLowering::CallLoweringInfo CLI(DAG);
ArgListTy Args;
CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
- getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));
+ CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2,
+ RetTy, Callee, std::move(Args));
std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
@@ -8598,12 +8600,12 @@ static void analyzeCallOperands(const AArch64TargetLowering &TLI,
}
static SMECallAttrs
-getSMECallAttrs(const Function &Caller, const TargetLowering &TLI,
+getSMECallAttrs(const Function &Caller,
const TargetLowering::CallLoweringInfo &CLI) {
if (CLI.CB)
- return SMECallAttrs(*CLI.CB, &TLI);
+ return SMECallAttrs(*CLI.CB);
if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
- return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), TLI));
+ return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol()));
return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(SMEAttrs::Normal));
}
@@ -8625,7 +8627,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
// SME Streaming functions are not eligible for TCO as they may require
// the streaming mode or ZA to be restored after returning from the call.
- SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, *this, CLI);
+ SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, CLI);
if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
CallAttrs.requiresPreservingAllZAState() ||
CallAttrs.caller().hasStreamingBody())
@@ -8919,14 +8921,14 @@ static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI,
DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64);
Args.push_back(Entry);
- RTLIB::Libcall LC =
- IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE;
- SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
- TLI.getPointerTy(DAG.getDataLayout()));
+ SDValue Callee =
+ DAG.getExternalSymbol(IsSave ? "__arm_sme_save" : "__arm_sme_restore",
+ TLI.getPointerTy(DAG.getDataLayout()));
auto *RetTy = Type::getVoidTy(*DAG.getContext());
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
- TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));
+ CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1, RetTy,
+ Callee, std::move(Args));
return TLI.LowerCallTo(CLI).second;
}
@@ -9114,7 +9116,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
}
// Determine whether we need any streaming mode changes.
- SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), *this, CLI);
+ SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), CLI);
auto DescribeCallsite =
[&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & {
@@ -9691,12 +9693,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
if (RequiresLazySave) {
// Conditionally restore the lazy save using a pseudo node.
- RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE;
TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
SDValue RegMask = DAG.getRegisterMask(
- TRI->getCallPreservedMask(MF, getLibcallCallingConv(LC)));
+ TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
- getLibcallName(LC), getPointerTy(DAG.getDataLayout()));
+ "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
SDValue TPIDR2_EL0 = DAG.getNode(
ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
@@ -29035,7 +29036,7 @@ bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
// Checks to allow the use of SME instructions
if (auto *Base = dyn_cast<CallBase>(&Inst)) {
- auto CallAttrs = SMECallAttrs(*Base, this);
+ auto CallAttrs = SMECallAttrs(*Base);
if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
CallAttrs.requiresPreservingZT0() ||
CallAttrs.requiresPreservingAllZAState())
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index fb59c9f131fb..a55f103bff38 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -5920,7 +5920,7 @@ static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI,
// Build up the expression (Reg + NumBytes + VG * NumVGScaledBytes)
SmallString<64> Expr;
unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
- assert(DwarfReg >= 0 && DwarfReg <= 31 && "DwarfReg out of bounds (0..31)");
+ assert(DwarfReg <= 31 && "DwarfReg out of bounds (0..31)");
// Reg + NumBytes
Expr.push_back(dwarf::DW_OP_breg0 + DwarfReg);
appendLEB128<LEB128Sign::Signed>(Expr, NumBytes);
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 4523c659dd39..3fba7e853eaf 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -220,16 +220,20 @@ static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode(
static cl::opt<bool> EnableScalableAutovecInStreamingMode(
"enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
-static bool isSMEABIRoutineCall(const CallInst &CI, const TargetLowering &TLI) {
+static bool isSMEABIRoutineCall(const CallInst &CI) {
const auto *F = CI.getCalledFunction();
- return F && SMEAttrs(F->getName(), TLI).isSMEABIRoutine();
+ return F && StringSwitch<bool>(F->getName())
+ .Case("__arm_sme_state", true)
+ .Case("__arm_tpidr2_save", true)
+ .Case("__arm_tpidr2_restore", true)
+ .Case("__arm_za_disable", true)
+ .Default(false);
}
/// Returns true if the function has explicit operations that can only be
/// lowered using incompatible instructions for the selected mode. This also
/// returns true if the function F may use or modify ZA state.
-static bool hasPossibleIncompatibleOps(const Function *F,
- const TargetLowering &TLI) {
+static bool hasPossibleIncompatibleOps(const Function *F) {
for (const BasicBlock &BB : *F) {
for (const Instruction &I : BB) {
// Be conservative for now and assume that any call to inline asm or to
@@ -238,7 +242,7 @@ static bool hasPossibleIncompatibleOps(const Function *F,
// all native LLVM instructions can be lowered to compatible instructions.
if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
(cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
- isSMEABIRoutineCall(cast<CallInst>(I), TLI)))
+ isSMEABIRoutineCall(cast<CallInst>(I))))
return true;
}
}
@@ -286,7 +290,7 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() ||
CallAttrs.requiresPreservingZT0() ||
CallAttrs.requiresPreservingAllZAState()) {
- if (hasPossibleIncompatibleOps(Callee, *getTLI()))
+ if (hasPossibleIncompatibleOps(Callee))
return false;
}
@@ -353,7 +357,7 @@ AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call,
// change only once and avoid inlining of G into F.
SMEAttrs FAttrs(*F);
- SMECallAttrs CallAttrs(Call, getTLI());
+ SMECallAttrs CallAttrs(Call);
if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
if (F == Call.getCaller()) // (1)
@@ -4333,7 +4337,8 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
InstructionCost
AArch64TTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
- const SCEV *Ptr) const {
+ const SCEV *Ptr,
+ TTI::TargetCostKind CostKind) const {
// Address computations in vectorized code with non-consecutive addresses will
// likely result in more instructions compared to scalar code where the
// computation can more often be merged into the index mode. The resulting
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 647b242d74fb..9c96fdd42781 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -238,8 +238,9 @@ public:
ArrayRef<const Value *> Args = {},
const Instruction *CxtI = nullptr) const override;
- InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
- const SCEV *Ptr) const override;
+ InstructionCost
+ getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr,
+ TTI::TargetCostKind CostKind) const override;
InstructionCost getCmpSelInstrCost(
unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
diff --git a/llvm/lib/Target/AArch64/SMEABIPass.cpp b/llvm/lib/Target/AArch64/SMEABIPass.cpp
index 2008516885c3..4af4d4930662 100644
--- a/llvm/lib/Target/AArch64/SMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/SMEABIPass.cpp
@@ -15,16 +15,11 @@
#include "AArch64.h"
#include "Utils/AArch64SMEAttributes.h"
#include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
-#include "llvm/IR/RuntimeLibcalls.h"
-#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/Cloning.h"
using namespace llvm;
@@ -38,13 +33,9 @@ struct SMEABI : public FunctionPass {
bool runOnFunction(Function &F) override;
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<TargetPassConfig>();
- }
-
private:
bool updateNewStateFunctions(Module *M, Function *F, IRBuilder<> &Builder,
- SMEAttrs FnAttrs, const TargetLowering &TLI);
+ SMEAttrs FnAttrs);
};
} // end anonymous namespace
@@ -60,16 +51,14 @@ FunctionPass *llvm::createSMEABIPass() { return new SMEABI(); }
//===----------------------------------------------------------------------===//
// Utility function to emit a call to __arm_tpidr2_save and clear TPIDR2_EL0.
-void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, const TargetLowering &TLI,
- bool ZT0IsUndef = false) {
+void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, bool ZT0IsUndef = false) {
auto &Ctx = M->getContext();
auto *TPIDR2SaveTy =
FunctionType::get(Builder.getVoidTy(), {}, /*IsVarArgs=*/false);
auto Attrs =
AttributeList().addFnAttribute(Ctx, "aarch64_pstate_sm_compatible");
- RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_SAVE;
FunctionCallee Callee =
- M->getOrInsertFunction(TLI.getLibcallName(LC), TPIDR2SaveTy, Attrs);
+ M->getOrInsertFunction("__arm_tpidr2_save", TPIDR2SaveTy, Attrs);
CallInst *Call = Builder.CreateCall(Callee);
// If ZT0 is undefined (i.e. we're at the entry of a "new_zt0" function), mark
@@ -78,7 +67,8 @@ void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, const TargetLowering &TLI,
if (ZT0IsUndef)
Call->addFnAttr(Attribute::get(Ctx, "aarch64_zt0_undef"));
- Call->setCallingConv(TLI.getLibcallCallingConv(LC));
+ Call->setCallingConv(
+ CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0);
// A save to TPIDR2 should be followed by clearing TPIDR2_EL0.
Function *WriteIntr =
@@ -108,8 +98,7 @@ void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, const TargetLowering &TLI,
/// interface if it does not share ZA or ZT0.
///
bool SMEABI::updateNewStateFunctions(Module *M, Function *F,
- IRBuilder<> &Builder, SMEAttrs FnAttrs,
- const TargetLowering &TLI) {
+ IRBuilder<> &Builder, SMEAttrs FnAttrs) {
LLVMContext &Context = F->getContext();
BasicBlock *OrigBB = &F->getEntryBlock();
Builder.SetInsertPoint(&OrigBB->front());
@@ -135,7 +124,7 @@ bool SMEABI::updateNewStateFunctions(Module *M, Function *F,
// Create a call __arm_tpidr2_save, which commits the lazy save.
Builder.SetInsertPoint(&SaveBB->back());
- emitTPIDR2Save(M, Builder, TLI, /*ZT0IsUndef=*/FnAttrs.isNewZT0());
+ emitTPIDR2Save(M, Builder, /*ZT0IsUndef=*/FnAttrs.isNewZT0());
// Enable pstate.za at the start of the function.
Builder.SetInsertPoint(&OrigBB->front());
@@ -183,14 +172,10 @@ bool SMEABI::runOnFunction(Function &F) {
if (F.isDeclaration() || F.hasFnAttribute("aarch64_expanded_pstate_za"))
return false;
- const TargetMachine &TM =
- getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
- const TargetLowering &TLI = *TM.getSubtargetImpl(F)->getTargetLowering();
-
bool Changed = false;
SMEAttrs FnAttrs(F);
if (FnAttrs.isNewZA() || FnAttrs.isNewZT0())
- Changed |= updateNewStateFunctions(M, &F, Builder, FnAttrs, TLI);
+ Changed |= updateNewStateFunctions(M, &F, Builder, FnAttrs);
return Changed;
}
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
index 934f68b29922..271094f935e0 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
+++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
@@ -7,9 +7,7 @@
//===----------------------------------------------------------------------===//
#include "AArch64SMEAttributes.h"
-#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/RuntimeLibcalls.h"
#include <cassert>
using namespace llvm;
@@ -79,36 +77,19 @@ SMEAttrs::SMEAttrs(const AttributeList &Attrs) {
Bitmask |= encodeZT0State(StateValue::New);
}
-void SMEAttrs::addKnownFunctionAttrs(StringRef FuncName,
- const TargetLowering &TLI) {
- RTLIB::LibcallImpl Impl = TLI.getSupportedLibcallImpl(FuncName);
- if (Impl == RTLIB::Unsupported)
- return;
- RTLIB::Libcall LC = RTLIB::RuntimeLibcallsInfo::getLibcallFromImpl(Impl);
+void SMEAttrs::addKnownFunctionAttrs(StringRef FuncName) {
unsigned KnownAttrs = SMEAttrs::Normal;
- switch (LC) {
- case RTLIB::SMEABI_SME_STATE:
- case RTLIB::SMEABI_TPIDR2_SAVE:
- case RTLIB::SMEABI_GET_CURRENT_VG:
- case RTLIB::SMEABI_SME_STATE_SIZE:
- case RTLIB::SMEABI_SME_SAVE:
- case RTLIB::SMEABI_SME_RESTORE:
- KnownAttrs |= SMEAttrs::SM_Compatible | SMEAttrs::SME_ABI_Routine;
- break;
- case RTLIB::SMEABI_ZA_DISABLE:
- case RTLIB::SMEABI_TPIDR2_RESTORE:
+ if (FuncName == "__arm_tpidr2_save" || FuncName == "__arm_sme_state")
+ KnownAttrs |= (SMEAttrs::SM_Compatible | SMEAttrs::SME_ABI_Routine);
+ if (FuncName == "__arm_tpidr2_restore")
KnownAttrs |= SMEAttrs::SM_Compatible | encodeZAState(StateValue::In) |
SMEAttrs::SME_ABI_Routine;
- break;
- case RTLIB::SC_MEMCPY:
- case RTLIB::SC_MEMMOVE:
- case RTLIB::SC_MEMSET:
- case RTLIB::SC_MEMCHR:
+ if (FuncName == "__arm_sc_memcpy" || FuncName == "__arm_sc_memset" ||
+ FuncName == "__arm_sc_memmove" || FuncName == "__arm_sc_memchr")
KnownAttrs |= SMEAttrs::SM_Compatible;
- break;
- default:
- break;
- }
+ if (FuncName == "__arm_sme_save" || FuncName == "__arm_sme_restore" ||
+ FuncName == "__arm_sme_state_size")
+ KnownAttrs |= SMEAttrs::SM_Compatible | SMEAttrs::SME_ABI_Routine;
set(KnownAttrs);
}
@@ -129,11 +110,11 @@ bool SMECallAttrs::requiresSMChange() const {
return true;
}
-SMECallAttrs::SMECallAttrs(const CallBase &CB, const TargetLowering *TLI)
+SMECallAttrs::SMECallAttrs(const CallBase &CB)
: CallerFn(*CB.getFunction()), CalledFn(SMEAttrs::Normal),
Callsite(CB.getAttributes()), IsIndirect(CB.isIndirectCall()) {
if (auto *CalledFunction = CB.getCalledFunction())
- CalledFn = SMEAttrs(*CalledFunction, TLI);
+ CalledFn = SMEAttrs(*CalledFunction, SMEAttrs::InferAttrsFromName::Yes);
// FIXME: We probably should not allow SME attributes on direct calls but
// clang duplicates streaming mode attributes at each callsite.
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
index 06376c74025f..f1be0ecbee7e 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
@@ -13,8 +13,6 @@
namespace llvm {
-class TargetLowering;
-
class Function;
class CallBase;
class AttributeList;
@@ -50,17 +48,17 @@ public:
CallSiteFlags_Mask = ZT0_Undef
};
+ enum class InferAttrsFromName { No, Yes };
+
SMEAttrs() = default;
SMEAttrs(unsigned Mask) { set(Mask); }
- SMEAttrs(const Function &F, const TargetLowering *TLI = nullptr)
+ SMEAttrs(const Function &F, InferAttrsFromName Infer = InferAttrsFromName::No)
: SMEAttrs(F.getAttributes()) {
- if (TLI)
- addKnownFunctionAttrs(F.getName(), *TLI);
+ if (Infer == InferAttrsFromName::Yes)
+ addKnownFunctionAttrs(F.getName());
}
SMEAttrs(const AttributeList &L);
- SMEAttrs(StringRef FuncName, const TargetLowering &TLI) {
- addKnownFunctionAttrs(FuncName, TLI);
- };
+ SMEAttrs(StringRef FuncName) { addKnownFunctionAttrs(FuncName); };
void set(unsigned M, bool Enable = true);
@@ -148,7 +146,7 @@ public:
}
private:
- void addKnownFunctionAttrs(StringRef FuncName, const TargetLowering &TLI);
+ void addKnownFunctionAttrs(StringRef FuncName);
};
/// SMECallAttrs is a utility class to hold the SMEAttrs for a callsite. It has
@@ -165,7 +163,7 @@ public:
SMEAttrs Callsite = SMEAttrs::Normal)
: CallerFn(Caller), CalledFn(Callee), Callsite(Callsite) {}
- SMECallAttrs(const CallBase &CB, const TargetLowering *TLI);
+ SMECallAttrs(const CallBase &CB);
SMEAttrs &caller() { return CallerFn; }
SMEAttrs &callee() { return IsIndirect ? Callsite : CalledFn; }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 007b481f8496..0059a862ba9b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -439,10 +439,6 @@ struct AMDGPUPrintfRuntimeBindingPass
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
};
-struct AMDGPUUnifyMetadataPass : PassInfoMixin<AMDGPUUnifyMetadataPass> {
- PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
-};
-
void initializeSIOptimizeExecMaskingPreRALegacyPass(PassRegistry &);
extern char &SIOptimizeExecMaskingPreRAID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index b6c6d927d0e8..6ddfa386e8ac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -29,7 +29,6 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(
MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
-MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
#undef MODULE_PASS
#ifndef MODULE_PASS_WITH_PARAMS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 5a6ad405a026..8c56c2162112 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -724,10 +724,10 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{S32}, {{}, {Vgpr32, SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}});
addRulesForGOpcs({G_PTR_ADD})
- .Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}})
- .Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}})
- .Any({{DivP0}, {{VgprP0}, {VgprP0, Vgpr64}}})
- .Any({{UniP4}, {{SgprP4}, {SgprP4, Sgpr64}}});
+ .Any({{UniPtr32}, {{SgprPtr32}, {SgprPtr32, Sgpr32}}})
+ .Any({{DivPtr32}, {{VgprPtr32}, {VgprPtr32, Vgpr32}}})
+ .Any({{UniPtr64}, {{SgprPtr64}, {SgprPtr64, Sgpr64}}})
+ .Any({{DivPtr64}, {{VgprPtr64}, {VgprPtr64, Vgpr64}}});
addRulesForGOpcs({G_INTTOPTR})
.Any({{UniPtr32}, {{SgprPtr32}, {Sgpr32}}})
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index c1f17033d04a..e393aa198774 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -848,8 +848,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
if (Level == OptimizationLevel::O0)
return;
- PM.addPass(AMDGPUUnifyMetadataPass());
-
// We don't want to run internalization at per-module stage.
if (InternalizeSymbols && !isLTOPreLink(Phase)) {
PM.addPass(InternalizePass(mustPreserveGV));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
deleted file mode 100644
index e400491c3860..000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//===- AMDGPUUnifyMetadata.cpp - Unify OpenCL metadata --------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// \file
-// This pass that unifies multiple OpenCL metadata due to linking.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/Pass.h"
-
-using namespace llvm;
-
-namespace {
-
- namespace kOCLMD {
-
- const char SpirVer[] = "opencl.spir.version";
- const char OCLVer[] = "opencl.ocl.version";
- const char UsedExt[] = "opencl.used.extensions";
- const char UsedOptCoreFeat[] = "opencl.used.optional.core.features";
- const char CompilerOptions[] = "opencl.compiler.options";
- const char LLVMIdent[] = "llvm.ident";
-
- } // end namespace kOCLMD
-
- /// Unify version metadata.
- /// \return true if changes are made.
- /// Assume the named metadata has operands each of which is a pair of
- /// integer constant, e.g.
- /// !Name = {!n1, !n2}
- /// !n1 = {i32 1, i32 2}
- /// !n2 = {i32 2, i32 0}
- /// Keep the largest version as the sole operand if PickFirst is false.
- /// Otherwise pick it from the first value, representing kernel module.
- bool unifyVersionMD(Module &M, StringRef Name, bool PickFirst) {
- auto *NamedMD = M.getNamedMetadata(Name);
- if (!NamedMD || NamedMD->getNumOperands() <= 1)
- return false;
- MDNode *MaxMD = nullptr;
- auto MaxVer = 0U;
- for (auto *VersionMD : NamedMD->operands()) {
- assert(VersionMD->getNumOperands() == 2);
- auto *CMajor = mdconst::extract<ConstantInt>(VersionMD->getOperand(0));
- auto VersionMajor = CMajor->getZExtValue();
- auto *CMinor = mdconst::extract<ConstantInt>(VersionMD->getOperand(1));
- auto VersionMinor = CMinor->getZExtValue();
- auto Ver = (VersionMajor * 100) + (VersionMinor * 10);
- if (Ver > MaxVer) {
- MaxVer = Ver;
- MaxMD = VersionMD;
- }
- if (PickFirst)
- break;
- }
- NamedMD->eraseFromParent();
- NamedMD = M.getOrInsertNamedMetadata(Name);
- NamedMD->addOperand(MaxMD);
- return true;
- }
-
- /// Unify version metadata.
- /// \return true if changes are made.
- /// Assume the named metadata has operands each of which is a list e.g.
- /// !Name = {!n1, !n2}
- /// !n1 = !{!"cl_khr_fp16", {!"cl_khr_fp64"}}
- /// !n2 = !{!"cl_khr_image"}
- /// Combine it into a single list with unique operands.
- bool unifyExtensionMD(Module &M, StringRef Name) {
- auto *NamedMD = M.getNamedMetadata(Name);
- if (!NamedMD || NamedMD->getNumOperands() == 1)
- return false;
-
- SmallVector<Metadata *, 4> All;
- for (auto *MD : NamedMD->operands())
- for (const auto &Op : MD->operands())
- if (!llvm::is_contained(All, Op.get()))
- All.push_back(Op.get());
-
- NamedMD->eraseFromParent();
- NamedMD = M.getOrInsertNamedMetadata(Name);
- for (const auto &MD : All)
- NamedMD->addOperand(MDNode::get(M.getContext(), MD));
-
- return true;
- }
-
- /// Unify multiple OpenCL metadata due to linking.
- bool unifyMetadataImpl(Module &M) {
- const char *Vers[] = {kOCLMD::SpirVer, kOCLMD::OCLVer};
- const char *Exts[] = {kOCLMD::UsedExt, kOCLMD::UsedOptCoreFeat,
- kOCLMD::CompilerOptions, kOCLMD::LLVMIdent};
-
- bool Changed = false;
-
- for (auto &I : Vers)
- Changed |= unifyVersionMD(M, I, true);
-
- for (auto &I : Exts)
- Changed |= unifyExtensionMD(M, I);
-
- return Changed;
- }
-
- } // end anonymous namespace
-
- PreservedAnalyses AMDGPUUnifyMetadataPass::run(Module &M,
- ModuleAnalysisManager &AM) {
- return unifyMetadataImpl(M) ? PreservedAnalyses::none()
- : PreservedAnalyses::all();
- }
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index c466f9cf0f35..dc9dd220130e 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -114,7 +114,6 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUTargetTransformInfo.cpp
AMDGPUWaitSGPRHazards.cpp
AMDGPUUnifyDivergentExitNodes.cpp
- AMDGPUUnifyMetadata.cpp
R600MachineCFGStructurizer.cpp
GCNCreateVOPD.cpp
GCNDPPCombine.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 2d0102fffe5e..7c019031ff24 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -197,7 +197,7 @@ enum ClassFlags : unsigned {
namespace AMDGPU {
enum OperandType : unsigned {
- /// Operands with register or 32-bit immediate
+ /// Operands with register, 32-bit, or 64-bit immediate
OPERAND_REG_IMM_INT32 = MCOI::OPERAND_FIRST_TARGET,
OPERAND_REG_IMM_INT64,
OPERAND_REG_IMM_INT16,
@@ -407,7 +407,7 @@ enum CPol {
SCAL = 1 << 11, // Scale offset bit
- ALL = TH | SCOPE,
+ ALL = TH | SCOPE | NV,
// Helper bits
TH_TYPE_LOAD = 1 << 7, // TH_LOAD policy
@@ -440,6 +440,7 @@ enum Id { // Message ID, width(4) [3:0].
ID_EARLY_PRIM_DEALLOC = 8, // added in GFX9, removed in GFX10
ID_GS_ALLOC_REQ = 9, // added in GFX9
ID_GET_DOORBELL = 10, // added in GFX9, removed in GFX11
+ ID_SAVEWAVE_HAS_TDM = 10, // added in GFX1250
ID_GET_DDID = 11, // added in GFX10, removed in GFX11
ID_SYSMSG = 15,
@@ -513,6 +514,7 @@ enum Id { // HwRegCode, (6) [5:0]
ID_HW_ID2 = 24,
ID_POPS_PACKER = 25,
ID_PERF_SNAPSHOT_DATA_gfx11 = 27,
+ ID_IB_STS2 = 28,
ID_SHADER_CYCLES = 29,
ID_SHADER_CYCLES_HI = 30,
ID_DVGPR_ALLOC_LO = 31,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2e76225bbc54..f58fde421f77 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16894,6 +16894,11 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
const TargetRegisterClass *RC = nullptr;
if (Constraint.size() == 1) {
+ // Check if we cannot determine the bit size of the given value type. This
+ // can happen, for example, in this situation where we have an empty struct
+ // (size 0): `call void asm "", "v"({} poison)`-
+ if (VT == MVT::Other)
+ return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
const unsigned BitWidth = VT.getSizeInBits();
switch (Constraint[0]) {
default:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 9278b859a806..c425d9753dd1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2708,7 +2708,6 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
isModifierType<Src2VT>.ret,
HasOMod);
field bit HasNeg = HasModifiers;
- field bit HasMatrixReuse = 0;
field bit HasMatrixFMT = 0;
field bit HasMatrixScale = 0;
field bit HasMatrixReuse = 0;
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index f8878f32f829..f7a9a584a6b5 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -57,6 +57,7 @@
#include "llvm/CodeGen/LiveVariables.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/Target/TargetMachine.h"
using namespace llvm;
@@ -76,6 +77,7 @@ private:
LiveIntervals *LIS = nullptr;
LiveVariables *LV = nullptr;
MachineDominatorTree *MDT = nullptr;
+ MachinePostDominatorTree *PDT = nullptr;
MachineRegisterInfo *MRI = nullptr;
SetVector<MachineInstr*> LoweredEndCf;
DenseSet<Register> LoweredIf;
@@ -138,8 +140,8 @@ private:
public:
SILowerControlFlow(LiveIntervals *LIS, LiveVariables *LV,
- MachineDominatorTree *MDT)
- : LIS(LIS), LV(LV), MDT(MDT) {}
+ MachineDominatorTree *MDT, MachinePostDominatorTree *PDT)
+ : LIS(LIS), LV(LV), MDT(MDT), PDT(PDT) {}
bool run(MachineFunction &MF);
};
@@ -159,6 +161,7 @@ public:
AU.addUsedIfAvailable<LiveIntervalsWrapperPass>();
// Should preserve the same set that TwoAddressInstructions does.
AU.addPreserved<MachineDominatorTreeWrapperPass>();
+ AU.addPreserved<MachinePostDominatorTreeWrapperPass>();
AU.addPreserved<SlotIndexesWrapperPass>();
AU.addPreserved<LiveIntervalsWrapperPass>();
AU.addPreserved<LiveVariablesWrapperPass>();
@@ -506,13 +509,18 @@ MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) {
MachineBasicBlock *SplitBB = &MBB;
if (NeedBlockSplit) {
SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/true, LIS);
- if (MDT && SplitBB != &MBB) {
- MachineDomTreeNode *MBBNode = (*MDT)[&MBB];
- SmallVector<MachineDomTreeNode *> Children(MBBNode->begin(),
- MBBNode->end());
- MachineDomTreeNode *SplitBBNode = MDT->addNewBlock(SplitBB, &MBB);
- for (MachineDomTreeNode *Child : Children)
- MDT->changeImmediateDominator(Child, SplitBBNode);
+ if (SplitBB != &MBB && (MDT || PDT)) {
+ using DomTreeT = DomTreeBase<MachineBasicBlock>;
+ SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
+ for (MachineBasicBlock *Succ : SplitBB->successors()) {
+ DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
+ DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ});
+ }
+ DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB});
+ if (MDT)
+ MDT->applyUpdates(DTUpdates);
+ if (PDT)
+ PDT->applyUpdates(DTUpdates);
}
Opcode = OrTermrOpc;
InsPt = MI;
@@ -727,26 +735,27 @@ bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
MachineBasicBlock *Succ = *MBB.succ_begin();
MachineBasicBlock *FallThrough = nullptr;
+ using DomTreeT = DomTreeBase<MachineBasicBlock>;
+ SmallVector<DomTreeT::UpdateType, 8> DTUpdates;
+
while (!MBB.predecessors().empty()) {
MachineBasicBlock *P = *MBB.pred_begin();
if (P->getFallThrough(false) == &MBB)
FallThrough = P;
P->ReplaceUsesOfBlockWith(&MBB, Succ);
+ DTUpdates.push_back({DomTreeT::Insert, P, Succ});
+ DTUpdates.push_back({DomTreeT::Delete, P, &MBB});
}
MBB.removeSuccessor(Succ);
if (LIS) {
for (auto &I : MBB.instrs())
LIS->RemoveMachineInstrFromMaps(I);
}
- if (MDT) {
- // If Succ, the single successor of MBB, is dominated by MBB, MDT needs
- // updating by changing Succ's idom to the one of MBB; otherwise, MBB must
- // be a leaf node in MDT and could be erased directly.
- if (MDT->dominates(&MBB, Succ))
- MDT->changeImmediateDominator(MDT->getNode(Succ),
- MDT->getNode(&MBB)->getIDom());
- MDT->eraseNode(&MBB);
- }
+ if (MDT)
+ MDT->applyUpdates(DTUpdates);
+ if (PDT)
+ PDT->applyUpdates(DTUpdates);
+
MBB.clear();
MBB.eraseFromParent();
if (FallThrough && !FallThrough->isLayoutSuccessor(Succ)) {
@@ -875,7 +884,11 @@ bool SILowerControlFlowLegacy::runOnMachineFunction(MachineFunction &MF) {
LiveVariables *LV = LVWrapper ? &LVWrapper->getLV() : nullptr;
auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
MachineDominatorTree *MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
- return SILowerControlFlow(LIS, LV, MDT).run(MF);
+ auto *PDTWrapper =
+ getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
+ MachinePostDominatorTree *PDT =
+ PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr;
+ return SILowerControlFlow(LIS, LV, MDT, PDT).run(MF);
}
PreservedAnalyses
@@ -885,13 +898,16 @@ SILowerControlFlowPass::run(MachineFunction &MF,
LiveVariables *LV = MFAM.getCachedResult<LiveVariablesAnalysis>(MF);
MachineDominatorTree *MDT =
MFAM.getCachedResult<MachineDominatorTreeAnalysis>(MF);
+ MachinePostDominatorTree *PDT =
+ MFAM.getCachedResult<MachinePostDominatorTreeAnalysis>(MF);
- bool Changed = SILowerControlFlow(LIS, LV, MDT).run(MF);
+ bool Changed = SILowerControlFlow(LIS, LV, MDT, PDT).run(MF);
if (!Changed)
return PreservedAnalyses::all();
auto PA = getMachineFunctionPassPreservedAnalyses();
PA.preserve<MachineDominatorTreeAnalysis>();
+ PA.preserve<MachinePostDominatorTreeAnalysis>();
PA.preserve<SlotIndexesAnalysis>();
PA.preserve<LiveIntervalsAnalysis>();
PA.preserve<LiveVariablesAnalysis>();
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index c2f4dbfa247d..a003a46191a8 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1665,7 +1665,9 @@ def S_WAITCNT_lds_direct : SPseudoInstSI<(outs), (ins)> {
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
[(int_amdgcn_s_sethalt timm:$simm16)]>;
-def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;
+def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16"> {
+ let SubtargetPredicate = isNotGFX1250Plus;
+}
// On SI the documentation says sleep for approximately 64 * low 2
// bits, consistent with the reported maximum of 448. On VI the
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index 3d9455fc51a3..c740b5e0f09d 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -106,7 +106,7 @@ static constexpr CustomOperand MsgOperands[] = {
{{"MSG_GET_DDID"}, ID_GET_DDID, isGFX10},
{{"MSG_HS_TESSFACTOR"}, ID_HS_TESSFACTOR_GFX11Plus, isGFX11Plus},
{{"MSG_DEALLOC_VGPRS"}, ID_DEALLOC_VGPRS_GFX11Plus, isGFX11Plus},
- {{""}},
+ {{"MSG_SAVEWAVE_HAS_TDM"}, ID_SAVEWAVE_HAS_TDM, isGFX1250},
{{"MSG_SYSMSG"}, ID_SYSMSG},
{{"MSG_RTN_GET_DOORBELL"}, ID_RTN_GET_DOORBELL, isGFX11Plus},
{{"MSG_RTN_GET_DDID"}, ID_RTN_GET_DDID, isGFX11Plus},
@@ -195,7 +195,7 @@ static constexpr CustomOperand Operands[] = {
{{"HW_REG_POPS_PACKER"}, ID_POPS_PACKER, isGFX10},
{{""}},
{{"HW_REG_PERF_SNAPSHOT_DATA"}, ID_PERF_SNAPSHOT_DATA_gfx11, isGFX11},
- {{""}},
+ {{"HW_REG_IB_STS2"}, ID_IB_STS2, isGFX1250},
{{"HW_REG_SHADER_CYCLES"}, ID_SHADER_CYCLES, isGFX10_3_GFX11},
{{"HW_REG_SHADER_CYCLES_HI"}, ID_SHADER_CYCLES_HI, isGFX12Plus},
{{"HW_REG_DVGPR_ALLOC_LO"}, ID_DVGPR_ALLOC_LO, isGFX12Plus},
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index d386c917a256..8ea567cfb9d3 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -587,167 +587,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
}
}
- // RTLIB
- if (TM.isAAPCS_ABI() && (TT.isTargetAEABI() || TT.isTargetGNUAEABI() ||
- TT.isTargetMuslAEABI() || TT.isAndroid())) {
- // FIXME: This does not depend on the subtarget and should go directly into
- // RuntimeLibcalls. This is only here because of missing support for setting
- // the calling convention of an implementation.
- // clang-format off
- static const struct {
- const RTLIB::Libcall Op;
- const RTLIB::LibcallImpl Impl;
- } LibraryCalls[] = {
- // Double-precision floating-point arithmetic helper functions
- // RTABI chapter 4.1.2, Table 2
- { RTLIB::ADD_F64, RTLIB::__aeabi_dadd },
- { RTLIB::DIV_F64, RTLIB::__aeabi_ddiv },
- { RTLIB::MUL_F64, RTLIB::__aeabi_dmul },
- { RTLIB::SUB_F64, RTLIB::__aeabi_dsub },
-
- // Double-precision floating-point comparison helper functions
- // RTABI chapter 4.1.2, Table 3
- { RTLIB::OEQ_F64, RTLIB::__aeabi_dcmpeq__oeq },
- { RTLIB::UNE_F64, RTLIB::__aeabi_dcmpeq__une },
- { RTLIB::OLT_F64, RTLIB::__aeabi_dcmplt },
- { RTLIB::OLE_F64, RTLIB::__aeabi_dcmple },
- { RTLIB::OGE_F64, RTLIB::__aeabi_dcmpge },
- { RTLIB::OGT_F64, RTLIB::__aeabi_dcmpgt },
- { RTLIB::UO_F64, RTLIB::__aeabi_dcmpun },
-
- // Single-precision floating-point arithmetic helper functions
- // RTABI chapter 4.1.2, Table 4
- { RTLIB::ADD_F32, RTLIB::__aeabi_fadd },
- { RTLIB::DIV_F32, RTLIB::__aeabi_fdiv },
- { RTLIB::MUL_F32, RTLIB::__aeabi_fmul },
- { RTLIB::SUB_F32, RTLIB::__aeabi_fsub },
-
- // Single-precision floating-point comparison helper functions
- // RTABI chapter 4.1.2, Table 5
- { RTLIB::OEQ_F32, RTLIB::__aeabi_fcmpeq__oeq },
- { RTLIB::UNE_F32, RTLIB::__aeabi_fcmpeq__une },
- { RTLIB::OLT_F32, RTLIB::__aeabi_fcmplt},
- { RTLIB::OLE_F32, RTLIB::__aeabi_fcmple },
- { RTLIB::OGE_F32, RTLIB::__aeabi_fcmpge },
- { RTLIB::OGT_F32, RTLIB::__aeabi_fcmpgt },
- { RTLIB::UO_F32, RTLIB::__aeabi_fcmpun },
-
- // Floating-point to integer conversions.
- // RTABI chapter 4.1.2, Table 6
- { RTLIB::FPTOSINT_F64_I32, RTLIB::__aeabi_d2iz },
- { RTLIB::FPTOUINT_F64_I32, RTLIB::__aeabi_d2uiz },
- { RTLIB::FPTOSINT_F64_I64, RTLIB::__aeabi_d2lz },
- { RTLIB::FPTOUINT_F64_I64, RTLIB::__aeabi_d2ulz },
- { RTLIB::FPTOSINT_F32_I32, RTLIB::__aeabi_f2iz },
- { RTLIB::FPTOUINT_F32_I32, RTLIB::__aeabi_f2uiz },
- { RTLIB::FPTOSINT_F32_I64, RTLIB::__aeabi_f2lz },
- { RTLIB::FPTOUINT_F32_I64, RTLIB::__aeabi_f2ulz },
-
- // Conversions between floating types.
- // RTABI chapter 4.1.2, Table 7
- { RTLIB::FPROUND_F64_F32, RTLIB::__aeabi_d2f },
- { RTLIB::FPROUND_F64_F16, RTLIB::__aeabi_d2h },
- { RTLIB::FPEXT_F32_F64, RTLIB::__aeabi_f2d },
-
- // Integer to floating-point conversions.
- // RTABI chapter 4.1.2, Table 8
- { RTLIB::SINTTOFP_I32_F64, RTLIB::__aeabi_i2d },
- { RTLIB::UINTTOFP_I32_F64, RTLIB::__aeabi_ui2d },
- { RTLIB::SINTTOFP_I64_F64, RTLIB::__aeabi_l2d },
- { RTLIB::UINTTOFP_I64_F64, RTLIB::__aeabi_ul2d },
- { RTLIB::SINTTOFP_I32_F32, RTLIB::__aeabi_i2f },
- { RTLIB::UINTTOFP_I32_F32, RTLIB::__aeabi_ui2f },
- { RTLIB::SINTTOFP_I64_F32, RTLIB::__aeabi_l2f },
- { RTLIB::UINTTOFP_I64_F32, RTLIB::__aeabi_ul2f },
-
- // Long long helper functions
- // RTABI chapter 4.2, Table 9
- { RTLIB::MUL_I64, RTLIB::__aeabi_lmul },
- { RTLIB::SHL_I64, RTLIB::__aeabi_llsl },
- { RTLIB::SRL_I64, RTLIB::__aeabi_llsr },
- { RTLIB::SRA_I64, RTLIB::__aeabi_lasr },
-
- // Integer division functions
- // RTABI chapter 4.3.1
- { RTLIB::SDIV_I32, RTLIB::__aeabi_idiv },
- { RTLIB::SDIV_I64, RTLIB::__aeabi_ldivmod },
- { RTLIB::UDIV_I32, RTLIB::__aeabi_uidiv },
- { RTLIB::UDIV_I64, RTLIB::__aeabi_uldivmod },
- };
- // clang-format on
-
- for (const auto &LC : LibraryCalls)
- setLibcallImpl(LC.Op, LC.Impl);
-
- // EABI dependent RTLIB
- if (TM.Options.EABIVersion == EABI::EABI4 ||
- TM.Options.EABIVersion == EABI::EABI5) {
- static const struct {
- const RTLIB::Libcall Op;
- const RTLIB::LibcallImpl Impl;
- } MemOpsLibraryCalls[] = {
- // Memory operations
- // RTABI chapter 4.3.4
- {RTLIB::MEMCPY, RTLIB::__aeabi_memcpy},
- {RTLIB::MEMMOVE, RTLIB::__aeabi_memmove},
- {RTLIB::MEMSET, RTLIB::__aeabi_memset},
- {RTLIB::AEABI_MEMCPY4, RTLIB::__aeabi_memcpy4},
- {RTLIB::AEABI_MEMCPY8, RTLIB::__aeabi_memcpy8},
- {RTLIB::AEABI_MEMMOVE4, RTLIB::__aeabi_memmove4},
- {RTLIB::AEABI_MEMMOVE8, RTLIB::__aeabi_memmove8},
- {RTLIB::AEABI_MEMSET4, RTLIB::__aeabi_memset4},
- {RTLIB::AEABI_MEMSET8, RTLIB::__aeabi_memset8},
- {RTLIB::AEABI_MEMCLR, RTLIB::__aeabi_memclr},
- {RTLIB::AEABI_MEMCLR4, RTLIB::__aeabi_memclr4},
- {RTLIB::AEABI_MEMCLR8, RTLIB::__aeabi_memclr8},
- };
-
- for (const auto &LC : MemOpsLibraryCalls)
- setLibcallImpl(LC.Op, LC.Impl);
- }
- }
-
- // The half <-> float conversion functions are always soft-float on
- // non-watchos platforms, but are needed for some targets which use a
- // hard-float calling convention by default.
- if (!TT.isWatchABI()) {
- if (TM.isAAPCS_ABI()) {
- setLibcallImplCallingConv(RTLIB::__truncsfhf2, CallingConv::ARM_AAPCS);
- setLibcallImplCallingConv(RTLIB::__truncdfhf2, CallingConv::ARM_AAPCS);
- setLibcallImplCallingConv(RTLIB::__extendhfsf2, CallingConv::ARM_AAPCS);
- setLibcallImplCallingConv(RTLIB::__gnu_h2f_ieee, CallingConv::ARM_AAPCS);
- setLibcallImplCallingConv(RTLIB::__gnu_f2h_ieee, CallingConv::ARM_AAPCS);
- } else {
- setLibcallImplCallingConv(RTLIB::__truncsfhf2, CallingConv::ARM_APCS);
- setLibcallImplCallingConv(RTLIB::__truncdfhf2, CallingConv::ARM_APCS);
- setLibcallImplCallingConv(RTLIB::__extendhfsf2, CallingConv::ARM_APCS);
- setLibcallImplCallingConv(RTLIB::__gnu_h2f_ieee, CallingConv::ARM_APCS);
- setLibcallImplCallingConv(RTLIB::__gnu_f2h_ieee, CallingConv::ARM_APCS);
- }
- }
-
- // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
- // a __gnu_ prefix (which is the default).
- if (TT.isTargetAEABI()) {
- // FIXME: This does not depend on the subtarget and should go directly into
- // RuntimeLibcalls. This is only here because of missing support for setting
- // the calling convention of an implementation.
- static const struct {
- const RTLIB::Libcall Op;
- const RTLIB::LibcallImpl Impl;
- } LibraryCalls[] = {
- {RTLIB::FPROUND_F32_F16, RTLIB::__aeabi_f2h},
- {RTLIB::FPEXT_F16_F32, RTLIB::__aeabi_h2f},
- };
-
- for (const auto &LC : LibraryCalls) {
- setLibcallImpl(LC.Op, LC.Impl);
- }
- } else if (!TT.isOSBinFormatMachO()) {
- setLibcallImpl(RTLIB::FPROUND_F32_F16, RTLIB::__gnu_f2h_ieee);
- setLibcallImpl(RTLIB::FPEXT_F16_F32, RTLIB::__gnu_h2f_ieee);
- }
-
if (Subtarget->isThumb1Only())
addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
else
@@ -7406,7 +7245,7 @@ static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
return false;
unsigned NumElts = VT.getVectorNumElements();
- if (M.size() != NumElts && M.size() != NumElts*2)
+ if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
return false;
// If the mask is twice as long as the input vector then we need to check the
@@ -7438,7 +7277,7 @@ static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
return false;
unsigned NumElts = VT.getVectorNumElements();
- if (M.size() != NumElts && M.size() != NumElts*2)
+ if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
return false;
for (unsigned i = 0; i < M.size(); i += NumElts) {
@@ -7541,7 +7380,7 @@ static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
return false;
unsigned NumElts = VT.getVectorNumElements();
- if (M.size() != NumElts && M.size() != NumElts*2)
+ if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
return false;
for (unsigned i = 0; i < M.size(); i += NumElts) {
@@ -7574,7 +7413,7 @@ static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
return false;
unsigned NumElts = VT.getVectorNumElements();
- if (M.size() != NumElts && M.size() != NumElts*2)
+ if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
return false;
for (unsigned i = 0; i < M.size(); i += NumElts) {
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 393cf2d97380..6b2854171c81 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1084,9 +1084,10 @@ InstructionCost ARMTTIImpl::getCmpSelInstrCost(
CostKind, Op1Info, Op2Info, I);
}
-InstructionCost ARMTTIImpl::getAddressComputationCost(Type *PtrTy,
- ScalarEvolution *SE,
- const SCEV *Ptr) const {
+InstructionCost
+ARMTTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
+ const SCEV *Ptr,
+ TTI::TargetCostKind CostKind) const {
// Address computations in vectorized code with non-consecutive addresses will
// likely result in more instructions compared to scalar code where the
// computation can more often be merged into the index mode. The resulting
@@ -1103,7 +1104,7 @@ InstructionCost ARMTTIImpl::getAddressComputationCost(Type *PtrTy,
// addressing mode.
return 1;
}
- return BaseT::getAddressComputationCost(PtrTy, SE, Ptr);
+ return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
}
bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) const {
@@ -1335,6 +1336,39 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
if (!Mask.empty()) {
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
+ // Check for LD2/LD4 instructions, which are represented in llvm IR as
+ // deinterleaving-shuffle(load). The shuffle cost could potentially be
+ // free, but we model it with a cost of LT.first so that LD2/LD4 have a
+ // higher cost than just the load.
+ if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
+ (LT.second.getScalarSizeInBits() == 8 ||
+ LT.second.getScalarSizeInBits() == 16 ||
+ LT.second.getScalarSizeInBits() == 32) &&
+ LT.second.getSizeInBits() == 128 &&
+ ((TLI->getMaxSupportedInterleaveFactor() >= 2 &&
+ ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 2)) ||
+ (TLI->getMaxSupportedInterleaveFactor() == 4 &&
+ ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 4))))
+ return ST->getMVEVectorCostFactor(CostKind) *
+ std::max<InstructionCost>(1, LT.first / 4);
+
+ // Check for ST2/ST4 instructions, which are represented in llvm IR as
+ // store(interleaving-shuffle). The shuffle cost could potentially be
+ // free, but we model it with a cost of LT.first so that ST2/ST4 have a
+ // higher cost than just the store.
+ if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
+ (LT.second.getScalarSizeInBits() == 8 ||
+ LT.second.getScalarSizeInBits() == 16 ||
+ LT.second.getScalarSizeInBits() == 32) &&
+ LT.second.getSizeInBits() == 128 &&
+ ((TLI->getMaxSupportedInterleaveFactor() >= 2 &&
+ ShuffleVectorInst::isInterleaveMask(
+ Mask, 2, SrcTy->getElementCount().getKnownMinValue() * 2)) ||
+ (TLI->getMaxSupportedInterleaveFactor() == 4 &&
+ ShuffleVectorInst::isInterleaveMask(
+ Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2))))
+ return ST->getMVEVectorCostFactor(CostKind) * LT.first;
+
if (LT.second.isVector() &&
Mask.size() <= LT.second.getVectorNumElements() &&
(isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 522c235a90a8..cdd8bcb9f741 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -257,8 +257,9 @@ public:
unsigned Index, const Value *Op0,
const Value *Op1) const override;
- InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE,
- const SCEV *Ptr) const override;
+ InstructionCost
+ getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr,
+ TTI::TargetCostKind CostKind) const override;
InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index ece6c10e828d..0e974838a7c6 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -3373,12 +3373,12 @@ public:
void addMSRMaskOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- Inst.addOperand(MCOperand::createImm(unsigned(getMSRMask())));
+ Inst.addOperand(MCOperand::createImm(getMSRMask()));
}
void addBankedRegOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- Inst.addOperand(MCOperand::createImm(unsigned(getBankedReg())));
+ Inst.addOperand(MCOperand::createImm(getBankedReg()));
}
void addProcIFlagsOperands(MCInst &Inst, unsigned N) const {
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index 5c212816fbdb..171e2949366a 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -156,9 +156,10 @@ HexagonTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
}
-InstructionCost HexagonTTIImpl::getAddressComputationCost(Type *PtrTy,
- ScalarEvolution *SE,
- const SCEV *S) const {
+InstructionCost
+HexagonTTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
+ const SCEV *S,
+ TTI::TargetCostKind CostKind) const {
return 0;
}
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index 0a5766d1dadf..dbf16c99c314 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -111,8 +111,9 @@ public:
InstructionCost
getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind) const override;
- InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
- const SCEV *S) const override;
+ InstructionCost
+ getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *S,
+ TTI::TargetCostKind CostKind) const override;
InstructionCost getMemoryOpCost(
unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 14472419a10f..a2a41d0062ff 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2786,7 +2786,7 @@ SDValue LoongArchTargetLowering::lowerUINT_TO_FP(SDValue Op,
EVT RetVT = Op.getValueType();
RTLIB::Libcall LC = RTLIB::getUINTTOFP(OpVT, RetVT);
MakeLibCallOptions CallOptions;
- CallOptions.setTypeListBeforeSoften(OpVT, RetVT, true);
+ CallOptions.setTypeListBeforeSoften(OpVT, RetVT);
SDValue Chain = SDValue();
SDValue Result;
std::tie(Result, Chain) =
@@ -2811,7 +2811,7 @@ SDValue LoongArchTargetLowering::lowerSINT_TO_FP(SDValue Op,
EVT RetVT = Op.getValueType();
RTLIB::Libcall LC = RTLIB::getSINTTOFP(OpVT, RetVT);
MakeLibCallOptions CallOptions;
- CallOptions.setTypeListBeforeSoften(OpVT, RetVT, true);
+ CallOptions.setTypeListBeforeSoften(OpVT, RetVT);
SDValue Chain = SDValue();
SDValue Result;
std::tie(Result, Chain) =
@@ -4107,7 +4107,7 @@ void LoongArchTargetLowering::ReplaceNodeResults(
LC = RTLIB::getFPTOSINT(Src.getValueType(), VT);
MakeLibCallOptions CallOptions;
EVT OpVT = Src.getValueType();
- CallOptions.setTypeListBeforeSoften(OpVT, VT, true);
+ CallOptions.setTypeListBeforeSoften(OpVT, VT);
SDValue Chain = SDValue();
SDValue Result;
std::tie(Result, Chain) =
@@ -4360,7 +4360,7 @@ void LoongArchTargetLowering::ReplaceNodeResults(
RTLIB::Libcall LC =
OpVT == MVT::f64 ? RTLIB::LROUND_F64 : RTLIB::LROUND_F32;
MakeLibCallOptions CallOptions;
- CallOptions.setTypeListBeforeSoften(OpVT, MVT::i64, true);
+ CallOptions.setTypeListBeforeSoften(OpVT, MVT::i64);
SDValue Result = makeLibCall(DAG, LC, MVT::i64, Op0, CallOptions, DL).first;
Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Result);
Results.push_back(Result);
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index d8bb16fe9b94..0696b11d62ac 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1640,6 +1640,24 @@ defm : PairInsertExtractPatV8<v8f32, f32>;
defm : PairInsertExtractPatV4<v4i64, GRLenVT>;
defm : PairInsertExtractPatV4<v4f64, f64>;
+def : Pat<(vector_insert v8i32:$xd, (GRLenVT(vector_extract v8i32:$xj, 0)),
+ uimm3:$imm),
+ (XVINSVE0_W v8i32:$xd, v8i32:$xj, uimm3:$imm)>;
+
+def : Pat<(vector_insert v4i64:$xd, (GRLenVT(vector_extract v4i64:$xj, 0)),
+ uimm2:$imm),
+ (XVINSVE0_D v4i64:$xd, v4i64:$xj, uimm2:$imm)>;
+
+def : Pat<(vector_insert v8i32:$xd,
+ (GRLenVT(vector_extract v8i32:$xj, uimm3:$imm1)), uimm3:$imm2),
+ (XVINSVE0_W v8i32:$xd, (XVPICKVE_W v8i32:$xj, uimm3:$imm1),
+ uimm3:$imm2)>;
+
+def : Pat<(vector_insert v4i64:$xd,
+ (GRLenVT(vector_extract v4i64:$xj, uimm2:$imm1)), uimm2:$imm2),
+ (XVINSVE0_D v4i64:$xd, (XVPICKVE_D v4i64:$xj, uimm2:$imm1),
+ uimm2:$imm2)>;
+
// PseudoXVINSGR2VR_{B/H}
def : Pat<(vector_insert v32i8:$xd, GRLenVT:$rj, uimm5:$imm),
(PseudoXVINSGR2VR_B v32i8:$xd, GRLenVT:$rj, uimm5:$imm)>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 18aeda6a7935..2445005bf98c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -70,7 +70,7 @@ NVPTXDAGToDAGISel::getDivF32Level(const SDNode *N) const {
}
bool NVPTXDAGToDAGISel::usePrecSqrtF32(const SDNode *N) const {
- return Subtarget->getTargetLowering()->usePrecSqrtF32(*MF, N);
+ return Subtarget->getTargetLowering()->usePrecSqrtF32(N);
}
bool NVPTXDAGToDAGISel::useF32FTZ() const {
@@ -82,11 +82,6 @@ bool NVPTXDAGToDAGISel::allowFMA() const {
return TL->allowFMA(*MF, OptLevel);
}
-bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
- const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
- return TL->allowUnsafeFPMath(*MF);
-}
-
bool NVPTXDAGToDAGISel::doRsqrtOpt() const { return EnableRsqrtOpt; }
/// Select - Select instructions not customized! Used for
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 357e915fd077..65731722f534 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -44,7 +44,6 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
bool usePrecSqrtF32(const SDNode *N) const;
bool useF32FTZ() const;
bool allowFMA() const;
- bool allowUnsafeFPMath() const;
bool doRsqrtOpt() const;
NVPTXScopes Scopes{};
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 3daf25d55152..b94cbd0bd9c1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -125,10 +125,6 @@ NVPTXTargetLowering::getDivF32Level(const MachineFunction &MF,
if (UsePrecDivF32.getNumOccurrences() > 0)
return UsePrecDivF32;
- // Otherwise, use div.approx if fast math is enabled
- if (allowUnsafeFPMath(MF))
- return NVPTX::DivPrecisionLevel::Approx;
-
const SDNodeFlags Flags = N.getFlags();
if (Flags.hasApproximateFuncs())
return NVPTX::DivPrecisionLevel::Approx;
@@ -136,16 +132,11 @@ NVPTXTargetLowering::getDivF32Level(const MachineFunction &MF,
return NVPTX::DivPrecisionLevel::IEEE754;
}
-bool NVPTXTargetLowering::usePrecSqrtF32(const MachineFunction &MF,
- const SDNode *N) const {
+bool NVPTXTargetLowering::usePrecSqrtF32(const SDNode *N) const {
// If nvptx-prec-sqrtf32 is used on the command-line, always honor it
if (UsePrecSqrtF32.getNumOccurrences() > 0)
return UsePrecSqrtF32;
- // Otherwise, use sqrt.approx if fast math is enabled
- if (allowUnsafeFPMath(MF))
- return false;
-
if (N) {
const SDNodeFlags Flags = N->getFlags();
if (Flags.hasApproximateFuncs())
@@ -1193,8 +1184,7 @@ SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
bool &UseOneConst,
bool Reciprocal) const {
if (!(Enabled == ReciprocalEstimate::Enabled ||
- (Enabled == ReciprocalEstimate::Unspecified &&
- !usePrecSqrtF32(DAG.getMachineFunction()))))
+ (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
return SDValue();
if (ExtraSteps == ReciprocalEstimate::Unspecified)
@@ -2851,8 +2841,7 @@ static SDValue lowerROT(SDValue Op, SelectionDAG &DAG) {
SDLoc(Op), Opcode, DAG);
}
-static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG,
- bool AllowUnsafeFPMath) {
+static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG) {
// Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)),
// i.e. "poor man's fmod()". When y is infinite, x is returned. This matches
// the semantics of LLVM's frem.
@@ -2869,7 +2858,7 @@ static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG,
SDValue Sub = DAG.getNode(ISD::FSUB, DL, Ty, X, Mul,
Flags | SDNodeFlags::AllowContract);
- if (AllowUnsafeFPMath || Flags.hasNoInfs())
+ if (Flags.hasNoInfs())
return Sub;
// If Y is infinite, return X
@@ -3014,7 +3003,7 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::CTLZ:
return lowerCTLZCTPOP(Op, DAG);
case ISD::FREM:
- return lowerFREM(Op, DAG, allowUnsafeFPMath(DAG.getMachineFunction()));
+ return lowerFREM(Op, DAG);
default:
llvm_unreachable("Custom lowering not defined for operation");
@@ -4868,17 +4857,7 @@ bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast)
return true;
- return allowUnsafeFPMath(MF);
-}
-
-bool NVPTXTargetLowering::allowUnsafeFPMath(const MachineFunction &MF) const {
- // Honor TargetOptions flags that explicitly say unsafe math is okay.
- if (MF.getTarget().Options.UnsafeFPMath)
- return true;
-
- // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
- const Function &F = MF.getFunction();
- return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
+ return false;
}
static bool isConstZero(const SDValue &Operand) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 43e721a9c2a4..27f099e22097 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -206,8 +206,7 @@ public:
// Get whether we should use a precise or approximate 32-bit floating point
// sqrt instruction.
- bool usePrecSqrtF32(const MachineFunction &MF,
- const SDNode *N = nullptr) const;
+ bool usePrecSqrtF32(const SDNode *N = nullptr) const;
// Get whether we should use instructions that flush floating-point denormals
// to sign-preserving zero.
@@ -220,7 +219,6 @@ public:
unsigned combineRepeatedFPDivisors() const override { return 2; }
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const;
- bool allowUnsafeFPMath(const MachineFunction &MF) const;
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
EVT) const override {
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index bd54d1db9156..ebb5e32f5e6f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1133,9 +1133,8 @@ defm FMA_F64 : FMA<F64RT, allow_ftz = false>;
// sin/cos/tanh
class UnaryOpAllowsApproxFn<SDPatternOperator operator>
- : PatFrag<(ops node:$A),
- (operator node:$A), [{
- return allowUnsafeFPMath() || N->getFlags().hasApproximateFuncs();
+ : PatFrag<(ops node:$A), (operator node:$A), [{
+ return N->getFlags().hasApproximateFuncs();
}]>;
def SIN_APPROX_f32 :
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 599865312920..9e1530a2d00f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -18,6 +18,7 @@
#include "RISCVInstrInfo.h"
#include "RISCVSelectionDAGInfo.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/SDPatternMatch.h"
#include "llvm/IR/IntrinsicsRISCV.h"
#include "llvm/Support/Alignment.h"
#include "llvm/Support/Debug.h"
@@ -772,6 +773,49 @@ bool RISCVDAGToDAGISel::trySignedBitfieldInsertInSign(SDNode *Node) {
return false;
}
+// (xor X, (and (xor X, C1), C2))
+// -> (qc.insbi X, (C1 >> ShAmt), Width, ShAmt)
+// where C2 is a shifted mask with width=Width and shift=ShAmt
+bool RISCVDAGToDAGISel::tryBitfieldInsertOpFromXor(SDNode *Node) {
+
+ if (!Subtarget->hasVendorXqcibm())
+ return false;
+
+ using namespace SDPatternMatch;
+
+ SDValue X;
+ APInt CImm, CMask;
+ if (!sd_match(
+ Node,
+ m_Xor(m_Value(X),
+ m_OneUse(m_And(m_OneUse(m_Xor(m_Deferred(X), m_ConstInt(CImm))),
+ m_ConstInt(CMask))))))
+ return false;
+
+ unsigned Width, ShAmt;
+ if (!CMask.isShiftedMask(ShAmt, Width))
+ return false;
+
+ int64_t Imm = CImm.getSExtValue();
+ Imm >>= ShAmt;
+
+ SDLoc DL(Node);
+ SDValue ImmNode;
+ auto Opc = RISCV::QC_INSB;
+
+ if (isInt<5>(Imm)) {
+ Opc = RISCV::QC_INSBI;
+ ImmNode = CurDAG->getSignedTargetConstant(Imm, DL, MVT::i32);
+ } else {
+ ImmNode = selectImm(CurDAG, DL, MVT::i32, Imm, *Subtarget);
+ }
+ SDValue Ops[] = {X, ImmNode, CurDAG->getTargetConstant(Width, DL, MVT::i32),
+ CurDAG->getTargetConstant(ShAmt, DL, MVT::i32)};
+ ReplaceNode(Node, CurDAG->getMachineNode(Opc, DL, MVT::i32, Ops));
+
+ return true;
+}
+
bool RISCVDAGToDAGISel::tryUnsignedBitfieldExtract(SDNode *Node,
const SDLoc &DL, MVT VT,
SDValue X, unsigned Msb,
@@ -1349,6 +1393,9 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
if (tryShrinkShlLogicImm(Node))
return;
+ if (tryBitfieldInsertOpFromXor(Node))
+ return;
+
break;
case ISD::AND: {
auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1));
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index ee3a86e25add..9d4cd0e6e339 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -75,6 +75,7 @@ public:
bool trySignedBitfieldExtract(SDNode *Node);
bool trySignedBitfieldInsertInSign(SDNode *Node);
bool trySignedBitfieldInsertInMask(SDNode *Node);
+ bool tryBitfieldInsertOpFromXor(SDNode *Node);
bool tryUnsignedBitfieldExtract(SDNode *Node, const SDLoc &DL, MVT VT,
SDValue X, unsigned Msb, unsigned Lsb);
bool tryUnsignedBitfieldInsertInZero(SDNode *Node, const SDLoc &DL, MVT VT,
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 8bc42ad8758c..4f52f68d35aa 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -14333,7 +14333,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
LC = RTLIB::getFPTOUINT(Op0.getValueType(), N->getValueType(0));
MakeLibCallOptions CallOptions;
EVT OpVT = Op0.getValueType();
- CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true);
+ CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0));
SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
SDValue Result;
std::tie(Result, Chain) =
@@ -14368,7 +14368,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
Op0.getValueType() == MVT::f64 ? RTLIB::LROUND_F64 : RTLIB::LROUND_F32;
MakeLibCallOptions CallOptions;
EVT OpVT = Op0.getValueType();
- CallOptions.setTypeListBeforeSoften(OpVT, MVT::i64, true);
+ CallOptions.setTypeListBeforeSoften(OpVT, MVT::i64);
SDValue Result = makeLibCall(DAG, LC, MVT::i64, Op0, CallOptions, DL).first;
Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Result);
Results.push_back(Result);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 8bd383033f11..2a34a24a6ae2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1694,6 +1694,20 @@ multiclass SelectCC_GPR_riirr<DAGOperand valty, DAGOperand imm> {
valty:$truev, valty:$falsev), []>;
}
+let Predicates = [IsRV32] in {
+def : Pat<(i32 (seteq (i32 (and GPR:$rs1, 0xffffffff80000000)), 0)),
+ (XORI (i32 (SRLI GPR:$rs1, 31)), 1)>;
+def : Pat<(i32 (setlt (i32 GPR:$rs1), 0)), (SRLI GPR:$rs1, 31)>; // compressible
+}
+let Predicates = [IsRV64] in {
+def : Pat<(i64 (seteq (i64 (and GPR:$rs1, 0x8000000000000000)), 0)),
+ (XORI (i64 (SRLI GPR:$rs1, 63)), 1)>;
+def : Pat<(i64 (seteq (i64 (and GPR:$rs1, 0x0000000080000000)), 0)),
+ (XORI (i64 (SRLIW GPR:$rs1, 31)), 1)>;
+def : Pat<(i64 (setlt (i64 GPR:$rs1), 0)), (SRLI GPR:$rs1, 63)>; // compressible
+def : Pat<(i64 (setlt (sext_inreg GPR:$rs1, i32), 0)), (SRLIW GPR:$rs1, 31)>;
+}
+
/// Branches and jumps
// Match `riscv_brcc` and lower to the appropriate RISC-V branch instruction.
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index 8297d5050ced..d17330f9da9f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -98,6 +98,14 @@ class RVPShift_ri<bits<3> f, bits<3> funct3, string opcodestr, Operand ImmType>
let Inst{27} = 0b0;
}
+class RVPShiftD_ri<bits<3> f, bits<3> funct3, string opcodestr>
+ : RVPShift_ri<f, funct3, opcodestr, uimm6> {
+ bits<6> shamt;
+
+ let Inst{26} = 0b1;
+ let Inst{25-20} = shamt;
+}
+
class RVPShiftW_ri<bits<3> f, bits<3> funct3, string opcodestr>
: RVPShift_ri<f, funct3, opcodestr, uimm5> {
bits<5> shamt;
@@ -136,34 +144,36 @@ class RVPUnary_ri<bits<2> w, bits<5> uf, string opcodestr>
//===----------------------------------------------------------------------===//
let Predicates = [HasStdExtP] in {
-let IsSignExtendingOpW = 1 in
-def CLS : Unary_r<0b011000000011, 0b001, "cls">;
-def ABS : Unary_r<0b011000000111, 0b001, "abs">;
+ let IsSignExtendingOpW = 1 in
+ def CLS : Unary_r<0b011000000011, 0b001, "cls">;
+ def ABS : Unary_r<0b011000000111, 0b001, "abs">;
} // Predicates = [HasStdExtP]
-let Predicates = [HasStdExtP, IsRV32] in
-def REV_RV32 : Unary_r<0b011010011111, 0b101, "rev">;
+
+let Predicates = [HasStdExtP, IsRV32] in {
+ def REV_RV32 : Unary_r<0b011010011111, 0b101, "rev">;
+} // Predicates = [HasStdExtP, IsRV32]
let Predicates = [HasStdExtP, IsRV64] in {
-def REV16 : Unary_r<0b011010110000, 0b101, "rev16">;
-def REV_RV64 : Unary_r<0b011010111111, 0b101, "rev">;
+ def REV16 : Unary_r<0b011010110000, 0b101, "rev16">;
+ def REV_RV64 : Unary_r<0b011010111111, 0b101, "rev">;
-let IsSignExtendingOpW = 1 in {
-def CLSW : UnaryW_r<0b011000000011, 0b001, "clsw">;
-def ABSW : UnaryW_r<0b011000000111, 0b001, "absw">;
-}
+ let IsSignExtendingOpW = 1 in {
+ def CLSW : UnaryW_r<0b011000000011, 0b001, "clsw">;
+ def ABSW : UnaryW_r<0b011000000111, 0b001, "absw">;
+ }
} // Predicates = [HasStdExtP, IsRV64]
let Predicates = [HasStdExtP] in {
-def PSLLI_B : RVPShiftB_ri<0b000, 0b010, "pslli.b">;
-def PSLLI_H : RVPShiftH_ri<0b000, 0b010, "pslli.h">;
-def PSSLAI_H : RVPShiftH_ri<0b101, 0b010, "psslai.h">;
+ def PSLLI_B : RVPShiftB_ri<0b000, 0b010, "pslli.b">;
+ def PSLLI_H : RVPShiftH_ri<0b000, 0b010, "pslli.h">;
+ def PSSLAI_H : RVPShiftH_ri<0b101, 0b010, "psslai.h">;
} // Predicates = [HasStdExtP]
-let DecoderNamespace = "RV32Only",
- Predicates = [HasStdExtP, IsRV32] in
-def SSLAI : RVPShiftW_ri<0b101, 0b010, "sslai">;
+let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in {
+ def SSLAI : RVPShiftW_ri<0b101, 0b010, "sslai">;
+} // Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only"
let Predicates = [HasStdExtP, IsRV64] in {
-def PSLLI_W : RVPShiftW_ri<0b000, 0b010, "pslli.w">;
-def PSSLAI_W : RVPShiftW_ri<0b101, 0b010, "psslai.w">;
+ def PSLLI_W : RVPShiftW_ri<0b000, 0b010, "pslli.w">;
+ def PSSLAI_W : RVPShiftW_ri<0b101, 0b010, "psslai.w">;
} // Predicates = [HasStdExtP, IsRV64]
let Predicates = [HasStdExtP] in
@@ -174,16 +184,50 @@ let Predicates = [HasStdExtP] in
def PLI_B : PLI_B_i<0b10110100, "pli.b">;
let Predicates = [HasStdExtP] in {
-def PSEXT_H_B : RVPUnary_ri<0b00, 0b00100, "psext.h.b">;
-def PSABS_H : RVPUnary_ri<0b00, 0b00111, "psabs.h">;
-def PSABS_B : RVPUnary_ri<0b10, 0b00111, "psabs.b">;
+ def PSEXT_H_B : RVPUnary_ri<0b00, 0b00100, "psext.h.b">;
+ def PSABS_H : RVPUnary_ri<0b00, 0b00111, "psabs.h">;
+ def PSABS_B : RVPUnary_ri<0b10, 0b00111, "psabs.b">;
} // Predicates = [HasStdExtP]
let Predicates = [HasStdExtP, IsRV64] in {
-def PSEXT_W_B : RVPUnary_ri<0b01, 0b00100, "psext.w.b">;
-def PSEXT_W_H : RVPUnary_ri<0b01, 0b00101, "psext.w.h">;
+ def PSEXT_W_B : RVPUnary_ri<0b01, 0b00100, "psext.w.b">;
+ def PSEXT_W_H : RVPUnary_ri<0b01, 0b00101, "psext.w.h">;
} // Predicates = [HasStdExtP, IsRV64]
let Predicates = [HasStdExtP] in
def PLUI_H : PLUI_i<0b1111000, "plui.h">;
let Predicates = [HasStdExtP, IsRV64] in
def PLUI_W : PLUI_i<0b1111001, "plui.w">;
+
+let Predicates = [HasStdExtP] in {
+ def PSRLI_B : RVPShiftB_ri<0b000, 0b100, "psrli.b">;
+ def PSRLI_H : RVPShiftH_ri<0b000, 0b100, "psrli.h">;
+
+ def PUSATI_H : RVPShiftH_ri<0b010, 0b100, "pusati.h">;
+
+ def PSRAI_B : RVPShiftB_ri<0b100, 0b100, "psrai.b">;
+ def PSRAI_H : RVPShiftH_ri<0b100, 0b100, "psrai.h">;
+
+ def PSRARI_H : RVPShiftH_ri<0b101, 0b100, "psrari.h">;
+
+ def PSATI_H : RVPShiftH_ri<0b110, 0b100, "psati.h">;
+} // Predicates = [HasStdExtP]
+let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in {
+ def USATI_RV32 : RVPShiftW_ri<0b010, 0b100, "usati">;
+
+ def SRARI_RV32 : RVPShiftW_ri<0b101, 0b100, "srari">;
+
+ def SATI_RV32 : RVPShiftW_ri<0b110, 0b100, "sati">;
+} // Predicates = [HasStdExtP, IsRV32]
+let Predicates = [HasStdExtP, IsRV64] in {
+ def PSRLI_W : RVPShiftW_ri<0b000, 0b100, "psrli.w">;
+ def PSRAI_W : RVPShiftW_ri<0b100, 0b100, "psrai.w">;
+
+ def PUSATI_W : RVPShiftW_ri<0b010, 0b100, "pusati.w">;
+ def USATI_RV64 : RVPShiftD_ri<0b010, 0b100, "usati">;
+
+ def PSRARI_W : RVPShiftW_ri<0b101, 0b100, "psrari.w">;
+ def SRARI_RV64 : RVPShiftD_ri<0b101, 0b100, "srari">;
+
+ def PSATI_W : RVPShiftW_ri<0b110, 0b100, "psati.w">;
+ def SATI_RV64 : RVPShiftD_ri<0b110, 0b100, "sati">;
+} // Predicates = [HasStdExtP, IsRV64]
diff --git a/llvm/lib/Target/RISCV/RISCVSchedAndes45.td b/llvm/lib/Target/RISCV/RISCVSchedAndes45.td
index 5ef858a787c7..8cf15fa26e22 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedAndes45.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedAndes45.td
@@ -24,7 +24,7 @@ let SchedModel = Andes45Model in {
//===----------------------------------------------------------------------===//
// Andes 45 series CPU
-// - 2 Interger Arithmetic and Logical Units (ALU)
+// - 2 Integer Arithmetic and Logical Units (ALU)
// - Multiply / Divide Unit (MDU)
// - Load Store Unit (LSU)
// - Control and Status Register Unit (CSR)
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 28c8f401321f..f013898e8520 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -497,6 +497,10 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
case RISCV::VANDN_VX:
// Vector Reverse Bits in Elements
case RISCV::VBREV_V:
+ // Vector Reverse Bits in Bytes
+ case RISCV::VBREV8_V:
+ // Vector Reverse Bytes
+ case RISCV::VREV8_V:
// Vector Count Leading Zeros
case RISCV::VCLZ_V:
// Vector Count Trailing Zeros
@@ -510,6 +514,13 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
case RISCV::VROR_VI:
case RISCV::VROR_VV:
case RISCV::VROR_VX:
+ // Vector Carry-less Multiplication Instructions (Zvbc)
+ // Vector Carry-less Multiply
+ case RISCV::VCLMUL_VV:
+ case RISCV::VCLMUL_VX:
+ // Vector Carry-less Multiply Return High Half
+ case RISCV::VCLMULH_VV:
+ case RISCV::VCLMULH_VX:
return MILog2SEW;
// Vector Widening Shift Left Logical (Zvbb)
@@ -1046,6 +1057,10 @@ static bool isSupportedInstr(const MachineInstr &MI) {
case RISCV::VANDN_VX:
// Vector Reverse Bits in Elements
case RISCV::VBREV_V:
+ // Vector Reverse Bits in Bytes
+ case RISCV::VBREV8_V:
+ // Vector Reverse Bytes
+ case RISCV::VREV8_V:
// Vector Count Leading Zeros
case RISCV::VCLZ_V:
// Vector Count Trailing Zeros
@@ -1063,6 +1078,13 @@ static bool isSupportedInstr(const MachineInstr &MI) {
case RISCV::VWSLL_VI:
case RISCV::VWSLL_VX:
case RISCV::VWSLL_VV:
+ // Vector Carry-less Multiplication Instructions (Zvbc)
+ // Vector Carry-less Multiply
+ case RISCV::VCLMUL_VV:
+ case RISCV::VCLMUL_VX:
+ // Vector Carry-less Multiply Return High Half
+ case RISCV::VCLMULH_VV:
+ case RISCV::VCLMULH_VX:
// Vector Mask Instructions
// Vector Mask-Register Logical Instructions
// vmsbf.m set-before-first mask bit
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f366094c3195..97cdf5b784bc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -15419,18 +15419,18 @@ static SDValue lowerShuffleAsLanePermuteAndPermute(
return SDValue();
}
- // Avoid returning the same shuffle operation. For example,
- // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
- // undef:v16i16
- if (CrossLaneMask == Mask || InLaneMask == Mask)
- return SDValue();
-
// Simplify CrossLaneMask based on the actual demanded elements.
if (V1.hasOneUse())
for (int i = 0; i != NumElts; ++i)
if (!DemandedCrossLane[i])
CrossLaneMask[i] = SM_SentinelUndef;
+ // Avoid returning the same shuffle operation. For example,
+ // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
+ // undef:v16i16
+ if (CrossLaneMask == Mask || InLaneMask == Mask)
+ return SDValue();
+
SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
InLaneMask);
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 9ef21faea2b6..cae6bb99d963 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -5488,9 +5488,10 @@ InstructionCost X86TTIImpl::getPointersChainCost(
return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
}
-InstructionCost X86TTIImpl::getAddressComputationCost(Type *PtrTy,
- ScalarEvolution *SE,
- const SCEV *Ptr) const {
+InstructionCost
+X86TTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
+ const SCEV *Ptr,
+ TTI::TargetCostKind CostKind) const {
// Address computations in vectorized code with non-consecutive addresses will
// likely result in more instructions compared to scalar code where the
// computation can more often be merged into the index mode. The resulting
@@ -5513,7 +5514,7 @@ InstructionCost X86TTIImpl::getAddressComputationCost(Type *PtrTy,
return 1;
}
- return BaseT::getAddressComputationCost(PtrTy, SE, Ptr);
+ return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
}
InstructionCost
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index bc06c4746c3c..5718c0c9535f 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -194,8 +194,9 @@ public:
getPointersChainCost(ArrayRef<const Value *> Ptrs, const Value *Base,
const TTI::PointersChainInfo &Info, Type *AccessTy,
TTI::TargetCostKind CostKind) const override;
- InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
- const SCEV *Ptr) const override;
+ InstructionCost
+ getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr,
+ TTI::TargetCostKind CostKind) const override;
std::optional<Instruction *>
instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override;