diff options
| author | Mehdi Amini <joker.eph@gmail.com> | 2025-08-14 15:36:46 +0200 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-08-14 15:36:46 +0200 |
| commit | df57d6a01e85ca78da2febab21b268d9fd6955a0 (patch) | |
| tree | 19b0aab453e6bc7e2b15d3220024dfdacd4fa57e /llvm/lib/Target | |
| parent | df86ea61b7ed484ca797f96d7ad40fd9ada7ba30 (diff) | |
| parent | 7bda76367f19cfc19086f68d9dd5ac019a9ceccd (diff) | |
Merge branch 'main' into users/joker-eph-python-bindings-maintainersusers/joker-eph-python-bindings-maintainers
Diffstat (limited to 'llvm/lib/Target')
43 files changed, 383 insertions, 525 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index ba02c82b25aa..885f2a94f85f 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -1487,11 +1487,8 @@ bool isVGInstruction(MachineBasicBlock::iterator MBBI) { if (Opc == AArch64::BL) { auto Op1 = MBBI->getOperand(0); - auto &TLI = - *MBBI->getMF()->getSubtarget<AArch64Subtarget>().getTargetLowering(); - char const *GetCurrentVG = - TLI.getLibcallName(RTLIB::SMEABI_GET_CURRENT_VG); - return Op1.isSymbol() && StringRef(Op1.getSymbolName()) == GetCurrentVG; + return Op1.isSymbol() && + (StringRef(Op1.getSymbolName()) == "__arm_get_current_vg"); } } @@ -3471,7 +3468,6 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); - auto &TLI = *MF.getSubtarget<AArch64Subtarget>().getTargetLowering(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); bool NeedsWinCFI = needsWinCFI(MF); @@ -3585,11 +3581,11 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( .addReg(AArch64::X0, RegState::Implicit) .setMIFlag(MachineInstr::FrameSetup); - RTLIB::Libcall LC = RTLIB::SMEABI_GET_CURRENT_VG; - const uint32_t *RegMask = - TRI->getCallPreservedMask(MF, TLI.getLibcallCallingConv(LC)); + const uint32_t *RegMask = TRI->getCallPreservedMask( + MF, + CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1); BuildMI(MBB, MI, DL, TII.get(AArch64::BL)) - .addExternalSymbol(TLI.getLibcallName(LC)) + .addExternalSymbol("__arm_get_current_vg") .addRegMask(RegMask) .addReg(AArch64::X0, RegState::ImplicitDefine) .setMIFlag(MachineInstr::FrameSetup); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 224bbe7e38a1..2072e48914ae 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3083,12 +3083,13 @@ AArch64TargetLowering::EmitGetSMESaveSize(MachineInstr &MI, AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); if (FuncInfo->isSMESaveBufferUsed()) { - RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE; const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL)) - .addExternalSymbol(getLibcallName(LC)) + .addExternalSymbol("__arm_sme_state_size") .addReg(AArch64::X0, RegState::ImplicitDefine) - .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC))); + .addRegMask(TRI->getCallPreservedMask( + *MF, CallingConv:: + AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1)); BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) .addReg(AArch64::X0); @@ -3108,12 +3109,13 @@ AArch64TargetLowering::EmitEntryPStateSM(MachineInstr &MI, const TargetInstrInfo *TII = Subtarget->getInstrInfo(); Register ResultReg = MI.getOperand(0).getReg(); if (FuncInfo->isPStateSMRegUsed()) { - RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE; const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL)) - .addExternalSymbol(getLibcallName(LC)) + .addExternalSymbol("__arm_sme_state") .addReg(AArch64::X0, RegState::ImplicitDefine) - .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC))); + .addRegMask(TRI->getCallPreservedMask( + *MF, CallingConv:: + AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2)); BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), ResultReg) .addReg(AArch64::X0); } else { @@ -5737,15 +5739,15 @@ static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) { SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG, SDValue Chain, SDLoc DL, EVT VT) const { - RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE; - SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), + SDValue Callee = DAG.getExternalSymbol("__arm_sme_state", getPointerTy(DAG.getDataLayout())); Type *Int64Ty = Type::getInt64Ty(*DAG.getContext()); Type *RetTy = StructType::get(Int64Ty, Int64Ty); TargetLowering::CallLoweringInfo CLI(DAG); ArgListTy Args; CLI.setDebugLoc(DL).setChain(Chain).setLibCallee( - getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)); + CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2, + RetTy, Callee, std::move(Args)); std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64); return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0), @@ -8598,12 +8600,12 @@ static void analyzeCallOperands(const AArch64TargetLowering &TLI, } static SMECallAttrs -getSMECallAttrs(const Function &Caller, const TargetLowering &TLI, +getSMECallAttrs(const Function &Caller, const TargetLowering::CallLoweringInfo &CLI) { if (CLI.CB) - return SMECallAttrs(*CLI.CB, &TLI); + return SMECallAttrs(*CLI.CB); if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee)) - return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), TLI)); + return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol())); return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(SMEAttrs::Normal)); } @@ -8625,7 +8627,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( // SME Streaming functions are not eligible for TCO as they may require // the streaming mode or ZA to be restored after returning from the call. - SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, *this, CLI); + SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, CLI); if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() || CallAttrs.requiresPreservingAllZAState() || CallAttrs.caller().hasStreamingBody()) @@ -8919,14 +8921,14 @@ static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI, DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64); Args.push_back(Entry); - RTLIB::Libcall LC = - IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE; - SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), - TLI.getPointerTy(DAG.getDataLayout())); + SDValue Callee = + DAG.getExternalSymbol(IsSave ? "__arm_sme_save" : "__arm_sme_restore", + TLI.getPointerTy(DAG.getDataLayout())); auto *RetTy = Type::getVoidTy(*DAG.getContext()); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(DL).setChain(Chain).setLibCallee( - TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)); + CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1, RetTy, + Callee, std::move(Args)); return TLI.LowerCallTo(CLI).second; } @@ -9114,7 +9116,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, } // Determine whether we need any streaming mode changes. - SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), *this, CLI); + SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), CLI); auto DescribeCallsite = [&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & { @@ -9691,12 +9693,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, if (RequiresLazySave) { // Conditionally restore the lazy save using a pseudo node. - RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE; TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); SDValue RegMask = DAG.getRegisterMask( - TRI->getCallPreservedMask(MF, getLibcallCallingConv(LC))); + TRI->SMEABISupportRoutinesCallPreservedMaskFromX0()); SDValue RestoreRoutine = DAG.getTargetExternalSymbol( - getLibcallName(LC), getPointerTy(DAG.getDataLayout())); + "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout())); SDValue TPIDR2_EL0 = DAG.getNode( ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result, DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32)); @@ -29035,7 +29036,7 @@ bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const { // Checks to allow the use of SME instructions if (auto *Base = dyn_cast<CallBase>(&Inst)) { - auto CallAttrs = SMECallAttrs(*Base, this); + auto CallAttrs = SMECallAttrs(*Base); if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() || CallAttrs.requiresPreservingZT0() || CallAttrs.requiresPreservingAllZAState()) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index fb59c9f131fb..a55f103bff38 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -5920,7 +5920,7 @@ static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI, // Build up the expression (Reg + NumBytes + VG * NumVGScaledBytes) SmallString<64> Expr; unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); - assert(DwarfReg >= 0 && DwarfReg <= 31 && "DwarfReg out of bounds (0..31)"); + assert(DwarfReg <= 31 && "DwarfReg out of bounds (0..31)"); // Reg + NumBytes Expr.push_back(dwarf::DW_OP_breg0 + DwarfReg); appendLEB128<LEB128Sign::Signed>(Expr, NumBytes); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 4523c659dd39..3fba7e853eaf 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -220,16 +220,20 @@ static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode( static cl::opt<bool> EnableScalableAutovecInStreamingMode( "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden); -static bool isSMEABIRoutineCall(const CallInst &CI, const TargetLowering &TLI) { +static bool isSMEABIRoutineCall(const CallInst &CI) { const auto *F = CI.getCalledFunction(); - return F && SMEAttrs(F->getName(), TLI).isSMEABIRoutine(); + return F && StringSwitch<bool>(F->getName()) + .Case("__arm_sme_state", true) + .Case("__arm_tpidr2_save", true) + .Case("__arm_tpidr2_restore", true) + .Case("__arm_za_disable", true) + .Default(false); } /// Returns true if the function has explicit operations that can only be /// lowered using incompatible instructions for the selected mode. This also /// returns true if the function F may use or modify ZA state. -static bool hasPossibleIncompatibleOps(const Function *F, - const TargetLowering &TLI) { +static bool hasPossibleIncompatibleOps(const Function *F) { for (const BasicBlock &BB : *F) { for (const Instruction &I : BB) { // Be conservative for now and assume that any call to inline asm or to @@ -238,7 +242,7 @@ static bool hasPossibleIncompatibleOps(const Function *F, // all native LLVM instructions can be lowered to compatible instructions. if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() && (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) || - isSMEABIRoutineCall(cast<CallInst>(I), TLI))) + isSMEABIRoutineCall(cast<CallInst>(I)))) return true; } } @@ -286,7 +290,7 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() || CallAttrs.requiresPreservingZT0() || CallAttrs.requiresPreservingAllZAState()) { - if (hasPossibleIncompatibleOps(Callee, *getTLI())) + if (hasPossibleIncompatibleOps(Callee)) return false; } @@ -353,7 +357,7 @@ AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call, // change only once and avoid inlining of G into F. SMEAttrs FAttrs(*F); - SMECallAttrs CallAttrs(Call, getTLI()); + SMECallAttrs CallAttrs(Call); if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) { if (F == Call.getCaller()) // (1) @@ -4333,7 +4337,8 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, - const SCEV *Ptr) const { + const SCEV *Ptr, + TTI::TargetCostKind CostKind) const { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 647b242d74fb..9c96fdd42781 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -238,8 +238,9 @@ public: ArrayRef<const Value *> Args = {}, const Instruction *CxtI = nullptr) const override; - InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, - const SCEV *Ptr) const override; + InstructionCost + getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, + TTI::TargetCostKind CostKind) const override; InstructionCost getCmpSelInstrCost( unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, diff --git a/llvm/lib/Target/AArch64/SMEABIPass.cpp b/llvm/lib/Target/AArch64/SMEABIPass.cpp index 2008516885c3..4af4d4930662 100644 --- a/llvm/lib/Target/AArch64/SMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/SMEABIPass.cpp @@ -15,16 +15,11 @@ #include "AArch64.h" #include "Utils/AArch64SMEAttributes.h" #include "llvm/ADT/StringRef.h" -#include "llvm/CodeGen/TargetLowering.h" -#include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" -#include "llvm/IR/RuntimeLibcalls.h" -#include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Utils/Cloning.h" using namespace llvm; @@ -38,13 +33,9 @@ struct SMEABI : public FunctionPass { bool runOnFunction(Function &F) override; - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<TargetPassConfig>(); - } - private: bool updateNewStateFunctions(Module *M, Function *F, IRBuilder<> &Builder, - SMEAttrs FnAttrs, const TargetLowering &TLI); + SMEAttrs FnAttrs); }; } // end anonymous namespace @@ -60,16 +51,14 @@ FunctionPass *llvm::createSMEABIPass() { return new SMEABI(); } //===----------------------------------------------------------------------===// // Utility function to emit a call to __arm_tpidr2_save and clear TPIDR2_EL0. -void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, const TargetLowering &TLI, - bool ZT0IsUndef = false) { +void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, bool ZT0IsUndef = false) { auto &Ctx = M->getContext(); auto *TPIDR2SaveTy = FunctionType::get(Builder.getVoidTy(), {}, /*IsVarArgs=*/false); auto Attrs = AttributeList().addFnAttribute(Ctx, "aarch64_pstate_sm_compatible"); - RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_SAVE; FunctionCallee Callee = - M->getOrInsertFunction(TLI.getLibcallName(LC), TPIDR2SaveTy, Attrs); + M->getOrInsertFunction("__arm_tpidr2_save", TPIDR2SaveTy, Attrs); CallInst *Call = Builder.CreateCall(Callee); // If ZT0 is undefined (i.e. we're at the entry of a "new_zt0" function), mark @@ -78,7 +67,8 @@ void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, const TargetLowering &TLI, if (ZT0IsUndef) Call->addFnAttr(Attribute::get(Ctx, "aarch64_zt0_undef")); - Call->setCallingConv(TLI.getLibcallCallingConv(LC)); + Call->setCallingConv( + CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0); // A save to TPIDR2 should be followed by clearing TPIDR2_EL0. Function *WriteIntr = @@ -108,8 +98,7 @@ void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, const TargetLowering &TLI, /// interface if it does not share ZA or ZT0. /// bool SMEABI::updateNewStateFunctions(Module *M, Function *F, - IRBuilder<> &Builder, SMEAttrs FnAttrs, - const TargetLowering &TLI) { + IRBuilder<> &Builder, SMEAttrs FnAttrs) { LLVMContext &Context = F->getContext(); BasicBlock *OrigBB = &F->getEntryBlock(); Builder.SetInsertPoint(&OrigBB->front()); @@ -135,7 +124,7 @@ bool SMEABI::updateNewStateFunctions(Module *M, Function *F, // Create a call __arm_tpidr2_save, which commits the lazy save. Builder.SetInsertPoint(&SaveBB->back()); - emitTPIDR2Save(M, Builder, TLI, /*ZT0IsUndef=*/FnAttrs.isNewZT0()); + emitTPIDR2Save(M, Builder, /*ZT0IsUndef=*/FnAttrs.isNewZT0()); // Enable pstate.za at the start of the function. Builder.SetInsertPoint(&OrigBB->front()); @@ -183,14 +172,10 @@ bool SMEABI::runOnFunction(Function &F) { if (F.isDeclaration() || F.hasFnAttribute("aarch64_expanded_pstate_za")) return false; - const TargetMachine &TM = - getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); - const TargetLowering &TLI = *TM.getSubtargetImpl(F)->getTargetLowering(); - bool Changed = false; SMEAttrs FnAttrs(F); if (FnAttrs.isNewZA() || FnAttrs.isNewZT0()) - Changed |= updateNewStateFunctions(M, &F, Builder, FnAttrs, TLI); + Changed |= updateNewStateFunctions(M, &F, Builder, FnAttrs); return Changed; } diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp index 934f68b29922..271094f935e0 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp @@ -7,9 +7,7 @@ //===----------------------------------------------------------------------===// #include "AArch64SMEAttributes.h" -#include "llvm/CodeGen/TargetLowering.h" #include "llvm/IR/InstrTypes.h" -#include "llvm/IR/RuntimeLibcalls.h" #include <cassert> using namespace llvm; @@ -79,36 +77,19 @@ SMEAttrs::SMEAttrs(const AttributeList &Attrs) { Bitmask |= encodeZT0State(StateValue::New); } -void SMEAttrs::addKnownFunctionAttrs(StringRef FuncName, - const TargetLowering &TLI) { - RTLIB::LibcallImpl Impl = TLI.getSupportedLibcallImpl(FuncName); - if (Impl == RTLIB::Unsupported) - return; - RTLIB::Libcall LC = RTLIB::RuntimeLibcallsInfo::getLibcallFromImpl(Impl); +void SMEAttrs::addKnownFunctionAttrs(StringRef FuncName) { unsigned KnownAttrs = SMEAttrs::Normal; - switch (LC) { - case RTLIB::SMEABI_SME_STATE: - case RTLIB::SMEABI_TPIDR2_SAVE: - case RTLIB::SMEABI_GET_CURRENT_VG: - case RTLIB::SMEABI_SME_STATE_SIZE: - case RTLIB::SMEABI_SME_SAVE: - case RTLIB::SMEABI_SME_RESTORE: - KnownAttrs |= SMEAttrs::SM_Compatible | SMEAttrs::SME_ABI_Routine; - break; - case RTLIB::SMEABI_ZA_DISABLE: - case RTLIB::SMEABI_TPIDR2_RESTORE: + if (FuncName == "__arm_tpidr2_save" || FuncName == "__arm_sme_state") + KnownAttrs |= (SMEAttrs::SM_Compatible | SMEAttrs::SME_ABI_Routine); + if (FuncName == "__arm_tpidr2_restore") KnownAttrs |= SMEAttrs::SM_Compatible | encodeZAState(StateValue::In) | SMEAttrs::SME_ABI_Routine; - break; - case RTLIB::SC_MEMCPY: - case RTLIB::SC_MEMMOVE: - case RTLIB::SC_MEMSET: - case RTLIB::SC_MEMCHR: + if (FuncName == "__arm_sc_memcpy" || FuncName == "__arm_sc_memset" || + FuncName == "__arm_sc_memmove" || FuncName == "__arm_sc_memchr") KnownAttrs |= SMEAttrs::SM_Compatible; - break; - default: - break; - } + if (FuncName == "__arm_sme_save" || FuncName == "__arm_sme_restore" || + FuncName == "__arm_sme_state_size") + KnownAttrs |= SMEAttrs::SM_Compatible | SMEAttrs::SME_ABI_Routine; set(KnownAttrs); } @@ -129,11 +110,11 @@ bool SMECallAttrs::requiresSMChange() const { return true; } -SMECallAttrs::SMECallAttrs(const CallBase &CB, const TargetLowering *TLI) +SMECallAttrs::SMECallAttrs(const CallBase &CB) : CallerFn(*CB.getFunction()), CalledFn(SMEAttrs::Normal), Callsite(CB.getAttributes()), IsIndirect(CB.isIndirectCall()) { if (auto *CalledFunction = CB.getCalledFunction()) - CalledFn = SMEAttrs(*CalledFunction, TLI); + CalledFn = SMEAttrs(*CalledFunction, SMEAttrs::InferAttrsFromName::Yes); // FIXME: We probably should not allow SME attributes on direct calls but // clang duplicates streaming mode attributes at each callsite. diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h index 06376c74025f..f1be0ecbee7e 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h @@ -13,8 +13,6 @@ namespace llvm { -class TargetLowering; - class Function; class CallBase; class AttributeList; @@ -50,17 +48,17 @@ public: CallSiteFlags_Mask = ZT0_Undef }; + enum class InferAttrsFromName { No, Yes }; + SMEAttrs() = default; SMEAttrs(unsigned Mask) { set(Mask); } - SMEAttrs(const Function &F, const TargetLowering *TLI = nullptr) + SMEAttrs(const Function &F, InferAttrsFromName Infer = InferAttrsFromName::No) : SMEAttrs(F.getAttributes()) { - if (TLI) - addKnownFunctionAttrs(F.getName(), *TLI); + if (Infer == InferAttrsFromName::Yes) + addKnownFunctionAttrs(F.getName()); } SMEAttrs(const AttributeList &L); - SMEAttrs(StringRef FuncName, const TargetLowering &TLI) { - addKnownFunctionAttrs(FuncName, TLI); - }; + SMEAttrs(StringRef FuncName) { addKnownFunctionAttrs(FuncName); }; void set(unsigned M, bool Enable = true); @@ -148,7 +146,7 @@ public: } private: - void addKnownFunctionAttrs(StringRef FuncName, const TargetLowering &TLI); + void addKnownFunctionAttrs(StringRef FuncName); }; /// SMECallAttrs is a utility class to hold the SMEAttrs for a callsite. It has @@ -165,7 +163,7 @@ public: SMEAttrs Callsite = SMEAttrs::Normal) : CallerFn(Caller), CalledFn(Callee), Callsite(Callsite) {} - SMECallAttrs(const CallBase &CB, const TargetLowering *TLI); + SMECallAttrs(const CallBase &CB); SMEAttrs &caller() { return CallerFn; } SMEAttrs &callee() { return IsIndirect ? Callsite : CalledFn; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 007b481f8496..0059a862ba9b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -439,10 +439,6 @@ struct AMDGPUPrintfRuntimeBindingPass PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); }; -struct AMDGPUUnifyMetadataPass : PassInfoMixin<AMDGPUUnifyMetadataPass> { - PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); -}; - void initializeSIOptimizeExecMaskingPreRALegacyPass(PassRegistry &); extern char &SIOptimizeExecMaskingPreRAID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index b6c6d927d0e8..6ddfa386e8ac 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -29,7 +29,6 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass( MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass()) MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this)) MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this)) -MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass()) #undef MODULE_PASS #ifndef MODULE_PASS_WITH_PARAMS diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 5a6ad405a026..8c56c2162112 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -724,10 +724,10 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{S32}, {{}, {Vgpr32, SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}}); addRulesForGOpcs({G_PTR_ADD}) - .Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}}) - .Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}}) - .Any({{DivP0}, {{VgprP0}, {VgprP0, Vgpr64}}}) - .Any({{UniP4}, {{SgprP4}, {SgprP4, Sgpr64}}}); + .Any({{UniPtr32}, {{SgprPtr32}, {SgprPtr32, Sgpr32}}}) + .Any({{DivPtr32}, {{VgprPtr32}, {VgprPtr32, Vgpr32}}}) + .Any({{UniPtr64}, {{SgprPtr64}, {SgprPtr64, Sgpr64}}}) + .Any({{DivPtr64}, {{VgprPtr64}, {VgprPtr64, Vgpr64}}}); addRulesForGOpcs({G_INTTOPTR}) .Any({{UniPtr32}, {{SgprPtr32}, {Sgpr32}}}) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index c1f17033d04a..e393aa198774 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -848,8 +848,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { if (Level == OptimizationLevel::O0) return; - PM.addPass(AMDGPUUnifyMetadataPass()); - // We don't want to run internalization at per-module stage. if (InternalizeSymbols && !isLTOPreLink(Phase)) { PM.addPass(InternalizePass(mustPreserveGV)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp deleted file mode 100644 index e400491c3860..000000000000 --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp +++ /dev/null @@ -1,119 +0,0 @@ -//===- AMDGPUUnifyMetadata.cpp - Unify OpenCL metadata --------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// \file -// This pass that unifies multiple OpenCL metadata due to linking. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/PassManager.h" -#include "llvm/Pass.h" - -using namespace llvm; - -namespace { - - namespace kOCLMD { - - const char SpirVer[] = "opencl.spir.version"; - const char OCLVer[] = "opencl.ocl.version"; - const char UsedExt[] = "opencl.used.extensions"; - const char UsedOptCoreFeat[] = "opencl.used.optional.core.features"; - const char CompilerOptions[] = "opencl.compiler.options"; - const char LLVMIdent[] = "llvm.ident"; - - } // end namespace kOCLMD - - /// Unify version metadata. - /// \return true if changes are made. - /// Assume the named metadata has operands each of which is a pair of - /// integer constant, e.g. - /// !Name = {!n1, !n2} - /// !n1 = {i32 1, i32 2} - /// !n2 = {i32 2, i32 0} - /// Keep the largest version as the sole operand if PickFirst is false. - /// Otherwise pick it from the first value, representing kernel module. - bool unifyVersionMD(Module &M, StringRef Name, bool PickFirst) { - auto *NamedMD = M.getNamedMetadata(Name); - if (!NamedMD || NamedMD->getNumOperands() <= 1) - return false; - MDNode *MaxMD = nullptr; - auto MaxVer = 0U; - for (auto *VersionMD : NamedMD->operands()) { - assert(VersionMD->getNumOperands() == 2); - auto *CMajor = mdconst::extract<ConstantInt>(VersionMD->getOperand(0)); - auto VersionMajor = CMajor->getZExtValue(); - auto *CMinor = mdconst::extract<ConstantInt>(VersionMD->getOperand(1)); - auto VersionMinor = CMinor->getZExtValue(); - auto Ver = (VersionMajor * 100) + (VersionMinor * 10); - if (Ver > MaxVer) { - MaxVer = Ver; - MaxMD = VersionMD; - } - if (PickFirst) - break; - } - NamedMD->eraseFromParent(); - NamedMD = M.getOrInsertNamedMetadata(Name); - NamedMD->addOperand(MaxMD); - return true; - } - - /// Unify version metadata. - /// \return true if changes are made. - /// Assume the named metadata has operands each of which is a list e.g. - /// !Name = {!n1, !n2} - /// !n1 = !{!"cl_khr_fp16", {!"cl_khr_fp64"}} - /// !n2 = !{!"cl_khr_image"} - /// Combine it into a single list with unique operands. - bool unifyExtensionMD(Module &M, StringRef Name) { - auto *NamedMD = M.getNamedMetadata(Name); - if (!NamedMD || NamedMD->getNumOperands() == 1) - return false; - - SmallVector<Metadata *, 4> All; - for (auto *MD : NamedMD->operands()) - for (const auto &Op : MD->operands()) - if (!llvm::is_contained(All, Op.get())) - All.push_back(Op.get()); - - NamedMD->eraseFromParent(); - NamedMD = M.getOrInsertNamedMetadata(Name); - for (const auto &MD : All) - NamedMD->addOperand(MDNode::get(M.getContext(), MD)); - - return true; - } - - /// Unify multiple OpenCL metadata due to linking. - bool unifyMetadataImpl(Module &M) { - const char *Vers[] = {kOCLMD::SpirVer, kOCLMD::OCLVer}; - const char *Exts[] = {kOCLMD::UsedExt, kOCLMD::UsedOptCoreFeat, - kOCLMD::CompilerOptions, kOCLMD::LLVMIdent}; - - bool Changed = false; - - for (auto &I : Vers) - Changed |= unifyVersionMD(M, I, true); - - for (auto &I : Exts) - Changed |= unifyExtensionMD(M, I); - - return Changed; - } - - } // end anonymous namespace - - PreservedAnalyses AMDGPUUnifyMetadataPass::run(Module &M, - ModuleAnalysisManager &AM) { - return unifyMetadataImpl(M) ? PreservedAnalyses::none() - : PreservedAnalyses::all(); - } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index c466f9cf0f35..dc9dd220130e 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -114,7 +114,6 @@ add_llvm_target(AMDGPUCodeGen AMDGPUTargetTransformInfo.cpp AMDGPUWaitSGPRHazards.cpp AMDGPUUnifyDivergentExitNodes.cpp - AMDGPUUnifyMetadata.cpp R600MachineCFGStructurizer.cpp GCNCreateVOPD.cpp GCNDPPCombine.cpp diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 2d0102fffe5e..7c019031ff24 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -197,7 +197,7 @@ enum ClassFlags : unsigned { namespace AMDGPU { enum OperandType : unsigned { - /// Operands with register or 32-bit immediate + /// Operands with register, 32-bit, or 64-bit immediate OPERAND_REG_IMM_INT32 = MCOI::OPERAND_FIRST_TARGET, OPERAND_REG_IMM_INT64, OPERAND_REG_IMM_INT16, @@ -407,7 +407,7 @@ enum CPol { SCAL = 1 << 11, // Scale offset bit - ALL = TH | SCOPE, + ALL = TH | SCOPE | NV, // Helper bits TH_TYPE_LOAD = 1 << 7, // TH_LOAD policy @@ -440,6 +440,7 @@ enum Id { // Message ID, width(4) [3:0]. ID_EARLY_PRIM_DEALLOC = 8, // added in GFX9, removed in GFX10 ID_GS_ALLOC_REQ = 9, // added in GFX9 ID_GET_DOORBELL = 10, // added in GFX9, removed in GFX11 + ID_SAVEWAVE_HAS_TDM = 10, // added in GFX1250 ID_GET_DDID = 11, // added in GFX10, removed in GFX11 ID_SYSMSG = 15, @@ -513,6 +514,7 @@ enum Id { // HwRegCode, (6) [5:0] ID_HW_ID2 = 24, ID_POPS_PACKER = 25, ID_PERF_SNAPSHOT_DATA_gfx11 = 27, + ID_IB_STS2 = 28, ID_SHADER_CYCLES = 29, ID_SHADER_CYCLES_HI = 30, ID_DVGPR_ALLOC_LO = 31, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 2e76225bbc54..f58fde421f77 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -16894,6 +16894,11 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, const TargetRegisterClass *RC = nullptr; if (Constraint.size() == 1) { + // Check if we cannot determine the bit size of the given value type. This + // can happen, for example, in this situation where we have an empty struct + // (size 0): `call void asm "", "v"({} poison)`- + if (VT == MVT::Other) + return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); const unsigned BitWidth = VT.getSizeInBits(); switch (Constraint[0]) { default: diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 9278b859a806..c425d9753dd1 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2708,7 +2708,6 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> { isModifierType<Src2VT>.ret, HasOMod); field bit HasNeg = HasModifiers; - field bit HasMatrixReuse = 0; field bit HasMatrixFMT = 0; field bit HasMatrixScale = 0; field bit HasMatrixReuse = 0; diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index f8878f32f829..f7a9a584a6b5 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -57,6 +57,7 @@ #include "llvm/CodeGen/LiveVariables.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/Target/TargetMachine.h" using namespace llvm; @@ -76,6 +77,7 @@ private: LiveIntervals *LIS = nullptr; LiveVariables *LV = nullptr; MachineDominatorTree *MDT = nullptr; + MachinePostDominatorTree *PDT = nullptr; MachineRegisterInfo *MRI = nullptr; SetVector<MachineInstr*> LoweredEndCf; DenseSet<Register> LoweredIf; @@ -138,8 +140,8 @@ private: public: SILowerControlFlow(LiveIntervals *LIS, LiveVariables *LV, - MachineDominatorTree *MDT) - : LIS(LIS), LV(LV), MDT(MDT) {} + MachineDominatorTree *MDT, MachinePostDominatorTree *PDT) + : LIS(LIS), LV(LV), MDT(MDT), PDT(PDT) {} bool run(MachineFunction &MF); }; @@ -159,6 +161,7 @@ public: AU.addUsedIfAvailable<LiveIntervalsWrapperPass>(); // Should preserve the same set that TwoAddressInstructions does. AU.addPreserved<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachinePostDominatorTreeWrapperPass>(); AU.addPreserved<SlotIndexesWrapperPass>(); AU.addPreserved<LiveIntervalsWrapperPass>(); AU.addPreserved<LiveVariablesWrapperPass>(); @@ -506,13 +509,18 @@ MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) { MachineBasicBlock *SplitBB = &MBB; if (NeedBlockSplit) { SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/true, LIS); - if (MDT && SplitBB != &MBB) { - MachineDomTreeNode *MBBNode = (*MDT)[&MBB]; - SmallVector<MachineDomTreeNode *> Children(MBBNode->begin(), - MBBNode->end()); - MachineDomTreeNode *SplitBBNode = MDT->addNewBlock(SplitBB, &MBB); - for (MachineDomTreeNode *Child : Children) - MDT->changeImmediateDominator(Child, SplitBBNode); + if (SplitBB != &MBB && (MDT || PDT)) { + using DomTreeT = DomTreeBase<MachineBasicBlock>; + SmallVector<DomTreeT::UpdateType, 16> DTUpdates; + for (MachineBasicBlock *Succ : SplitBB->successors()) { + DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ}); + DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ}); + } + DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB}); + if (MDT) + MDT->applyUpdates(DTUpdates); + if (PDT) + PDT->applyUpdates(DTUpdates); } Opcode = OrTermrOpc; InsPt = MI; @@ -727,26 +735,27 @@ bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) { MachineBasicBlock *Succ = *MBB.succ_begin(); MachineBasicBlock *FallThrough = nullptr; + using DomTreeT = DomTreeBase<MachineBasicBlock>; + SmallVector<DomTreeT::UpdateType, 8> DTUpdates; + while (!MBB.predecessors().empty()) { MachineBasicBlock *P = *MBB.pred_begin(); if (P->getFallThrough(false) == &MBB) FallThrough = P; P->ReplaceUsesOfBlockWith(&MBB, Succ); + DTUpdates.push_back({DomTreeT::Insert, P, Succ}); + DTUpdates.push_back({DomTreeT::Delete, P, &MBB}); } MBB.removeSuccessor(Succ); if (LIS) { for (auto &I : MBB.instrs()) LIS->RemoveMachineInstrFromMaps(I); } - if (MDT) { - // If Succ, the single successor of MBB, is dominated by MBB, MDT needs - // updating by changing Succ's idom to the one of MBB; otherwise, MBB must - // be a leaf node in MDT and could be erased directly. - if (MDT->dominates(&MBB, Succ)) - MDT->changeImmediateDominator(MDT->getNode(Succ), - MDT->getNode(&MBB)->getIDom()); - MDT->eraseNode(&MBB); - } + if (MDT) + MDT->applyUpdates(DTUpdates); + if (PDT) + PDT->applyUpdates(DTUpdates); + MBB.clear(); MBB.eraseFromParent(); if (FallThrough && !FallThrough->isLayoutSuccessor(Succ)) { @@ -875,7 +884,11 @@ bool SILowerControlFlowLegacy::runOnMachineFunction(MachineFunction &MF) { LiveVariables *LV = LVWrapper ? &LVWrapper->getLV() : nullptr; auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>(); MachineDominatorTree *MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr; - return SILowerControlFlow(LIS, LV, MDT).run(MF); + auto *PDTWrapper = + getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>(); + MachinePostDominatorTree *PDT = + PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr; + return SILowerControlFlow(LIS, LV, MDT, PDT).run(MF); } PreservedAnalyses @@ -885,13 +898,16 @@ SILowerControlFlowPass::run(MachineFunction &MF, LiveVariables *LV = MFAM.getCachedResult<LiveVariablesAnalysis>(MF); MachineDominatorTree *MDT = MFAM.getCachedResult<MachineDominatorTreeAnalysis>(MF); + MachinePostDominatorTree *PDT = + MFAM.getCachedResult<MachinePostDominatorTreeAnalysis>(MF); - bool Changed = SILowerControlFlow(LIS, LV, MDT).run(MF); + bool Changed = SILowerControlFlow(LIS, LV, MDT, PDT).run(MF); if (!Changed) return PreservedAnalyses::all(); auto PA = getMachineFunctionPassPreservedAnalyses(); PA.preserve<MachineDominatorTreeAnalysis>(); + PA.preserve<MachinePostDominatorTreeAnalysis>(); PA.preserve<SlotIndexesAnalysis>(); PA.preserve<LiveIntervalsAnalysis>(); PA.preserve<LiveVariablesAnalysis>(); diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index c2f4dbfa247d..a003a46191a8 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1665,7 +1665,9 @@ def S_WAITCNT_lds_direct : SPseudoInstSI<(outs), (ins)> { def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16", [(int_amdgcn_s_sethalt timm:$simm16)]>; -def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">; +def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16"> { + let SubtargetPredicate = isNotGFX1250Plus; +} // On SI the documentation says sleep for approximately 64 * low 2 // bits, consistent with the reported maximum of 448. On VI the diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index 3d9455fc51a3..c740b5e0f09d 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -106,7 +106,7 @@ static constexpr CustomOperand MsgOperands[] = { {{"MSG_GET_DDID"}, ID_GET_DDID, isGFX10}, {{"MSG_HS_TESSFACTOR"}, ID_HS_TESSFACTOR_GFX11Plus, isGFX11Plus}, {{"MSG_DEALLOC_VGPRS"}, ID_DEALLOC_VGPRS_GFX11Plus, isGFX11Plus}, - {{""}}, + {{"MSG_SAVEWAVE_HAS_TDM"}, ID_SAVEWAVE_HAS_TDM, isGFX1250}, {{"MSG_SYSMSG"}, ID_SYSMSG}, {{"MSG_RTN_GET_DOORBELL"}, ID_RTN_GET_DOORBELL, isGFX11Plus}, {{"MSG_RTN_GET_DDID"}, ID_RTN_GET_DDID, isGFX11Plus}, @@ -195,7 +195,7 @@ static constexpr CustomOperand Operands[] = { {{"HW_REG_POPS_PACKER"}, ID_POPS_PACKER, isGFX10}, {{""}}, {{"HW_REG_PERF_SNAPSHOT_DATA"}, ID_PERF_SNAPSHOT_DATA_gfx11, isGFX11}, - {{""}}, + {{"HW_REG_IB_STS2"}, ID_IB_STS2, isGFX1250}, {{"HW_REG_SHADER_CYCLES"}, ID_SHADER_CYCLES, isGFX10_3_GFX11}, {{"HW_REG_SHADER_CYCLES_HI"}, ID_SHADER_CYCLES_HI, isGFX12Plus}, {{"HW_REG_DVGPR_ALLOC_LO"}, ID_DVGPR_ALLOC_LO, isGFX12Plus}, diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index d386c917a256..8ea567cfb9d3 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -587,167 +587,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, } } - // RTLIB - if (TM.isAAPCS_ABI() && (TT.isTargetAEABI() || TT.isTargetGNUAEABI() || - TT.isTargetMuslAEABI() || TT.isAndroid())) { - // FIXME: This does not depend on the subtarget and should go directly into - // RuntimeLibcalls. This is only here because of missing support for setting - // the calling convention of an implementation. - // clang-format off - static const struct { - const RTLIB::Libcall Op; - const RTLIB::LibcallImpl Impl; - } LibraryCalls[] = { - // Double-precision floating-point arithmetic helper functions - // RTABI chapter 4.1.2, Table 2 - { RTLIB::ADD_F64, RTLIB::__aeabi_dadd }, - { RTLIB::DIV_F64, RTLIB::__aeabi_ddiv }, - { RTLIB::MUL_F64, RTLIB::__aeabi_dmul }, - { RTLIB::SUB_F64, RTLIB::__aeabi_dsub }, - - // Double-precision floating-point comparison helper functions - // RTABI chapter 4.1.2, Table 3 - { RTLIB::OEQ_F64, RTLIB::__aeabi_dcmpeq__oeq }, - { RTLIB::UNE_F64, RTLIB::__aeabi_dcmpeq__une }, - { RTLIB::OLT_F64, RTLIB::__aeabi_dcmplt }, - { RTLIB::OLE_F64, RTLIB::__aeabi_dcmple }, - { RTLIB::OGE_F64, RTLIB::__aeabi_dcmpge }, - { RTLIB::OGT_F64, RTLIB::__aeabi_dcmpgt }, - { RTLIB::UO_F64, RTLIB::__aeabi_dcmpun }, - - // Single-precision floating-point arithmetic helper functions - // RTABI chapter 4.1.2, Table 4 - { RTLIB::ADD_F32, RTLIB::__aeabi_fadd }, - { RTLIB::DIV_F32, RTLIB::__aeabi_fdiv }, - { RTLIB::MUL_F32, RTLIB::__aeabi_fmul }, - { RTLIB::SUB_F32, RTLIB::__aeabi_fsub }, - - // Single-precision floating-point comparison helper functions - // RTABI chapter 4.1.2, Table 5 - { RTLIB::OEQ_F32, RTLIB::__aeabi_fcmpeq__oeq }, - { RTLIB::UNE_F32, RTLIB::__aeabi_fcmpeq__une }, - { RTLIB::OLT_F32, RTLIB::__aeabi_fcmplt}, - { RTLIB::OLE_F32, RTLIB::__aeabi_fcmple }, - { RTLIB::OGE_F32, RTLIB::__aeabi_fcmpge }, - { RTLIB::OGT_F32, RTLIB::__aeabi_fcmpgt }, - { RTLIB::UO_F32, RTLIB::__aeabi_fcmpun }, - - // Floating-point to integer conversions. - // RTABI chapter 4.1.2, Table 6 - { RTLIB::FPTOSINT_F64_I32, RTLIB::__aeabi_d2iz }, - { RTLIB::FPTOUINT_F64_I32, RTLIB::__aeabi_d2uiz }, - { RTLIB::FPTOSINT_F64_I64, RTLIB::__aeabi_d2lz }, - { RTLIB::FPTOUINT_F64_I64, RTLIB::__aeabi_d2ulz }, - { RTLIB::FPTOSINT_F32_I32, RTLIB::__aeabi_f2iz }, - { RTLIB::FPTOUINT_F32_I32, RTLIB::__aeabi_f2uiz }, - { RTLIB::FPTOSINT_F32_I64, RTLIB::__aeabi_f2lz }, - { RTLIB::FPTOUINT_F32_I64, RTLIB::__aeabi_f2ulz }, - - // Conversions between floating types. - // RTABI chapter 4.1.2, Table 7 - { RTLIB::FPROUND_F64_F32, RTLIB::__aeabi_d2f }, - { RTLIB::FPROUND_F64_F16, RTLIB::__aeabi_d2h }, - { RTLIB::FPEXT_F32_F64, RTLIB::__aeabi_f2d }, - - // Integer to floating-point conversions. - // RTABI chapter 4.1.2, Table 8 - { RTLIB::SINTTOFP_I32_F64, RTLIB::__aeabi_i2d }, - { RTLIB::UINTTOFP_I32_F64, RTLIB::__aeabi_ui2d }, - { RTLIB::SINTTOFP_I64_F64, RTLIB::__aeabi_l2d }, - { RTLIB::UINTTOFP_I64_F64, RTLIB::__aeabi_ul2d }, - { RTLIB::SINTTOFP_I32_F32, RTLIB::__aeabi_i2f }, - { RTLIB::UINTTOFP_I32_F32, RTLIB::__aeabi_ui2f }, - { RTLIB::SINTTOFP_I64_F32, RTLIB::__aeabi_l2f }, - { RTLIB::UINTTOFP_I64_F32, RTLIB::__aeabi_ul2f }, - - // Long long helper functions - // RTABI chapter 4.2, Table 9 - { RTLIB::MUL_I64, RTLIB::__aeabi_lmul }, - { RTLIB::SHL_I64, RTLIB::__aeabi_llsl }, - { RTLIB::SRL_I64, RTLIB::__aeabi_llsr }, - { RTLIB::SRA_I64, RTLIB::__aeabi_lasr }, - - // Integer division functions - // RTABI chapter 4.3.1 - { RTLIB::SDIV_I32, RTLIB::__aeabi_idiv }, - { RTLIB::SDIV_I64, RTLIB::__aeabi_ldivmod }, - { RTLIB::UDIV_I32, RTLIB::__aeabi_uidiv }, - { RTLIB::UDIV_I64, RTLIB::__aeabi_uldivmod }, - }; - // clang-format on - - for (const auto &LC : LibraryCalls) - setLibcallImpl(LC.Op, LC.Impl); - - // EABI dependent RTLIB - if (TM.Options.EABIVersion == EABI::EABI4 || - TM.Options.EABIVersion == EABI::EABI5) { - static const struct { - const RTLIB::Libcall Op; - const RTLIB::LibcallImpl Impl; - } MemOpsLibraryCalls[] = { - // Memory operations - // RTABI chapter 4.3.4 - {RTLIB::MEMCPY, RTLIB::__aeabi_memcpy}, - {RTLIB::MEMMOVE, RTLIB::__aeabi_memmove}, - {RTLIB::MEMSET, RTLIB::__aeabi_memset}, - {RTLIB::AEABI_MEMCPY4, RTLIB::__aeabi_memcpy4}, - {RTLIB::AEABI_MEMCPY8, RTLIB::__aeabi_memcpy8}, - {RTLIB::AEABI_MEMMOVE4, RTLIB::__aeabi_memmove4}, - {RTLIB::AEABI_MEMMOVE8, RTLIB::__aeabi_memmove8}, - {RTLIB::AEABI_MEMSET4, RTLIB::__aeabi_memset4}, - {RTLIB::AEABI_MEMSET8, RTLIB::__aeabi_memset8}, - {RTLIB::AEABI_MEMCLR, RTLIB::__aeabi_memclr}, - {RTLIB::AEABI_MEMCLR4, RTLIB::__aeabi_memclr4}, - {RTLIB::AEABI_MEMCLR8, RTLIB::__aeabi_memclr8}, - }; - - for (const auto &LC : MemOpsLibraryCalls) - setLibcallImpl(LC.Op, LC.Impl); - } - } - - // The half <-> float conversion functions are always soft-float on - // non-watchos platforms, but are needed for some targets which use a - // hard-float calling convention by default. - if (!TT.isWatchABI()) { - if (TM.isAAPCS_ABI()) { - setLibcallImplCallingConv(RTLIB::__truncsfhf2, CallingConv::ARM_AAPCS); - setLibcallImplCallingConv(RTLIB::__truncdfhf2, CallingConv::ARM_AAPCS); - setLibcallImplCallingConv(RTLIB::__extendhfsf2, CallingConv::ARM_AAPCS); - setLibcallImplCallingConv(RTLIB::__gnu_h2f_ieee, CallingConv::ARM_AAPCS); - setLibcallImplCallingConv(RTLIB::__gnu_f2h_ieee, CallingConv::ARM_AAPCS); - } else { - setLibcallImplCallingConv(RTLIB::__truncsfhf2, CallingConv::ARM_APCS); - setLibcallImplCallingConv(RTLIB::__truncdfhf2, CallingConv::ARM_APCS); - setLibcallImplCallingConv(RTLIB::__extendhfsf2, CallingConv::ARM_APCS); - setLibcallImplCallingConv(RTLIB::__gnu_h2f_ieee, CallingConv::ARM_APCS); - setLibcallImplCallingConv(RTLIB::__gnu_f2h_ieee, CallingConv::ARM_APCS); - } - } - - // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have - // a __gnu_ prefix (which is the default). - if (TT.isTargetAEABI()) { - // FIXME: This does not depend on the subtarget and should go directly into - // RuntimeLibcalls. This is only here because of missing support for setting - // the calling convention of an implementation. - static const struct { - const RTLIB::Libcall Op; - const RTLIB::LibcallImpl Impl; - } LibraryCalls[] = { - {RTLIB::FPROUND_F32_F16, RTLIB::__aeabi_f2h}, - {RTLIB::FPEXT_F16_F32, RTLIB::__aeabi_h2f}, - }; - - for (const auto &LC : LibraryCalls) { - setLibcallImpl(LC.Op, LC.Impl); - } - } else if (!TT.isOSBinFormatMachO()) { - setLibcallImpl(RTLIB::FPROUND_F32_F16, RTLIB::__gnu_f2h_ieee); - setLibcallImpl(RTLIB::FPEXT_F16_F32, RTLIB::__gnu_h2f_ieee); - } - if (Subtarget->isThumb1Only()) addRegisterClass(MVT::i32, &ARM::tGPRRegClass); else @@ -7406,7 +7245,7 @@ static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { return false; unsigned NumElts = VT.getVectorNumElements(); - if (M.size() != NumElts && M.size() != NumElts*2) + if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0) return false; // If the mask is twice as long as the input vector then we need to check the @@ -7438,7 +7277,7 @@ static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ return false; unsigned NumElts = VT.getVectorNumElements(); - if (M.size() != NumElts && M.size() != NumElts*2) + if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0) return false; for (unsigned i = 0; i < M.size(); i += NumElts) { @@ -7541,7 +7380,7 @@ static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { return false; unsigned NumElts = VT.getVectorNumElements(); - if (M.size() != NumElts && M.size() != NumElts*2) + if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0) return false; for (unsigned i = 0; i < M.size(); i += NumElts) { @@ -7574,7 +7413,7 @@ static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ return false; unsigned NumElts = VT.getVectorNumElements(); - if (M.size() != NumElts && M.size() != NumElts*2) + if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0) return false; for (unsigned i = 0; i < M.size(); i += NumElts) { diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 393cf2d97380..6b2854171c81 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1084,9 +1084,10 @@ InstructionCost ARMTTIImpl::getCmpSelInstrCost( CostKind, Op1Info, Op2Info, I); } -InstructionCost ARMTTIImpl::getAddressComputationCost(Type *PtrTy, - ScalarEvolution *SE, - const SCEV *Ptr) const { +InstructionCost +ARMTTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, + const SCEV *Ptr, + TTI::TargetCostKind CostKind) const { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting @@ -1103,7 +1104,7 @@ InstructionCost ARMTTIImpl::getAddressComputationCost(Type *PtrTy, // addressing mode. return 1; } - return BaseT::getAddressComputationCost(PtrTy, SE, Ptr); + return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind); } bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) const { @@ -1335,6 +1336,39 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, if (!Mask.empty()) { std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy); + // Check for LD2/LD4 instructions, which are represented in llvm IR as + // deinterleaving-shuffle(load). The shuffle cost could potentially be + // free, but we model it with a cost of LT.first so that LD2/LD4 have a + // higher cost than just the load. + if (Args.size() >= 1 && isa<LoadInst>(Args[0]) && + (LT.second.getScalarSizeInBits() == 8 || + LT.second.getScalarSizeInBits() == 16 || + LT.second.getScalarSizeInBits() == 32) && + LT.second.getSizeInBits() == 128 && + ((TLI->getMaxSupportedInterleaveFactor() >= 2 && + ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 2)) || + (TLI->getMaxSupportedInterleaveFactor() == 4 && + ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 4)))) + return ST->getMVEVectorCostFactor(CostKind) * + std::max<InstructionCost>(1, LT.first / 4); + + // Check for ST2/ST4 instructions, which are represented in llvm IR as + // store(interleaving-shuffle). The shuffle cost could potentially be + // free, but we model it with a cost of LT.first so that ST2/ST4 have a + // higher cost than just the store. + if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) && + (LT.second.getScalarSizeInBits() == 8 || + LT.second.getScalarSizeInBits() == 16 || + LT.second.getScalarSizeInBits() == 32) && + LT.second.getSizeInBits() == 128 && + ((TLI->getMaxSupportedInterleaveFactor() >= 2 && + ShuffleVectorInst::isInterleaveMask( + Mask, 2, SrcTy->getElementCount().getKnownMinValue() * 2)) || + (TLI->getMaxSupportedInterleaveFactor() == 4 && + ShuffleVectorInst::isInterleaveMask( + Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2)))) + return ST->getMVEVectorCostFactor(CostKind) * LT.first; + if (LT.second.isVector() && Mask.size() <= LT.second.getVectorNumElements() && (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) || diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index 522c235a90a8..cdd8bcb9f741 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -257,8 +257,9 @@ public: unsigned Index, const Value *Op0, const Value *Op1) const override; - InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE, - const SCEV *Ptr) const override; + InstructionCost + getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr, + TTI::TargetCostKind CostKind) const override; InstructionCost getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index ece6c10e828d..0e974838a7c6 100644 --- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -3373,12 +3373,12 @@ public: void addMSRMaskOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::createImm(unsigned(getMSRMask()))); + Inst.addOperand(MCOperand::createImm(getMSRMask())); } void addBankedRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::createImm(unsigned(getBankedReg()))); + Inst.addOperand(MCOperand::createImm(getBankedReg())); } void addProcIFlagsOperands(MCInst &Inst, unsigned N) const { diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp index 5c212816fbdb..171e2949366a 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -156,9 +156,10 @@ HexagonTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, return BaseT::getIntrinsicInstrCost(ICA, CostKind); } -InstructionCost HexagonTTIImpl::getAddressComputationCost(Type *PtrTy, - ScalarEvolution *SE, - const SCEV *S) const { +InstructionCost +HexagonTTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, + const SCEV *S, + TTI::TargetCostKind CostKind) const { return 0; } diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h index 0a5766d1dadf..dbf16c99c314 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -111,8 +111,9 @@ public: InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override; - InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, - const SCEV *S) const override; + InstructionCost + getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *S, + TTI::TargetCostKind CostKind) const override; InstructionCost getMemoryOpCost( unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 14472419a10f..a2a41d0062ff 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -2786,7 +2786,7 @@ SDValue LoongArchTargetLowering::lowerUINT_TO_FP(SDValue Op, EVT RetVT = Op.getValueType(); RTLIB::Libcall LC = RTLIB::getUINTTOFP(OpVT, RetVT); MakeLibCallOptions CallOptions; - CallOptions.setTypeListBeforeSoften(OpVT, RetVT, true); + CallOptions.setTypeListBeforeSoften(OpVT, RetVT); SDValue Chain = SDValue(); SDValue Result; std::tie(Result, Chain) = @@ -2811,7 +2811,7 @@ SDValue LoongArchTargetLowering::lowerSINT_TO_FP(SDValue Op, EVT RetVT = Op.getValueType(); RTLIB::Libcall LC = RTLIB::getSINTTOFP(OpVT, RetVT); MakeLibCallOptions CallOptions; - CallOptions.setTypeListBeforeSoften(OpVT, RetVT, true); + CallOptions.setTypeListBeforeSoften(OpVT, RetVT); SDValue Chain = SDValue(); SDValue Result; std::tie(Result, Chain) = @@ -4107,7 +4107,7 @@ void LoongArchTargetLowering::ReplaceNodeResults( LC = RTLIB::getFPTOSINT(Src.getValueType(), VT); MakeLibCallOptions CallOptions; EVT OpVT = Src.getValueType(); - CallOptions.setTypeListBeforeSoften(OpVT, VT, true); + CallOptions.setTypeListBeforeSoften(OpVT, VT); SDValue Chain = SDValue(); SDValue Result; std::tie(Result, Chain) = @@ -4360,7 +4360,7 @@ void LoongArchTargetLowering::ReplaceNodeResults( RTLIB::Libcall LC = OpVT == MVT::f64 ? RTLIB::LROUND_F64 : RTLIB::LROUND_F32; MakeLibCallOptions CallOptions; - CallOptions.setTypeListBeforeSoften(OpVT, MVT::i64, true); + CallOptions.setTypeListBeforeSoften(OpVT, MVT::i64); SDValue Result = makeLibCall(DAG, LC, MVT::i64, Op0, CallOptions, DL).first; Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Result); Results.push_back(Result); diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index d8bb16fe9b94..0696b11d62ac 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -1640,6 +1640,24 @@ defm : PairInsertExtractPatV8<v8f32, f32>; defm : PairInsertExtractPatV4<v4i64, GRLenVT>; defm : PairInsertExtractPatV4<v4f64, f64>; +def : Pat<(vector_insert v8i32:$xd, (GRLenVT(vector_extract v8i32:$xj, 0)), + uimm3:$imm), + (XVINSVE0_W v8i32:$xd, v8i32:$xj, uimm3:$imm)>; + +def : Pat<(vector_insert v4i64:$xd, (GRLenVT(vector_extract v4i64:$xj, 0)), + uimm2:$imm), + (XVINSVE0_D v4i64:$xd, v4i64:$xj, uimm2:$imm)>; + +def : Pat<(vector_insert v8i32:$xd, + (GRLenVT(vector_extract v8i32:$xj, uimm3:$imm1)), uimm3:$imm2), + (XVINSVE0_W v8i32:$xd, (XVPICKVE_W v8i32:$xj, uimm3:$imm1), + uimm3:$imm2)>; + +def : Pat<(vector_insert v4i64:$xd, + (GRLenVT(vector_extract v4i64:$xj, uimm2:$imm1)), uimm2:$imm2), + (XVINSVE0_D v4i64:$xd, (XVPICKVE_D v4i64:$xj, uimm2:$imm1), + uimm2:$imm2)>; + // PseudoXVINSGR2VR_{B/H} def : Pat<(vector_insert v32i8:$xd, GRLenVT:$rj, uimm5:$imm), (PseudoXVINSGR2VR_B v32i8:$xd, GRLenVT:$rj, uimm5:$imm)>; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 18aeda6a7935..2445005bf98c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -70,7 +70,7 @@ NVPTXDAGToDAGISel::getDivF32Level(const SDNode *N) const { } bool NVPTXDAGToDAGISel::usePrecSqrtF32(const SDNode *N) const { - return Subtarget->getTargetLowering()->usePrecSqrtF32(*MF, N); + return Subtarget->getTargetLowering()->usePrecSqrtF32(N); } bool NVPTXDAGToDAGISel::useF32FTZ() const { @@ -82,11 +82,6 @@ bool NVPTXDAGToDAGISel::allowFMA() const { return TL->allowFMA(*MF, OptLevel); } -bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const { - const NVPTXTargetLowering *TL = Subtarget->getTargetLowering(); - return TL->allowUnsafeFPMath(*MF); -} - bool NVPTXDAGToDAGISel::doRsqrtOpt() const { return EnableRsqrtOpt; } /// Select - Select instructions not customized! Used for diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 357e915fd077..65731722f534 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -44,7 +44,6 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { bool usePrecSqrtF32(const SDNode *N) const; bool useF32FTZ() const; bool allowFMA() const; - bool allowUnsafeFPMath() const; bool doRsqrtOpt() const; NVPTXScopes Scopes{}; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 3daf25d55152..b94cbd0bd9c1 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -125,10 +125,6 @@ NVPTXTargetLowering::getDivF32Level(const MachineFunction &MF, if (UsePrecDivF32.getNumOccurrences() > 0) return UsePrecDivF32; - // Otherwise, use div.approx if fast math is enabled - if (allowUnsafeFPMath(MF)) - return NVPTX::DivPrecisionLevel::Approx; - const SDNodeFlags Flags = N.getFlags(); if (Flags.hasApproximateFuncs()) return NVPTX::DivPrecisionLevel::Approx; @@ -136,16 +132,11 @@ NVPTXTargetLowering::getDivF32Level(const MachineFunction &MF, return NVPTX::DivPrecisionLevel::IEEE754; } -bool NVPTXTargetLowering::usePrecSqrtF32(const MachineFunction &MF, - const SDNode *N) const { +bool NVPTXTargetLowering::usePrecSqrtF32(const SDNode *N) const { // If nvptx-prec-sqrtf32 is used on the command-line, always honor it if (UsePrecSqrtF32.getNumOccurrences() > 0) return UsePrecSqrtF32; - // Otherwise, use sqrt.approx if fast math is enabled - if (allowUnsafeFPMath(MF)) - return false; - if (N) { const SDNodeFlags Flags = N->getFlags(); if (Flags.hasApproximateFuncs()) @@ -1193,8 +1184,7 @@ SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, bool &UseOneConst, bool Reciprocal) const { if (!(Enabled == ReciprocalEstimate::Enabled || - (Enabled == ReciprocalEstimate::Unspecified && - !usePrecSqrtF32(DAG.getMachineFunction())))) + (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32()))) return SDValue(); if (ExtraSteps == ReciprocalEstimate::Unspecified) @@ -2851,8 +2841,7 @@ static SDValue lowerROT(SDValue Op, SelectionDAG &DAG) { SDLoc(Op), Opcode, DAG); } -static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG, - bool AllowUnsafeFPMath) { +static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG) { // Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)), // i.e. "poor man's fmod()". When y is infinite, x is returned. This matches // the semantics of LLVM's frem. @@ -2869,7 +2858,7 @@ static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG, SDValue Sub = DAG.getNode(ISD::FSUB, DL, Ty, X, Mul, Flags | SDNodeFlags::AllowContract); - if (AllowUnsafeFPMath || Flags.hasNoInfs()) + if (Flags.hasNoInfs()) return Sub; // If Y is infinite, return X @@ -3014,7 +3003,7 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::CTLZ: return lowerCTLZCTPOP(Op, DAG); case ISD::FREM: - return lowerFREM(Op, DAG, allowUnsafeFPMath(DAG.getMachineFunction())); + return lowerFREM(Op, DAG); default: llvm_unreachable("Custom lowering not defined for operation"); @@ -4868,17 +4857,7 @@ bool NVPTXTargetLowering::allowFMA(MachineFunction &MF, if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast) return true; - return allowUnsafeFPMath(MF); -} - -bool NVPTXTargetLowering::allowUnsafeFPMath(const MachineFunction &MF) const { - // Honor TargetOptions flags that explicitly say unsafe math is okay. - if (MF.getTarget().Options.UnsafeFPMath) - return true; - - // Allow unsafe math if unsafe-fp-math attribute explicitly says so. - const Function &F = MF.getFunction(); - return F.getFnAttribute("unsafe-fp-math").getValueAsBool(); + return false; } static bool isConstZero(const SDValue &Operand) { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 43e721a9c2a4..27f099e22097 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -206,8 +206,7 @@ public: // Get whether we should use a precise or approximate 32-bit floating point // sqrt instruction. - bool usePrecSqrtF32(const MachineFunction &MF, - const SDNode *N = nullptr) const; + bool usePrecSqrtF32(const SDNode *N = nullptr) const; // Get whether we should use instructions that flush floating-point denormals // to sign-preserving zero. @@ -220,7 +219,6 @@ public: unsigned combineRepeatedFPDivisors() const override { return 2; } bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const; - bool allowUnsafeFPMath(const MachineFunction &MF) const; bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT) const override { diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index bd54d1db9156..ebb5e32f5e6f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -1133,9 +1133,8 @@ defm FMA_F64 : FMA<F64RT, allow_ftz = false>; // sin/cos/tanh class UnaryOpAllowsApproxFn<SDPatternOperator operator> - : PatFrag<(ops node:$A), - (operator node:$A), [{ - return allowUnsafeFPMath() || N->getFlags().hasApproximateFuncs(); + : PatFrag<(ops node:$A), (operator node:$A), [{ + return N->getFlags().hasApproximateFuncs(); }]>; def SIN_APPROX_f32 : diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 599865312920..9e1530a2d00f 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -18,6 +18,7 @@ #include "RISCVInstrInfo.h" #include "RISCVSelectionDAGInfo.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/SDPatternMatch.h" #include "llvm/IR/IntrinsicsRISCV.h" #include "llvm/Support/Alignment.h" #include "llvm/Support/Debug.h" @@ -772,6 +773,49 @@ bool RISCVDAGToDAGISel::trySignedBitfieldInsertInSign(SDNode *Node) { return false; } +// (xor X, (and (xor X, C1), C2)) +// -> (qc.insbi X, (C1 >> ShAmt), Width, ShAmt) +// where C2 is a shifted mask with width=Width and shift=ShAmt +bool RISCVDAGToDAGISel::tryBitfieldInsertOpFromXor(SDNode *Node) { + + if (!Subtarget->hasVendorXqcibm()) + return false; + + using namespace SDPatternMatch; + + SDValue X; + APInt CImm, CMask; + if (!sd_match( + Node, + m_Xor(m_Value(X), + m_OneUse(m_And(m_OneUse(m_Xor(m_Deferred(X), m_ConstInt(CImm))), + m_ConstInt(CMask)))))) + return false; + + unsigned Width, ShAmt; + if (!CMask.isShiftedMask(ShAmt, Width)) + return false; + + int64_t Imm = CImm.getSExtValue(); + Imm >>= ShAmt; + + SDLoc DL(Node); + SDValue ImmNode; + auto Opc = RISCV::QC_INSB; + + if (isInt<5>(Imm)) { + Opc = RISCV::QC_INSBI; + ImmNode = CurDAG->getSignedTargetConstant(Imm, DL, MVT::i32); + } else { + ImmNode = selectImm(CurDAG, DL, MVT::i32, Imm, *Subtarget); + } + SDValue Ops[] = {X, ImmNode, CurDAG->getTargetConstant(Width, DL, MVT::i32), + CurDAG->getTargetConstant(ShAmt, DL, MVT::i32)}; + ReplaceNode(Node, CurDAG->getMachineNode(Opc, DL, MVT::i32, Ops)); + + return true; +} + bool RISCVDAGToDAGISel::tryUnsignedBitfieldExtract(SDNode *Node, const SDLoc &DL, MVT VT, SDValue X, unsigned Msb, @@ -1349,6 +1393,9 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { if (tryShrinkShlLogicImm(Node)) return; + if (tryBitfieldInsertOpFromXor(Node)) + return; + break; case ISD::AND: { auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1)); diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h index ee3a86e25add..9d4cd0e6e339 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h @@ -75,6 +75,7 @@ public: bool trySignedBitfieldExtract(SDNode *Node); bool trySignedBitfieldInsertInSign(SDNode *Node); bool trySignedBitfieldInsertInMask(SDNode *Node); + bool tryBitfieldInsertOpFromXor(SDNode *Node); bool tryUnsignedBitfieldExtract(SDNode *Node, const SDLoc &DL, MVT VT, SDValue X, unsigned Msb, unsigned Lsb); bool tryUnsignedBitfieldInsertInZero(SDNode *Node, const SDLoc &DL, MVT VT, diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 8bc42ad8758c..4f52f68d35aa 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -14333,7 +14333,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, LC = RTLIB::getFPTOUINT(Op0.getValueType(), N->getValueType(0)); MakeLibCallOptions CallOptions; EVT OpVT = Op0.getValueType(); - CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true); + CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0)); SDValue Chain = IsStrict ? N->getOperand(0) : SDValue(); SDValue Result; std::tie(Result, Chain) = @@ -14368,7 +14368,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, Op0.getValueType() == MVT::f64 ? RTLIB::LROUND_F64 : RTLIB::LROUND_F32; MakeLibCallOptions CallOptions; EVT OpVT = Op0.getValueType(); - CallOptions.setTypeListBeforeSoften(OpVT, MVT::i64, true); + CallOptions.setTypeListBeforeSoften(OpVT, MVT::i64); SDValue Result = makeLibCall(DAG, LC, MVT::i64, Op0, CallOptions, DL).first; Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Result); Results.push_back(Result); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 8bd383033f11..2a34a24a6ae2 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -1694,6 +1694,20 @@ multiclass SelectCC_GPR_riirr<DAGOperand valty, DAGOperand imm> { valty:$truev, valty:$falsev), []>; } +let Predicates = [IsRV32] in { +def : Pat<(i32 (seteq (i32 (and GPR:$rs1, 0xffffffff80000000)), 0)), + (XORI (i32 (SRLI GPR:$rs1, 31)), 1)>; +def : Pat<(i32 (setlt (i32 GPR:$rs1), 0)), (SRLI GPR:$rs1, 31)>; // compressible +} +let Predicates = [IsRV64] in { +def : Pat<(i64 (seteq (i64 (and GPR:$rs1, 0x8000000000000000)), 0)), + (XORI (i64 (SRLI GPR:$rs1, 63)), 1)>; +def : Pat<(i64 (seteq (i64 (and GPR:$rs1, 0x0000000080000000)), 0)), + (XORI (i64 (SRLIW GPR:$rs1, 31)), 1)>; +def : Pat<(i64 (setlt (i64 GPR:$rs1), 0)), (SRLI GPR:$rs1, 63)>; // compressible +def : Pat<(i64 (setlt (sext_inreg GPR:$rs1, i32), 0)), (SRLIW GPR:$rs1, 31)>; +} + /// Branches and jumps // Match `riscv_brcc` and lower to the appropriate RISC-V branch instruction. diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td index 8297d5050ced..d17330f9da9f 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td @@ -98,6 +98,14 @@ class RVPShift_ri<bits<3> f, bits<3> funct3, string opcodestr, Operand ImmType> let Inst{27} = 0b0; } +class RVPShiftD_ri<bits<3> f, bits<3> funct3, string opcodestr> + : RVPShift_ri<f, funct3, opcodestr, uimm6> { + bits<6> shamt; + + let Inst{26} = 0b1; + let Inst{25-20} = shamt; +} + class RVPShiftW_ri<bits<3> f, bits<3> funct3, string opcodestr> : RVPShift_ri<f, funct3, opcodestr, uimm5> { bits<5> shamt; @@ -136,34 +144,36 @@ class RVPUnary_ri<bits<2> w, bits<5> uf, string opcodestr> //===----------------------------------------------------------------------===// let Predicates = [HasStdExtP] in { -let IsSignExtendingOpW = 1 in -def CLS : Unary_r<0b011000000011, 0b001, "cls">; -def ABS : Unary_r<0b011000000111, 0b001, "abs">; + let IsSignExtendingOpW = 1 in + def CLS : Unary_r<0b011000000011, 0b001, "cls">; + def ABS : Unary_r<0b011000000111, 0b001, "abs">; } // Predicates = [HasStdExtP] -let Predicates = [HasStdExtP, IsRV32] in -def REV_RV32 : Unary_r<0b011010011111, 0b101, "rev">; + +let Predicates = [HasStdExtP, IsRV32] in { + def REV_RV32 : Unary_r<0b011010011111, 0b101, "rev">; +} // Predicates = [HasStdExtP, IsRV32] let Predicates = [HasStdExtP, IsRV64] in { -def REV16 : Unary_r<0b011010110000, 0b101, "rev16">; -def REV_RV64 : Unary_r<0b011010111111, 0b101, "rev">; + def REV16 : Unary_r<0b011010110000, 0b101, "rev16">; + def REV_RV64 : Unary_r<0b011010111111, 0b101, "rev">; -let IsSignExtendingOpW = 1 in { -def CLSW : UnaryW_r<0b011000000011, 0b001, "clsw">; -def ABSW : UnaryW_r<0b011000000111, 0b001, "absw">; -} + let IsSignExtendingOpW = 1 in { + def CLSW : UnaryW_r<0b011000000011, 0b001, "clsw">; + def ABSW : UnaryW_r<0b011000000111, 0b001, "absw">; + } } // Predicates = [HasStdExtP, IsRV64] let Predicates = [HasStdExtP] in { -def PSLLI_B : RVPShiftB_ri<0b000, 0b010, "pslli.b">; -def PSLLI_H : RVPShiftH_ri<0b000, 0b010, "pslli.h">; -def PSSLAI_H : RVPShiftH_ri<0b101, 0b010, "psslai.h">; + def PSLLI_B : RVPShiftB_ri<0b000, 0b010, "pslli.b">; + def PSLLI_H : RVPShiftH_ri<0b000, 0b010, "pslli.h">; + def PSSLAI_H : RVPShiftH_ri<0b101, 0b010, "psslai.h">; } // Predicates = [HasStdExtP] -let DecoderNamespace = "RV32Only", - Predicates = [HasStdExtP, IsRV32] in -def SSLAI : RVPShiftW_ri<0b101, 0b010, "sslai">; +let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in { + def SSLAI : RVPShiftW_ri<0b101, 0b010, "sslai">; +} // Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" let Predicates = [HasStdExtP, IsRV64] in { -def PSLLI_W : RVPShiftW_ri<0b000, 0b010, "pslli.w">; -def PSSLAI_W : RVPShiftW_ri<0b101, 0b010, "psslai.w">; + def PSLLI_W : RVPShiftW_ri<0b000, 0b010, "pslli.w">; + def PSSLAI_W : RVPShiftW_ri<0b101, 0b010, "psslai.w">; } // Predicates = [HasStdExtP, IsRV64] let Predicates = [HasStdExtP] in @@ -174,16 +184,50 @@ let Predicates = [HasStdExtP] in def PLI_B : PLI_B_i<0b10110100, "pli.b">; let Predicates = [HasStdExtP] in { -def PSEXT_H_B : RVPUnary_ri<0b00, 0b00100, "psext.h.b">; -def PSABS_H : RVPUnary_ri<0b00, 0b00111, "psabs.h">; -def PSABS_B : RVPUnary_ri<0b10, 0b00111, "psabs.b">; + def PSEXT_H_B : RVPUnary_ri<0b00, 0b00100, "psext.h.b">; + def PSABS_H : RVPUnary_ri<0b00, 0b00111, "psabs.h">; + def PSABS_B : RVPUnary_ri<0b10, 0b00111, "psabs.b">; } // Predicates = [HasStdExtP] let Predicates = [HasStdExtP, IsRV64] in { -def PSEXT_W_B : RVPUnary_ri<0b01, 0b00100, "psext.w.b">; -def PSEXT_W_H : RVPUnary_ri<0b01, 0b00101, "psext.w.h">; + def PSEXT_W_B : RVPUnary_ri<0b01, 0b00100, "psext.w.b">; + def PSEXT_W_H : RVPUnary_ri<0b01, 0b00101, "psext.w.h">; } // Predicates = [HasStdExtP, IsRV64] let Predicates = [HasStdExtP] in def PLUI_H : PLUI_i<0b1111000, "plui.h">; let Predicates = [HasStdExtP, IsRV64] in def PLUI_W : PLUI_i<0b1111001, "plui.w">; + +let Predicates = [HasStdExtP] in { + def PSRLI_B : RVPShiftB_ri<0b000, 0b100, "psrli.b">; + def PSRLI_H : RVPShiftH_ri<0b000, 0b100, "psrli.h">; + + def PUSATI_H : RVPShiftH_ri<0b010, 0b100, "pusati.h">; + + def PSRAI_B : RVPShiftB_ri<0b100, 0b100, "psrai.b">; + def PSRAI_H : RVPShiftH_ri<0b100, 0b100, "psrai.h">; + + def PSRARI_H : RVPShiftH_ri<0b101, 0b100, "psrari.h">; + + def PSATI_H : RVPShiftH_ri<0b110, 0b100, "psati.h">; +} // Predicates = [HasStdExtP] +let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in { + def USATI_RV32 : RVPShiftW_ri<0b010, 0b100, "usati">; + + def SRARI_RV32 : RVPShiftW_ri<0b101, 0b100, "srari">; + + def SATI_RV32 : RVPShiftW_ri<0b110, 0b100, "sati">; +} // Predicates = [HasStdExtP, IsRV32] +let Predicates = [HasStdExtP, IsRV64] in { + def PSRLI_W : RVPShiftW_ri<0b000, 0b100, "psrli.w">; + def PSRAI_W : RVPShiftW_ri<0b100, 0b100, "psrai.w">; + + def PUSATI_W : RVPShiftW_ri<0b010, 0b100, "pusati.w">; + def USATI_RV64 : RVPShiftD_ri<0b010, 0b100, "usati">; + + def PSRARI_W : RVPShiftW_ri<0b101, 0b100, "psrari.w">; + def SRARI_RV64 : RVPShiftD_ri<0b101, 0b100, "srari">; + + def PSATI_W : RVPShiftW_ri<0b110, 0b100, "psati.w">; + def SATI_RV64 : RVPShiftD_ri<0b110, 0b100, "sati">; +} // Predicates = [HasStdExtP, IsRV64] diff --git a/llvm/lib/Target/RISCV/RISCVSchedAndes45.td b/llvm/lib/Target/RISCV/RISCVSchedAndes45.td index 5ef858a787c7..8cf15fa26e22 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedAndes45.td +++ b/llvm/lib/Target/RISCV/RISCVSchedAndes45.td @@ -24,7 +24,7 @@ let SchedModel = Andes45Model in { //===----------------------------------------------------------------------===// // Andes 45 series CPU -// - 2 Interger Arithmetic and Logical Units (ALU) +// - 2 Integer Arithmetic and Logical Units (ALU) // - Multiply / Divide Unit (MDU) // - Load Store Unit (LSU) // - Control and Status Register Unit (CSR) diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index 28c8f401321f..f013898e8520 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -497,6 +497,10 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) { case RISCV::VANDN_VX: // Vector Reverse Bits in Elements case RISCV::VBREV_V: + // Vector Reverse Bits in Bytes + case RISCV::VBREV8_V: + // Vector Reverse Bytes + case RISCV::VREV8_V: // Vector Count Leading Zeros case RISCV::VCLZ_V: // Vector Count Trailing Zeros @@ -510,6 +514,13 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) { case RISCV::VROR_VI: case RISCV::VROR_VV: case RISCV::VROR_VX: + // Vector Carry-less Multiplication Instructions (Zvbc) + // Vector Carry-less Multiply + case RISCV::VCLMUL_VV: + case RISCV::VCLMUL_VX: + // Vector Carry-less Multiply Return High Half + case RISCV::VCLMULH_VV: + case RISCV::VCLMULH_VX: return MILog2SEW; // Vector Widening Shift Left Logical (Zvbb) @@ -1046,6 +1057,10 @@ static bool isSupportedInstr(const MachineInstr &MI) { case RISCV::VANDN_VX: // Vector Reverse Bits in Elements case RISCV::VBREV_V: + // Vector Reverse Bits in Bytes + case RISCV::VBREV8_V: + // Vector Reverse Bytes + case RISCV::VREV8_V: // Vector Count Leading Zeros case RISCV::VCLZ_V: // Vector Count Trailing Zeros @@ -1063,6 +1078,13 @@ static bool isSupportedInstr(const MachineInstr &MI) { case RISCV::VWSLL_VI: case RISCV::VWSLL_VX: case RISCV::VWSLL_VV: + // Vector Carry-less Multiplication Instructions (Zvbc) + // Vector Carry-less Multiply + case RISCV::VCLMUL_VV: + case RISCV::VCLMUL_VX: + // Vector Carry-less Multiply Return High Half + case RISCV::VCLMULH_VV: + case RISCV::VCLMULH_VX: // Vector Mask Instructions // Vector Mask-Register Logical Instructions // vmsbf.m set-before-first mask bit diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f366094c3195..97cdf5b784bc 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -15419,18 +15419,18 @@ static SDValue lowerShuffleAsLanePermuteAndPermute( return SDValue(); } - // Avoid returning the same shuffle operation. For example, - // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5, - // undef:v16i16 - if (CrossLaneMask == Mask || InLaneMask == Mask) - return SDValue(); - // Simplify CrossLaneMask based on the actual demanded elements. if (V1.hasOneUse()) for (int i = 0; i != NumElts; ++i) if (!DemandedCrossLane[i]) CrossLaneMask[i] = SM_SentinelUndef; + // Avoid returning the same shuffle operation. For example, + // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5, + // undef:v16i16 + if (CrossLaneMask == Mask || InLaneMask == Mask) + return SDValue(); + SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask); return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT), InLaneMask); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 9ef21faea2b6..cae6bb99d963 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -5488,9 +5488,10 @@ InstructionCost X86TTIImpl::getPointersChainCost( return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind); } -InstructionCost X86TTIImpl::getAddressComputationCost(Type *PtrTy, - ScalarEvolution *SE, - const SCEV *Ptr) const { +InstructionCost +X86TTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, + const SCEV *Ptr, + TTI::TargetCostKind CostKind) const { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting @@ -5513,7 +5514,7 @@ InstructionCost X86TTIImpl::getAddressComputationCost(Type *PtrTy, return 1; } - return BaseT::getAddressComputationCost(PtrTy, SE, Ptr); + return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind); } InstructionCost diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index bc06c4746c3c..5718c0c9535f 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -194,8 +194,9 @@ public: getPointersChainCost(ArrayRef<const Value *> Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override; - InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, - const SCEV *Ptr) const override; + InstructionCost + getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, + TTI::TargetCostKind CostKind) const override; std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override; |
