diff options
| author | Koakuma <koachan@protonmail.com> | 2024-07-08 19:19:54 +0700 |
|---|---|---|
| committer | Koakuma <koachan@protonmail.com> | 2024-07-08 19:19:54 +0700 |
| commit | 5c4fdc2fd5898ebd9e89999a4f4b8aa289ca637f (patch) | |
| tree | f3b92a07f3dfc6e70f36d1000605f36a3c15af46 /llvm/lib/Target | |
| parent | dbda8e2f2cd8764e0badd983915d62a2c3377f4d (diff) | |
| parent | e9b8cd0c806db00f0981fb36717077c941426302 (diff) | |
[𝘀𝗽𝗿] changes introduced through rebaseusers/koachan/spr/main.sparcias-enable-parseforallfeatures-in-matchoperandparserimpl
Created using spr 1.3.5
[skip ci]
Diffstat (limited to 'llvm/lib/Target')
340 files changed, 9891 insertions, 6289 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index 5708b6173750..2c1a9cfa67a6 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -19,6 +19,7 @@ include "llvm/Target/Target.td" // Subtarget features. //===----------------------------------------------------------------------===// include "AArch64Features.td" +include "AArch64FMV.td" //===----------------------------------------------------------------------===// // Register File Description diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp index 0ec15d34cd4a..f2c38b09c648 100644 --- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp @@ -46,6 +46,18 @@ static cl::opt<bool> GenerateThunks("arm64ec-generate-thunks", cl::Hidden, namespace { +enum ThunkArgTranslation : uint8_t { + Direct, + Bitcast, + PointerIndirection, +}; + +struct ThunkArgInfo { + Type *Arm64Ty; + Type *X64Ty; + ThunkArgTranslation Translation; +}; + class AArch64Arm64ECCallLowering : public ModulePass { public: static char ID; @@ -74,25 +86,30 @@ private: void getThunkType(FunctionType *FT, AttributeList AttrList, Arm64ECThunkType TT, raw_ostream &Out, - FunctionType *&Arm64Ty, FunctionType *&X64Ty); + FunctionType *&Arm64Ty, FunctionType *&X64Ty, + SmallVector<ThunkArgTranslation> &ArgTranslations); void getThunkRetType(FunctionType *FT, AttributeList AttrList, raw_ostream &Out, Type *&Arm64RetTy, Type *&X64RetTy, SmallVectorImpl<Type *> &Arm64ArgTypes, - SmallVectorImpl<Type *> &X64ArgTypes, bool &HasSretPtr); + SmallVectorImpl<Type *> &X64ArgTypes, + SmallVector<ThunkArgTranslation> &ArgTranslations, + bool &HasSretPtr); void getThunkArgTypes(FunctionType *FT, AttributeList AttrList, Arm64ECThunkType TT, raw_ostream &Out, SmallVectorImpl<Type *> &Arm64ArgTypes, - SmallVectorImpl<Type *> &X64ArgTypes, bool HasSretPtr); - void canonicalizeThunkType(Type *T, Align Alignment, bool Ret, - uint64_t ArgSizeBytes, raw_ostream &Out, - Type *&Arm64Ty, Type *&X64Ty); + SmallVectorImpl<Type *> &X64ArgTypes, + SmallVectorImpl<ThunkArgTranslation> &ArgTranslations, + bool HasSretPtr); + ThunkArgInfo canonicalizeThunkType(Type *T, Align Alignment, bool Ret, + uint64_t ArgSizeBytes, raw_ostream &Out); }; } // end anonymous namespace void AArch64Arm64ECCallLowering::getThunkType( FunctionType *FT, AttributeList AttrList, Arm64ECThunkType TT, - raw_ostream &Out, FunctionType *&Arm64Ty, FunctionType *&X64Ty) { + raw_ostream &Out, FunctionType *&Arm64Ty, FunctionType *&X64Ty, + SmallVector<ThunkArgTranslation> &ArgTranslations) { Out << (TT == Arm64ECThunkType::Entry ? "$ientry_thunk$cdecl$" : "$iexit_thunk$cdecl$"); @@ -111,10 +128,10 @@ void AArch64Arm64ECCallLowering::getThunkType( bool HasSretPtr = false; getThunkRetType(FT, AttrList, Out, Arm64RetTy, X64RetTy, Arm64ArgTypes, - X64ArgTypes, HasSretPtr); + X64ArgTypes, ArgTranslations, HasSretPtr); getThunkArgTypes(FT, AttrList, TT, Out, Arm64ArgTypes, X64ArgTypes, - HasSretPtr); + ArgTranslations, HasSretPtr); Arm64Ty = FunctionType::get(Arm64RetTy, Arm64ArgTypes, false); @@ -124,7 +141,8 @@ void AArch64Arm64ECCallLowering::getThunkType( void AArch64Arm64ECCallLowering::getThunkArgTypes( FunctionType *FT, AttributeList AttrList, Arm64ECThunkType TT, raw_ostream &Out, SmallVectorImpl<Type *> &Arm64ArgTypes, - SmallVectorImpl<Type *> &X64ArgTypes, bool HasSretPtr) { + SmallVectorImpl<Type *> &X64ArgTypes, + SmallVectorImpl<ThunkArgTranslation> &ArgTranslations, bool HasSretPtr) { Out << "$"; if (FT->isVarArg()) { @@ -153,17 +171,20 @@ void AArch64Arm64ECCallLowering::getThunkArgTypes( for (int i = HasSretPtr ? 1 : 0; i < 4; i++) { Arm64ArgTypes.push_back(I64Ty); X64ArgTypes.push_back(I64Ty); + ArgTranslations.push_back(ThunkArgTranslation::Direct); } // x4 Arm64ArgTypes.push_back(PtrTy); X64ArgTypes.push_back(PtrTy); + ArgTranslations.push_back(ThunkArgTranslation::Direct); // x5 Arm64ArgTypes.push_back(I64Ty); if (TT != Arm64ECThunkType::Entry) { // FIXME: x5 isn't actually used by the x64 side; revisit once we // have proper isel for varargs X64ArgTypes.push_back(I64Ty); + ArgTranslations.push_back(ThunkArgTranslation::Direct); } return; } @@ -187,18 +208,20 @@ void AArch64Arm64ECCallLowering::getThunkArgTypes( uint64_t ArgSizeBytes = 0; Align ParamAlign = Align(); #endif - Type *Arm64Ty, *X64Ty; - canonicalizeThunkType(FT->getParamType(I), ParamAlign, - /*Ret*/ false, ArgSizeBytes, Out, Arm64Ty, X64Ty); + auto [Arm64Ty, X64Ty, ArgTranslation] = + canonicalizeThunkType(FT->getParamType(I), ParamAlign, + /*Ret*/ false, ArgSizeBytes, Out); Arm64ArgTypes.push_back(Arm64Ty); X64ArgTypes.push_back(X64Ty); + ArgTranslations.push_back(ArgTranslation); } } void AArch64Arm64ECCallLowering::getThunkRetType( FunctionType *FT, AttributeList AttrList, raw_ostream &Out, Type *&Arm64RetTy, Type *&X64RetTy, SmallVectorImpl<Type *> &Arm64ArgTypes, - SmallVectorImpl<Type *> &X64ArgTypes, bool &HasSretPtr) { + SmallVectorImpl<Type *> &X64ArgTypes, + SmallVector<ThunkArgTranslation> &ArgTranslations, bool &HasSretPtr) { Type *T = FT->getReturnType(); #if 0 // FIXME: Need more information about argument size; see @@ -209,35 +232,44 @@ void AArch64Arm64ECCallLowering::getThunkRetType( #endif if (T->isVoidTy()) { if (FT->getNumParams()) { - auto SRetAttr = AttrList.getParamAttr(0, Attribute::StructRet); - auto InRegAttr = AttrList.getParamAttr(0, Attribute::InReg); - if (SRetAttr.isValid() && InRegAttr.isValid()) { + Attribute SRetAttr0 = AttrList.getParamAttr(0, Attribute::StructRet); + Attribute InRegAttr0 = AttrList.getParamAttr(0, Attribute::InReg); + Attribute SRetAttr1, InRegAttr1; + if (FT->getNumParams() > 1) { + // Also check the second parameter (for class methods, the first + // parameter is "this", and the second parameter is the sret pointer.) + // It doesn't matter which one is sret. + SRetAttr1 = AttrList.getParamAttr(1, Attribute::StructRet); + InRegAttr1 = AttrList.getParamAttr(1, Attribute::InReg); + } + if ((SRetAttr0.isValid() && InRegAttr0.isValid()) || + (SRetAttr1.isValid() && InRegAttr1.isValid())) { // sret+inreg indicates a call that returns a C++ class value. This is // actually equivalent to just passing and returning a void* pointer - // as the first argument. Translate it that way, instead of trying - // to model "inreg" in the thunk's calling convention, to simplify - // the rest of the code. + // as the first or second argument. Translate it that way, instead of + // trying to model "inreg" in the thunk's calling convention; this + // simplfies the rest of the code, and matches MSVC mangling. Out << "i8"; Arm64RetTy = I64Ty; X64RetTy = I64Ty; return; } - if (SRetAttr.isValid()) { + if (SRetAttr0.isValid()) { // FIXME: Sanity-check the sret type; if it's an integer or pointer, // we'll get screwy mangling/codegen. // FIXME: For large struct types, mangle as an integer argument and // integer return, so we can reuse more thunks, instead of "m" syntax. // (MSVC mangles this case as an integer return with no argument, but // that's a miscompile.) - Type *SRetType = SRetAttr.getValueAsType(); + Type *SRetType = SRetAttr0.getValueAsType(); Align SRetAlign = AttrList.getParamAlignment(0).valueOrOne(); - Type *Arm64Ty, *X64Ty; canonicalizeThunkType(SRetType, SRetAlign, /*Ret*/ true, ArgSizeBytes, - Out, Arm64Ty, X64Ty); + Out); Arm64RetTy = VoidTy; X64RetTy = VoidTy; Arm64ArgTypes.push_back(FT->getParamType(0)); X64ArgTypes.push_back(FT->getParamType(0)); + ArgTranslations.push_back(ThunkArgTranslation::Direct); HasSretPtr = true; return; } @@ -249,8 +281,10 @@ void AArch64Arm64ECCallLowering::getThunkRetType( return; } - canonicalizeThunkType(T, Align(), /*Ret*/ true, ArgSizeBytes, Out, Arm64RetTy, - X64RetTy); + auto info = + canonicalizeThunkType(T, Align(), /*Ret*/ true, ArgSizeBytes, Out); + Arm64RetTy = info.Arm64Ty; + X64RetTy = info.X64Ty; if (X64RetTy->isPointerTy()) { // If the X64 type is canonicalized to a pointer, that means it's // passed/returned indirectly. For a return value, that means it's an @@ -260,21 +294,33 @@ void AArch64Arm64ECCallLowering::getThunkRetType( } } -void AArch64Arm64ECCallLowering::canonicalizeThunkType( - Type *T, Align Alignment, bool Ret, uint64_t ArgSizeBytes, raw_ostream &Out, - Type *&Arm64Ty, Type *&X64Ty) { +ThunkArgInfo AArch64Arm64ECCallLowering::canonicalizeThunkType( + Type *T, Align Alignment, bool Ret, uint64_t ArgSizeBytes, + raw_ostream &Out) { + + auto direct = [](Type *T) { + return ThunkArgInfo{T, T, ThunkArgTranslation::Direct}; + }; + + auto bitcast = [this](Type *Arm64Ty, uint64_t SizeInBytes) { + return ThunkArgInfo{Arm64Ty, + llvm::Type::getIntNTy(M->getContext(), SizeInBytes * 8), + ThunkArgTranslation::Bitcast}; + }; + + auto pointerIndirection = [this](Type *Arm64Ty) { + return ThunkArgInfo{Arm64Ty, PtrTy, + ThunkArgTranslation::PointerIndirection}; + }; + if (T->isFloatTy()) { Out << "f"; - Arm64Ty = T; - X64Ty = T; - return; + return direct(T); } if (T->isDoubleTy()) { Out << "d"; - Arm64Ty = T; - X64Ty = T; - return; + return direct(T); } if (T->isFloatingPointTy()) { @@ -297,16 +343,14 @@ void AArch64Arm64ECCallLowering::canonicalizeThunkType( Out << (ElementTy->isFloatTy() ? "F" : "D") << TotalSizeBytes; if (Alignment.value() >= 16 && !Ret) Out << "a" << Alignment.value(); - Arm64Ty = T; if (TotalSizeBytes <= 8) { // Arm64 returns small structs of float/double in float registers; // X64 uses RAX. - X64Ty = llvm::Type::getIntNTy(M->getContext(), TotalSizeBytes * 8); + return bitcast(T, TotalSizeBytes); } else { // Struct is passed directly on Arm64, but indirectly on X64. - X64Ty = PtrTy; + return pointerIndirection(T); } - return; } else if (T->isFloatingPointTy()) { report_fatal_error("Only 32 and 64 bit floating points are supported for " "ARM64EC thunks"); @@ -315,9 +359,7 @@ void AArch64Arm64ECCallLowering::canonicalizeThunkType( if ((T->isIntegerTy() || T->isPointerTy()) && DL.getTypeSizeInBits(T) <= 64) { Out << "i8"; - Arm64Ty = I64Ty; - X64Ty = I64Ty; - return; + return direct(I64Ty); } unsigned TypeSize = ArgSizeBytes; @@ -329,13 +371,12 @@ void AArch64Arm64ECCallLowering::canonicalizeThunkType( if (Alignment.value() >= 16 && !Ret) Out << "a" << Alignment.value(); // FIXME: Try to canonicalize Arm64Ty more thoroughly? - Arm64Ty = T; if (TypeSize == 1 || TypeSize == 2 || TypeSize == 4 || TypeSize == 8) { // Pass directly in an integer register - X64Ty = llvm::Type::getIntNTy(M->getContext(), TypeSize * 8); + return bitcast(T, TypeSize); } else { // Passed directly on Arm64, but indirectly on X64. - X64Ty = PtrTy; + return pointerIndirection(T); } } @@ -346,8 +387,9 @@ Function *AArch64Arm64ECCallLowering::buildExitThunk(FunctionType *FT, SmallString<256> ExitThunkName; llvm::raw_svector_ostream ExitThunkStream(ExitThunkName); FunctionType *Arm64Ty, *X64Ty; + SmallVector<ThunkArgTranslation> ArgTranslations; getThunkType(FT, Attrs, Arm64ECThunkType::Exit, ExitThunkStream, Arm64Ty, - X64Ty); + X64Ty, ArgTranslations); if (Function *F = M->getFunction(ExitThunkName)) return F; @@ -378,6 +420,7 @@ Function *AArch64Arm64ECCallLowering::buildExitThunk(FunctionType *FT, SmallVector<Value *> Args; // Pass the called function in x9. + auto X64TyOffset = 1; Args.push_back(F->arg_begin()); Type *RetTy = Arm64Ty->getReturnType(); @@ -387,10 +430,14 @@ Function *AArch64Arm64ECCallLowering::buildExitThunk(FunctionType *FT, // pointer. if (DL.getTypeStoreSize(RetTy) > 8) { Args.push_back(IRB.CreateAlloca(RetTy)); + X64TyOffset++; } } - for (auto &Arg : make_range(F->arg_begin() + 1, F->arg_end())) { + for (auto [Arg, X64ArgType, ArgTranslation] : llvm::zip_equal( + make_range(F->arg_begin() + 1, F->arg_end()), + make_range(X64Ty->param_begin() + X64TyOffset, X64Ty->param_end()), + ArgTranslations)) { // Translate arguments from AArch64 calling convention to x86 calling // convention. // @@ -405,18 +452,20 @@ Function *AArch64Arm64ECCallLowering::buildExitThunk(FunctionType *FT, // with an attribute.) // // The first argument is the called function, stored in x9. - if (Arg.getType()->isArrayTy() || Arg.getType()->isStructTy() || - DL.getTypeStoreSize(Arg.getType()) > 8) { + if (ArgTranslation != ThunkArgTranslation::Direct) { Value *Mem = IRB.CreateAlloca(Arg.getType()); IRB.CreateStore(&Arg, Mem); - if (DL.getTypeStoreSize(Arg.getType()) <= 8) { + if (ArgTranslation == ThunkArgTranslation::Bitcast) { Type *IntTy = IRB.getIntNTy(DL.getTypeStoreSizeInBits(Arg.getType())); Args.push_back(IRB.CreateLoad(IntTy, IRB.CreateBitCast(Mem, PtrTy))); - } else + } else { + assert(ArgTranslation == ThunkArgTranslation::PointerIndirection); Args.push_back(Mem); + } } else { Args.push_back(&Arg); } + assert(Args.back()->getType() == X64ArgType); } // FIXME: Transfer necessary attributes? sret? anything else? @@ -450,8 +499,10 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) { SmallString<256> EntryThunkName; llvm::raw_svector_ostream EntryThunkStream(EntryThunkName); FunctionType *Arm64Ty, *X64Ty; + SmallVector<ThunkArgTranslation> ArgTranslations; getThunkType(F->getFunctionType(), F->getAttributes(), - Arm64ECThunkType::Entry, EntryThunkStream, Arm64Ty, X64Ty); + Arm64ECThunkType::Entry, EntryThunkStream, Arm64Ty, X64Ty, + ArgTranslations); if (Function *F = M->getFunction(EntryThunkName)) return F; @@ -463,7 +514,6 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) { // Copy MSVC, and always set up a frame pointer. (Maybe this isn't necessary.) Thunk->addFnAttr("frame-pointer", "all"); - auto &DL = M->getDataLayout(); BasicBlock *BB = BasicBlock::Create(M->getContext(), "", Thunk); IRBuilder<> IRB(BB); @@ -472,24 +522,28 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) { bool TransformDirectToSRet = X64RetType->isVoidTy() && !RetTy->isVoidTy(); unsigned ThunkArgOffset = TransformDirectToSRet ? 2 : 1; - unsigned PassthroughArgSize = F->isVarArg() ? 5 : Thunk->arg_size(); + unsigned PassthroughArgSize = + (F->isVarArg() ? 5 : Thunk->arg_size()) - ThunkArgOffset; + assert(ArgTranslations.size() == F->isVarArg() ? 5 : PassthroughArgSize); // Translate arguments to call. SmallVector<Value *> Args; - for (unsigned i = ThunkArgOffset, e = PassthroughArgSize; i != e; ++i) { - Value *Arg = Thunk->getArg(i); - Type *ArgTy = Arm64Ty->getParamType(i - ThunkArgOffset); - if (ArgTy->isArrayTy() || ArgTy->isStructTy() || - DL.getTypeStoreSize(ArgTy) > 8) { + for (unsigned i = 0; i != PassthroughArgSize; ++i) { + Value *Arg = Thunk->getArg(i + ThunkArgOffset); + Type *ArgTy = Arm64Ty->getParamType(i); + ThunkArgTranslation ArgTranslation = ArgTranslations[i]; + if (ArgTranslation != ThunkArgTranslation::Direct) { // Translate array/struct arguments to the expected type. - if (DL.getTypeStoreSize(ArgTy) <= 8) { + if (ArgTranslation == ThunkArgTranslation::Bitcast) { Value *CastAlloca = IRB.CreateAlloca(ArgTy); IRB.CreateStore(Arg, IRB.CreateBitCast(CastAlloca, PtrTy)); Arg = IRB.CreateLoad(ArgTy, CastAlloca); } else { + assert(ArgTranslation == ThunkArgTranslation::PointerIndirection); Arg = IRB.CreateLoad(ArgTy, IRB.CreateBitCast(Arg, PtrTy)); } } + assert(Arg->getType() == ArgTy); Args.push_back(Arg); } @@ -549,8 +603,10 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) { Function *AArch64Arm64ECCallLowering::buildGuestExitThunk(Function *F) { llvm::raw_null_ostream NullThunkName; FunctionType *Arm64Ty, *X64Ty; + SmallVector<ThunkArgTranslation> ArgTranslations; getThunkType(F->getFunctionType(), F->getAttributes(), - Arm64ECThunkType::GuestExit, NullThunkName, Arm64Ty, X64Ty); + Arm64ECThunkType::GuestExit, NullThunkName, Arm64Ty, X64Ty, + ArgTranslations); auto MangledName = getArm64ECMangledFunctionName(F->getName().str()); assert(MangledName && "Can't guest exit to function that's already native"); std::string ThunkName = *MangledName; diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index 7da540f8ef8e..da11539eab34 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -90,6 +90,8 @@ public: return MCInstLowering.lowerOperand(MO, MCOp); } + const MCExpr *lowerConstantPtrAuth(const ConstantPtrAuth &CPA) override; + void emitStartOfAsmFile(Module &M) override; void emitJumpTableInfo() override; std::tuple<const MCSymbol *, uint64_t, const MCSymbol *, @@ -1575,6 +1577,52 @@ void AArch64AsmPrinter::emitPtrauthBranch(const MachineInstr *MI) { assert(STI->getInstrInfo()->getInstSizeInBytes(*MI) >= InstsEmitted * 4); } +const MCExpr * +AArch64AsmPrinter::lowerConstantPtrAuth(const ConstantPtrAuth &CPA) { + MCContext &Ctx = OutContext; + + // Figure out the base symbol and the addend, if any. + APInt Offset(64, 0); + const Value *BaseGV = CPA.getPointer()->stripAndAccumulateConstantOffsets( + getDataLayout(), Offset, /*AllowNonInbounds=*/true); + + auto *BaseGVB = dyn_cast<GlobalValue>(BaseGV); + + // If we can't understand the referenced ConstantExpr, there's nothing + // else we can do: emit an error. + if (!BaseGVB) { + BaseGV->getContext().emitError( + "cannot resolve target base/addend of ptrauth constant"); + return nullptr; + } + + // If there is an addend, turn that into the appropriate MCExpr. + const MCExpr *Sym = MCSymbolRefExpr::create(getSymbol(BaseGVB), Ctx); + if (Offset.sgt(0)) + Sym = MCBinaryExpr::createAdd( + Sym, MCConstantExpr::create(Offset.getSExtValue(), Ctx), Ctx); + else if (Offset.slt(0)) + Sym = MCBinaryExpr::createSub( + Sym, MCConstantExpr::create((-Offset).getSExtValue(), Ctx), Ctx); + + uint64_t KeyID = CPA.getKey()->getZExtValue(); + // We later rely on valid KeyID value in AArch64PACKeyIDToString call from + // AArch64AuthMCExpr::printImpl, so fail fast. + if (KeyID > AArch64PACKey::LAST) + report_fatal_error("AArch64 PAC Key ID '" + Twine(KeyID) + + "' out of range [0, " + + Twine((unsigned)AArch64PACKey::LAST) + "]"); + + uint64_t Disc = CPA.getDiscriminator()->getZExtValue(); + if (!isUInt<16>(Disc)) + report_fatal_error("AArch64 PAC Discriminator '" + Twine(Disc) + + "' out of range [0, 0xFFFF]"); + + // Finally build the complete @AUTH expr. + return AArch64AuthMCExpr::create(Sym, Disc, AArch64PACKey::ID(KeyID), + CPA.hasAddressDiscriminator(), Ctx); +} + // Simple pseudo-instructions have their lowering (with expansion to real // instructions) auto-generated. #include "AArch64GenMCPseudoLowering.inc" diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td index 941990c53c4a..2f7e226fd09b 100644 --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td @@ -500,23 +500,31 @@ def CC_AArch64_Preserve_None : CallingConv<[ // - X8, used for sret // - X16/X17, used by the linker as IP0/IP1 // - X18, the platform register + // - X19, the base pointer // - X29, the frame pointer // - X30, the link register // General registers are not preserved with the exception of // FP, LR, and X18 // Non-volatile registers are used first, so functions may call // normal functions without saving and reloading arguments. - CCIfType<[i32], CCAssignToReg<[W19, W20, W21, W22, W23, + // X9 is assigned last as it is used in FrameLowering as the first + // choice for a scratch register. + CCIfType<[i32], CCAssignToReg<[W20, W21, W22, W23, W24, W25, W26, W27, W28, W0, W1, W2, W3, W4, W5, - W6, W7, W9, W10, W11, - W12, W13, W14, W15]>>, - CCIfType<[i64], CCAssignToReg<[X19, X20, X21, X22, X23, + W6, W7, W10, W11, + W12, W13, W14, W9]>>, + CCIfType<[i64], CCAssignToReg<[X20, X21, X22, X23, X24, X25, X26, X27, X28, X0, X1, X2, X3, X4, X5, - X6, X7, X9, X10, X11, - X12, X13, X14, X15]>>, - + X6, X7, X10, X11, + X12, X13, X14, X9]>>, + + // Windows uses X15 for stack allocation + CCIf<"!State.getMachineFunction().getSubtarget<AArch64Subtarget>().isTargetWindows()", + CCIfType<[i32], CCAssignToReg<[W15]>>>, + CCIf<"!State.getMachineFunction().getSubtarget<AArch64Subtarget>().isTargetWindows()", + CCIfType<[i64], CCAssignToReg<[X15]>>>, CCDelegateTo<CC_AArch64_AAPCS> ]>; diff --git a/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp index 3f244ba10102..154ae43b29d5 100644 --- a/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp +++ b/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp @@ -50,7 +50,8 @@ struct LDTLSCleanup : public MachineFunctionPass { return false; } - MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>(); + MachineDominatorTree *DT = + &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); return VisitNode(DT->getRootNode(), 0); } @@ -138,7 +139,7 @@ struct LDTLSCleanup : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } }; diff --git a/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp index 2a4a3c0df08f..68243258a68f 100644 --- a/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp @@ -126,7 +126,7 @@ char AArch64ConditionOptimizer::ID = 0; INITIALIZE_PASS_BEGIN(AArch64ConditionOptimizer, "aarch64-condopt", "AArch64 CondOpt Pass", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(AArch64ConditionOptimizer, "aarch64-condopt", "AArch64 CondOpt Pass", false, false) @@ -135,8 +135,8 @@ FunctionPass *llvm::createAArch64ConditionOptimizerPass() { } void AArch64ConditionOptimizer::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -332,7 +332,7 @@ bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) { return false; TII = MF.getSubtarget().getInstrInfo(); - DomTree = &getAnalysis<MachineDominatorTree>(); + DomTree = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); MRI = &MF.getRegInfo(); bool Changed = false; diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp index 8c16a88a13a4..9a788123b1ff 100644 --- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp +++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp @@ -795,7 +795,7 @@ char AArch64ConditionalCompares::ID = 0; INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp", "AArch64 CCMP Pass", false, false) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp", "AArch64 CCMP Pass", false, false) @@ -806,8 +806,8 @@ FunctionPass *llvm::createAArch64ConditionalCompares() { void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<MachineBranchProbabilityInfo>(); - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); AU.addRequired<MachineLoopInfo>(); AU.addPreserved<MachineLoopInfo>(); AU.addRequired<MachineTraceMetrics>(); @@ -933,7 +933,7 @@ bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) { TRI = MF.getSubtarget().getRegisterInfo(); SchedModel = MF.getSubtarget().getSchedModel(); MRI = &MF.getRegInfo(); - DomTree = &getAnalysis<MachineDominatorTree>(); + DomTree = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); Loops = &getAnalysis<MachineLoopInfo>(); MBPI = &getAnalysis<MachineBranchProbabilityInfo>(); Traces = &getAnalysis<MachineTraceMetrics>(); diff --git a/llvm/lib/Target/AArch64/AArch64FMV.td b/llvm/lib/Target/AArch64/AArch64FMV.td new file mode 100644 index 000000000000..7a40c83b2bb2 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64FMV.td @@ -0,0 +1,99 @@ +//=------ AArch64FMV.td - Describe AArch64 FMV Features ------*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Function MultiVersioning (FMV) properties. FMV features are accepted by the +// attributes target_version and target_clones, and they correspond to a mapping +// from the FMV feature name to: +// - A bit in the FMV ABI, as defined by the ACLE. +// - The FMV priority, as defined by the ACLE. +// - A list of backend features. +// +// The list of backend features is not a set of dependencies; it is specific to +// LLVM and indicates how to do codegen when the FMV feature is present. +// +// Therefore FMVExtensions are separated from regular AArch64 Extensions, which +// encode dependencies between themselves and other SubtargetFeatures. +//===----------------------------------------------------------------------===// + + +// Something you can add to target_version or target_clones. +class FMVExtension<string n, string b, string f, int p> { + // Name, as spelled in target_version or target_clones. e.g. "memtag". + string Name = n; + + // A C++ expression giving the number of the bit in the FMV ABI. + // Currently this is given as a value from the enum "CPUFeatures". + string Bit = b; + + // SubtargetFeatures enabled for codegen when this FMV feature is present. + string BackendFeatures = f; + + // The FMV priority. + int Priority = p; +} + +def : FMVExtension<"aes", "FEAT_AES", "+fp-armv8,+neon", 150>; +def : FMVExtension<"bf16", "FEAT_BF16", "+bf16", 280>; +def : FMVExtension<"bti", "FEAT_BTI", "+bti", 510>; +def : FMVExtension<"crc", "FEAT_CRC", "+crc", 110>; +def : FMVExtension<"dgh", "FEAT_DGH", "", 260>; +def : FMVExtension<"dit", "FEAT_DIT", "+dit", 180>; +def : FMVExtension<"dotprod", "FEAT_DOTPROD", "+dotprod,+fp-armv8,+neon", 104>; +def : FMVExtension<"dpb", "FEAT_DPB", "+ccpp", 190>; +def : FMVExtension<"dpb2", "FEAT_DPB2", "+ccpp,+ccdp", 200>; +def : FMVExtension<"ebf16", "FEAT_EBF16", "+bf16", 290>; +def : FMVExtension<"f32mm", "FEAT_SVE_F32MM", "+sve,+f32mm,+fullfp16,+fp-armv8,+neon", 350>; +def : FMVExtension<"f64mm", "FEAT_SVE_F64MM", "+sve,+f64mm,+fullfp16,+fp-armv8,+neon", 360>; +def : FMVExtension<"fcma", "FEAT_FCMA", "+fp-armv8,+neon,+complxnum", 220>; +def : FMVExtension<"flagm", "FEAT_FLAGM", "+flagm", 20>; +def : FMVExtension<"flagm2", "FEAT_FLAGM2", "+flagm,+altnzcv", 30>; +def : FMVExtension<"fp", "FEAT_FP", "+fp-armv8,+neon", 90>; +def : FMVExtension<"fp16", "FEAT_FP16", "+fullfp16,+fp-armv8,+neon", 170>; +def : FMVExtension<"fp16fml", "FEAT_FP16FML", "+fp16fml,+fullfp16,+fp-armv8,+neon", 175>; +def : FMVExtension<"frintts", "FEAT_FRINTTS", "+fptoint", 250>; +def : FMVExtension<"i8mm", "FEAT_I8MM", "+i8mm", 270>; +def : FMVExtension<"jscvt", "FEAT_JSCVT", "+fp-armv8,+neon,+jsconv", 210>; +def : FMVExtension<"ls64", "FEAT_LS64", "", 520>; +def : FMVExtension<"ls64_accdata", "FEAT_LS64_ACCDATA", "+ls64", 540>; +def : FMVExtension<"ls64_v", "FEAT_LS64_V", "", 530>; +def : FMVExtension<"lse", "FEAT_LSE", "+lse", 80>; +def : FMVExtension<"memtag", "FEAT_MEMTAG", "", 440>; +def : FMVExtension<"memtag2", "FEAT_MEMTAG2", "+mte", 450>; +def : FMVExtension<"memtag3", "FEAT_MEMTAG3", "+mte", 460>; +def : FMVExtension<"mops", "FEAT_MOPS", "+mops", 650>; +def : FMVExtension<"pmull", "FEAT_PMULL", "+aes,+fp-armv8,+neon", 160>; +def : FMVExtension<"predres", "FEAT_PREDRES", "+predres", 480>; +def : FMVExtension<"rcpc", "FEAT_RCPC", "+rcpc", 230>; +def : FMVExtension<"rcpc2", "FEAT_RCPC2", "+rcpc", 240>; +def : FMVExtension<"rcpc3", "FEAT_RCPC3", "+rcpc,+rcpc3", 241>; +def : FMVExtension<"rdm", "FEAT_RDM", "+rdm,+fp-armv8,+neon", 108>; +def : FMVExtension<"rng", "FEAT_RNG", "+rand", 10>; +def : FMVExtension<"rpres", "FEAT_RPRES", "", 300>; +def : FMVExtension<"sb", "FEAT_SB", "+sb", 470>; +def : FMVExtension<"sha1", "FEAT_SHA1", "+fp-armv8,+neon", 120>; +def : FMVExtension<"sha2", "FEAT_SHA2", "+sha2,+fp-armv8,+neon", 130>; +def : FMVExtension<"sha3", "FEAT_SHA3", "+sha3,+sha2,+fp-armv8,+neon", 140>; +def : FMVExtension<"simd", "FEAT_SIMD", "+fp-armv8,+neon", 100>; +def : FMVExtension<"sm4", "FEAT_SM4", "+sm4,+fp-armv8,+neon", 106>; +def : FMVExtension<"sme", "FEAT_SME", "+sme,+bf16", 430>; +def : FMVExtension<"sme-f64f64", "FEAT_SME_F64", "+sme,+sme-f64f64,+bf16", 560>; +def : FMVExtension<"sme-i16i64", "FEAT_SME_I64", "+sme,+sme-i16i64,+bf16", 570>; +def : FMVExtension<"sme2", "FEAT_SME2", "+sme2,+sme,+bf16", 580>; +def : FMVExtension<"ssbs", "FEAT_SSBS", "", 490>; +def : FMVExtension<"ssbs2", "FEAT_SSBS2", "+ssbs", 500>; +def : FMVExtension<"sve", "FEAT_SVE", "+sve,+fullfp16,+fp-armv8,+neon", 310>; +def : FMVExtension<"sve-bf16", "FEAT_SVE_BF16", "+sve,+bf16,+fullfp16,+fp-armv8,+neon", 320>; +def : FMVExtension<"sve-ebf16", "FEAT_SVE_EBF16", "+sve,+bf16,+fullfp16,+fp-armv8,+neon", 330>; +def : FMVExtension<"sve-i8mm", "FEAT_SVE_I8MM", "+sve,+i8mm,+fullfp16,+fp-armv8,+neon", 340>; +def : FMVExtension<"sve2", "FEAT_SVE2", "+sve2,+sve,+fullfp16,+fp-armv8,+neon", 370>; +def : FMVExtension<"sve2-aes", "FEAT_SVE_AES", "+sve2,+sve,+sve2-aes,+fullfp16,+fp-armv8,+neon", 380>; +def : FMVExtension<"sve2-bitperm", "FEAT_SVE_BITPERM", "+sve2,+sve,+sve2-bitperm,+fullfp16,+fp-armv8,+neon", 400>; +def : FMVExtension<"sve2-pmull128", "FEAT_SVE_PMULL128", "+sve2,+sve,+sve2-aes,+fullfp16,+fp-armv8,+neon", 390>; +def : FMVExtension<"sve2-sha3", "FEAT_SVE_SHA3", "+sve2,+sve,+sve2-sha3,+fullfp16,+fp-armv8,+neon", 410>; +def : FMVExtension<"sve2-sm4", "FEAT_SVE_SM4", "+sve2,+sve,+sve2-sm4,+fullfp16,+fp-armv8,+neon", 420>; +def : FMVExtension<"wfxt", "FEAT_WFXT", "+wfxt", 550>; diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td index ffb899a30145..8c1003c085e6 100644 --- a/llvm/lib/Target/AArch64/AArch64Features.td +++ b/llvm/lib/Target/AArch64/AArch64Features.td @@ -11,24 +11,11 @@ // A SubtargetFeature that can be toggled from the command line, and therefore // has an AEK_* entry in ArmExtKind. -// -// If Function MultiVersioning (FMV) properties are left at their defaults -// (FEAT_INIT, no dependencies, priority 0) it indiates that this extension is -// not an FMV feature, but can be enabled via the command line (-march, -mcpu, -// etc). -// -// Conversely if the ArchExtKindSpelling is set to AEK_NONE, this indicates -// that a feature is FMV-only, and can not be selected on the command line. -// Such extensions should be added via FMVOnlyExtension. class Extension< string TargetFeatureName, // String used for -target-feature and -march, unless overridden. string Spelling, // The XYZ in HasXYZ and AEK_XYZ. string Desc, // Description. - list<SubtargetFeature> Implies = [], // List of dependent features. - // FMV properties - string _FMVBit = "FEAT_INIT", // FEAT_INIT is repurposed to indicate "not an FMV feature" - string _FMVDependencies = "", - int _FMVPriority = 0 + list<SubtargetFeature> Implies = [] // List of dependent features. > : SubtargetFeature<TargetFeatureName, "Has" # Spelling, "true", Desc, Implies> { string ArchExtKindSpelling = "AEK_" # Spelling; // ArchExtKind enum name. @@ -42,57 +29,9 @@ class Extension< // An alias that can be used on the command line, if the extension has one. // Used for correcting historical names while remaining backwards compatible. string MArchAlias = ""; - - // Function MultiVersioning (FMV) properties - - // A C++ expression giving the number of the bit in the FMV ABI. - // Currently this is given as a value from the enum "CPUFeatures". - // If this is not set, it indicates that this is not an FMV extension. - string FMVBit = _FMVBit; - - // List of features that this feature depends on. - // FIXME generate this from Implies. - string FMVDependencies = _FMVDependencies; - - // The FMV priority - int FMVPriority = _FMVPriority; } -// Some extensions are available for FMV but can not be controlled via the -// command line. These entries: -// - are SubtargetFeatures, so they have (unused) FieldNames on the subtarget -// e.g. HasFMVOnlyFEAT_XYZ -// - have incorrect (empty) Implies fields, because the code that handles FMV -// ignores these dependencies and looks only at FMVDependencies. -// - have no description. -// -// In the generated data structures for extensions (ExtensionInfo), AEK_NONE is -// used to indicate that a feature is FMV only. Therefore ArchExtKindSpelling is -// manually overridden here. -class FMVOnlyExtension<string FMVBit, string Name, string Deps, int Priority> - : Extension<Name, "FMVOnly"#FMVBit, "", [], FMVBit, Deps, Priority> { - let ArchExtKindSpelling = "AEK_NONE"; // AEK_NONE indicates FMV-only feature -} -def : FMVOnlyExtension<"FEAT_DGH", "dgh", "", 260>; -def : FMVOnlyExtension<"FEAT_DPB", "dpb", "+ccpp", 190>; -def : FMVOnlyExtension<"FEAT_DPB2", "dpb2", "+ccpp,+ccdp", 200>; -def : FMVOnlyExtension<"FEAT_EBF16", "ebf16", "+bf16", 290>; -def : FMVOnlyExtension<"FEAT_FLAGM2", "flagm2", "+flagm,+altnzcv", 30>; -def : FMVOnlyExtension<"FEAT_FRINTTS", "frintts", "+fptoint", 250>; -def : FMVOnlyExtension<"FEAT_LS64_ACCDATA", "ls64_accdata", "+ls64", 540>; -def : FMVOnlyExtension<"FEAT_LS64_V", "ls64_v", "", 530>; -def : FMVOnlyExtension<"FEAT_MEMTAG2", "memtag2", "+mte", 450>; -def : FMVOnlyExtension<"FEAT_MEMTAG3", "memtag3", "+mte", 460>; -def : FMVOnlyExtension<"FEAT_PMULL", "pmull", "+aes,+fp-armv8,+neon", 160>; -def : FMVOnlyExtension<"FEAT_RCPC2", "rcpc2", "+rcpc", 240>; -def : FMVOnlyExtension<"FEAT_RPRES", "rpres", "", 300>; -def : FMVOnlyExtension<"FEAT_SHA1", "sha1", "+fp-armv8,+neon", 120>; -def : FMVOnlyExtension<"FEAT_SSBS2", "ssbs2", "+ssbs", 500>; -def : FMVOnlyExtension<"FEAT_SVE_BF16", "sve-bf16", "+sve,+bf16,+fullfp16,+fp-armv8,+neon", 320>; -def : FMVOnlyExtension<"FEAT_SVE_EBF16", "sve-ebf16", "+sve,+bf16,+fullfp16,+fp-armv8,+neon", 330>; -def : FMVOnlyExtension<"FEAT_SVE_I8MM", "sve-i8mm", "+sve,+i8mm,+fullfp16,+fp-armv8,+neon", 340>; -def : FMVOnlyExtension<"FEAT_SVE_PMULL128", "sve2-pmull128", "+sve2,+sve,+sve2-aes,+fullfp16,+fp-armv8,+neon", 390>; // Each SubtargetFeature which corresponds to an Arm Architecture feature should @@ -101,35 +40,26 @@ def : FMVOnlyExtension<"FEAT_SVE_PMULL128", "sve2-pmull128", "+sve2,+sve,+sve2-a // Arm Architecture Features, it should list all the relevant features. Not all // FEAT_ features have a corresponding SubtargetFeature. + +//===----------------------------------------------------------------------===// +// Armv8.0 Architecture Extensions +//===----------------------------------------------------------------------===// + let ArchExtKindSpelling = "AEK_FP", MArchName = "fp" in def FeatureFPARMv8 : Extension<"fp-armv8", "FPARMv8", - "Enable ARMv8 (FEAT_FP)", [], - "FEAT_FP", "+fp-armv8,+neon", 90>; + "Enable ARMv8 (FEAT_FP)">; let ArchExtKindSpelling = "AEK_SIMD", MArchName = "simd" in def FeatureNEON : Extension<"neon", "NEON", - "Enable Advanced SIMD instructions (FEAT_AdvSIMD)", [FeatureFPARMv8], - "FEAT_SIMD", "+fp-armv8,+neon", 100>; - -def FeatureSM4 : Extension< - "sm4", "SM4", - "Enable SM3 and SM4 support (FEAT_SM4, FEAT_SM3)", [FeatureNEON], - "FEAT_SM4", "+sm4,+fp-armv8,+neon", 106>; + "Enable Advanced SIMD instructions (FEAT_AdvSIMD)", [FeatureFPARMv8]>; def FeatureSHA2 : Extension< "sha2", "SHA2", - "Enable SHA1 and SHA256 support (FEAT_SHA1, FEAT_SHA256)", [FeatureNEON], - "FEAT_SHA2", "+sha2,+fp-armv8,+neon", 130>; - -def FeatureSHA3 : Extension< - "sha3", "SHA3", - "Enable SHA512 and SHA3 support (FEAT_SHA3, FEAT_SHA512)", [FeatureNEON, FeatureSHA2], - "FEAT_SHA3", "+sha3,+sha2,+fp-armv8,+neon", 140>; + "Enable SHA1 and SHA256 support (FEAT_SHA1, FEAT_SHA256)", [FeatureNEON]>; def FeatureAES : Extension< "aes", "AES", - "Enable AES support (FEAT_AES, FEAT_PMULL)", [FeatureNEON], - "FEAT_AES", "+fp-armv8,+neon", 150>; + "Enable AES support (FEAT_AES, FEAT_PMULL)", [FeatureNEON]>; // Crypto has been split up and any combination is now valid (see the // crypto definitions above). Also, crypto is now context sensitive: @@ -139,39 +69,33 @@ def FeatureAES : Extension< // meaning anymore. We kept the Crypto definition here for backward // compatibility, and now imply features SHA2 and AES, which was the // "traditional" meaning of Crypto. -let FMVDependencies = "+aes,+sha2" in def FeatureCrypto : Extension<"crypto", "Crypto", "Enable cryptographic instructions", [FeatureNEON, FeatureSHA2, FeatureAES]>; def FeatureCRC : Extension<"crc", "CRC", - "Enable ARMv8 CRC-32 checksum instructions (FEAT_CRC32)", [], - "FEAT_CRC", "+crc", 110>; + "Enable ARMv8 CRC-32 checksum instructions (FEAT_CRC32)">; -def FeatureRAS : Extension<"ras", "RAS", - "Enable ARMv8 Reliability, Availability and Serviceability Extensions (FEAT_RAS, FEAT_RASv1p1)">; - -def FeatureRASv2 : Extension<"rasv2", "RASv2", - "Enable ARMv8.9-A Reliability, Availability and Serviceability Extensions (FEAT_RASv2)", - [FeatureRAS]>; - -def FeatureLSE : Extension<"lse", "LSE", - "Enable ARMv8.1 Large System Extension (LSE) atomic instructions (FEAT_LSE)", [], - "FEAT_LSE", "+lse", 80>; +// This SubtargetFeature is special. It controls only whether codegen will turn +// `llvm.readcyclecounter()` into an access to a PMUv3 System Register. The +// `FEAT_PMUv3*` system registers are always available for assembly/disassembly. +let MArchName = "pmuv3" in +def FeaturePerfMon : Extension<"perfmon", "PerfMon", + "Enable Code Generation for ARMv8 PMUv3 Performance Monitors extension (FEAT_PMUv3)">; -def FeatureLSE2 : SubtargetFeature<"lse2", "HasLSE2", "true", - "Enable ARMv8.4 Large System Extension 2 (LSE2) atomicity rules (FEAT_LSE2)">; +def FeatureSpecRestrict : SubtargetFeature<"specrestrict", "HasSpecRestrict", + "true", "Enable architectural speculation restriction (FEAT_CSV2_2)">; -def FeatureOutlineAtomics : SubtargetFeature<"outline-atomics", "OutlineAtomics", "true", - "Enable out of line atomics to support LSE instructions">; +//===----------------------------------------------------------------------===// +// Armv8.1 Architecture Extensions +//===----------------------------------------------------------------------===// -def FeatureFMV : SubtargetFeature<"fmv", "HasFMV", "true", - "Enable Function Multi Versioning support.">; +def FeatureLSE : Extension<"lse", "LSE", + "Enable ARMv8.1 Large System Extension (LSE) atomic instructions (FEAT_LSE)">; let MArchAlias = "rdma" in def FeatureRDM : Extension<"rdm", "RDM", "Enable ARMv8.1 Rounding Double Multiply Add/Subtract instructions (FEAT_RDM)", - [FeatureNEON], - "FEAT_RDM", "+rdm,+fp-armv8,+neon", 108>; + [FeatureNEON]>; def FeaturePAN : SubtargetFeature< "pan", "HasPAN", "true", @@ -187,21 +111,24 @@ def FeatureCONTEXTIDREL2 : SubtargetFeature<"CONTEXTIDREL2", "HasCONTEXTIDREL2", def FeatureVH : SubtargetFeature<"vh", "HasVH", "true", "Enables ARM v8.1 Virtual Host extension (FEAT_VHE)", [FeatureCONTEXTIDREL2] >; -// This SubtargetFeature is special. It controls only whether codegen will turn -// `llvm.readcyclecounter()` into an access to a PMUv3 System Register. The -// `FEAT_PMUv3*` system registers are always available for assembly/disassembly. -let MArchName = "pmuv3" in -def FeaturePerfMon : Extension<"perfmon", "PerfMon", - "Enable Code Generation for ARMv8 PMUv3 Performance Monitors extension (FEAT_PMUv3)">; +//===----------------------------------------------------------------------===// +// Armv8.2 Architecture Extensions +//===----------------------------------------------------------------------===// + +def FeatureSM4 : Extension< + "sm4", "SM4", + "Enable SM3 and SM4 support (FEAT_SM4, FEAT_SM3)", [FeatureNEON]>; + +def FeatureSHA3 : Extension< + "sha3", "SHA3", + "Enable SHA512 and SHA3 support (FEAT_SHA3, FEAT_SHA512)", [FeatureNEON, FeatureSHA2]>; + +def FeatureRAS : Extension<"ras", "RAS", + "Enable ARMv8 Reliability, Availability and Serviceability Extensions (FEAT_RAS, FEAT_RASv1p1)">; let ArchExtKindSpelling = "AEK_FP16", MArchName = "fp16" in def FeatureFullFP16 : Extension<"fullfp16", "FullFP16", - "Full FP16 (FEAT_FP16)", [FeatureFPARMv8], - "FEAT_FP16", "+fullfp16,+fp-armv8,+neon", 170>; - -def FeatureFP16FML : Extension<"fp16fml", "FP16FML", - "Enable FP16 FML instructions (FEAT_FHM)", [FeatureFullFP16], - "FEAT_FP16FML", "+fp16fml,+fullfp16,+fp-armv8,+neon", 175>; + "Full FP16 (FEAT_FP16)", [FeatureFPARMv8]>; let ArchExtKindSpelling = "AEK_PROFILE", MArchName = "profile" in def FeatureSPE : Extension<"spe", "SPE", @@ -212,7 +139,6 @@ def FeaturePAN_RWV : SubtargetFeature< "Enable v8.2 PAN s1e1R and s1e1W Variants (FEAT_PAN2)", [FeaturePAN]>; -// UAO PState def FeaturePsUAO : SubtargetFeature< "uaops", "HasPsUAO", "true", "Enable v8.2 UAO PState (FEAT_UAO)">; @@ -220,66 +146,400 @@ def FeatureCCPP : SubtargetFeature<"ccpp", "HasCCPP", "true", "Enable v8.2 data Cache Clean to Point of Persistence (FEAT_DPB)" >; def FeatureSVE : Extension<"sve", "SVE", - "Enable Scalable Vector Extension (SVE) instructions (FEAT_SVE)", [FeatureFullFP16], - "FEAT_SVE", "+sve,+fullfp16,+fp-armv8,+neon", 310>; + "Enable Scalable Vector Extension (SVE) instructions (FEAT_SVE)", [FeatureFullFP16]>; -// This flag is currently still labeled as Experimental, but when fully -// implemented this should tell the compiler to use the zeroing pseudos to -// benefit from the reverse instructions (e.g. SUB vs SUBR) if the inactive -// lanes are known to be zero. The pseudos will then be expanded using the -// MOVPRFX instruction to zero the inactive lanes. This feature should only be -// enabled if MOVPRFX instructions are known to merge with the destructive -// operations they prefix. -// -// This feature could similarly be extended to support cheap merging of _any_ -// value into the inactive lanes using the MOVPRFX instruction that uses -// merging-predication. -def FeatureExperimentalZeroingPseudos - : SubtargetFeature<"use-experimental-zeroing-pseudos", - "UseExperimentalZeroingPseudos", "true", - "Hint to the compiler that the MOVPRFX instruction is " - "merged with destructive operations", - []>; +let ArchExtKindSpelling = "AEK_I8MM" in +def FeatureMatMulInt8 : Extension<"i8mm", "MatMulInt8", + "Enable Matrix Multiply Int8 Extension (FEAT_I8MM)", []>; -def FeatureUseScalarIncVL : SubtargetFeature<"use-scalar-inc-vl", - "UseScalarIncVL", "true", "Prefer inc/dec over add+cnt">; +let ArchExtKindSpelling = "AEK_F32MM" in +def FeatureMatMulFP32 : Extension<"f32mm", "MatMulFP32", + "Enable Matrix Multiply FP32 Extension (FEAT_F32MM)", [FeatureSVE]>; + +let ArchExtKindSpelling = "AEK_F64MM" in +def FeatureMatMulFP64 : Extension<"f64mm", "MatMulFP64", + "Enable Matrix Multiply FP64 Extension (FEAT_F64MM)", [FeatureSVE]>; + +//===----------------------------------------------------------------------===// +// Armv8.3 Architecture Extensions +//===----------------------------------------------------------------------===// + +def FeatureRCPC : Extension<"rcpc", "RCPC", + "Enable support for RCPC extension (FEAT_LRCPC)", []>; + +def FeaturePAuth : Extension< + "pauth", "PAuth", + "Enable v8.3-A Pointer Authentication extension (FEAT_PAuth)">; + +let ArchExtKindSpelling = "AEK_JSCVT", MArchName = "jscvt" in +def FeatureJS : Extension< + "jsconv", "JS", + "Enable v8.3-A JavaScript FP conversion instructions (FEAT_JSCVT)", + [FeatureFPARMv8]>; + +def FeatureCCIDX : SubtargetFeature< + "ccidx", "HasCCIDX", "true", + "Enable v8.3-A Extend of the CCSIDR number of sets (FEAT_CCIDX)">; + +let ArchExtKindSpelling = "AEK_FCMA", MArchName = "fcma" in +def FeatureComplxNum : Extension< + "complxnum", "ComplxNum", + "Enable v8.3-A Floating-point complex number support (FEAT_FCMA)", + [FeatureNEON]>; + +def FeatureNV : SubtargetFeature< + "nv", "HasNV", "true", + "Enable v8.4-A Nested Virtualization Enchancement (FEAT_NV, FEAT_NV2)">; + +//===----------------------------------------------------------------------===// +// Armv8.4 Architecture Extensions +//===----------------------------------------------------------------------===// + +def FeatureLSE2 : SubtargetFeature<"lse2", "HasLSE2", "true", + "Enable ARMv8.4 Large System Extension 2 (LSE2) atomicity rules (FEAT_LSE2)">; + +def FeatureFP16FML : Extension<"fp16fml", "FP16FML", + "Enable FP16 FML instructions (FEAT_FHM)", [FeatureFullFP16]>; + +def FeatureDotProd : Extension< + "dotprod", "DotProd", + "Enable dot product support (FEAT_DotProd)", [FeatureNEON]>; + +def FeatureMPAM : SubtargetFeature< + "mpam", "HasMPAM", "true", + "Enable v8.4-A Memory system Partitioning and Monitoring extension (FEAT_MPAM)">; + +def FeatureDIT : SubtargetFeature< + "dit", "HasDIT", "true", + "Enable v8.4-A Data Independent Timing instructions (FEAT_DIT)">; + +def FeatureTRACEV8_4 : SubtargetFeature< + "tracev8.4", "HasTRACEV8_4", "true", + "Enable v8.4-A Trace extension (FEAT_TRF)">; + +def FeatureAM : SubtargetFeature< + "am", "HasAM", "true", + "Enable v8.4-A Activity Monitors extension (FEAT_AMUv1)">; + +def FeatureSEL2 : SubtargetFeature< + "sel2", "HasSEL2", "true", + "Enable v8.4-A Secure Exception Level 2 extension (FEAT_SEL2)">; + +def FeatureTLB_RMI : SubtargetFeature< + "tlb-rmi", "HasTLB_RMI", "true", + "Enable v8.4-A TLB Range and Maintenance Instructions (FEAT_TLBIOS, FEAT_TLBIRANGE)">; + +def FeatureFlagM : Extension< + "flagm", "FlagM", + "Enable v8.4-A Flag Manipulation Instructions (FEAT_FlagM)", []>; + +def FeatureRCPC_IMMO : SubtargetFeature<"rcpc-immo", "HasRCPC_IMMO", "true", + "Enable v8.4-A RCPC instructions with Immediate Offsets (FEAT_LRCPC2)", + [FeatureRCPC]>; + +//===----------------------------------------------------------------------===// +// Armv8.5 Architecture Extensions +//===----------------------------------------------------------------------===// + +def FeatureAltFPCmp : SubtargetFeature<"altnzcv", "HasAlternativeNZCV", "true", + "Enable alternative NZCV format for floating point comparisons (FEAT_FlagM2)">; + +def FeatureFRInt3264 : SubtargetFeature<"fptoint", "HasFRInt3264", "true", + "Enable FRInt[32|64][Z|X] instructions that round a floating-point number to " + "an integer (in FP format) forcing it to fit into a 32- or 64-bit int (FEAT_FRINTTS)" >; + +def FeatureSB : Extension<"sb", "SB", + "Enable v8.5 Speculation Barrier (FEAT_SB)", []>; + +def FeatureSSBS : Extension<"ssbs", "SSBS", + "Enable Speculative Store Bypass Safe bit (FEAT_SSBS, FEAT_SSBS2)", []>; + +def FeaturePredRes : Extension<"predres", "PredRes", + "Enable v8.5a execution and data prediction invalidation instructions (FEAT_SPECRES)", []>; + +def FeatureCacheDeepPersist : SubtargetFeature<"ccdp", "CCDP", "true", + "Enable v8.5 Cache Clean to Point of Deep Persistence (FEAT_DPB2)" >; + +def FeatureBranchTargetId : SubtargetFeature<"bti", "BTI", "true", + "Enable Branch Target Identification (FEAT_BTI)">; + +let ArchExtKindSpelling = "AEK_RAND", MArchName = "rng" in +def FeatureRandGen : Extension<"rand", "RandGen", + "Enable Random Number generation instructions (FEAT_RNG)", []>; + +// NOTE: "memtag" means FEAT_MTE + FEAT_MTE2 for -march or +// __attribute((target(...))), but only FEAT_MTE for FMV. +let MArchName = "memtag" in +def FeatureMTE : Extension<"mte", "MTE", + "Enable Memory Tagging Extension (FEAT_MTE, FEAT_MTE2)", []>; + +//===----------------------------------------------------------------------===// +// Armv8.6 Architecture Extensions +//===----------------------------------------------------------------------===// def FeatureBF16 : Extension<"bf16", "BF16", - "Enable BFloat16 Extension (FEAT_BF16)", [], - "FEAT_BF16", "+bf16", 280>; + "Enable BFloat16 Extension (FEAT_BF16)">; -def FeatureNoSVEFPLD1R : SubtargetFeature<"no-sve-fp-ld1r", - "NoSVEFPLD1R", "true", "Avoid using LD1RX instructions for FP">; +def FeatureAMVS : SubtargetFeature< + "amvs", "HasAMVS", "true", + "Enable v8.6-A Activity Monitors Virtualization support (FEAT_AMUv1p1)", + [FeatureAM]>; + +def FeatureFineGrainedTraps : SubtargetFeature<"fgt", "HasFineGrainedTraps", + "true", "Enable fine grained virtualization traps extension (FEAT_FGT)">; + +def FeatureEnhancedCounterVirtualization : + SubtargetFeature<"ecv", "HasEnhancedCounterVirtualization", + "true", "Enable enhanced counter virtualization extension (FEAT_ECV)">; + +//===----------------------------------------------------------------------===// +// Armv8.7 Architecture Extensions +//===----------------------------------------------------------------------===// + +def FeatureXS : SubtargetFeature<"xs", "HasXS", + "true", "Enable Armv8.7-A limited-TLB-maintenance instruction (FEAT_XS)">; + +def FeatureWFxT : SubtargetFeature<"wfxt", "WFxT", "true", + "Enable Armv8.7-A WFET and WFIT instruction (FEAT_WFxT)">; + +def FeatureHCX : SubtargetFeature< + "hcx", "HasHCX", "true", "Enable Armv8.7-A HCRX_EL2 system register (FEAT_HCX)">; + +def FeatureLS64 : Extension<"ls64", "LS64", + "Enable Armv8.7-A LD64B/ST64B Accelerator Extension (FEAT_LS64, FEAT_LS64_V, FEAT_LS64_ACCDATA)", []>; + +def FeatureSPE_EEF : SubtargetFeature<"spe-eef", "HasSPE_EEF", + "true", "Enable extra register in the Statistical Profiling Extension (FEAT_SPEv1p2)">; + +//===----------------------------------------------------------------------===// +// Armv8.8 Architecture Extensions +//===----------------------------------------------------------------------===// + +def FeatureHBC : Extension<"hbc", "HBC", + "Enable Armv8.8-A Hinted Conditional Branches Extension (FEAT_HBC)">; + +def FeatureMOPS : Extension<"mops", "MOPS", + "Enable Armv8.8-A memcpy and memset acceleration instructions (FEAT_MOPS)", []>; + +def FeatureNMI : SubtargetFeature<"nmi", "HasNMI", + "true", "Enable Armv8.8-A Non-maskable Interrupts (FEAT_NMI, FEAT_GICv3_NMI)">; + +//===----------------------------------------------------------------------===// +// Armv8.9 Architecture Extensions +//===----------------------------------------------------------------------===// + +def FeatureRASv2 : Extension<"rasv2", "RASv2", + "Enable ARMv8.9-A Reliability, Availability and Serviceability Extensions (FEAT_RASv2)", + [FeatureRAS]>; + +def FeatureCSSC : Extension<"cssc", "CSSC", + "Enable Common Short Sequence Compression (CSSC) instructions (FEAT_CSSC)">; + +def FeatureCLRBHB : SubtargetFeature<"clrbhb", "HasCLRBHB", + "true", "Enable Clear BHB instruction (FEAT_CLRBHB)">; + +def FeaturePRFM_SLC : SubtargetFeature<"prfm-slc-target", "HasPRFM_SLC", + "true", "Enable SLC target for PRFM instruction">; + +let MArchName = "predres2" in +def FeatureSPECRES2 : Extension<"specres2", "SPECRES2", + "Enable Speculation Restriction Instruction (FEAT_SPECRES2)", + [FeaturePredRes]>; + +def FeatureRCPC3 : Extension<"rcpc3", "RCPC3", + "Enable Armv8.9-A RCPC instructions for A64 and Advanced SIMD and floating-point instruction set (FEAT_LRCPC3)", + [FeatureRCPC_IMMO]>; + +def FeatureTHE : Extension<"the", "THE", + "Enable Armv8.9-A Translation Hardening Extension (FEAT_THE)">; + +//===----------------------------------------------------------------------===// +// Armv9.0 Architecture Extensions +//===----------------------------------------------------------------------===// + +def FeatureUseScalarIncVL : SubtargetFeature<"use-scalar-inc-vl", + "UseScalarIncVL", "true", "Prefer inc/dec over add+cnt">; def FeatureSVE2 : Extension<"sve2", "SVE2", "Enable Scalable Vector Extension 2 (SVE2) instructions (FEAT_SVE2)", - [FeatureSVE, FeatureUseScalarIncVL], - "FEAT_SVE2", "+sve2,+sve,+fullfp16,+fp-armv8,+neon", 370>; + [FeatureSVE, FeatureUseScalarIncVL]>; def FeatureSVE2AES : Extension<"sve2-aes", "SVE2AES", "Enable AES SVE2 instructions (FEAT_SVE_AES, FEAT_SVE_PMULL128)", - [FeatureSVE2, FeatureAES], - "FEAT_SVE_AES", "+sve2,+sve,+sve2-aes,+fullfp16,+fp-armv8,+neon", 380>; + [FeatureSVE2, FeatureAES]>; def FeatureSVE2SM4 : Extension<"sve2-sm4", "SVE2SM4", - "Enable SM4 SVE2 instructions (FEAT_SVE_SM4)", [FeatureSVE2, FeatureSM4], - "FEAT_SVE_SM4", "+sve2,+sve,+sve2-sm4,+fullfp16,+fp-armv8,+neon", 420>; + "Enable SM4 SVE2 instructions (FEAT_SVE_SM4)", [FeatureSVE2, FeatureSM4]>; def FeatureSVE2SHA3 : Extension<"sve2-sha3", "SVE2SHA3", - "Enable SHA3 SVE2 instructions (FEAT_SVE_SHA3)", [FeatureSVE2, FeatureSHA3], - "FEAT_SVE_SHA3", "+sve2,+sve,+sve2-sha3,+fullfp16,+fp-armv8,+neon", 410>; + "Enable SHA3 SVE2 instructions (FEAT_SVE_SHA3)", [FeatureSVE2, FeatureSHA3]>; def FeatureSVE2BitPerm : Extension<"sve2-bitperm", "SVE2BitPerm", - "Enable bit permutation SVE2 instructions (FEAT_SVE_BitPerm)", [FeatureSVE2], - "FEAT_SVE_BITPERM", "+sve2,+sve,+sve2-bitperm,+fullfp16,+fp-armv8,+neon", 400>; + "Enable bit permutation SVE2 instructions (FEAT_SVE_BitPerm)", [FeatureSVE2]>; + +def FeatureTRBE : SubtargetFeature<"trbe", "TRBE", "true", + "Enable Trace Buffer Extension (FEAT_TRBE)">; + +def FeatureETE : SubtargetFeature<"ete", "ETE", "true", + "Enable Embedded Trace Extension (FEAT_ETE)", + [FeatureTRBE]>; + +def FeatureTME : Extension<"tme", "TME", + "Enable Transactional Memory Extension (FEAT_TME)" >; + +//===----------------------------------------------------------------------===// +// Armv9.1 Architecture Extensions +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Armv9.2 Architecture Extensions +//===----------------------------------------------------------------------===// + +def FeatureBRBE : Extension<"brbe", "BRBE", + "Enable Branch Record Buffer Extension (FEAT_BRBE)">; + +def FeatureRME : SubtargetFeature<"rme", "HasRME", + "true", "Enable Realm Management Extension (FEAT_RME)">; + +def FeatureSME : Extension<"sme", "SME", + "Enable Scalable Matrix Extension (SME) (FEAT_SME)", [FeatureBF16, FeatureUseScalarIncVL]>; + +def FeatureSMEF64F64 : Extension<"sme-f64f64", "SMEF64F64", + "Enable Scalable Matrix Extension (SME) F64F64 instructions (FEAT_SME_F64F64)", [FeatureSME]>; + +def FeatureSMEI16I64 : Extension<"sme-i16i64", "SMEI16I64", + "Enable Scalable Matrix Extension (SME) I16I64 instructions (FEAT_SME_I16I64)", [FeatureSME]>; + +def FeatureSMEFA64 : Extension<"sme-fa64", "SMEFA64", + "Enable the full A64 instruction set in streaming SVE mode (FEAT_SME_FA64)", [FeatureSME, FeatureSVE2]>; + +//===----------------------------------------------------------------------===// +// Armv9.3 Architecture Extensions +//===----------------------------------------------------------------------===// + +def FeatureSME2 : Extension<"sme2", "SME2", + "Enable Scalable Matrix Extension 2 (SME2) instructions", [FeatureSME]>; + +def FeatureMEC : SubtargetFeature<"mec", "HasMEC", + "true", "Enable Memory Encryption Contexts Extension", [FeatureRME]>; + +//===----------------------------------------------------------------------===// +// Armv9.4 Architecture Extensions +//===----------------------------------------------------------------------===// -let FMVDependencies = "+sve2p1,+sve2,+sve,+fullfp16,+fp-armv8,+neon" in def FeatureSVE2p1: Extension<"sve2p1", "SVE2p1", "Enable Scalable Vector Extension 2.1 instructions", [FeatureSVE2]>; def FeatureB16B16 : Extension<"b16b16", "B16B16", "Enable SVE2.1 or SME2.1 non-widening BFloat16 to BFloat16 instructions (FEAT_B16B16)", [FeatureBF16]>; +def FeatureSMEF16F16 : Extension<"sme-f16f16", "SMEF16F16", + "Enable SME non-widening Float16 instructions (FEAT_SME_F16F16)", [FeatureSME2]>; + +def FeatureSME2p1 : Extension<"sme2p1", "SME2p1", + "Enable Scalable Matrix Extension 2.1 (FEAT_SME2p1) instructions", [FeatureSME2]>; + +def FeatureCHK : SubtargetFeature<"chk", "HasCHK", + "true", "Enable Armv8.0-A Check Feature Status Extension (FEAT_CHK)">; + +def FeatureGCS : Extension<"gcs", "GCS", + "Enable Armv9.4-A Guarded Call Stack Extension", [FeatureCHK]>; + +def FeatureITE : Extension<"ite", "ITE", + "Enable Armv9.4-A Instrumentation Extension FEAT_ITE", [FeatureETE, + FeatureTRBE]>; + +def FeatureLSE128 : Extension<"lse128", "LSE128", + "Enable Armv9.4-A 128-bit Atomic Instructions (FEAT_LSE128)", + [FeatureLSE]>; + +// FEAT_D128, FEAT_LVA3, FEAT_SYSREG128, and FEAT_SYSINSTR128 are mutually implicit. +// Therefore group them all under a single feature flag, d128: +def FeatureD128 : Extension<"d128", "D128", + "Enable Armv9.4-A 128-bit Page Table Descriptors, System Registers " + "and Instructions (FEAT_D128, FEAT_LVA3, FEAT_SYSREG128, FEAT_SYSINSTR128)", + [FeatureLSE128]>; + +//===----------------------------------------------------------------------===// +// Armv9.5 Architecture Extensions +//===----------------------------------------------------------------------===// + +def FeatureFAMINMAX: Extension<"faminmax", "FAMINMAX", + "Enable FAMIN and FAMAX instructions (FEAT_FAMINMAX)">; + +def FeatureLUT: Extension<"lut", "LUT", + "Enable Lookup Table instructions (FEAT_LUT)">; + +def FeatureFP8 : Extension<"fp8", "FP8", + "Enable FP8 instructions (FEAT_FP8)", [FeatureFAMINMAX, FeatureLUT, FeatureBF16]>; + +def FeatureFP8FMA : Extension<"fp8fma", "FP8FMA", + "Enable fp8 multiply-add instructions (FEAT_FP8FMA)", [FeatureFP8]>; + +def FeatureSSVE_FP8FMA : Extension<"ssve-fp8fma", "SSVE_FP8FMA", + "Enable SVE2 fp8 multiply-add instructions (FEAT_SSVE_FP8FMA)", [FeatureSME2, FeatureFP8]>; + +def FeatureFP8DOT4: Extension<"fp8dot4", "FP8DOT4", + "Enable fp8 4-way dot instructions (FEAT_FP8DOT4)", [FeatureFP8FMA]>; + +def FeatureFP8DOT2: Extension<"fp8dot2", "FP8DOT2", + "Enable fp8 2-way dot instructions (FEAT_FP8DOT2)", [FeatureFP8DOT4]>; + +def FeatureSSVE_FP8DOT4 : Extension<"ssve-fp8dot4", "SSVE_FP8DOT4", + "Enable SVE2 fp8 4-way dot product instructions (FEAT_SSVE_FP8DOT4)", [FeatureSSVE_FP8FMA]>; + +def FeatureSSVE_FP8DOT2 : Extension<"ssve-fp8dot2", "SSVE_FP8DOT2", + "Enable SVE2 fp8 2-way dot product instructions (FEAT_SSVE_FP8DOT2)", [FeatureSSVE_FP8DOT4]>; + +def FeatureSME_LUTv2 : Extension<"sme-lutv2", "SME_LUTv2", + "Enable Scalable Matrix Extension (SME) LUTv2 instructions (FEAT_SME_LUTv2)">; + +def FeatureSMEF8F32 : Extension<"sme-f8f32", "SMEF8F32", + "Enable Scalable Matrix Extension (SME) F8F32 instructions (FEAT_SME_F8F32)", [FeatureSME2, FeatureFP8]>; + +def FeatureSMEF8F16 : Extension<"sme-f8f16", "SMEF8F16", + "Enable Scalable Matrix Extension (SME) F8F16 instructions(FEAT_SME_F8F16)", [FeatureSMEF8F32]>; + +def FeatureCPA : Extension<"cpa", "CPA", + "Enable Armv9.5-A Checked Pointer Arithmetic (FEAT_CPA)">; + +def FeaturePAuthLR : Extension<"pauth-lr", "PAuthLR", + "Enable Armv9.5-A PAC enhancements (FEAT_PAuth_LR)">; + +def FeatureTLBIW : Extension<"tlbiw", "TLBIW", + "Enable ARMv9.5-A TLBI VMALL for Dirty State (FEAT_TLBIW)">; + +//===----------------------------------------------------------------------===// +// Other Features +//===----------------------------------------------------------------------===// + +def FeatureOutlineAtomics : SubtargetFeature<"outline-atomics", "OutlineAtomics", "true", + "Enable out of line atomics to support LSE instructions">; + +def FeatureFMV : SubtargetFeature<"fmv", "HasFMV", "true", + "Enable Function Multi Versioning support.">; + +// This flag is currently still labeled as Experimental, but when fully +// implemented this should tell the compiler to use the zeroing pseudos to +// benefit from the reverse instructions (e.g. SUB vs SUBR) if the inactive +// lanes are known to be zero. The pseudos will then be expanded using the +// MOVPRFX instruction to zero the inactive lanes. This feature should only be +// enabled if MOVPRFX instructions are known to merge with the destructive +// operations they prefix. +// +// This feature could similarly be extended to support cheap merging of _any_ +// value into the inactive lanes using the MOVPRFX instruction that uses +// merging-predication. +def FeatureExperimentalZeroingPseudos + : SubtargetFeature<"use-experimental-zeroing-pseudos", + "UseExperimentalZeroingPseudos", "true", + "Hint to the compiler that the MOVPRFX instruction is " + "merged with destructive operations", + []>; + +def FeatureNoSVEFPLD1R : SubtargetFeature<"no-sve-fp-ld1r", + "NoSVEFPLD1R", "true", "Avoid using LD1RX instructions for FP">; + def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true", "Has zero-cycle register moves">; @@ -409,85 +669,10 @@ def FeatureForce32BitJumpTables : SubtargetFeature<"force-32bit-jump-tables", "Force32BitJumpTables", "true", "Force jump table entries to be 32-bits wide except at MinSize">; -def FeatureRCPC : Extension<"rcpc", "RCPC", - "Enable support for RCPC extension (FEAT_LRCPC)", [], - "FEAT_RCPC", "+rcpc", 230>; - def FeatureUseRSqrt : SubtargetFeature< "use-reciprocal-square-root", "UseRSqrt", "true", "Use the reciprocal square root approximation">; -def FeatureDotProd : Extension< - "dotprod", "DotProd", - "Enable dot product support (FEAT_DotProd)", [FeatureNEON], - "FEAT_DOTPROD", "+dotprod,+fp-armv8,+neon", 104>; - -def FeaturePAuth : Extension< - "pauth", "PAuth", - "Enable v8.3-A Pointer Authentication extension (FEAT_PAuth)">; - -let ArchExtKindSpelling = "AEK_JSCVT", MArchName = "jscvt" in -def FeatureJS : Extension< - "jsconv", "JS", - "Enable v8.3-A JavaScript FP conversion instructions (FEAT_JSCVT)", - [FeatureFPARMv8], - "FEAT_JSCVT", "+fp-armv8,+neon,+jsconv", 210>; - -def FeatureCCIDX : SubtargetFeature< - "ccidx", "HasCCIDX", "true", - "Enable v8.3-A Extend of the CCSIDR number of sets (FEAT_CCIDX)">; - -let ArchExtKindSpelling = "AEK_FCMA", MArchName = "fcma" in -def FeatureComplxNum : Extension< - "complxnum", "ComplxNum", - "Enable v8.3-A Floating-point complex number support (FEAT_FCMA)", - [FeatureNEON], - "FEAT_FCMA", "+fp-armv8,+neon,+complxnum", 220>; - -def FeatureNV : SubtargetFeature< - "nv", "HasNV", "true", - "Enable v8.4-A Nested Virtualization Enchancement (FEAT_NV, FEAT_NV2)">; - -def FeatureMPAM : SubtargetFeature< - "mpam", "HasMPAM", "true", - "Enable v8.4-A Memory system Partitioning and Monitoring extension (FEAT_MPAM)">; - -def FeatureDIT : Extension< - "dit", "DIT", - "Enable v8.4-A Data Independent Timing instructions (FEAT_DIT)", [], - "FEAT_DIT", "+dit", 180>; - -def FeatureTRACEV8_4 : SubtargetFeature< - "tracev8.4", "HasTRACEV8_4", "true", - "Enable v8.4-A Trace extension (FEAT_TRF)">; - -def FeatureAM : SubtargetFeature< - "am", "HasAM", "true", - "Enable v8.4-A Activity Monitors extension (FEAT_AMUv1)">; - -def FeatureAMVS : SubtargetFeature< - "amvs", "HasAMVS", "true", - "Enable v8.6-A Activity Monitors Virtualization support (FEAT_AMUv1p1)", - [FeatureAM]>; - -def FeatureSEL2 : SubtargetFeature< - "sel2", "HasSEL2", "true", - "Enable v8.4-A Secure Exception Level 2 extension (FEAT_SEL2)">; - -def FeatureTLB_RMI : SubtargetFeature< - "tlb-rmi", "HasTLB_RMI", "true", - "Enable v8.4-A TLB Range and Maintenance Instructions (FEAT_TLBIOS, FEAT_TLBIRANGE)">; - -def FeatureFlagM : Extension< - "flagm", "FlagM", - "Enable v8.4-A Flag Manipulation Instructions (FEAT_FlagM)", [], - "FEAT_FLAGM", "+flagm", 20>; - -// 8.4 RCPC enchancements: LDAPR & STLR instructions with Immediate Offset -def FeatureRCPC_IMMO : SubtargetFeature<"rcpc-immo", "HasRCPC_IMMO", "true", - "Enable v8.4-A RCPC instructions with Immediate Offsets (FEAT_LRCPC2)", - [FeatureRCPC]>; - def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates", "NegativeImmediates", "false", "Convert immediates and instructions " @@ -518,186 +703,11 @@ def FeatureAggressiveFMA : "true", "Enable Aggressive FMA for floating-point.">; -def FeatureAltFPCmp : SubtargetFeature<"altnzcv", "HasAlternativeNZCV", "true", - "Enable alternative NZCV format for floating point comparisons (FEAT_FlagM2)">; - -def FeatureFRInt3264 : SubtargetFeature<"fptoint", "HasFRInt3264", "true", - "Enable FRInt[32|64][Z|X] instructions that round a floating-point number to " - "an integer (in FP format) forcing it to fit into a 32- or 64-bit int (FEAT_FRINTTS)" >; - -def FeatureSpecRestrict : SubtargetFeature<"specrestrict", "HasSpecRestrict", - "true", "Enable architectural speculation restriction (FEAT_CSV2_2)">; - -def FeatureSB : Extension<"sb", "SB", - "Enable v8.5 Speculation Barrier (FEAT_SB)", [], - "FEAT_SB", "+sb", 470>; - -def FeatureSSBS : Extension<"ssbs", "SSBS", - "Enable Speculative Store Bypass Safe bit (FEAT_SSBS, FEAT_SSBS2)", [], - "FEAT_SSBS", "", 490>; - -def FeaturePredRes : Extension<"predres", "PredRes", - "Enable v8.5a execution and data prediction invalidation instructions (FEAT_SPECRES)", [], - "FEAT_PREDRES", "+predres", 480>; - -def FeatureCacheDeepPersist : SubtargetFeature<"ccdp", "CCDP", "true", - "Enable v8.5 Cache Clean to Point of Deep Persistence (FEAT_DPB2)" >; - -let ArchExtKindSpelling = "AEK_NONE" in -def FeatureBranchTargetId : Extension<"bti", "BTI", - "Enable Branch Target Identification (FEAT_BTI)", [], - "FEAT_BTI", "+bti", 510>; - -let ArchExtKindSpelling = "AEK_RAND", MArchName = "rng" in -def FeatureRandGen : Extension<"rand", "RandGen", - "Enable Random Number generation instructions (FEAT_RNG)", [], - "FEAT_RNG", "+rand", 10>; - -// NOTE: "memtag" means FEAT_MTE + FEAT_MTE2 for -march or -// __attribute((target(...))), but only FEAT_MTE for FMV. -let MArchName = "memtag" in -def FeatureMTE : Extension<"mte", "MTE", - "Enable Memory Tagging Extension (FEAT_MTE, FEAT_MTE2)", [], - "FEAT_MEMTAG", "", 440>; - -def FeatureTRBE : SubtargetFeature<"trbe", "TRBE", "true", - "Enable Trace Buffer Extension (FEAT_TRBE)">; - -def FeatureETE : SubtargetFeature<"ete", "ETE", "true", - "Enable Embedded Trace Extension (FEAT_ETE)", - [FeatureTRBE]>; - -def FeatureTME : Extension<"tme", "TME", - "Enable Transactional Memory Extension (FEAT_TME)" >; - def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals", "AllowTaggedGlobals", "true", "Use an instruction sequence for taking the address of a global " "that allows a memory tag in the upper address bits">; -let ArchExtKindSpelling = "AEK_I8MM" in -def FeatureMatMulInt8 : Extension<"i8mm", "MatMulInt8", - "Enable Matrix Multiply Int8 Extension (FEAT_I8MM)", [], - "FEAT_I8MM", "+i8mm", 270>; - -let ArchExtKindSpelling = "AEK_F32MM" in -def FeatureMatMulFP32 : Extension<"f32mm", "MatMulFP32", - "Enable Matrix Multiply FP32 Extension (FEAT_F32MM)", [FeatureSVE], - "FEAT_SVE_F32MM", "+sve,+f32mm,+fullfp16,+fp-armv8,+neon", 350>; - -let ArchExtKindSpelling = "AEK_F64MM" in -def FeatureMatMulFP64 : Extension<"f64mm", "MatMulFP64", - "Enable Matrix Multiply FP64 Extension (FEAT_F64MM)", [FeatureSVE], - "FEAT_SVE_F64MM", "+sve,+f64mm,+fullfp16,+fp-armv8,+neon", 360>; - -def FeatureXS : SubtargetFeature<"xs", "HasXS", - "true", "Enable Armv8.7-A limited-TLB-maintenance instruction (FEAT_XS)">; - -def FeatureWFxT : Extension<"wfxt", "WFxT", - "Enable Armv8.7-A WFET and WFIT instruction (FEAT_WFxT)", [], - "FEAT_WFXT", "+wfxt", 550>; - -def FeatureHCX : SubtargetFeature< - "hcx", "HasHCX", "true", "Enable Armv8.7-A HCRX_EL2 system register (FEAT_HCX)">; - -def FeatureLS64 : Extension<"ls64", "LS64", - "Enable Armv8.7-A LD64B/ST64B Accelerator Extension (FEAT_LS64, FEAT_LS64_V, FEAT_LS64_ACCDATA)", [], - "FEAT_LS64", "", 520>; - -def FeatureHBC : Extension<"hbc", "HBC", - "Enable Armv8.8-A Hinted Conditional Branches Extension (FEAT_HBC)">; - -def FeatureMOPS : Extension<"mops", "MOPS", - "Enable Armv8.8-A memcpy and memset acceleration instructions (FEAT_MOPS)", [], - "FEAT_MOPS", "+mops", 650>; - -def FeatureNMI : SubtargetFeature<"nmi", "HasNMI", - "true", "Enable Armv8.8-A Non-maskable Interrupts (FEAT_NMI, FEAT_GICv3_NMI)">; - -def FeatureBRBE : Extension<"brbe", "BRBE", - "Enable Branch Record Buffer Extension (FEAT_BRBE)">; - -def FeatureSPE_EEF : SubtargetFeature<"spe-eef", "HasSPE_EEF", - "true", "Enable extra register in the Statistical Profiling Extension (FEAT_SPEv1p2)">; - -def FeatureFineGrainedTraps : SubtargetFeature<"fgt", "HasFineGrainedTraps", - "true", "Enable fine grained virtualization traps extension (FEAT_FGT)">; - -def FeatureEnhancedCounterVirtualization : - SubtargetFeature<"ecv", "HasEnhancedCounterVirtualization", - "true", "Enable enhanced counter virtualization extension (FEAT_ECV)">; - -def FeatureRME : SubtargetFeature<"rme", "HasRME", - "true", "Enable Realm Management Extension (FEAT_RME)">; - -def FeatureSME : Extension<"sme", "SME", - "Enable Scalable Matrix Extension (SME) (FEAT_SME)", [FeatureBF16, FeatureUseScalarIncVL], - "FEAT_SME", "+sme,+bf16", 430>; - -def FeatureSMEF64F64 : Extension<"sme-f64f64", "SMEF64F64", - "Enable Scalable Matrix Extension (SME) F64F64 instructions (FEAT_SME_F64F64)", [FeatureSME], - "FEAT_SME_F64", "+sme,+sme-f64f64,+bf16", 560>; - -def FeatureSMEI16I64 : Extension<"sme-i16i64", "SMEI16I64", - "Enable Scalable Matrix Extension (SME) I16I64 instructions (FEAT_SME_I16I64)", [FeatureSME], - "FEAT_SME_I64", "+sme,+sme-i16i64,+bf16", 570>; - -def FeatureSMEFA64 : Extension<"sme-fa64", "SMEFA64", - "Enable the full A64 instruction set in streaming SVE mode (FEAT_SME_FA64)", [FeatureSME, FeatureSVE2]>; - -def FeatureSME2 : Extension<"sme2", "SME2", - "Enable Scalable Matrix Extension 2 (SME2) instructions", [FeatureSME], - "FEAT_SME2", "+sme2,+sme,+bf16", 580>; - -let FMVDependencies = "+sme2,+sme-f16f16" in -def FeatureSMEF16F16 : Extension<"sme-f16f16", "SMEF16F16", - "Enable SME non-widening Float16 instructions (FEAT_SME_F16F16)", [FeatureSME2]>; - -let FMVDependencies = "+sme2p1,+sme2,+sme,+bf16" in -def FeatureSME2p1 : Extension<"sme2p1", "SME2p1", - "Enable Scalable Matrix Extension 2.1 (FEAT_SME2p1) instructions", [FeatureSME2]>; - -def FeatureFAMINMAX: Extension<"faminmax", "FAMINMAX", - "Enable FAMIN and FAMAX instructions (FEAT_FAMINMAX)">; - -def FeatureLUT: Extension<"lut", "LUT", - "Enable Lookup Table instructions (FEAT_LUT)">; - -def FeatureFP8 : Extension<"fp8", "FP8", - "Enable FP8 instructions (FEAT_FP8)", [FeatureFAMINMAX, FeatureLUT, FeatureBF16]>; - -def FeatureFP8FMA : Extension<"fp8fma", "FP8FMA", - "Enable fp8 multiply-add instructions (FEAT_FP8FMA)", [FeatureFP8]>; - -let FMVDependencies = "+sme2" in -def FeatureSSVE_FP8FMA : Extension<"ssve-fp8fma", "SSVE_FP8FMA", - "Enable SVE2 fp8 multiply-add instructions (FEAT_SSVE_FP8FMA)", [FeatureSME2, FeatureFP8]>; - -def FeatureFP8DOT4: Extension<"fp8dot4", "FP8DOT4", - "Enable fp8 4-way dot instructions (FEAT_FP8DOT4)", [FeatureFP8FMA]>; - -def FeatureFP8DOT2: Extension<"fp8dot2", "FP8DOT2", - "Enable fp8 2-way dot instructions (FEAT_FP8DOT2)", [FeatureFP8DOT4]>; - -let FMVDependencies = "+sme2" in -def FeatureSSVE_FP8DOT4 : Extension<"ssve-fp8dot4", "SSVE_FP8DOT4", - "Enable SVE2 fp8 4-way dot product instructions (FEAT_SSVE_FP8DOT4)", [FeatureSSVE_FP8FMA]>; - -let FMVDependencies = "+sme2" in -def FeatureSSVE_FP8DOT2 : Extension<"ssve-fp8dot2", "SSVE_FP8DOT2", - "Enable SVE2 fp8 2-way dot product instructions (FEAT_SSVE_FP8DOT2)", [FeatureSSVE_FP8DOT4]>; - -def FeatureSME_LUTv2 : Extension<"sme-lutv2", "SME_LUTv2", - "Enable Scalable Matrix Extension (SME) LUTv2 instructions (FEAT_SME_LUTv2)">; - -let FMVDependencies = "+sme2,+fp8" in -def FeatureSMEF8F32 : Extension<"sme-f8f32", "SMEF8F32", - "Enable Scalable Matrix Extension (SME) F8F32 instructions (FEAT_SME_F8F32)", [FeatureSME2, FeatureFP8]>; - -let FMVDependencies = "+fp8,+sme2" in -def FeatureSMEF8F16 : Extension<"sme-f8f16", "SMEF8F16", - "Enable Scalable Matrix Extension (SME) F8F16 instructions(FEAT_SME_F8F16)", [FeatureSMEF8F32]>; - def FeatureAppleA7SysReg : SubtargetFeature<"apple-a7-sysreg", "HasAppleA7SysReg", "true", "Apple A7 (the CPU formerly known as Cyclone)">; @@ -707,9 +717,6 @@ def FeatureEL2VMSA : SubtargetFeature<"el2vmsa", "HasEL2VMSA", "true", def FeatureEL3 : SubtargetFeature<"el3", "HasEL3", "true", "Enable Exception Level 3">; -def FeatureCSSC : Extension<"cssc", "CSSC", - "Enable Common Short Sequence Compression (CSSC) instructions (FEAT_CSSC)">; - def FeatureFixCortexA53_835769 : SubtargetFeature<"fix-cortex-a53-835769", "FixCortexA53_835769", "true", "Mitigate Cortex-A53 Erratum 835769">; @@ -718,49 +725,6 @@ def FeatureNoBTIAtReturnTwice : SubtargetFeature<"no-bti-at-return-twice", "Don't place a BTI instruction " "after a return-twice">; -def FeatureCHK : SubtargetFeature<"chk", "HasCHK", - "true", "Enable Armv8.0-A Check Feature Status Extension (FEAT_CHK)">; - -def FeatureGCS : Extension<"gcs", "GCS", - "Enable Armv9.4-A Guarded Call Stack Extension", [FeatureCHK]>; - -def FeatureCLRBHB : SubtargetFeature<"clrbhb", "HasCLRBHB", - "true", "Enable Clear BHB instruction (FEAT_CLRBHB)">; - -def FeaturePRFM_SLC : SubtargetFeature<"prfm-slc-target", "HasPRFM_SLC", - "true", "Enable SLC target for PRFM instruction">; - -let MArchName = "predres2" in -def FeatureSPECRES2 : Extension<"specres2", "SPECRES2", - "Enable Speculation Restriction Instruction (FEAT_SPECRES2)", - [FeaturePredRes]>; - -def FeatureMEC : SubtargetFeature<"mec", "HasMEC", - "true", "Enable Memory Encryption Contexts Extension", [FeatureRME]>; - -def FeatureITE : Extension<"ite", "ITE", - "Enable Armv9.4-A Instrumentation Extension FEAT_ITE", [FeatureETE, - FeatureTRBE]>; - -def FeatureRCPC3 : Extension<"rcpc3", "RCPC3", - "Enable Armv8.9-A RCPC instructions for A64 and Advanced SIMD and floating-point instruction set (FEAT_LRCPC3)", - [FeatureRCPC_IMMO], - "FEAT_RCPC3", "+rcpc,+rcpc3", 241>; - -def FeatureTHE : Extension<"the", "THE", - "Enable Armv8.9-A Translation Hardening Extension (FEAT_THE)">; - -def FeatureLSE128 : Extension<"lse128", "LSE128", - "Enable Armv9.4-A 128-bit Atomic Instructions (FEAT_LSE128)", - [FeatureLSE]>; - -// FEAT_D128, FEAT_LVA3, FEAT_SYSREG128, and FEAT_SYSINSTR128 are mutually implicit. -// Therefore group them all under a single feature flag, d128: -def FeatureD128 : Extension<"d128", "D128", - "Enable Armv9.4-A 128-bit Page Table Descriptors, System Registers " - "and Instructions (FEAT_D128, FEAT_LVA3, FEAT_SYSREG128, FEAT_SYSINSTR128)", - [FeatureLSE128]>; - def FeatureDisableLdp : SubtargetFeature<"disable-ldp", "HasDisableLdp", "true", "Do not emit ldp">; @@ -773,18 +737,6 @@ def FeatureLdpAlignedOnly : SubtargetFeature<"ldp-aligned-only", "HasLdpAlignedO def FeatureStpAlignedOnly : SubtargetFeature<"stp-aligned-only", "HasStpAlignedOnly", "true", "In order to emit stp, first check if the store will be aligned to 2 * element_size">; -// AArch64 2023 Architecture Extensions (v9.5-A) - -def FeatureCPA : Extension<"cpa", "CPA", - "Enable Armv9.5-A Checked Pointer Arithmetic (FEAT_CPA)">; - -def FeaturePAuthLR : Extension<"pauth-lr", "PAuthLR", - "Enable Armv9.5-A PAC enhancements (FEAT_PAuth_LR)">; - -def FeatureTLBIW : Extension<"tlbiw", "TLBIW", - "Enable ARMv9.5-A TLBI VMALL for Dirty State (FEAT_TLBIW)">; - - //===----------------------------------------------------------------------===// // Architectures. // @@ -847,7 +799,7 @@ def HasV8_9aOps : Architecture64<8, 9, "a", "v8.9a", !listconcat(HasV8_8aOps.DefaultExts, [FeatureSPECRES2, FeatureCSSC, FeatureRASv2])>; def HasV9_0aOps : Architecture64<9, 0, "a", "v9a", - [HasV8_5aOps, FeatureMEC, FeatureSVE2], + [HasV8_5aOps, FeatureMEC], !listconcat(HasV8_5aOps.DefaultExts, [FeatureFullFP16, FeatureSVE, FeatureSVE2])>; def HasV9_1aOps : Architecture64<9, 1, "a", "v9.1a", @@ -914,4 +866,4 @@ def FeatureHardenSlsNoComdat : SubtargetFeature<"harden-sls-nocomdat", // Only intended to be used by disassemblers. def FeatureAll - : SubtargetFeature<"all", "IsAll", "true", "Enable all instructions", []>; + : SubtargetFeature<"all", "IsAll", "true", "Enable all instructions">; diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index cd532671f501..8216fa7db822 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -321,7 +321,7 @@ bool AArch64FrameLowering::homogeneousPrologEpilog( return false; auto *AFI = MF.getInfo<AArch64FunctionInfo>(); - if (AFI->hasSwiftAsyncContext()) + if (AFI->hasSwiftAsyncContext() || AFI->hasStreamingModeChanges()) return false; // If there are an odd number of GPRs before LR and FP in the CSRs list, @@ -431,8 +431,16 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); uint64_t NumBytes = AFI->getLocalStackSize(); + // If neither NEON or SVE are available, a COPY from one Q-reg to + // another requires a spill -> reload sequence. We can do that + // using a pre-decrementing store/post-decrementing load, but + // if we do so, we can't use the Red Zone. + bool LowerQRegCopyThroughMem = Subtarget.hasFPARMv8() && + !Subtarget.isNeonAvailable() && + !Subtarget.hasSVE(); + return !(MFI.hasCalls() || hasFP(MF) || NumBytes > RedZoneSize || - getSVEStackSize(MF)); + getSVEStackSize(MF) || LowerQRegCopyThroughMem); } /// hasFP - Return true if the specified function should have a dedicated frame @@ -550,6 +558,10 @@ void AArch64FrameLowering::emitCalleeSavedGPRLocations( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = MF.getFrameInfo(); + AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + SMEAttrs Attrs(MF.getFunction()); + bool LocallyStreaming = + Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface(); const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); if (CSI.empty()) @@ -561,14 +573,22 @@ void AArch64FrameLowering::emitCalleeSavedGPRLocations( DebugLoc DL = MBB.findDebugLoc(MBBI); for (const auto &Info : CSI) { - if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector) + unsigned FrameIdx = Info.getFrameIdx(); + if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector) continue; assert(!Info.isSpilledToReg() && "Spilling to registers not implemented"); - unsigned DwarfReg = TRI.getDwarfRegNum(Info.getReg(), true); + int64_t DwarfReg = TRI.getDwarfRegNum(Info.getReg(), true); + int64_t Offset = MFI.getObjectOffset(FrameIdx) - getOffsetOfLocalArea(); + + // The location of VG will be emitted before each streaming-mode change in + // the function. Only locally-streaming functions require emitting the + // non-streaming VG location here. + if ((LocallyStreaming && FrameIdx == AFI->getStreamingVGIdx()) || + (!LocallyStreaming && + DwarfReg == TRI.getDwarfRegNum(AArch64::VG, true))) + continue; - int64_t Offset = - MFI.getObjectOffset(Info.getFrameIdx()) - getOffsetOfLocalArea(); unsigned CFIIndex = MF.addFrameInst( MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset)); BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) @@ -691,6 +711,9 @@ static void emitCalleeSavedRestores(MachineBasicBlock &MBB, !static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg)) continue; + if (!Info.isRestored()) + continue; + unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestore( nullptr, TRI.getDwarfRegNum(Info.getReg(), true))); BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) @@ -1013,7 +1036,10 @@ static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) { MachineFunction *MF = MBB->getParent(); // If MBB is an entry block, use X9 as the scratch register - if (&MF->front() == MBB) + // preserve_none functions may be using X9 to pass arguments, + // so prefer to pick an available register below. + if (&MF->front() == MBB && + MF->getFunction().getCallingConv() != CallingConv::PreserveNone) return AArch64::X9; const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>(); @@ -1334,6 +1360,32 @@ static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI, ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize); } +bool requiresGetVGCall(MachineFunction &MF) { + AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + return AFI->hasStreamingModeChanges() && + !MF.getSubtarget<AArch64Subtarget>().hasSVE(); +} + +bool isVGInstruction(MachineBasicBlock::iterator MBBI) { + unsigned Opc = MBBI->getOpcode(); + if (Opc == AArch64::CNTD_XPiI || Opc == AArch64::RDSVLI_XI || + Opc == AArch64::UBFMXri) + return true; + + if (requiresGetVGCall(*MBBI->getMF())) { + if (Opc == AArch64::ORRXrr) + return true; + + if (Opc == AArch64::BL) { + auto Op1 = MBBI->getOperand(0); + return Op1.isSymbol() && + (StringRef(Op1.getSymbolName()) == "__arm_get_current_vg"); + } + } + + return false; +} + // Convert callee-save register save/restore instruction to do stack pointer // decrement/increment to allocate/deallocate the callee-save stack area by // converting store/load to use pre/post increment version. @@ -1344,6 +1396,17 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( MachineInstr::MIFlag FrameFlag = MachineInstr::FrameSetup, int CFAOffset = 0) { unsigned NewOpc; + + // If the function contains streaming mode changes, we expect instructions + // to calculate the value of VG before spilling. For locally-streaming + // functions, we need to do this for both the streaming and non-streaming + // vector length. Move past these instructions if necessary. + MachineFunction &MF = *MBB.getParent(); + AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + if (AFI->hasStreamingModeChanges()) + while (isVGInstruction(MBBI)) + ++MBBI; + switch (MBBI->getOpcode()) { default: llvm_unreachable("Unexpected callee-save save/restore opcode!"); @@ -1400,7 +1463,6 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( // If the first store isn't right where we want SP then we can't fold the // update in so create a normal arithmetic instruction instead. - MachineFunction &MF = *MBB.getParent(); if (MBBI->getOperand(MBBI->getNumOperands() - 1).getImm() != 0 || CSStackSizeInc < MinOffset || CSStackSizeInc > MaxOffset) { emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, @@ -1652,6 +1714,12 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, LiveRegs.removeReg(AArch64::X19); LiveRegs.removeReg(AArch64::FP); LiveRegs.removeReg(AArch64::LR); + + // X0 will be clobbered by a call to __arm_get_current_vg in the prologue. + // This is necessary to spill VG if required where SVE is unavailable, but + // X0 is preserved around this call. + if (requiresGetVGCall(MF)) + LiveRegs.removeReg(AArch64::X0); } auto VerifyClobberOnExit = make_scope_exit([&]() { @@ -1838,6 +1906,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // pointer bump above. while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) && !IsSVECalleeSave(MBBI)) { + // Move past instructions generated to calculate VG + if (AFI->hasStreamingModeChanges()) + while (isVGInstruction(MBBI)) + ++MBBI; + if (CombineSPBump) fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(), NeedsWinCFI, &HasWinCFI); @@ -2760,7 +2833,7 @@ struct RegPairInfo { unsigned Reg2 = AArch64::NoRegister; int FrameIdx; int Offset; - enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type; + enum RegType { GPR, FPR64, FPR128, PPR, ZPR, VG } Type; RegPairInfo() = default; @@ -2772,6 +2845,7 @@ struct RegPairInfo { return 2; case GPR: case FPR64: + case VG: return 8; case ZPR: case FPR128: @@ -2847,6 +2921,8 @@ static void computeCalleeSaveRegisterPairs( RPI.Type = RegPairInfo::ZPR; else if (AArch64::PPRRegClass.contains(RPI.Reg1)) RPI.Type = RegPairInfo::PPR; + else if (RPI.Reg1 == AArch64::VG) + RPI.Type = RegPairInfo::VG; else llvm_unreachable("Unsupported register class."); @@ -2879,6 +2955,8 @@ static void computeCalleeSaveRegisterPairs( if (((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1)) RPI.Reg2 = NextReg; break; + case RegPairInfo::VG: + break; } } @@ -2995,6 +3073,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); bool NeedsWinCFI = needsWinCFI(MF); DebugLoc DL; SmallVector<RegPairInfo, 8> RegPairs; @@ -3062,7 +3141,70 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( Size = 2; Alignment = Align(2); break; + case RegPairInfo::VG: + StrOpc = AArch64::STRXui; + Size = 8; + Alignment = Align(8); + break; + } + + unsigned X0Scratch = AArch64::NoRegister; + if (Reg1 == AArch64::VG) { + // Find an available register to store value of VG to. + Reg1 = findScratchNonCalleeSaveRegister(&MBB); + assert(Reg1 != AArch64::NoRegister); + SMEAttrs Attrs(MF.getFunction()); + + if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface() && + AFI->getStreamingVGIdx() == std::numeric_limits<int>::max()) { + // For locally-streaming functions, we need to store both the streaming + // & non-streaming VG. Spill the streaming value first. + BuildMI(MBB, MI, DL, TII.get(AArch64::RDSVLI_XI), Reg1) + .addImm(1) + .setMIFlag(MachineInstr::FrameSetup); + BuildMI(MBB, MI, DL, TII.get(AArch64::UBFMXri), Reg1) + .addReg(Reg1) + .addImm(3) + .addImm(63) + .setMIFlag(MachineInstr::FrameSetup); + + AFI->setStreamingVGIdx(RPI.FrameIdx); + } else if (MF.getSubtarget<AArch64Subtarget>().hasSVE()) { + BuildMI(MBB, MI, DL, TII.get(AArch64::CNTD_XPiI), Reg1) + .addImm(31) + .addImm(1) + .setMIFlag(MachineInstr::FrameSetup); + AFI->setVGIdx(RPI.FrameIdx); + } else { + const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>(); + if (llvm::any_of( + MBB.liveins(), + [&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) { + return STI.getRegisterInfo()->isSuperOrSubRegisterEq( + AArch64::X0, LiveIn.PhysReg); + })) + X0Scratch = Reg1; + + if (X0Scratch != AArch64::NoRegister) + BuildMI(MBB, MI, DL, TII.get(AArch64::ORRXrr), Reg1) + .addReg(AArch64::XZR) + .addReg(AArch64::X0, RegState::Undef) + .addReg(AArch64::X0, RegState::Implicit) + .setMIFlag(MachineInstr::FrameSetup); + + const uint32_t *RegMask = TRI->getCallPreservedMask( + MF, + CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1); + BuildMI(MBB, MI, DL, TII.get(AArch64::BL)) + .addExternalSymbol("__arm_get_current_vg") + .addRegMask(RegMask) + .addReg(AArch64::X0, RegState::ImplicitDefine) + .setMIFlag(MachineInstr::FrameSetup); + Reg1 = AArch64::X0; + AFI->setVGIdx(RPI.FrameIdx); + } } + LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI); if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI); dbgs() << ") -> fi#(" << RPI.FrameIdx; @@ -3154,6 +3296,13 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( if (RPI.isPaired()) MFI.setStackID(FrameIdxReg2, TargetStackID::ScalableVector); } + + if (X0Scratch != AArch64::NoRegister) + BuildMI(MBB, MI, DL, TII.get(AArch64::ORRXrr), AArch64::X0) + .addReg(AArch64::XZR) + .addReg(X0Scratch, RegState::Undef) + .addReg(X0Scratch, RegState::Implicit) + .setMIFlag(MachineInstr::FrameSetup); } return true; } @@ -3233,6 +3382,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( Size = 2; Alignment = Align(2); break; + case RegPairInfo::VG: + continue; } LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI); if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI); @@ -3432,6 +3583,19 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, CSStackSize += RegSize; } + // Increase the callee-saved stack size if the function has streaming mode + // changes, as we will need to spill the value of the VG register. + // For locally streaming functions, we spill both the streaming and + // non-streaming VG value. + const Function &F = MF.getFunction(); + SMEAttrs Attrs(F); + if (AFI->hasStreamingModeChanges()) { + if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface()) + CSStackSize += 16; + else + CSStackSize += 8; + } + // Save number of saved regs, so we can easily update CSStackSize later. unsigned NumSavedRegs = SavedRegs.count(); @@ -3568,6 +3732,33 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots( if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx; } + // Insert VG into the list of CSRs, immediately before LR if saved. + if (AFI->hasStreamingModeChanges()) { + std::vector<CalleeSavedInfo> VGSaves; + SMEAttrs Attrs(MF.getFunction()); + + auto VGInfo = CalleeSavedInfo(AArch64::VG); + VGInfo.setRestored(false); + VGSaves.push_back(VGInfo); + + // Add VG again if the function is locally-streaming, as we will spill two + // values. + if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface()) + VGSaves.push_back(VGInfo); + + bool InsertBeforeLR = false; + + for (unsigned I = 0; I < CSI.size(); I++) + if (CSI[I].getReg() == AArch64::LR) { + InsertBeforeLR = true; + CSI.insert(CSI.begin() + I, VGSaves.begin(), VGSaves.end()); + break; + } + + if (!InsertBeforeLR) + CSI.insert(CSI.end(), VGSaves.begin(), VGSaves.end()); + } + for (auto &CS : CSI) { Register Reg = CS.getReg(); const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); @@ -4183,12 +4374,58 @@ MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II, } } // namespace +MachineBasicBlock::iterator emitVGSaveRestore(MachineBasicBlock::iterator II, + const AArch64FrameLowering *TFI) { + MachineInstr &MI = *II; + MachineBasicBlock *MBB = MI.getParent(); + MachineFunction *MF = MBB->getParent(); + + if (MI.getOpcode() != AArch64::VGSavePseudo && + MI.getOpcode() != AArch64::VGRestorePseudo) + return II; + + SMEAttrs FuncAttrs(MF->getFunction()); + bool LocallyStreaming = + FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface(); + const AArch64FunctionInfo *AFI = MF->getInfo<AArch64FunctionInfo>(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + const AArch64InstrInfo *TII = + MF->getSubtarget<AArch64Subtarget>().getInstrInfo(); + + int64_t VGFrameIdx = + LocallyStreaming ? AFI->getStreamingVGIdx() : AFI->getVGIdx(); + assert(VGFrameIdx != std::numeric_limits<int>::max() && + "Expected FrameIdx for VG"); + + unsigned CFIIndex; + if (MI.getOpcode() == AArch64::VGSavePseudo) { + const MachineFrameInfo &MFI = MF->getFrameInfo(); + int64_t Offset = + MFI.getObjectOffset(VGFrameIdx) - TFI->getOffsetOfLocalArea(); + CFIIndex = MF->addFrameInst(MCCFIInstruction::createOffset( + nullptr, TRI->getDwarfRegNum(AArch64::VG, true), Offset)); + } else + CFIIndex = MF->addFrameInst(MCCFIInstruction::createRestore( + nullptr, TRI->getDwarfRegNum(AArch64::VG, true))); + + MachineInstr *UnwindInst = BuildMI(*MBB, II, II->getDebugLoc(), + TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + + MI.eraseFromParent(); + return UnwindInst->getIterator(); +} + void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced( MachineFunction &MF, RegScavenger *RS = nullptr) const { - if (StackTaggingMergeSetTag) - for (auto &BB : MF) - for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();) + AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + for (auto &BB : MF) + for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();) { + if (AFI->hasStreamingModeChanges()) + II = emitVGSaveRestore(II, this); + if (StackTaggingMergeSetTag) II = tryMergeAdjacentSTG(II, this, RS); + } } /// For Win64 AArch64 EH, the offset to the Unwind object is from the SP diff --git a/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def b/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def index b87421e5ee46..82066b48c84b 100644 --- a/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def +++ b/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def @@ -137,7 +137,9 @@ bool AArch64GenRegisterBankInfo::checkValueMapImpl(unsigned Idx, unsigned Offset) { unsigned PartialMapBaseIdx = Idx - PartialMappingIdx::PMI_Min; const ValueMapping &Map = - AArch64GenRegisterBankInfo::getValueMapping((PartialMappingIdx)FirstInBank, Size)[Offset]; + AArch64GenRegisterBankInfo::getValueMapping( + (PartialMappingIdx)FirstInBank, + TypeSize::getFixed(Size))[Offset]; return Map.BreakDown == &PartMappings[PartialMapBaseIdx] && Map.NumBreakDowns == 1; } @@ -167,7 +169,7 @@ bool AArch64GenRegisterBankInfo::checkPartialMappingIdx( } unsigned AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(unsigned RBIdx, - unsigned Size) { + TypeSize Size) { if (RBIdx == PMI_FirstGPR) { if (Size <= 32) return 0; @@ -178,17 +180,20 @@ unsigned AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(unsigned RBIdx, return -1; } if (RBIdx == PMI_FirstFPR) { - if (Size <= 16) + const unsigned MinSize = Size.getKnownMinValue(); + assert((!Size.isScalable() || MinSize >= 128) && + "Scalable vector types should have size of at least 128 bits"); + if (MinSize <= 16) return 0; - if (Size <= 32) + if (MinSize <= 32) return 1; - if (Size <= 64) + if (MinSize <= 64) return 2; - if (Size <= 128) + if (MinSize <= 128) return 3; - if (Size <= 256) + if (MinSize <= 256) return 4; - if (Size <= 512) + if (MinSize <= 512) return 5; return -1; } @@ -197,7 +202,7 @@ unsigned AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(unsigned RBIdx, const RegisterBankInfo::ValueMapping * AArch64GenRegisterBankInfo::getValueMapping(PartialMappingIdx RBIdx, - unsigned Size) { + const TypeSize Size) { assert(RBIdx != PartialMappingIdx::PMI_None && "No mapping needed for that"); unsigned BaseIdxOffset = getRegBankBaseIdxOffset(RBIdx, Size); if (BaseIdxOffset == -1u) @@ -221,7 +226,8 @@ const AArch64GenRegisterBankInfo::PartialMappingIdx const RegisterBankInfo::ValueMapping * AArch64GenRegisterBankInfo::getCopyMapping(unsigned DstBankID, - unsigned SrcBankID, unsigned Size) { + unsigned SrcBankID, + const TypeSize Size) { assert(DstBankID < AArch64::NumRegisterBanks && "Invalid bank ID"); assert(SrcBankID < AArch64::NumRegisterBanks && "Invalid bank ID"); PartialMappingIdx DstRBIdx = BankIDToCopyMapIdx[DstBankID]; diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 248778f98f4c..544eec3ab9ce 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -395,7 +395,8 @@ public: template <unsigned MaxIdx, unsigned Scale> void SelectMultiVectorMove(SDNode *N, unsigned NumVecs, unsigned BaseReg, unsigned Op); - + void SelectMultiVectorMoveZ(SDNode *N, unsigned NumVecs, unsigned Op, + unsigned MaxIdx, unsigned Scale); bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm); /// SVE Reg+Imm addressing mode. template <int64_t Min, int64_t Max> @@ -2003,6 +2004,34 @@ void AArch64DAGToDAGISel::SelectMultiVectorMove(SDNode *N, unsigned NumVecs, CurDAG->RemoveDeadNode(N); } +void AArch64DAGToDAGISel::SelectMultiVectorMoveZ(SDNode *N, unsigned NumVecs, + unsigned Op, unsigned MaxIdx, + unsigned Scale) { + + SDValue SliceBase = N->getOperand(3); + SDValue Base, Offset; + if (!SelectSMETileSlice(SliceBase, MaxIdx, Base, Offset, Scale)) + return; + // The correct Za tile number is computed in Machine Instruction + // See EmitZAInstr + // DAG cannot select Za tile as an output register with ZReg + SDLoc DL(N); + SDValue Ops[] = {/*TileNum*/ N->getOperand(2), Base, Offset, + /*Chain*/ N->getOperand(0)}; + SDNode *Mov = CurDAG->getMachineNode(Op, DL, {MVT::Untyped, MVT::Other}, Ops); + + EVT VT = N->getValueType(0); + for (unsigned I = 0; I < NumVecs; ++I) + ReplaceUses(SDValue(N, I), + CurDAG->getTargetExtractSubreg(AArch64::zsub0 + I, DL, VT, + SDValue(Mov, 0))); + + // Copy chain + unsigned ChainIdx = NumVecs; + ReplaceUses(SDValue(N, ChainIdx), SDValue(Mov, 1)); + CurDAG->RemoveDeadNode(N); +} + void AArch64DAGToDAGISel::SelectUnaryMultiIntrinsic(SDNode *N, unsigned NumOutVecs, bool IsTupleInput, @@ -4359,7 +4388,9 @@ bool AArch64DAGToDAGISel::trySelectXAR(SDNode *N) { // N1 = SRL_PRED true, V, splat(imm) --> rotr amount // N0 = SHL_PRED true, V, splat(bits-imm) // V = (xor x, y) - if (VT.isScalableVector() && Subtarget->hasSVE2orSME()) { + if (VT.isScalableVector() && + (Subtarget->hasSVE2() || + (Subtarget->hasSME() && Subtarget->isStreaming()))) { if (N0.getOpcode() != AArch64ISD::SHL_PRED || N1.getOpcode() != AArch64ISD::SRL_PRED) std::swap(N0, N1); @@ -5243,6 +5274,74 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { AArch64::MOVA_VG4_4ZMXI); return; } + case Intrinsic::aarch64_sme_readz_horiz_x2: { + if (VT == MVT::nxv16i8) { + SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_H_B_PSEUDO, 14, 2); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + VT == MVT::nxv8bf16) { + SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_H_H_PSEUDO, 6, 2); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_H_S_PSEUDO, 2, 2); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_H_D_PSEUDO, 0, 2); + return; + } + break; + } + case Intrinsic::aarch64_sme_readz_vert_x2: { + if (VT == MVT::nxv16i8) { + SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_V_B_PSEUDO, 14, 2); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + VT == MVT::nxv8bf16) { + SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_V_H_PSEUDO, 6, 2); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_V_S_PSEUDO, 2, 2); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_V_D_PSEUDO, 0, 2); + return; + } + break; + } + case Intrinsic::aarch64_sme_readz_horiz_x4: { + if (VT == MVT::nxv16i8) { + SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_H_B_PSEUDO, 12, 4); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + VT == MVT::nxv8bf16) { + SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_H_H_PSEUDO, 4, 4); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_H_S_PSEUDO, 0, 4); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_H_D_PSEUDO, 0, 4); + return; + } + break; + } + case Intrinsic::aarch64_sme_readz_vert_x4: { + if (VT == MVT::nxv16i8) { + SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_V_B_PSEUDO, 12, 4); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + VT == MVT::nxv8bf16) { + SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_V_H_PSEUDO, 4, 4); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_V_S_PSEUDO, 0, 4); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_V_D_PSEUDO, 0, 4); + return; + } + break; + } case Intrinsic::swift_async_context_addr: { SDLoc DL(Node); SDValue Chain = Node->getOperand(0); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 48bf648b0052..81132572e820 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -149,7 +149,7 @@ static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden, // scalable vector types for all instruction, even if SVE is not yet supported // with some instructions. // See [AArch64TargetLowering::fallbackToDAGISel] for implementation details. -static cl::opt<bool> EnableSVEGISel( +cl::opt<bool> EnableSVEGISel( "aarch64-enable-gisel-sve", cl::Hidden, cl::desc("Enable / disable SVE scalable vectors in Global ISel"), cl::init(false)); @@ -423,7 +423,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, addQRType(MVT::v8bf16); } - if (Subtarget->hasSVEorSME()) { + if (Subtarget->isSVEorStreamingSVEAvailable()) { // Add legal sve predicate types addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass); addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass); @@ -728,14 +728,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Promote); } - for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI, - ISD::FCOS, ISD::FSIN, ISD::FSINCOS, - ISD::FTAN, ISD::FEXP, ISD::FEXP2, - ISD::FEXP10, ISD::FLOG, ISD::FLOG2, - ISD::FLOG10, ISD::STRICT_FREM, ISD::STRICT_FPOW, - ISD::STRICT_FPOWI, ISD::STRICT_FCOS, ISD::STRICT_FSIN, - ISD::STRICT_FEXP, ISD::STRICT_FEXP2, ISD::STRICT_FLOG, - ISD::STRICT_FLOG2, ISD::STRICT_FLOG10}) { + for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI, + ISD::FCOS, ISD::FSIN, ISD::FSINCOS, + ISD::FTAN, ISD::FEXP, ISD::FEXP2, + ISD::FEXP10, ISD::FLOG, ISD::FLOG2, + ISD::FLOG10, ISD::STRICT_FREM, ISD::STRICT_FPOW, + ISD::STRICT_FPOWI, ISD::STRICT_FCOS, ISD::STRICT_FSIN, + ISD::STRICT_FEXP, ISD::STRICT_FEXP2, ISD::STRICT_FLOG, + ISD::STRICT_FLOG2, ISD::STRICT_FLOG10, ISD::STRICT_FTAN}) { setOperationAction(Op, MVT::f16, Promote); setOperationAction(Op, MVT::v4f16, Expand); setOperationAction(Op, MVT::v8f16, Expand); @@ -1408,7 +1408,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // FIXME: Move lowering for more nodes here if those are common between // SVE and SME. - if (Subtarget->hasSVEorSME()) { + if (Subtarget->isSVEorStreamingSVEAvailable()) { for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) { setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); @@ -1418,7 +1418,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, } } - if (Subtarget->hasSVEorSME()) { + if (Subtarget->isSVEorStreamingSVEAvailable()) { for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) { setOperationAction(ISD::BITREVERSE, VT, Custom); setOperationAction(ISD::BSWAP, VT, Custom); @@ -1430,8 +1430,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SINT_TO_FP, VT, Custom); setOperationAction(ISD::FP_TO_UINT, VT, Custom); setOperationAction(ISD::FP_TO_SINT, VT, Custom); - setOperationAction(ISD::MGATHER, VT, Custom); - setOperationAction(ISD::MSCATTER, VT, Custom); setOperationAction(ISD::MLOAD, VT, Custom); setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::MULHS, VT, Custom); @@ -1486,7 +1484,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, if (!Subtarget->isLittleEndian()) setOperationAction(ISD::BITCAST, VT, Expand); - if (Subtarget->hasSVE2orSME()) + if (Subtarget->hasSVE2() || + (Subtarget->hasSME() && Subtarget->isStreaming())) // For SLI/SRI. setOperationAction(ISD::OR, VT, Custom); } @@ -1528,14 +1527,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, } } - // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does - for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64, - MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, - MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) { + // NEON doesn't support masked loads/stores, but SME and SVE do. + for (auto VT : + {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64, + MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, + MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) { setOperationAction(ISD::MLOAD, VT, Custom); setOperationAction(ISD::MSTORE, VT, Custom); - setOperationAction(ISD::MGATHER, VT, Custom); - setOperationAction(ISD::MSCATTER, VT, Custom); } // Firstly, exclude all scalable vector extending loads/truncating stores, @@ -1576,8 +1574,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, MVT::nxv4f32, MVT::nxv2f64}) { setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); - setOperationAction(ISD::MGATHER, VT, Custom); - setOperationAction(ISD::MSCATTER, VT, Custom); setOperationAction(ISD::MLOAD, VT, Custom); setOperationAction(ISD::SPLAT_VECTOR, VT, Legal); setOperationAction(ISD::SELECT, VT, Custom); @@ -1611,8 +1607,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom); setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom); - if (Subtarget->isSVEAvailable()) - setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); @@ -1650,8 +1644,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) { setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); - setOperationAction(ISD::MGATHER, VT, Custom); - setOperationAction(ISD::MSCATTER, VT, Custom); setOperationAction(ISD::MLOAD, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::SPLAT_VECTOR, VT, Legal); @@ -1675,18 +1667,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::MUL, MVT::v1i64, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); - if (Subtarget->isSVEAvailable()) { - // NEON doesn't support across-vector reductions, but SVE does. - for (auto VT : - {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64}) - setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); - } - - // Histcnt is SVE2 only - if (Subtarget->hasSVE2() && Subtarget->isSVEAvailable()) - setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::Other, - Custom); - // NOTE: Currently this has to happen after computeRegisterProperties rather // than the preferred option of combining it with the addRegisterClass call. if (Subtarget->useSVEForFixedLengthVectors()) { @@ -1762,6 +1742,31 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_WO_CHAIN, VT, Custom); } + // Handle operations that are only available in non-streaming SVE mode. + if (Subtarget->isSVEAvailable()) { + for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64, + MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32, + MVT::nxv4f32, MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16, + MVT::nxv8bf16, MVT::v4f16, MVT::v8f16, MVT::v2f32, + MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8, + MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, + MVT::v4i32, MVT::v1i64, MVT::v2i64}) { + setOperationAction(ISD::MGATHER, VT, Custom); + setOperationAction(ISD::MSCATTER, VT, Custom); + } + + for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32, + MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16, + MVT::v2f32, MVT::v4f32, MVT::v2f64}) + setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); + + // Histcnt is SVE2 only + if (Subtarget->hasSVE2()) + setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::Other, + Custom); + } + + if (Subtarget->hasMOPS() && Subtarget->hasMTE()) { // Only required for llvm.aarch64.mops.memset.tag setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); @@ -1781,6 +1786,20 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, IsStrictFPEnabled = true; setMaxAtomicSizeInBitsSupported(128); + // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has + // it, but it's just a wrapper around ldexp. + if (Subtarget->isTargetWindows()) { + for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP}) + if (isOperationExpand(Op, MVT::f32)) + setOperationAction(Op, MVT::f32, Promote); + } + + // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16 + // isn't legal. + for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP}) + if (isOperationExpand(Op, MVT::f16)) + setOperationAction(Op, MVT::f16, Promote); + if (Subtarget->isWindowsArm64EC()) { // FIXME: are there intrinsics we need to exclude from this? for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) { @@ -1933,7 +1952,7 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT, } bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const { - if (!Subtarget->hasSVEorSME()) + if (!Subtarget->isSVEorStreamingSVEAvailable()) return true; // We can only use the BRKB + CNTP sequence with legal predicate types. We can @@ -2492,7 +2511,11 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((AArch64ISD::NodeType)Opcode) { case AArch64ISD::FIRST_NUMBER: break; + MAKE_CASE(AArch64ISD::ALLOCATE_ZA_BUFFER) + MAKE_CASE(AArch64ISD::INIT_TPIDR2OBJ) MAKE_CASE(AArch64ISD::COALESCER_BARRIER) + MAKE_CASE(AArch64ISD::VG_SAVE) + MAKE_CASE(AArch64ISD::VG_RESTORE) MAKE_CASE(AArch64ISD::SMSTART) MAKE_CASE(AArch64ISD::SMSTOP) MAKE_CASE(AArch64ISD::RESTORE_ZA) @@ -2953,18 +2976,25 @@ MachineBasicBlock *AArch64TargetLowering::EmitZTInstr(MachineInstr &MI, MachineBasicBlock * AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, - MachineBasicBlock *BB, bool HasTile) const { + MachineBasicBlock *BB) const { const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc)); unsigned StartIdx = 0; + bool HasTile = BaseReg != AArch64::ZA; + bool HasZPROut = HasTile && MI.getOperand(0).isReg(); + if (HasZPROut) { + MIB.add(MI.getOperand(0)); // Output ZPR + ++StartIdx; + } if (HasTile) { - MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define); - MIB.addReg(BaseReg + MI.getOperand(0).getImm()); - StartIdx = 1; - } else + MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm(), + RegState::Define); // Output ZA Tile + MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm()); // Input Za Tile + StartIdx++; + } else { MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg); - + } for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I) MIB.add(MI.getOperand(I)); @@ -2989,6 +3019,80 @@ AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const { return BB; } +MachineBasicBlock * +AArch64TargetLowering::EmitInitTPIDR2Object(MachineInstr &MI, + MachineBasicBlock *BB) const { + MachineFunction *MF = BB->getParent(); + MachineFrameInfo &MFI = MF->getFrameInfo(); + AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>(); + TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); + if (TPIDR2.Uses > 0) { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + // Store the buffer pointer to the TPIDR2 stack object. + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui)) + .addReg(MI.getOperand(0).getReg()) + .addFrameIndex(TPIDR2.FrameIndex) + .addImm(0); + // Set the reserved bytes (10-15) to zero + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui)) + .addReg(AArch64::WZR) + .addFrameIndex(TPIDR2.FrameIndex) + .addImm(5); + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui)) + .addReg(AArch64::WZR) + .addFrameIndex(TPIDR2.FrameIndex) + .addImm(3); + } else + MFI.RemoveStackObject(TPIDR2.FrameIndex); + + BB->remove_instr(&MI); + return BB; +} + +MachineBasicBlock * +AArch64TargetLowering::EmitAllocateZABuffer(MachineInstr &MI, + MachineBasicBlock *BB) const { + MachineFunction *MF = BB->getParent(); + MachineFrameInfo &MFI = MF->getFrameInfo(); + AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>(); + // TODO This function grows the stack with a subtraction, which doesn't work + // on Windows. Some refactoring to share the functionality in + // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI + // supports SME + assert(!MF->getSubtarget<AArch64Subtarget>().isTargetWindows() && + "Lazy ZA save is not yet supported on Windows"); + + TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); + + if (TPIDR2.Uses > 0) { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + // The SUBXrs below won't always be emitted in a form that accepts SP + // directly + Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP) + .addReg(AArch64::SP); + + // Allocate a lazy-save buffer object of the size given, normally SVL * SVL + auto Size = MI.getOperand(1).getReg(); + auto Dest = MI.getOperand(0).getReg(); + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), Dest) + .addReg(Size) + .addReg(Size) + .addReg(SP); + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), + AArch64::SP) + .addReg(Dest); + + // We have just allocated a variable sized object, tell this to PEI. + MFI.CreateVariableSizedObject(Align(16), nullptr); + } + + BB->remove_instr(&MI); + return BB; +} + MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( MachineInstr &MI, MachineBasicBlock *BB) const { @@ -2999,17 +3103,17 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask; switch (SMEMatrixType) { case (AArch64::SMEMatrixArray): - return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB, /*HasTile*/ false); + return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB); case (AArch64::SMEMatrixTileB): - return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB, /*HasTile*/ true); + return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB); case (AArch64::SMEMatrixTileH): - return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB, /*HasTile*/ true); + return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB); case (AArch64::SMEMatrixTileS): - return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB, /*HasTile*/ true); + return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB); case (AArch64::SMEMatrixTileD): - return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB, /*HasTile*/ true); + return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB); case (AArch64::SMEMatrixTileQ): - return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB, /*HasTile*/ true); + return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB); } } @@ -3019,7 +3123,10 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( MI.dump(); #endif llvm_unreachable("Unexpected instruction for custom inserter!"); - + case AArch64::InitTPIDR2Obj: + return EmitInitTPIDR2Object(MI, BB); + case AArch64::AllocateZABuffer: + return EmitAllocateZABuffer(MI, BB); case AArch64::F128CSEL: return EmitF128CSEL(MI, BB); case TargetOpcode::STATEPOINT: @@ -3434,6 +3541,12 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS); } Opcode = AArch64ISD::FCCMP; + } else if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(RHS)) { + APInt Imm = Const->getAPIntValue(); + if (Imm.isNegative() && Imm.sgt(-32)) { + Opcode = AArch64ISD::CCMN; + RHS = DAG.getConstant(Imm.abs(), DL, Const->getValueType(0)); + } } else if (RHS.getOpcode() == ISD::SUB) { SDValue SubOp0 = RHS.getOperand(0); if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { @@ -6410,6 +6523,10 @@ SDValue AArch64TargetLowering::LowerLOAD(SDValue Op, if (LoadNode->getMemoryVT() != MVT::v4i8) return SDValue(); + // Avoid generating unaligned loads. + if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align(4)) + return SDValue(); + unsigned ExtType; if (LoadNode->getExtensionType() == ISD::SEXTLOAD) ExtType = ISD::SIGN_EXTEND; @@ -6901,7 +7018,7 @@ bool AArch64TargetLowering::useSVEForFixedLengthVectorVT( // NEON-sized vectors can be emulated using SVE instructions. if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector())) - return Subtarget->hasSVEorSME(); + return Subtarget->isSVEorStreamingSVEAvailable(); // Ensure NEON MVTs only belong to a single register class. if (VT.getFixedSizeInBits() <= 128) @@ -7027,47 +7144,6 @@ AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const { } } - -unsigned -AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL, - SelectionDAG &DAG) const { - MachineFunction &MF = DAG.getMachineFunction(); - MachineFrameInfo &MFI = MF.getFrameInfo(); - - // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case) - SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, - DAG.getConstant(1, DL, MVT::i32)); - SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N); - SDValue Ops[] = {Chain, NN, DAG.getConstant(1, DL, MVT::i64)}; - SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other); - SDValue Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, VTs, Ops); - Chain = Buffer.getValue(1); - MFI.CreateVariableSizedObject(Align(1), nullptr); - - // Allocate an additional TPIDR2 object on the stack (16 bytes) - unsigned TPIDR2Obj = MFI.CreateStackObject(16, Align(16), false); - - // Store the buffer pointer to the TPIDR2 stack object. - MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj); - SDValue Ptr = DAG.getFrameIndex( - TPIDR2Obj, - DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); - Chain = DAG.getStore(Chain, DL, Buffer, Ptr, MPI); - - // Set the reserved bytes (10-15) to zero - EVT PtrTy = Ptr.getValueType(); - SDValue ReservedPtr = - DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(10, DL, PtrTy)); - Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i16), ReservedPtr, - MPI); - ReservedPtr = - DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(12, DL, PtrTy)); - Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i32), ReservedPtr, - MPI); - - return TPIDR2Obj; -} - static bool isPassedInFPR(EVT VT) { return VT.isFixedLengthVector() || (VT.isFloatingPoint() && !VT.isScalableVector()); @@ -7483,10 +7559,28 @@ SDValue AArch64TargetLowering::LowerFormalArguments( if (Subtarget->hasCustomCallingConv()) Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF); - // Conservatively assume the function requires the lazy-save mechanism. + // Create a 16 Byte TPIDR2 object. The dynamic buffer + // will be expanded and stored in the static object later using a pseudonode. if (SMEAttrs(MF.getFunction()).hasZAState()) { - unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG); - FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj); + TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); + TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false); + SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, + DAG.getConstant(1, DL, MVT::i32)); + + SDValue Buffer; + if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) { + Buffer = DAG.getNode(AArch64ISD::ALLOCATE_ZA_BUFFER, DL, + DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL}); + } else { + SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL); + Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, + DAG.getVTList(MVT::i64, MVT::Other), + {Chain, Size, DAG.getConstant(1, DL, MVT::i64)}); + MFI.CreateVariableSizedObject(Align(16), nullptr); + } + Chain = DAG.getNode( + AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other), + {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0)}); } if (CallConv == CallingConv::PreserveNone) { @@ -8172,9 +8266,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs); if (RequiresLazySave) { - unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj(); - MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj); - SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj, + const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); + MachinePointerInfo MPI = + MachinePointerInfo::getStack(MF, TPIDR2.FrameIndex); + SDValue TPIDR2ObjAddr = DAG.getFrameIndex( + TPIDR2.FrameIndex, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); SDValue NumZaSaveSlicesAddr = DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr, @@ -8514,6 +8610,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, SDValue InGlue; if (RequiresSMChange) { + + Chain = DAG.getNode(AArch64ISD::VG_SAVE, DL, + DAG.getVTList(MVT::Other, MVT::Glue), Chain); + InGlue = Chain.getValue(1); + SDValue NewChain = changeStreamingMode( DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain, InGlue, getSMCondition(CallerAttrs, CalleeAttrs), PStateSM); @@ -8691,6 +8792,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, Result = changeStreamingMode( DAG, DL, !CalleeAttrs.hasStreamingInterface(), Result, InGlue, getSMCondition(CallerAttrs, CalleeAttrs), PStateSM); + InGlue = Result.getValue(1); + + Result = + DAG.getNode(AArch64ISD::VG_RESTORE, DL, + DAG.getVTList(MVT::Other, MVT::Glue), {Result, InGlue}); } if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs)) @@ -8707,7 +8813,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, if (RequiresLazySave) { // Conditionally restore the lazy save using a pseudo node. - unsigned FI = FuncInfo->getLazySaveTPIDR2Obj(); + TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); SDValue RegMask = DAG.getRegisterMask( TRI->SMEABISupportRoutinesCallPreservedMaskFromX0()); SDValue RestoreRoutine = DAG.getTargetExternalSymbol( @@ -8720,7 +8826,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // RESTORE_ZA pseudo. SDValue Glue; SDValue TPIDR2Block = DAG.getFrameIndex( - FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); + TPIDR2.FrameIndex, + DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue); Result = DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other, @@ -8732,6 +8839,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, ISD::INTRINSIC_VOID, DL, MVT::Other, Result, DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), DAG.getConstant(0, DL, MVT::i64)); + TPIDR2.Uses++; } if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0) { @@ -9735,9 +9843,7 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op, EltSize *= 2; NumElts /= 2; MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts); - Val = DAG.getNode( - ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, - DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val); + Val = DAG.getNode(AArch64ISD::UADDLP, DL, WidenVT, Val); } return Val; @@ -14443,7 +14549,9 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, Op.getOperand(0), Op.getOperand(1)); case ISD::SRA: case ISD::SRL: - if (VT.isScalableVector() && Subtarget->hasSVE2orSME()) { + if (VT.isScalableVector() && + (Subtarget->hasSVE2() || + (Subtarget->hasSME() && Subtarget->isStreaming()))) { SDValue RShOperand; unsigned ShiftValue; if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand)) @@ -14995,55 +15103,13 @@ AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, return SDValue(); } -// When x and y are extended, lower: -// avgfloor(x, y) -> (x + y) >> 1 -// avgceil(x, y) -> (x + y + 1) >> 1 - -// Otherwise, lower to: -// avgfloor(x, y) -> (x >> 1) + (y >> 1) + (x & y & 1) -// avgceil(x, y) -> (x >> 1) + (y >> 1) + ((x || y) & 1) SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG, unsigned NewOp) const { if (Subtarget->hasSVE2()) return LowerToPredicatedOp(Op, DAG, NewOp); - SDLoc dl(Op); - SDValue OpA = Op->getOperand(0); - SDValue OpB = Op->getOperand(1); - EVT VT = Op.getValueType(); - bool IsCeil = - (Op->getOpcode() == ISD::AVGCEILS || Op->getOpcode() == ISD::AVGCEILU); - bool IsSigned = - (Op->getOpcode() == ISD::AVGFLOORS || Op->getOpcode() == ISD::AVGCEILS); - unsigned ShiftOpc = IsSigned ? ISD::SRA : ISD::SRL; - - assert(VT.isScalableVector() && "Only expect to lower scalable vector op!"); - - auto IsZeroExtended = [&DAG](SDValue &Node) { - KnownBits Known = DAG.computeKnownBits(Node, 0); - return Known.Zero.isSignBitSet(); - }; - - auto IsSignExtended = [&DAG](SDValue &Node) { - return (DAG.ComputeNumSignBits(Node, 0) > 1); - }; - - SDValue ConstantOne = DAG.getConstant(1, dl, VT); - if ((!IsSigned && IsZeroExtended(OpA) && IsZeroExtended(OpB)) || - (IsSigned && IsSignExtended(OpA) && IsSignExtended(OpB))) { - SDValue Add = DAG.getNode(ISD::ADD, dl, VT, OpA, OpB); - if (IsCeil) - Add = DAG.getNode(ISD::ADD, dl, VT, Add, ConstantOne); - return DAG.getNode(ShiftOpc, dl, VT, Add, ConstantOne); - } - - SDValue ShiftOpA = DAG.getNode(ShiftOpc, dl, VT, OpA, ConstantOne); - SDValue ShiftOpB = DAG.getNode(ShiftOpc, dl, VT, OpB, ConstantOne); - - SDValue tmp = DAG.getNode(IsCeil ? ISD::OR : ISD::AND, dl, VT, OpA, OpB); - tmp = DAG.getNode(ISD::AND, dl, VT, tmp, ConstantOne); - SDValue Add = DAG.getNode(ISD::ADD, dl, VT, ShiftOpA, ShiftOpB); - return DAG.getNode(ISD::ADD, dl, VT, Add, tmp); + // Default to expand. + return SDValue(); } SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op, @@ -15854,48 +15920,49 @@ bool AArch64TargetLowering::shouldSinkOperands( return false; } -static bool createTblShuffleForZExt(ZExtInst *ZExt, FixedVectorType *DstTy, - bool IsLittleEndian) { - Value *Op = ZExt->getOperand(0); - auto *SrcTy = cast<FixedVectorType>(Op->getType()); - auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth(); - auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth(); +static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth, + unsigned NumElts, bool IsLittleEndian, + SmallVectorImpl<int> &Mask) { if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth >= 64) return false; assert(DstWidth % SrcWidth == 0 && - "TBL lowering is not supported for a ZExt instruction with this " - "source & destination element type."); - unsigned ZExtFactor = DstWidth / SrcWidth; + "TBL lowering is not supported for a conversion instruction with this " + "source and destination element type."); + + unsigned Factor = DstWidth / SrcWidth; + unsigned MaskLen = NumElts * Factor; + + Mask.clear(); + Mask.resize(MaskLen, NumElts); + + unsigned SrcIndex = 0; + for (unsigned I = IsLittleEndian ? 0 : Factor - 1; I < MaskLen; I += Factor) + Mask[I] = SrcIndex++; + + return true; +} + +static Value *createTblShuffleForZExt(IRBuilderBase &Builder, Value *Op, + FixedVectorType *ZExtTy, + FixedVectorType *DstTy, + bool IsLittleEndian) { + auto *SrcTy = cast<FixedVectorType>(Op->getType()); unsigned NumElts = SrcTy->getNumElements(); - IRBuilder<> Builder(ZExt); + auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth(); + auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth(); + SmallVector<int> Mask; - // Create a mask that selects <0,...,Op[i]> for each lane of the destination - // vector to replace the original ZExt. This can later be lowered to a set of - // tbl instructions. - for (unsigned i = 0; i < NumElts * ZExtFactor; i++) { - if (IsLittleEndian) { - if (i % ZExtFactor == 0) - Mask.push_back(i / ZExtFactor); - else - Mask.push_back(NumElts); - } else { - if ((i + 1) % ZExtFactor == 0) - Mask.push_back((i - ZExtFactor + 1) / ZExtFactor); - else - Mask.push_back(NumElts); - } - } + if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts, IsLittleEndian, Mask)) + return nullptr; auto *FirstEltZero = Builder.CreateInsertElement( PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0)); Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask); Result = Builder.CreateBitCast(Result, DstTy); - if (DstTy != ZExt->getType()) - Result = Builder.CreateZExt(Result, ZExt->getType()); - ZExt->replaceAllUsesWith(Result); - ZExt->eraseFromParent(); - return true; + if (DstTy != ZExtTy) + Result = Builder.CreateZExt(Result, ZExtTy); + return Result; } static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) { @@ -16060,21 +16127,30 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion( DstTy = TruncDstType; } - - return createTblShuffleForZExt(ZExt, DstTy, Subtarget->isLittleEndian()); + IRBuilder<> Builder(ZExt); + Value *Result = createTblShuffleForZExt( + Builder, ZExt->getOperand(0), cast<FixedVectorType>(ZExt->getType()), + DstTy, Subtarget->isLittleEndian()); + if (!Result) + return false; + ZExt->replaceAllUsesWith(Result); + ZExt->eraseFromParent(); + return true; } auto *UIToFP = dyn_cast<UIToFPInst>(I); if (UIToFP && SrcTy->getElementType()->isIntegerTy(8) && DstTy->getElementType()->isFloatTy()) { IRBuilder<> Builder(I); - auto *ZExt = cast<ZExtInst>( - Builder.CreateZExt(I->getOperand(0), VectorType::getInteger(DstTy))); + Value *ZExt = createTblShuffleForZExt( + Builder, I->getOperand(0), FixedVectorType::getInteger(DstTy), + FixedVectorType::getInteger(DstTy), Subtarget->isLittleEndian()); + if (!ZExt) + return false; auto *UI = Builder.CreateUIToFP(ZExt, DstTy); I->replaceAllUsesWith(UI); I->eraseFromParent(); - return createTblShuffleForZExt(ZExt, cast<FixedVectorType>(ZExt->getType()), - Subtarget->isLittleEndian()); + return true; } // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui @@ -16149,15 +16225,13 @@ bool AArch64TargetLowering::isLegalInterleavedAccessType( UseScalable = false; - if (!VecTy->isScalableTy() && !Subtarget->isNeonAvailable() && - !Subtarget->useSVEForFixedLengthVectors()) - return false; - - if (VecTy->isScalableTy() && !Subtarget->hasSVEorSME()) + if (isa<FixedVectorType>(VecTy) && !Subtarget->isNeonAvailable() && + (!Subtarget->useSVEForFixedLengthVectors() || + !getSVEPredPatternFromNumElements(MinElts))) return false; - // Ensure that the predicate for this number of elements is available. - if (Subtarget->hasSVE() && !getSVEPredPatternFromNumElements(MinElts)) + if (isa<ScalableVectorType>(VecTy) && + !Subtarget->isSVEorStreamingSVEAvailable()) return false; // Ensure the number of vector elements is greater than 1. @@ -17720,6 +17794,47 @@ static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) { return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM); } +// Transform vector add(zext i8 to i32, zext i8 to i32) +// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32) +// This allows extra uses of saddl/uaddl at the lower vector widths, and less +// extends. +static SDValue performVectorExtCombine(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 || + (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND && + N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) || + (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND && + N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) || + N->getOperand(0).getOperand(0).getValueType() != + N->getOperand(1).getOperand(0).getValueType()) + return SDValue(); + + if (N->getOpcode() == ISD::MUL && + N->getOperand(0).getOpcode() != N->getOperand(1).getOpcode()) + return SDValue(); + + SDValue N0 = N->getOperand(0).getOperand(0); + SDValue N1 = N->getOperand(1).getOperand(0); + EVT InVT = N0.getValueType(); + + EVT S1 = InVT.getScalarType(); + EVT S2 = VT.getScalarType(); + if ((S2 == MVT::i32 && S1 == MVT::i8) || + (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) { + SDLoc DL(N); + EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), + S2.getHalfSizedIntegerVT(*DAG.getContext()), + VT.getVectorElementCount()); + SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0); + SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1); + SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1); + return DAG.getNode(N->getOpcode() == ISD::MUL ? N->getOperand(0).getOpcode() + : (unsigned)ISD::SIGN_EXTEND, + DL, VT, NewOp); + } + return SDValue(); +} + static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { @@ -17728,6 +17843,8 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, return Ext; if (SDValue Ext = performMulVectorCmpZeroCombine(N, DAG)) return Ext; + if (SDValue Ext = performVectorExtCombine(N, DAG)) + return Ext; if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -18124,9 +18241,11 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, if (!VT.isVector()) return SDValue(); - // The combining code works for NEON, SVE2 and SME. - if (TLI.useSVEForFixedLengthVectorVT(VT, !Subtarget.isNeonAvailable()) || - (VT.isScalableVector() && !Subtarget.hasSVE2())) + if (VT.isScalableVector() && !Subtarget.hasSVE2()) + return SDValue(); + + if (VT.isFixedLengthVector() && + (!Subtarget.isNeonAvailable() || TLI.useSVEForFixedLengthVectorVT(VT))) return SDValue(); SDValue N0 = N->getOperand(0); @@ -19604,41 +19723,6 @@ static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG) { return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond); } -// Transform vector add(zext i8 to i32, zext i8 to i32) -// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32) -// This allows extra uses of saddl/uaddl at the lower vector widths, and less -// extends. -static SDValue performVectorAddSubExtCombine(SDNode *N, SelectionDAG &DAG) { - EVT VT = N->getValueType(0); - if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 || - (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND && - N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) || - (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND && - N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) || - N->getOperand(0).getOperand(0).getValueType() != - N->getOperand(1).getOperand(0).getValueType()) - return SDValue(); - - SDValue N0 = N->getOperand(0).getOperand(0); - SDValue N1 = N->getOperand(1).getOperand(0); - EVT InVT = N0.getValueType(); - - EVT S1 = InVT.getScalarType(); - EVT S2 = VT.getScalarType(); - if ((S2 == MVT::i32 && S1 == MVT::i8) || - (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) { - SDLoc DL(N); - EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), - S2.getHalfSizedIntegerVT(*DAG.getContext()), - VT.getVectorElementCount()); - SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0); - SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1); - SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1); - return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewOp); - } - return SDValue(); -} - static SDValue performBuildVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { @@ -20260,7 +20344,7 @@ static SDValue performAddSubCombine(SDNode *N, return Val; if (SDValue Val = performNegCSelCombine(N, DCI.DAG)) return Val; - if (SDValue Val = performVectorAddSubExtCombine(N, DCI.DAG)) + if (SDValue Val = performVectorExtCombine(N, DCI.DAG)) return Val; if (SDValue Val = performAddCombineForShiftedOperands(N, DCI.DAG)) return Val; @@ -22283,7 +22367,8 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) { ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT); SmallVector<SDValue, 16> MaskConstants; - if (VecVT == MVT::v16i8) { + if (DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable() && + VecVT == MVT::v16i8) { // v16i8 is a special case, as we have 16 entries but only 8 positional bits // per entry. We split it into two halves, apply the mask, zip the halves to // create 8x 16-bit values, and the perform the vector reduce. diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 48a4ea91c278..5200b24d1388 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -70,6 +70,9 @@ enum NodeType : unsigned { COALESCER_BARRIER, + VG_SAVE, + VG_RESTORE, + SMSTART, SMSTOP, RESTORE_ZA, @@ -454,6 +457,8 @@ enum NodeType : unsigned { // SME RDSVL, REVD_MERGE_PASSTHRU, + ALLOCATE_ZA_BUFFER, + INIT_TPIDR2OBJ, // Asserts that a function argument (i32) is zero-extended to i8 by // the caller @@ -650,11 +655,14 @@ public: MachineBasicBlock *BB) const; MachineBasicBlock *EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock *EmitZAInstr(unsigned Opc, unsigned BaseReg, - MachineInstr &MI, MachineBasicBlock *BB, - bool HasTile) const; + MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock *EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const; MachineBasicBlock *EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitInitTPIDR2Object(MachineInstr &MI, + MachineBasicBlock *BB) const; + MachineBasicBlock *EmitAllocateZABuffer(MachineInstr &MI, + MachineBasicBlock *BB) const; MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, @@ -1034,9 +1042,6 @@ private: bool shouldExpandBuildVectorWithShuffles(EVT, unsigned) const override; - unsigned allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL, - SelectionDAG &DAG) const; - SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 1f437d0ed6f8..e1ecc5a57dd2 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -7602,13 +7602,12 @@ multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm, } let mayRaiseFPException = 1, Uses = [FPCR] in -multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm, - Predicate pred = HasNEON> { - let Predicates = [pred] in { +multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm> { + let Predicates = [HasNEONandIsStreamingSafe] in { def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,[]>; def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,[]>; } - let Predicates = [pred, HasFullFP16] in { + let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in { def v1f16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,[]>; } } @@ -7616,11 +7615,13 @@ multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm, let mayRaiseFPException = 1, Uses = [FPCR] in multiclass SIMDFPTwoScalarCVT<bit U, bit S, bits<5> opc, string asm, SDPatternOperator OpNode> { + let Predicates = [HasNEONandIsStreamingSafe] in { def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm, [(set FPR64:$Rd, (OpNode (f64 FPR64:$Rn)))]>; def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm, [(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>; - let Predicates = [HasNEON, HasFullFP16] in { + } + let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in { def v1i16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm, [(set (f16 FPR16:$Rd), (OpNode (f16 FPR16:$Rn)))]>; } @@ -7880,7 +7881,7 @@ class SIMDMovAlias<string asm, string size, Instruction inst, multiclass SMov { // SMOV with vector index of 0 are legal in Scalable Matrix Extension (SME) // streaming mode. - let Predicates = [HasNEONorSME] in { + let Predicates = [HasNEONandIsStreamingSafe] in { def vi8to32_idx0 : SIMDSMov<0, ".b", GPR32, VectorIndex0> { let Inst{20-16} = 0b00001; } @@ -7927,7 +7928,7 @@ multiclass SMov { multiclass UMov { // UMOV with vector index of 0 are legal in Scalable Matrix Extension (SME) // streaming mode. - let Predicates = [HasNEONorSME] in { + let Predicates = [HasNEONandIsStreamingSafe] in { def vi8_idx0 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndex0> { let Inst{20-16} = 0b00001; } @@ -11887,79 +11888,79 @@ multiclass LDOPregister<bits<3> opc, string op, bits<1> Acq, bits<1> Rel, // complex DAG for DstRHS. let Predicates = [HasLSE] in multiclass LDOPregister_patterns_ord_dag<string inst, string suffix, string op, - string size, dag SrcRHS, dag DstRHS> { - def : Pat<(!cast<PatFrag>(op#"_"#size#"_monotonic") GPR64sp:$Rn, SrcRHS), + ValueType vt, dag SrcRHS, dag DstRHS> { + def : Pat<(!cast<PatFrag>(op#"_"#vt#"_monotonic") GPR64sp:$Rn, SrcRHS), (!cast<Instruction>(inst # suffix) DstRHS, GPR64sp:$Rn)>; - def : Pat<(!cast<PatFrag>(op#"_"#size#"_acquire") GPR64sp:$Rn, SrcRHS), + def : Pat<(!cast<PatFrag>(op#"_"#vt#"_acquire") GPR64sp:$Rn, SrcRHS), (!cast<Instruction>(inst # "A" # suffix) DstRHS, GPR64sp:$Rn)>; - def : Pat<(!cast<PatFrag>(op#"_"#size#"_release") GPR64sp:$Rn, SrcRHS), + def : Pat<(!cast<PatFrag>(op#"_"#vt#"_release") GPR64sp:$Rn, SrcRHS), (!cast<Instruction>(inst # "L" # suffix) DstRHS, GPR64sp:$Rn)>; - def : Pat<(!cast<PatFrag>(op#"_"#size#"_acq_rel") GPR64sp:$Rn, SrcRHS), + def : Pat<(!cast<PatFrag>(op#"_"#vt#"_acq_rel") GPR64sp:$Rn, SrcRHS), (!cast<Instruction>(inst # "AL" # suffix) DstRHS, GPR64sp:$Rn)>; - def : Pat<(!cast<PatFrag>(op#"_"#size#"_seq_cst") GPR64sp:$Rn, SrcRHS), + def : Pat<(!cast<PatFrag>(op#"_"#vt#"_seq_cst") GPR64sp:$Rn, SrcRHS), (!cast<Instruction>(inst # "AL" # suffix) DstRHS, GPR64sp:$Rn)>; } multiclass LDOPregister_patterns_ord<string inst, string suffix, string op, - string size, dag RHS> { - defm : LDOPregister_patterns_ord_dag<inst, suffix, op, size, RHS, RHS>; + ValueType vt, dag RHS> { + defm : LDOPregister_patterns_ord_dag<inst, suffix, op, vt, RHS, RHS>; } multiclass LDOPregister_patterns_ord_mod<string inst, string suffix, string op, - string size, dag LHS, dag RHS> { - defm : LDOPregister_patterns_ord_dag<inst, suffix, op, size, LHS, RHS>; + ValueType vt, dag LHS, dag RHS> { + defm : LDOPregister_patterns_ord_dag<inst, suffix, op, vt, LHS, RHS>; } multiclass LDOPregister_patterns<string inst, string op> { - defm : LDOPregister_patterns_ord<inst, "X", op, "64", (i64 GPR64:$Rm)>; - defm : LDOPregister_patterns_ord<inst, "W", op, "32", (i32 GPR32:$Rm)>; - defm : LDOPregister_patterns_ord<inst, "H", op, "16", (i32 GPR32:$Rm)>; - defm : LDOPregister_patterns_ord<inst, "B", op, "8", (i32 GPR32:$Rm)>; + defm : LDOPregister_patterns_ord<inst, "X", op, i64, (i64 GPR64:$Rm)>; + defm : LDOPregister_patterns_ord<inst, "W", op, i32, (i32 GPR32:$Rm)>; + defm : LDOPregister_patterns_ord<inst, "H", op, i16, (i32 GPR32:$Rm)>; + defm : LDOPregister_patterns_ord<inst, "B", op, i8, (i32 GPR32:$Rm)>; } multiclass LDOPregister_patterns_mod<string inst, string op, string mod> { - defm : LDOPregister_patterns_ord_mod<inst, "X", op, "64", + defm : LDOPregister_patterns_ord_mod<inst, "X", op, i64, (i64 GPR64:$Rm), (i64 (!cast<Instruction>(mod#Xrr) XZR, GPR64:$Rm))>; - defm : LDOPregister_patterns_ord_mod<inst, "W", op, "32", + defm : LDOPregister_patterns_ord_mod<inst, "W", op, i32, (i32 GPR32:$Rm), (i32 (!cast<Instruction>(mod#Wrr) WZR, GPR32:$Rm))>; - defm : LDOPregister_patterns_ord_mod<inst, "H", op, "16", + defm : LDOPregister_patterns_ord_mod<inst, "H", op, i16, (i32 GPR32:$Rm), (i32 (!cast<Instruction>(mod#Wrr) WZR, GPR32:$Rm))>; - defm : LDOPregister_patterns_ord_mod<inst, "B", op, "8", + defm : LDOPregister_patterns_ord_mod<inst, "B", op, i8, (i32 GPR32:$Rm), (i32 (!cast<Instruction>(mod#Wrr) WZR, GPR32:$Rm))>; } let Predicates = [HasLSE] in multiclass CASregister_patterns_ord_dag<string inst, string suffix, string op, - string size, dag OLD, dag NEW> { - def : Pat<(!cast<PatFrag>(op#"_"#size#"_monotonic") GPR64sp:$Rn, OLD, NEW), + ValueType vt, dag OLD, dag NEW> { + def : Pat<(!cast<PatFrag>(op#"_"#vt#"_monotonic") GPR64sp:$Rn, OLD, NEW), (!cast<Instruction>(inst # suffix) OLD, NEW, GPR64sp:$Rn)>; - def : Pat<(!cast<PatFrag>(op#"_"#size#"_acquire") GPR64sp:$Rn, OLD, NEW), + def : Pat<(!cast<PatFrag>(op#"_"#vt#"_acquire") GPR64sp:$Rn, OLD, NEW), (!cast<Instruction>(inst # "A" # suffix) OLD, NEW, GPR64sp:$Rn)>; - def : Pat<(!cast<PatFrag>(op#"_"#size#"_release") GPR64sp:$Rn, OLD, NEW), + def : Pat<(!cast<PatFrag>(op#"_"#vt#"_release") GPR64sp:$Rn, OLD, NEW), (!cast<Instruction>(inst # "L" # suffix) OLD, NEW, GPR64sp:$Rn)>; - def : Pat<(!cast<PatFrag>(op#"_"#size#"_acq_rel") GPR64sp:$Rn, OLD, NEW), + def : Pat<(!cast<PatFrag>(op#"_"#vt#"_acq_rel") GPR64sp:$Rn, OLD, NEW), (!cast<Instruction>(inst # "AL" # suffix) OLD, NEW, GPR64sp:$Rn)>; - def : Pat<(!cast<PatFrag>(op#"_"#size#"_seq_cst") GPR64sp:$Rn, OLD, NEW), + def : Pat<(!cast<PatFrag>(op#"_"#vt#"_seq_cst") GPR64sp:$Rn, OLD, NEW), (!cast<Instruction>(inst # "AL" # suffix) OLD, NEW, GPR64sp:$Rn)>; } multiclass CASregister_patterns_ord<string inst, string suffix, string op, - string size, dag OLD, dag NEW> { - defm : CASregister_patterns_ord_dag<inst, suffix, op, size, OLD, NEW>; + ValueType vt, dag OLD, dag NEW> { + defm : CASregister_patterns_ord_dag<inst, suffix, op, vt, OLD, NEW>; } multiclass CASregister_patterns<string inst, string op> { - defm : CASregister_patterns_ord<inst, "X", op, "64", + defm : CASregister_patterns_ord<inst, "X", op, i64, (i64 GPR64:$Rold), (i64 GPR64:$Rnew)>; - defm : CASregister_patterns_ord<inst, "W", op, "32", + defm : CASregister_patterns_ord<inst, "W", op, i32, (i32 GPR32:$Rold), (i32 GPR32:$Rnew)>; - defm : CASregister_patterns_ord<inst, "H", op, "16", + defm : CASregister_patterns_ord<inst, "H", op, i16, (i32 GPR32:$Rold), (i32 GPR32:$Rnew)>; - defm : CASregister_patterns_ord<inst, "B", op, "8", + defm : CASregister_patterns_ord<inst, "B", op, i8, (i32 GPR32:$Rold), (i32 GPR32:$Rnew)>; } diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td index 58ca52f37b63..2d2b2bee99ec 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td +++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td @@ -346,16 +346,16 @@ let Predicates = [HasNEON] in { } let Predicates = [HasNoLSE] in { -def : Pat<(atomic_cmp_swap_8 GPR64:$addr, GPR32:$desired, GPR32:$new), +def : Pat<(atomic_cmp_swap_i8 GPR64:$addr, GPR32:$desired, GPR32:$new), (CMP_SWAP_8 GPR64:$addr, GPR32:$desired, GPR32:$new)>; -def : Pat<(atomic_cmp_swap_16 GPR64:$addr, GPR32:$desired, GPR32:$new), +def : Pat<(atomic_cmp_swap_i16 GPR64:$addr, GPR32:$desired, GPR32:$new), (CMP_SWAP_16 GPR64:$addr, GPR32:$desired, GPR32:$new)>; -def : Pat<(atomic_cmp_swap_32 GPR64:$addr, GPR32:$desired, GPR32:$new), +def : Pat<(atomic_cmp_swap_i32 GPR64:$addr, GPR32:$desired, GPR32:$new), (CMP_SWAP_32 GPR64:$addr, GPR32:$desired, GPR32:$new)>; -def : Pat<(atomic_cmp_swap_64 GPR64:$addr, GPR64:$desired, GPR64:$new), +def : Pat<(atomic_cmp_swap_i64 GPR64:$addr, GPR64:$desired, GPR64:$new), (CMP_SWAP_64 GPR64:$addr, GPR64:$desired, GPR64:$new)>; } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 7d540efe2b41..ee397db3fba6 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -17,6 +17,7 @@ #include "AArch64PointerAuth.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" +#include "MCTargetDesc/AArch64MCTargetDesc.h" #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" @@ -1354,48 +1355,52 @@ static bool areCFlagsAccessedBetweenInstrs( return false; } -/// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating -/// operation which could set the flags in an identical manner -bool AArch64InstrInfo::optimizePTestInstr( - MachineInstr *PTest, unsigned MaskReg, unsigned PredReg, - const MachineRegisterInfo *MRI) const { - auto *Mask = MRI->getUniqueVRegDef(MaskReg); - auto *Pred = MRI->getUniqueVRegDef(PredReg); - auto NewOp = Pred->getOpcode(); - bool OpChanged = false; - +std::optional<unsigned> +AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask, + MachineInstr *Pred, + const MachineRegisterInfo *MRI) const { unsigned MaskOpcode = Mask->getOpcode(); unsigned PredOpcode = Pred->getOpcode(); bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode); bool PredIsWhileLike = isWhileOpcode(PredOpcode); - if (isPTrueOpcode(MaskOpcode) && (PredIsPTestLike || PredIsWhileLike) && - getElementSizeForOpcode(MaskOpcode) == - getElementSizeForOpcode(PredOpcode) && - Mask->getOperand(1).getImm() == 31) { + if (PredIsWhileLike) { + // For PTEST(PG, PG), PTEST is redundant when PG is the result of a WHILEcc + // instruction and the condition is "any" since WHILcc does an implicit + // PTEST(ALL, PG) check and PG is always a subset of ALL. + if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY) + return PredOpcode; + // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is // redundant since WHILE performs an implicit PTEST with an all active - // mask. Must be an all active predicate of matching element size. + // mask. + if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 && + getElementSizeForOpcode(MaskOpcode) == + getElementSizeForOpcode(PredOpcode)) + return PredOpcode; + + return {}; + } + + if (PredIsPTestLike) { + // For PTEST(PG, PG), PTEST is redundant when PG is the result of an + // instruction that sets the flags as PTEST would and the condition is + // "any" since PG is always a subset of the governing predicate of the + // ptest-like instruction. + if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY) + return PredOpcode; // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the - // PTEST_LIKE instruction uses the same all active mask and the element - // size matches. If the PTEST has a condition of any then it is always - // redundant. - if (PredIsPTestLike) { + // the element size matches and either the PTEST_LIKE instruction uses + // the same all active mask or the condition is "any". + if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 && + getElementSizeForOpcode(MaskOpcode) == + getElementSizeForOpcode(PredOpcode)) { auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); - if (Mask != PTestLikeMask && PTest->getOpcode() != AArch64::PTEST_PP_ANY) - return false; + if (Mask == PTestLikeMask || PTest->getOpcode() == AArch64::PTEST_PP_ANY) + return PredOpcode; } - // Fallthough to simply remove the PTEST. - } else if ((Mask == Pred) && (PredIsPTestLike || PredIsWhileLike) && - PTest->getOpcode() == AArch64::PTEST_PP_ANY) { - // For PTEST(PG, PG), PTEST is redundant when PG is the result of an - // instruction that sets the flags as PTEST would. This is only valid when - // the condition is any. - - // Fallthough to simply remove the PTEST. - } else if (PredIsPTestLike) { // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the // flags are set based on the same mask 'PG', but PTEST_LIKE must operate // on 8-bit predicates like the PTEST. Otherwise, for instructions like @@ -1420,56 +1425,67 @@ bool AArch64InstrInfo::optimizePTestInstr( // identical regardless of element size. auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode); - if ((Mask != PTestLikeMask) || - (PredElementSize != AArch64::ElementSizeB && - PTest->getOpcode() != AArch64::PTEST_PP_ANY)) - return false; + if (Mask == PTestLikeMask && (PredElementSize == AArch64::ElementSizeB || + PTest->getOpcode() == AArch64::PTEST_PP_ANY)) + return PredOpcode; - // Fallthough to simply remove the PTEST. - } else { - // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the - // opcode so the PTEST becomes redundant. - switch (PredOpcode) { - case AArch64::AND_PPzPP: - case AArch64::BIC_PPzPP: - case AArch64::EOR_PPzPP: - case AArch64::NAND_PPzPP: - case AArch64::NOR_PPzPP: - case AArch64::ORN_PPzPP: - case AArch64::ORR_PPzPP: - case AArch64::BRKA_PPzP: - case AArch64::BRKPA_PPzPP: - case AArch64::BRKB_PPzP: - case AArch64::BRKPB_PPzPP: - case AArch64::RDFFR_PPz: { - // Check to see if our mask is the same. If not the resulting flag bits - // may be different and we can't remove the ptest. - auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); - if (Mask != PredMask) - return false; - break; - } - case AArch64::BRKN_PPzP: { - // BRKN uses an all active implicit mask to set flags unlike the other - // flag-setting instructions. - // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B). - if ((MaskOpcode != AArch64::PTRUE_B) || - (Mask->getOperand(1).getImm() != 31)) - return false; - break; - } - case AArch64::PTRUE_B: - // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A) - break; - default: - // Bail out if we don't recognize the input - return false; - } + return {}; + } - NewOp = convertToFlagSettingOpc(PredOpcode); - OpChanged = true; + // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the + // opcode so the PTEST becomes redundant. + switch (PredOpcode) { + case AArch64::AND_PPzPP: + case AArch64::BIC_PPzPP: + case AArch64::EOR_PPzPP: + case AArch64::NAND_PPzPP: + case AArch64::NOR_PPzPP: + case AArch64::ORN_PPzPP: + case AArch64::ORR_PPzPP: + case AArch64::BRKA_PPzP: + case AArch64::BRKPA_PPzPP: + case AArch64::BRKB_PPzP: + case AArch64::BRKPB_PPzPP: + case AArch64::RDFFR_PPz: { + // Check to see if our mask is the same. If not the resulting flag bits + // may be different and we can't remove the ptest. + auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); + if (Mask != PredMask) + return {}; + break; + } + case AArch64::BRKN_PPzP: { + // BRKN uses an all active implicit mask to set flags unlike the other + // flag-setting instructions. + // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B). + if ((MaskOpcode != AArch64::PTRUE_B) || + (Mask->getOperand(1).getImm() != 31)) + return {}; + break; + } + case AArch64::PTRUE_B: + // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A) + break; + default: + // Bail out if we don't recognize the input + return {}; } + return convertToFlagSettingOpc(PredOpcode); +} + +/// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating +/// operation which could set the flags in an identical manner +bool AArch64InstrInfo::optimizePTestInstr( + MachineInstr *PTest, unsigned MaskReg, unsigned PredReg, + const MachineRegisterInfo *MRI) const { + auto *Mask = MRI->getUniqueVRegDef(MaskReg); + auto *Pred = MRI->getUniqueVRegDef(PredReg); + unsigned PredOpcode = Pred->getOpcode(); + auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI); + if (!NewOp) + return false; + const TargetRegisterInfo *TRI = &getRegisterInfo(); // If another instruction between Pred and PTest accesses flags, don't remove @@ -1481,9 +1497,9 @@ bool AArch64InstrInfo::optimizePTestInstr( // as they are prior to PTEST. Sometimes this requires the tested PTEST // operand to be replaced with an equivalent instruction that also sets the // flags. - Pred->setDesc(get(NewOp)); PTest->eraseFromParent(); - if (OpChanged) { + if (*NewOp != PredOpcode) { + Pred->setDesc(get(*NewOp)); bool succeeded = UpdateOperandRegClass(*Pred); (void)succeeded; assert(succeeded && "Operands have incompatible register classes!"); @@ -4481,7 +4497,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Copy a Predicate register by ORRing with itself. if (AArch64::PPRRegClass.contains(DestReg) && AArch64::PPRRegClass.contains(SrcReg)) { - assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); + assert(Subtarget.isSVEorStreamingSVEAvailable() && + "Unexpected SVE register."); BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg) .addReg(SrcReg) // Pg .addReg(SrcReg) @@ -4494,8 +4511,6 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg); bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg); if (DestIsPNR || SrcIsPNR) { - assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) && - "Unexpected predicate-as-counter register."); auto ToPPR = [](MCRegister R) -> MCRegister { return (R - AArch64::PN0) + AArch64::P0; }; @@ -4516,7 +4531,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Copy a Z register by ORRing with itself. if (AArch64::ZPRRegClass.contains(DestReg) && AArch64::ZPRRegClass.contains(SrcReg)) { - assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); + assert(Subtarget.isSVEorStreamingSVEAvailable() && + "Unexpected SVE register."); BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg) .addReg(SrcReg) .addReg(SrcReg, getKillRegState(KillSrc)); @@ -4528,7 +4544,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) && (AArch64::ZPR2RegClass.contains(SrcReg) || AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) { - assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); + assert(Subtarget.isSVEorStreamingSVEAvailable() && + "Unexpected SVE register."); static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1}; copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, Indices); @@ -4538,7 +4555,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Copy a Z register triple by copying the individual sub-registers. if (AArch64::ZPR3RegClass.contains(DestReg) && AArch64::ZPR3RegClass.contains(SrcReg)) { - assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); + assert(Subtarget.isSVEorStreamingSVEAvailable() && + "Unexpected SVE register."); static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, AArch64::zsub2}; copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, @@ -4551,7 +4569,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) && (AArch64::ZPR4RegClass.contains(SrcReg) || AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) { - assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); + assert(Subtarget.isSVEorStreamingSVEAvailable() && + "Unexpected SVE register."); static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, AArch64::zsub2, AArch64::zsub3}; copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, @@ -4656,7 +4675,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (AArch64::FPR128RegClass.contains(DestReg) && AArch64::FPR128RegClass.contains(SrcReg)) { - if (Subtarget.hasSVEorSME() && !Subtarget.isNeonAvailable()) + if (Subtarget.isSVEorStreamingSVEAvailable() && + !Subtarget.isNeonAvailable()) BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ)) .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define) .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0)) @@ -4814,14 +4834,12 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, Opc = AArch64::STRBui; break; case 2: { - bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC); if (AArch64::FPR16RegClass.hasSubClassEq(RC)) Opc = AArch64::STRHui; - else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasSVEorSME() && + else if (AArch64::PNRRegClass.hasSubClassEq(RC) || + AArch64::PPRRegClass.hasSubClassEq(RC)) { + assert(Subtarget.isSVEorStreamingSVEAvailable() && "Unexpected register store without SVE store instructions"); - assert((!IsPNR || Subtarget.hasSVE2p1() || Subtarget.hasSME2()) && - "Unexpected register store without SVE2p1 or SME2"); Opc = AArch64::STR_PXI; StackID = TargetStackID::ScalableVector; } @@ -4870,7 +4888,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, AArch64::sube64, AArch64::subo64, FI, MMO); return; } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasSVEorSME() && + assert(Subtarget.isSVEorStreamingSVEAvailable() && "Unexpected register store without SVE store instructions"); Opc = AArch64::STR_ZXI; StackID = TargetStackID::ScalableVector; @@ -4894,7 +4912,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, Offset = false; } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) || AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasSVEorSME() && + assert(Subtarget.isSVEorStreamingSVEAvailable() && "Unexpected register store without SVE store instructions"); Opc = AArch64::STR_ZZXI; StackID = TargetStackID::ScalableVector; @@ -4906,7 +4924,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, Opc = AArch64::ST1Threev2d; Offset = false; } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasSVEorSME() && + assert(Subtarget.isSVEorStreamingSVEAvailable() && "Unexpected register store without SVE store instructions"); Opc = AArch64::STR_ZZZXI; StackID = TargetStackID::ScalableVector; @@ -4919,7 +4937,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, Offset = false; } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) || AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasSVEorSME() && + assert(Subtarget.isSVEorStreamingSVEAvailable() && "Unexpected register store without SVE store instructions"); Opc = AArch64::STR_ZZZZXI; StackID = TargetStackID::ScalableVector; @@ -4992,10 +5010,8 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, if (AArch64::FPR16RegClass.hasSubClassEq(RC)) Opc = AArch64::LDRHui; else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasSVEorSME() && + assert(Subtarget.isSVEorStreamingSVEAvailable() && "Unexpected register load without SVE load instructions"); - assert((!IsPNR || Subtarget.hasSVE2p1() || Subtarget.hasSME2()) && - "Unexpected register load without SVE2p1 or SME2"); if (IsPNR) PNRReg = DestReg; Opc = AArch64::LDR_PXI; @@ -5046,7 +5062,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, AArch64::subo64, FI, MMO); return; } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasSVEorSME() && + assert(Subtarget.isSVEorStreamingSVEAvailable() && "Unexpected register load without SVE load instructions"); Opc = AArch64::LDR_ZXI; StackID = TargetStackID::ScalableVector; @@ -5070,7 +5086,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, Offset = false; } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) || AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasSVEorSME() && + assert(Subtarget.isSVEorStreamingSVEAvailable() && "Unexpected register load without SVE load instructions"); Opc = AArch64::LDR_ZZXI; StackID = TargetStackID::ScalableVector; @@ -5082,7 +5098,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, Opc = AArch64::LD1Threev2d; Offset = false; } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasSVEorSME() && + assert(Subtarget.isSVEorStreamingSVEAvailable() && "Unexpected register load without SVE load instructions"); Opc = AArch64::LDR_ZZZXI; StackID = TargetStackID::ScalableVector; @@ -5095,7 +5111,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, Offset = false; } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) || AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasSVEorSME() && + assert(Subtarget.isSVEorStreamingSVEAvailable() && "Unexpected register load without SVE load instructions"); Opc = AArch64::LDR_ZZZZXI; StackID = TargetStackID::ScalableVector; @@ -8555,6 +8571,8 @@ AArch64InstrInfo::getOutliningCandidateInfo( NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) { RepeatedSequenceLocs = CandidatesWithoutStackFixups; FrameID = MachineOutlinerNoLRSave; + if (RepeatedSequenceLocs.size() < 2) + return std::nullopt; } else { SetCandidateCallInfo(MachineOutlinerDefault, 12); @@ -8700,6 +8718,13 @@ bool AArch64InstrInfo::isFunctionSafeToOutlineFrom( if (!AFI || AFI->hasRedZone().value_or(true)) return false; + // FIXME: Determine whether it is safe to outline from functions which contain + // streaming-mode changes. We may need to ensure any smstart/smstop pairs are + // outlined together and ensure it is safe to outline with async unwind info, + // required for saving & restoring VG around calls. + if (AFI->hasStreamingModeChanges()) + return false; + // FIXME: Teach the outliner to generate/handle Windows unwind info. if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) return false; @@ -9582,18 +9607,49 @@ AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI, namespace { class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo { - MachineInstr *PredBranch; + MachineFunction *MF; + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + MachineRegisterInfo &MRI; + + /// The block of the loop + MachineBasicBlock *LoopBB; + /// The conditional branch of the loop + MachineInstr *CondBranch; + /// The compare instruction for loop control + MachineInstr *Comp; + /// The number of the operand of the loop counter value in Comp + unsigned CompCounterOprNum; + /// The instruction that updates the loop counter value + MachineInstr *Update; + /// The number of the operand of the loop counter value in Update + unsigned UpdateCounterOprNum; + /// The initial value of the loop counter + Register Init; + /// True iff Update is a predecessor of Comp + bool IsUpdatePriorComp; + + /// The normalized condition used by createTripCountGreaterCondition() SmallVector<MachineOperand, 4> Cond; public: - AArch64PipelinerLoopInfo(MachineInstr *PredBranch, + AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch, + MachineInstr *Comp, unsigned CompCounterOprNum, + MachineInstr *Update, unsigned UpdateCounterOprNum, + Register Init, bool IsUpdatePriorComp, const SmallVectorImpl<MachineOperand> &Cond) - : PredBranch(PredBranch), Cond(Cond.begin(), Cond.end()) {} + : MF(Comp->getParent()->getParent()), + TII(MF->getSubtarget().getInstrInfo()), + TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()), + LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp), + CompCounterOprNum(CompCounterOprNum), Update(Update), + UpdateCounterOprNum(UpdateCounterOprNum), Init(Init), + IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {} bool shouldIgnoreForPipelining(const MachineInstr *MI) const override { // Make the instructions for loop control be placed in stage 0. - // The predecessors of PredBranch are considered by the caller. - return MI == PredBranch; + // The predecessors of Comp are considered by the caller. + return MI == Comp; } std::optional<bool> createTripCountGreaterCondition( @@ -9606,31 +9662,277 @@ public: return {}; } + void createRemainingIterationsGreaterCondition( + int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond, + DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override; + void setPreheader(MachineBasicBlock *NewPreheader) override {} void adjustTripCount(int TripCountAdjust) override {} void disposed() override {} + bool isMVEExpanderSupported() override { return true; } }; } // namespace -static bool isCompareAndBranch(unsigned Opcode) { - switch (Opcode) { - case AArch64::CBZW: - case AArch64::CBZX: - case AArch64::CBNZW: - case AArch64::CBNZX: - case AArch64::TBZW: - case AArch64::TBZX: - case AArch64::TBNZW: - case AArch64::TBNZX: - return true; +/// Clone an instruction from MI. The register of ReplaceOprNum-th operand +/// is replaced by ReplaceReg. The output register is newly created. +/// The other operands are unchanged from MI. +static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum, + Register ReplaceReg, MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertTo) { + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo(); + const TargetRegisterInfo *TRI = + MBB.getParent()->getSubtarget().getRegisterInfo(); + MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI); + Register Result = 0; + for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) { + if (I == 0 && NewMI->getOperand(0).getReg().isVirtual()) { + Result = MRI.createVirtualRegister( + MRI.getRegClass(NewMI->getOperand(0).getReg())); + NewMI->getOperand(I).setReg(Result); + } else if (I == ReplaceOprNum) { + MRI.constrainRegClass( + ReplaceReg, + TII->getRegClass(NewMI->getDesc(), I, TRI, *MBB.getParent())); + NewMI->getOperand(I).setReg(ReplaceReg); + } } - return false; + MBB.insert(InsertTo, NewMI); + return Result; +} + +void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition( + int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond, + DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) { + // Create and accumulate conditions for next TC iterations. + // Example: + // SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last + // # iteration of the kernel + // + // # insert the following instructions + // cond = CSINCXr 0, 0, C, implicit $nzcv + // counter = ADDXri counter, 1 # clone from this->Update + // SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp + // cond = CSINCXr cond, cond, C, implicit $nzcv + // ... (repeat TC times) + // SUBSXri cond, 0, implicit-def $nzcv + + assert(CondBranch->getOpcode() == AArch64::Bcc); + // CondCode to exit the loop + AArch64CC::CondCode CC = + (AArch64CC::CondCode)CondBranch->getOperand(0).getImm(); + if (CondBranch->getOperand(1).getMBB() == LoopBB) + CC = AArch64CC::getInvertedCondCode(CC); + + // Accumulate conditions to exit the loop + Register AccCond = AArch64::XZR; + + // If CC holds, CurCond+1 is returned; otherwise CurCond is returned. + auto AccumulateCond = [&](Register CurCond, + AArch64CC::CondCode CC) -> Register { + Register NewCond = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass); + BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::CSINCXr)) + .addReg(NewCond, RegState::Define) + .addReg(CurCond) + .addReg(CurCond) + .addImm(AArch64CC::getInvertedCondCode(CC)); + return NewCond; + }; + + if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) { + // Update and Comp for I==0 are already exists in MBB + // (MBB is an unrolled kernel) + Register Counter; + for (int I = 0; I <= TC; ++I) { + Register NextCounter; + if (I != 0) + NextCounter = + cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end()); + + AccCond = AccumulateCond(AccCond, CC); + + if (I != TC) { + if (I == 0) { + if (Update != Comp && IsUpdatePriorComp) { + Counter = + LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg(); + NextCounter = cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, + MBB.end()); + } else { + // can use already calculated value + NextCounter = LastStage0Insts[Update]->getOperand(0).getReg(); + } + } else if (Update != Comp) { + NextCounter = + cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end()); + } + } + Counter = NextCounter; + } + } else { + Register Counter; + if (LastStage0Insts.empty()) { + // use initial counter value (testing if the trip count is sufficient to + // be executed by pipelined code) + Counter = Init; + if (IsUpdatePriorComp) + Counter = + cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end()); + } else { + // MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block. + Counter = LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg(); + } + + for (int I = 0; I <= TC; ++I) { + Register NextCounter; + NextCounter = + cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end()); + AccCond = AccumulateCond(AccCond, CC); + if (I != TC && Update != Comp) + NextCounter = + cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end()); + Counter = NextCounter; + } + } + + // If AccCond == 0, the remainder is greater than TC. + BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::SUBSXri)) + .addReg(AArch64::XZR, RegState::Define | RegState::Dead) + .addReg(AccCond) + .addImm(0) + .addImm(0); + Cond.clear(); + Cond.push_back(MachineOperand::CreateImm(AArch64CC::EQ)); +} + +static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB, + Register &RegMBB, Register &RegOther) { + assert(Phi.getNumOperands() == 5); + if (Phi.getOperand(2).getMBB() == MBB) { + RegMBB = Phi.getOperand(1).getReg(); + RegOther = Phi.getOperand(3).getReg(); + } else { + assert(Phi.getOperand(4).getMBB() == MBB); + RegMBB = Phi.getOperand(3).getReg(); + RegOther = Phi.getOperand(1).getReg(); + } +} + +static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB) { + if (!Reg.isVirtual()) + return false; + const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + return MRI.getVRegDef(Reg)->getParent() != BB; +} + +/// If Reg is an induction variable, return true and set some parameters +static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB, + MachineInstr *&UpdateInst, + unsigned &UpdateCounterOprNum, Register &InitReg, + bool &IsUpdatePriorComp) { + // Example: + // + // Preheader: + // InitReg = ... + // LoopBB: + // Reg0 = PHI (InitReg, Preheader), (Reg1, LoopBB) + // Reg = COPY Reg0 ; COPY is ignored. + // Reg1 = ADD Reg, #1; UpdateInst. Incremented by a loop invariant value. + // ; Reg is the value calculated in the previous + // ; iteration, so IsUpdatePriorComp == false. + + if (LoopBB->pred_size() != 2) + return false; + if (!Reg.isVirtual()) + return false; + const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo(); + UpdateInst = nullptr; + UpdateCounterOprNum = 0; + InitReg = 0; + IsUpdatePriorComp = true; + Register CurReg = Reg; + while (true) { + MachineInstr *Def = MRI.getVRegDef(CurReg); + if (Def->getParent() != LoopBB) + return false; + if (Def->isCopy()) { + // Ignore copy instructions unless they contain subregisters + if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg()) + return false; + CurReg = Def->getOperand(1).getReg(); + } else if (Def->isPHI()) { + if (InitReg != 0) + return false; + if (!UpdateInst) + IsUpdatePriorComp = false; + extractPhiReg(*Def, LoopBB, CurReg, InitReg); + } else { + if (UpdateInst) + return false; + switch (Def->getOpcode()) { + case AArch64::ADDSXri: + case AArch64::ADDSWri: + case AArch64::SUBSXri: + case AArch64::SUBSWri: + case AArch64::ADDXri: + case AArch64::ADDWri: + case AArch64::SUBXri: + case AArch64::SUBWri: + UpdateInst = Def; + UpdateCounterOprNum = 1; + break; + case AArch64::ADDSXrr: + case AArch64::ADDSWrr: + case AArch64::SUBSXrr: + case AArch64::SUBSWrr: + case AArch64::ADDXrr: + case AArch64::ADDWrr: + case AArch64::SUBXrr: + case AArch64::SUBWrr: + UpdateInst = Def; + if (isDefinedOutside(Def->getOperand(2).getReg(), LoopBB)) + UpdateCounterOprNum = 1; + else if (isDefinedOutside(Def->getOperand(1).getReg(), LoopBB)) + UpdateCounterOprNum = 2; + else + return false; + break; + default: + return false; + } + CurReg = Def->getOperand(UpdateCounterOprNum).getReg(); + } + + if (!CurReg.isVirtual()) + return false; + if (Reg == CurReg) + break; + } + + if (!UpdateInst) + return false; + + return true; } std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const { + // Accept loops that meet the following conditions + // * The conditional branch is BCC + // * The compare instruction is ADDS/SUBS/WHILEXX + // * One operand of the compare is an induction variable and the other is a + // loop invariant value + // * The induction variable is incremented/decremented by a single instruction + // * Does not contain CALL or instructions which have unmodeled side effects + + for (MachineInstr &MI : *LoopBB) + if (MI.isCall() || MI.hasUnmodeledSideEffects()) + // This instruction may use NZCV, which interferes with the instruction to + // be inserted for loop control. + return nullptr; + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; SmallVector<MachineOperand, 4> Cond; if (analyzeBranch(*LoopBB, TBB, FBB, Cond)) @@ -9641,48 +9943,76 @@ AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const { return nullptr; // Must be conditional branch - if (FBB == nullptr) + if (TBB != LoopBB && FBB == nullptr) return nullptr; assert((TBB == LoopBB || FBB == LoopBB) && "The Loop must be a single-basic-block loop"); + MachineInstr *CondBranch = &*LoopBB->getFirstTerminator(); + const TargetRegisterInfo &TRI = getRegisterInfo(); + + if (CondBranch->getOpcode() != AArch64::Bcc) + return nullptr; + // Normalization for createTripCountGreaterCondition() if (TBB == LoopBB) reverseBranchCondition(Cond); - MachineInstr *CondBranch = &*LoopBB->getFirstTerminator(); - const TargetRegisterInfo &TRI = getRegisterInfo(); - - // Find the immediate predecessor of the conditional branch - MachineInstr *PredBranch = nullptr; - if (CondBranch->getOpcode() == AArch64::Bcc) { - for (MachineInstr &MI : reverse(*LoopBB)) { - if (MI.modifiesRegister(AArch64::NZCV, &TRI)) { - PredBranch = &MI; + MachineInstr *Comp = nullptr; + unsigned CompCounterOprNum = 0; + for (MachineInstr &MI : reverse(*LoopBB)) { + if (MI.modifiesRegister(AArch64::NZCV, &TRI)) { + // Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the + // operands is a loop invariant value + + switch (MI.getOpcode()) { + case AArch64::SUBSXri: + case AArch64::SUBSWri: + case AArch64::ADDSXri: + case AArch64::ADDSWri: + Comp = &MI; + CompCounterOprNum = 1; break; + case AArch64::ADDSWrr: + case AArch64::ADDSXrr: + case AArch64::SUBSWrr: + case AArch64::SUBSXrr: + Comp = &MI; + break; + default: + if (isWhileOpcode(MI.getOpcode())) { + Comp = &MI; + break; + } + return nullptr; } - } - if (!PredBranch) - return nullptr; - } else if (isCompareAndBranch(CondBranch->getOpcode())) { - const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo(); - Register Reg = CondBranch->getOperand(0).getReg(); - if (!Reg.isVirtual()) - return nullptr; - PredBranch = MRI.getVRegDef(Reg); - // MachinePipeliner does not expect that the immediate predecessor is a Phi - if (PredBranch->isPHI()) - return nullptr; + if (CompCounterOprNum == 0) { + if (isDefinedOutside(Comp->getOperand(1).getReg(), LoopBB)) + CompCounterOprNum = 2; + else if (isDefinedOutside(Comp->getOperand(2).getReg(), LoopBB)) + CompCounterOprNum = 1; + else + return nullptr; + } + break; + } + } + if (!Comp) + return nullptr; - if (PredBranch->getParent() != LoopBB) - return nullptr; - } else { + MachineInstr *Update = nullptr; + Register Init; + bool IsUpdatePriorComp; + unsigned UpdateCounterOprNum; + if (!getIndVarInfo(Comp->getOperand(CompCounterOprNum).getReg(), LoopBB, + Update, UpdateCounterOprNum, Init, IsUpdatePriorComp)) return nullptr; - } - return std::make_unique<AArch64PipelinerLoopInfo>(PredBranch, Cond); + return std::make_unique<AArch64PipelinerLoopInfo>( + LoopBB, CondBranch, Comp, CompCounterOprNum, Update, UpdateCounterOprNum, + Init, IsUpdatePriorComp, Cond); } #define GET_INSTRINFO_HELPERS diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index f434799c3982..792e0c3063b1 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -572,6 +572,9 @@ private: bool optimizePTestInstr(MachineInstr *PTest, unsigned MaskReg, unsigned PredReg, const MachineRegisterInfo *MRI) const; + std::optional<unsigned> + canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask, + MachineInstr *Pred, const MachineRegisterInfo *MRI) const; }; struct UsedNZCV { diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 91e5bc3caa10..f3aac3b46d17 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -107,7 +107,7 @@ def HasRCPC_IMMO : Predicate<"Subtarget->hasRCPC_IMMO()">, def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">, AssemblerPredicateWithAll<(all_of FeatureFPARMv8), "fp-armv8">; -def HasNEON : Predicate<"Subtarget->hasNEON()">, +def HasNEON : Predicate<"Subtarget->isNeonAvailable()">, AssemblerPredicateWithAll<(all_of FeatureNEON), "neon">; def HasSM4 : Predicate<"Subtarget->hasSM4()">, AssemblerPredicateWithAll<(all_of FeatureSM4), "sm4">; @@ -141,35 +141,41 @@ def HasSPE : Predicate<"Subtarget->hasSPE()">, def HasFuseAES : Predicate<"Subtarget->hasFuseAES()">, AssemblerPredicateWithAll<(all_of FeatureFuseAES), "fuse-aes">; -def HasSVE : Predicate<"Subtarget->hasSVE()">, +def HasSVE : Predicate<"Subtarget->isSVEAvailable()">, AssemblerPredicateWithAll<(all_of FeatureSVE), "sve">; -def HasSVE2 : Predicate<"Subtarget->hasSVE2()">, +def HasSVE2 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2()">, AssemblerPredicateWithAll<(all_of FeatureSVE2), "sve2">; -def HasSVE2p1 : Predicate<"Subtarget->hasSVE2p1()">, +def HasSVE2p1 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2p1()">, AssemblerPredicateWithAll<(all_of FeatureSVE2p1), "sve2p1">; -def HasSVE2AES : Predicate<"Subtarget->hasSVE2AES()">, +def HasSVE2AES : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2AES()">, AssemblerPredicateWithAll<(all_of FeatureSVE2AES), "sve2-aes">; -def HasSVE2SM4 : Predicate<"Subtarget->hasSVE2SM4()">, +def HasSVE2SM4 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2SM4()">, AssemblerPredicateWithAll<(all_of FeatureSVE2SM4), "sve2-sm4">; -def HasSVE2SHA3 : Predicate<"Subtarget->hasSVE2SHA3()">, +def HasSVE2SHA3 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2SHA3()">, AssemblerPredicateWithAll<(all_of FeatureSVE2SHA3), "sve2-sha3">; -def HasSVE2BitPerm : Predicate<"Subtarget->hasSVE2BitPerm()">, +def HasSVE2BitPerm : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2BitPerm()">, AssemblerPredicateWithAll<(all_of FeatureSVE2BitPerm), "sve2-bitperm">; def HasB16B16 : Predicate<"Subtarget->hasB16B16()">, AssemblerPredicateWithAll<(all_of FeatureB16B16), "b16b16">; -def HasSME : Predicate<"Subtarget->hasSME()">, +def HasSMEandIsNonStreamingSafe + : Predicate<"Subtarget->hasSME()">, AssemblerPredicateWithAll<(all_of FeatureSME), "sme">; -def HasSMEF64F64 : Predicate<"Subtarget->hasSMEF64F64()">, +def HasSME : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME()">, + AssemblerPredicateWithAll<(all_of FeatureSME), "sme">; +def HasSMEF64F64 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF64F64()">, AssemblerPredicateWithAll<(all_of FeatureSMEF64F64), "sme-f64f64">; -def HasSMEF16F16 : Predicate<"Subtarget->hasSMEF16F16()">, +def HasSMEF16F16 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF16F16()">, AssemblerPredicateWithAll<(all_of FeatureSMEF16F16), "sme-f16f16">; -def HasSMEFA64 : Predicate<"Subtarget->hasSMEFA64()">, +def HasSMEFA64 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEFA64()">, AssemblerPredicateWithAll<(all_of FeatureSMEFA64), "sme-fa64">; -def HasSMEI16I64 : Predicate<"Subtarget->hasSMEI16I64()">, +def HasSMEI16I64 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEI16I64()">, AssemblerPredicateWithAll<(all_of FeatureSMEI16I64), "sme-i16i64">; -def HasSME2 : Predicate<"Subtarget->hasSME2()">, +def HasSME2andIsNonStreamingSafe + : Predicate<"Subtarget->hasSME2()">, + AssemblerPredicateWithAll<(all_of FeatureSME2), "sme2">; +def HasSME2 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME2()">, AssemblerPredicateWithAll<(all_of FeatureSME2), "sme2">; -def HasSME2p1 : Predicate<"Subtarget->hasSME2p1()">, +def HasSME2p1 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME2p1()">, AssemblerPredicateWithAll<(all_of FeatureSME2p1), "sme2p1">; def HasFP8 : Predicate<"Subtarget->hasFP8()">, AssemblerPredicateWithAll<(all_of FeatureFP8), "fp8">; @@ -198,48 +204,47 @@ def HasSSVE_FP8DOT4 : Predicate<"Subtarget->hasSSVE_FP8DOT4() || " "ssve-fp8dot4 or (sve2 and fp8dot4)">; def HasLUT : Predicate<"Subtarget->hasLUT()">, AssemblerPredicateWithAll<(all_of FeatureLUT), "lut">; -def HasSME_LUTv2 : Predicate<"Subtarget->hasSME_LUTv2()">, +def HasSME_LUTv2 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME_LUTv2()">, AssemblerPredicateWithAll<(all_of FeatureSME_LUTv2), "sme-lutv2">; -def HasSMEF8F16 : Predicate<"Subtarget->hasSMEF8F16()">, +def HasSMEF8F16 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF8F16()">, AssemblerPredicateWithAll<(all_of FeatureSMEF8F16), "sme-f8f16">; -def HasSMEF8F32 : Predicate<"Subtarget->hasSMEF8F32()">, +def HasSMEF8F32 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF8F32()">, AssemblerPredicateWithAll<(all_of FeatureSMEF8F32), "sme-f8f32">; // A subset of SVE(2) instructions are legal in Streaming SVE execution mode, // they should be enabled if either has been specified. def HasSVEorSME - : Predicate<"Subtarget->hasSVEorSME()">, + : Predicate<"Subtarget->hasSVE() || (Subtarget->isStreaming() && Subtarget->hasSME())">, AssemblerPredicateWithAll<(any_of FeatureSVE, FeatureSME), "sve or sme">; def HasSVE2orSME - : Predicate<"Subtarget->hasSVE2() || Subtarget->hasSME()">, + : Predicate<"Subtarget->hasSVE2() || (Subtarget->isStreaming() && Subtarget->hasSME())">, AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSME), "sve2 or sme">; def HasSVE2orSME2 - : Predicate<"Subtarget->hasSVE2() || Subtarget->hasSME2()">, + : Predicate<"Subtarget->hasSVE2() || (Subtarget->isStreaming() && Subtarget->hasSME2())">, AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSME2), "sve2 or sme2">; def HasSVE2p1_or_HasSME - : Predicate<"Subtarget->hasSVE2p1() || Subtarget->hasSME()">, + : Predicate<"Subtarget->hasSVE2p1() || (Subtarget->isStreaming() && Subtarget->hasSME())">, AssemblerPredicateWithAll<(any_of FeatureSME, FeatureSVE2p1), "sme or sve2p1">; def HasSVE2p1_or_HasSME2 - : Predicate<"Subtarget->hasSVE2p1() || Subtarget->hasSME2()">, + : Predicate<"Subtarget->hasSVE2p1() || (Subtarget->isStreaming() && Subtarget->hasSME2())">, AssemblerPredicateWithAll<(any_of FeatureSME2, FeatureSVE2p1), "sme2 or sve2p1">; def HasSVE2p1_or_HasSME2p1 - : Predicate<"Subtarget->hasSVE2p1() || Subtarget->hasSME2p1()">, + : Predicate<"Subtarget->hasSVE2p1() || (Subtarget->isStreaming() && Subtarget->hasSME2p1())">, AssemblerPredicateWithAll<(any_of FeatureSME2p1, FeatureSVE2p1), "sme2p1 or sve2p1">; def HasSMEF16F16orSMEF8F16 - : Predicate<"Subtarget->hasSMEF16F16() || Subtarget->hasSMEF8F16()">, + : Predicate<"Subtarget->isStreaming() && (Subtarget->hasSMEF16F16() || Subtarget->hasSMEF8F16())">, AssemblerPredicateWithAll<(any_of FeatureSMEF16F16, FeatureSMEF8F16), "sme-f16f16 or sme-f8f16">; // A subset of NEON instructions are legal in Streaming SVE execution mode, -// they should be enabled if either has been specified. -def HasNEONorSME - : Predicate<"Subtarget->hasNEON() || Subtarget->hasSME()">, - AssemblerPredicateWithAll<(any_of FeatureNEON, FeatureSME), - "neon or sme">; +// so don't need the additional check for 'isNeonAvailable'. +def HasNEONandIsStreamingSafe + : Predicate<"Subtarget->hasNEON()">, + AssemblerPredicateWithAll<(any_of FeatureNEON), "neon">; def HasRCPC : Predicate<"Subtarget->hasRCPC()">, AssemblerPredicateWithAll<(all_of FeatureRCPC), "rcpc">; def HasAltNZCV : Predicate<"Subtarget->hasAlternativeNZCV()">, @@ -323,8 +328,6 @@ def NoUseScalarIncVL : Predicate<"!Subtarget->useScalarIncVL()">; def UseSVEFPLD1R : Predicate<"!Subtarget->noSVEFPLD1R()">; -def IsNeonAvailable : Predicate<"Subtarget->isNeonAvailable()">; - def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER", SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisInt<1>]>>; @@ -1350,7 +1353,7 @@ def : Pat<(v2f32 (int_aarch64_neon_bfdot VectorIndexS:$idx)>; } -let Predicates = [HasNEONorSME, HasBF16] in { +let Predicates = [HasNEONandIsStreamingSafe, HasBF16] in { def BFCVT : BF16ToSinglePrecision<"bfcvt">; // Round FP32 to BF16. def : Pat<(bf16 (any_fpround (f32 FPR32:$Rn))), (BFCVT $Rn)>; @@ -5789,9 +5792,9 @@ defm FACGT : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt", defm FCMEQ : SIMDThreeScalarFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>; defm FCMGE : SIMDThreeScalarFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>; defm FCMGT : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>; -defm FMULX : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx, HasNEONorSME>; -defm FRECPS : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps, HasNEONorSME>; -defm FRSQRTS : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts, HasNEONorSME>; +defm FMULX : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx, HasNEONandIsStreamingSafe>; +defm FRECPS : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps, HasNEONandIsStreamingSafe>; +defm FRSQRTS : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts, HasNEONandIsStreamingSafe>; defm SQADD : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>; defm SQDMULH : SIMDThreeScalarHS< 0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>; defm SQRDMULH : SIMDThreeScalarHS< 1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>; @@ -5820,7 +5823,7 @@ let Predicates = [HasRDM] in { defm : FMULScalarFromIndexedLane0Patterns<"FMULX", "16", "32", "64", int_aarch64_neon_fmulx, - [HasNEONorSME]>; + [HasNEONandIsStreamingSafe]>; let Predicates = [HasNEON] in { def : InstAlias<"cmls $dst, $src1, $src2", @@ -5894,9 +5897,9 @@ defm FCVTPU : SIMDFPTwoScalar< 1, 1, 0b11010, "fcvtpu">; def FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">; defm FCVTZS : SIMDFPTwoScalar< 0, 1, 0b11011, "fcvtzs">; defm FCVTZU : SIMDFPTwoScalar< 1, 1, 0b11011, "fcvtzu">; -defm FRECPE : SIMDFPTwoScalar< 0, 1, 0b11101, "frecpe", HasNEONorSME>; -defm FRECPX : SIMDFPTwoScalar< 0, 1, 0b11111, "frecpx", HasNEONorSME>; -defm FRSQRTE : SIMDFPTwoScalar< 1, 1, 0b11101, "frsqrte", HasNEONorSME>; +defm FRECPE : SIMDFPTwoScalar< 0, 1, 0b11101, "frecpe">; +defm FRECPX : SIMDFPTwoScalar< 0, 1, 0b11111, "frecpx">; +defm FRSQRTE : SIMDFPTwoScalar< 1, 1, 0b11101, "frsqrte">; defm NEG : SIMDTwoScalarD< 1, 0b01011, "neg", UnOpFrag<(sub immAllZerosV, node:$LHS)> >; defm SCVTF : SIMDFPTwoScalarCVT< 0, 0, 0b11101, "scvtf", AArch64sitof>; @@ -5915,7 +5918,7 @@ def : Pat<(v1i64 (AArch64vashr (v1i64 V64:$Rn), (i32 63))), (CMLTv1i64rz V64:$Rn)>; // Round FP64 to BF16. -let Predicates = [HasNEONorSME, HasBF16] in +let Predicates = [HasNEONandIsStreamingSafe, HasBF16] in def : Pat<(bf16 (any_fpround (f64 FPR64:$Rn))), (BFCVT (FCVTXNv1i64 $Rn))>; @@ -6016,7 +6019,7 @@ def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))), // Some float -> int -> float conversion patterns for which we want to keep the // int values in FP registers using the corresponding NEON instructions to // avoid more costly int <-> fp register transfers. -let Predicates = [HasNEON] in { +let Predicates = [HasNEONandIsStreamingSafe] in { def : Pat<(f64 (any_sint_to_fp (i64 (any_fp_to_sint f64:$Rn)))), (SCVTFv1i64 (i64 (FCVTZSv1i64 f64:$Rn)))>; def : Pat<(f32 (any_sint_to_fp (i32 (any_fp_to_sint f32:$Rn)))), @@ -6026,7 +6029,7 @@ def : Pat<(f64 (any_uint_to_fp (i64 (any_fp_to_uint f64:$Rn)))), def : Pat<(f32 (any_uint_to_fp (i32 (any_fp_to_uint f32:$Rn)))), (UCVTFv1i32 (i32 (FCVTZUv1i32 f32:$Rn)))>; -let Predicates = [HasFullFP16] in { +let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in { def : Pat<(f16 (any_sint_to_fp (i32 (any_fp_to_sint f16:$Rn)))), (SCVTFv1i16 (f16 (FCVTZSv1f16 f16:$Rn)))>; def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))), @@ -6118,7 +6121,7 @@ def : Pat <(f64 (uint_to_fp (i32 (LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>; // 64-bits -> double are handled in target specific dag combine: // performIntToFpCombine. -} // let Predicates = [HasNEON] +} // let Predicates = [HasNEONandIsStreamingSafe] //===----------------------------------------------------------------------===// // Advanced SIMD three different-sized vector instructions. @@ -8379,7 +8382,7 @@ def : Ld1Lane64IdxOpPat<extloadi8, VectorIndexH, v4i16, i32, LD1i8, VectorIndexH // Same as above, but the first element is populated using // scalar_to_vector + insert_subvector instead of insert_vector_elt. -let Predicates = [IsNeonAvailable] in { +let Predicates = [HasNEON] in { class Ld1Lane128FirstElm<ValueType ResultTy, ValueType VecTy, SDPatternOperator ExtLoad, Instruction LD1> : Pat<(ResultTy (scalar_to_vector (i32 (ExtLoad GPR64sp:$Rn)))), diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp index c3d64f5a0a96..957d7bc79b18 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp @@ -196,12 +196,14 @@ bool AArch64FunctionInfo::needsAsyncDwarfUnwindInfo( const MachineFunction &MF) const { if (!NeedsAsyncDwarfUnwindInfo) { const Function &F = MF.getFunction(); + const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); // The check got "minsize" is because epilogue unwind info is not emitted // (yet) for homogeneous epilogues, outlined functions, and functions // outlined from. - NeedsAsyncDwarfUnwindInfo = needsDwarfUnwindInfo(MF) && - F.getUWTableKind() == UWTableKind::Async && - !F.hasMinSize(); + NeedsAsyncDwarfUnwindInfo = + needsDwarfUnwindInfo(MF) && + ((F.getUWTableKind() == UWTableKind::Async && !F.hasMinSize()) || + AFI->hasStreamingModeChanges()); } return *NeedsAsyncDwarfUnwindInfo; } diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index df09fc5592ed..001521d1101e 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -13,6 +13,7 @@ #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H #define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H +#include "AArch64Subtarget.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" @@ -35,6 +36,11 @@ struct AArch64FunctionInfo; class AArch64Subtarget; class MachineInstr; +struct TPIDR2Object { + int FrameIndex = std::numeric_limits<int>::max(); + unsigned Uses = 0; +}; + /// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and /// contains private AArch64-specific information for each MachineFunction. class AArch64FunctionInfo final : public MachineFunctionInfo { @@ -195,7 +201,7 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { bool IsSVECC = false; /// The frame-index for the TPIDR2 object used for lazy saves. - Register LazySaveTPIDR2Obj = 0; + TPIDR2Object TPIDR2; /// Whether this function changes streaming mode within the function. bool HasStreamingModeChanges = false; @@ -216,6 +222,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { // The PTRUE is used for the LD/ST of ZReg pairs in save and restore. unsigned PredicateRegForFillSpill = 0; + // The stack slots where VG values are stored to. + int64_t VGIdx = std::numeric_limits<int>::max(); + int64_t StreamingVGIdx = std::numeric_limits<int>::max(); + public: AArch64FunctionInfo(const Function &F, const AArch64Subtarget *STI); @@ -234,11 +244,16 @@ public: Register getPStateSMReg() const { return PStateSMReg; }; void setPStateSMReg(Register Reg) { PStateSMReg = Reg; }; + int64_t getVGIdx() const { return VGIdx; }; + void setVGIdx(unsigned Idx) { VGIdx = Idx; }; + + int64_t getStreamingVGIdx() const { return StreamingVGIdx; }; + void setStreamingVGIdx(unsigned FrameIdx) { StreamingVGIdx = FrameIdx; }; + bool isSVECC() const { return IsSVECC; }; void setIsSVECC(bool s) { IsSVECC = s; }; - unsigned getLazySaveTPIDR2Obj() const { return LazySaveTPIDR2Obj; } - void setLazySaveTPIDR2Obj(unsigned Reg) { LazySaveTPIDR2Obj = Reg; } + TPIDR2Object &getTPIDR2Obj() { return TPIDR2; } void initializeBaseYamlFields(const yaml::AArch64FunctionInfo &YamlMFI); diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index a759efcd9441..53b46ff42b72 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -189,6 +189,17 @@ def TuneA720AE : SubtargetFeature<"a720ae", "ARMProcFamily", "CortexA720", FeatureEnableSelectOptimize, FeaturePredictableSelectIsExpensive]>; +def TuneA725 : SubtargetFeature<"cortex-a725", "ARMProcFamily", + "CortexA725", + "Cortex-A725 ARM processors", [ + FeatureFuseAES, + FeaturePostRAScheduler, + FeatureCmpBccFusion, + FeatureALULSLFast, + FeatureFuseAdrpAdd, + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; + def TuneR82 : SubtargetFeature<"cortex-r82", "ARMProcFamily", "CortexR82", "Cortex-R82 ARM processors", [ @@ -238,6 +249,15 @@ def TuneX4 : SubtargetFeature<"cortex-x4", "ARMProcFamily", "CortexX4", FeatureEnableSelectOptimize, FeaturePredictableSelectIsExpensive]>; +def TuneX925 : SubtargetFeature<"cortex-x925", "ARMProcFamily", + "CortexX925", "Cortex-X925 ARM processors",[ + FeatureALULSLFast, + FeatureFuseAdrpAdd, + FeatureFuseAES, + FeaturePostRAScheduler, + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; + def TuneA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX", "Fujitsu A64FX processors", [ FeaturePostRAScheduler, @@ -378,6 +398,22 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17", FeatureZCRegMove, FeatureZCZeroing]>; +def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4", + "Apple M4", [ + FeatureAlternateSExtLoadCVTF32Pattern, + FeatureArithmeticBccFusion, + FeatureArithmeticCbzFusion, + FeatureDisableLatencySchedHeuristic, + FeatureFuseAddress, + FeatureFuseAES, + FeatureFuseArithmeticLogic, + FeatureFuseCCSelect, + FeatureFuseCryptoEOR, + FeatureFuseLiterals, + FeatureZCRegMove, + FeatureZCZeroing + ]>; + def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3", "Samsung Exynos-M3 processors", [FeatureExynosCheapAsMoveHandling, @@ -620,7 +656,8 @@ def TuneAmpere1B : SubtargetFeature<"ampere1b", "ARMProcFamily", "Ampere1B", def TuneOryon : SubtargetFeature<"oryon-1", "ARMProcFamily", "Oryon", "Nuvia Inc Oryon processors", [ - FeatureCrypto, + FeatureSHA2, + FeatureAES, FeatureFPARMv8, FeatureNEON, FeatureFuseAES, @@ -640,187 +677,240 @@ def TuneOryon : SubtargetFeature<"oryon-1", "ARMProcFamily", HasV8_6aOps]>; def ProcessorFeatures { - list<SubtargetFeature> A53 = [HasV8_0aOps, FeatureCRC, FeatureCrypto, + list<SubtargetFeature> A53 = [HasV8_0aOps, FeatureCRC, FeatureSHA2, FeatureAES, FeatureFPARMv8, FeatureNEON, FeaturePerfMon]; - list<SubtargetFeature> A55 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, + list<SubtargetFeature> A55 = [HasV8_2aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8, FeatureNEON, FeatureFullFP16, FeatureDotProd, FeatureRCPC, FeaturePerfMon]; list<SubtargetFeature> A510 = [HasV9_0aOps, FeatureNEON, FeaturePerfMon, FeatureMatMulInt8, FeatureBF16, FeatureAM, FeatureMTE, FeatureETE, FeatureSVE2BitPerm, - FeatureFP16FML]; + FeatureFP16FML, + FeatureSB, FeaturePAuth, FeatureSSBS, FeatureSVE, FeatureSVE2]; list<SubtargetFeature> A520 = [HasV9_2aOps, FeaturePerfMon, FeatureAM, FeatureMTE, FeatureETE, FeatureSVE2BitPerm, - FeatureFP16FML]; + FeatureFP16FML, + FeatureSB, FeatureSSBS, FeaturePAuth, FeatureFlagM, FeaturePredRes, + FeatureSVE, FeatureSVE2]; list<SubtargetFeature> A520AE = [HasV9_2aOps, FeaturePerfMon, FeatureAM, FeatureMTE, FeatureETE, FeatureSVE2BitPerm, - FeatureFP16FML]; - list<SubtargetFeature> A65 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, + FeatureFP16FML, + FeatureSB, FeatureSSBS, FeaturePAuth, FeatureFlagM, FeaturePredRes, + FeatureSVE, FeatureSVE2]; + list<SubtargetFeature> A65 = [HasV8_2aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8, FeatureNEON, FeatureFullFP16, FeatureDotProd, FeatureRCPC, FeatureSSBS, FeatureRAS, FeaturePerfMon]; - list<SubtargetFeature> A76 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, + list<SubtargetFeature> A76 = [HasV8_2aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8, FeatureNEON, FeatureFullFP16, FeatureDotProd, FeatureRCPC, FeatureSSBS, FeaturePerfMon]; - list<SubtargetFeature> A77 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, + list<SubtargetFeature> A77 = [HasV8_2aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8, FeatureNEON, FeatureFullFP16, FeatureDotProd, FeatureRCPC, FeaturePerfMon, FeatureSSBS]; - list<SubtargetFeature> A78 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, + list<SubtargetFeature> A78 = [HasV8_2aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8, FeatureNEON, FeatureFullFP16, FeatureDotProd, FeatureRCPC, FeaturePerfMon, FeatureSPE, FeatureSSBS]; - list<SubtargetFeature> A78AE = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, + list<SubtargetFeature> A78AE = [HasV8_2aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8, FeatureNEON, FeatureFullFP16, FeatureDotProd, FeatureRCPC, FeaturePerfMon, FeatureSPE, FeatureSSBS]; - list<SubtargetFeature> A78C = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, + list<SubtargetFeature> A78C = [HasV8_2aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8, FeatureNEON, FeatureFullFP16, FeatureDotProd, FeatureFlagM, FeaturePAuth, FeaturePerfMon, FeatureRCPC, FeatureSPE, FeatureSSBS]; list<SubtargetFeature> A710 = [HasV9_0aOps, FeatureNEON, FeaturePerfMon, FeatureETE, FeatureMTE, FeatureFP16FML, - FeatureSVE2BitPerm, FeatureBF16, FeatureMatMulInt8]; + FeatureSVE2BitPerm, FeatureBF16, FeatureMatMulInt8, + FeaturePAuth, FeatureFlagM, FeatureSB, FeatureSVE, FeatureSVE2]; list<SubtargetFeature> A715 = [HasV9_0aOps, FeatureNEON, FeatureMTE, FeatureFP16FML, FeatureSVE, FeatureTRBE, FeatureSVE2BitPerm, FeatureBF16, FeatureETE, - FeaturePerfMon, FeatureMatMulInt8, FeatureSPE]; + FeaturePerfMon, FeatureMatMulInt8, FeatureSPE, + FeatureSB, FeatureSSBS, FeatureFullFP16, FeaturePAuth, FeaturePredRes, FeatureFlagM, + FeatureSVE2]; list<SubtargetFeature> A720 = [HasV9_2aOps, FeatureMTE, FeatureFP16FML, FeatureTRBE, FeatureSVE2BitPerm, FeatureETE, - FeaturePerfMon, FeatureSPE, FeatureSPE_EEF]; + FeaturePerfMon, FeatureSPE, FeatureSPE_EEF, + FeatureSB, FeatureSSBS, FeaturePAuth, FeatureFlagM, FeaturePredRes, + FeatureSVE, FeatureSVE2]; list<SubtargetFeature> A720AE = [HasV9_2aOps, FeatureMTE, FeatureFP16FML, FeatureTRBE, FeatureSVE2BitPerm, FeatureETE, - FeaturePerfMon, FeatureSPE, FeatureSPE_EEF]; + FeaturePerfMon, FeatureSPE, FeatureSPE_EEF, + FeatureSB, FeatureSSBS, FeaturePAuth, FeatureFlagM, FeaturePredRes, + FeatureSVE, FeatureSVE2]; + list<SubtargetFeature> A725 = [HasV9_2aOps, FeatureMTE, FeatureFP16FML, + FeatureETE, FeaturePerfMon, FeatureSPE, + FeatureSVE2BitPerm, FeatureSPE_EEF, FeatureTRBE, + FeatureFlagM, FeaturePredRes, FeatureSB, FeatureSSBS, + FeatureSVE, FeatureSVE2]; list<SubtargetFeature> R82 = [HasV8_0rOps, FeaturePerfMon, FeatureFullFP16, FeatureFP16FML, FeatureSSBS, FeaturePredRes, FeatureSB, FeatureRDM, FeatureDotProd, FeatureComplxNum, FeatureJS, - FeatureCacheDeepPersist]; + FeatureCacheDeepPersist, + FeatureLSE, FeatureFlagM]; list<SubtargetFeature> R82AE = [HasV8_0rOps, FeaturePerfMon, FeatureFullFP16, FeatureFP16FML, FeatureSSBS, FeaturePredRes, FeatureSB, FeatureRDM, FeatureDotProd, FeatureComplxNum, FeatureJS, - FeatureCacheDeepPersist]; - list<SubtargetFeature> X1 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, + FeatureCacheDeepPersist, + FeatureLSE, FeatureFlagM]; + list<SubtargetFeature> X1 = [HasV8_2aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8, FeatureNEON, FeatureRCPC, FeaturePerfMon, FeatureSPE, FeatureFullFP16, FeatureDotProd, FeatureSSBS]; - list<SubtargetFeature> X1C = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, + list<SubtargetFeature> X1C = [HasV8_2aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8, FeatureNEON, FeatureRCPC_IMMO, FeaturePerfMon, FeatureSPE, FeatureFullFP16, FeatureDotProd, FeaturePAuth, FeatureSSBS, FeatureFlagM, - FeatureLSE2]; + FeatureLSE2, + FeatureRCPC]; list<SubtargetFeature> X2 = [HasV9_0aOps, FeatureNEON, FeaturePerfMon, FeatureMatMulInt8, FeatureBF16, FeatureAM, FeatureMTE, FeatureETE, FeatureSVE2BitPerm, - FeatureFP16FML]; + FeatureFP16FML, + FeaturePAuth, FeatureSSBS, FeatureSB, FeatureSVE, FeatureSVE2, FeatureFlagM]; list<SubtargetFeature> X3 = [HasV9_0aOps, FeatureSVE, FeatureNEON, FeaturePerfMon, FeatureETE, FeatureTRBE, FeatureSPE, FeatureBF16, FeatureMatMulInt8, FeatureMTE, FeatureSVE2BitPerm, FeatureFullFP16, - FeatureFP16FML]; + FeatureFP16FML, + FeatureSB, FeaturePAuth, FeaturePredRes, FeatureFlagM, FeatureSSBS, + FeatureSVE2]; list<SubtargetFeature> X4 = [HasV9_2aOps, FeaturePerfMon, FeatureETE, FeatureTRBE, FeatureSPE, FeatureMTE, FeatureSVE2BitPerm, - FeatureFP16FML, FeatureSPE_EEF]; + FeatureFP16FML, FeatureSPE_EEF, + FeatureSB, FeatureSSBS, FeaturePAuth, FeatureFlagM, FeaturePredRes, + FeatureSVE, FeatureSVE2]; + list<SubtargetFeature> X925 = [HasV9_2aOps, FeatureMTE, FeatureFP16FML, + FeatureETE, FeaturePerfMon, FeatureSPE, + FeatureSVE2BitPerm, FeatureSPE_EEF, FeatureTRBE, + FeatureFlagM, FeaturePredRes, FeatureSB, FeatureSSBS, + FeatureSVE, FeatureSVE2]; list<SubtargetFeature> A64FX = [HasV8_2aOps, FeatureFPARMv8, FeatureNEON, FeatureSHA2, FeaturePerfMon, FeatureFullFP16, - FeatureSVE, FeatureComplxNum]; - list<SubtargetFeature> Carmel = [HasV8_2aOps, FeatureNEON, FeatureCrypto, + FeatureSVE, FeatureComplxNum, + FeatureAES]; + list<SubtargetFeature> Carmel = [HasV8_2aOps, FeatureNEON, FeatureSHA2, FeatureAES, FeatureFullFP16]; - list<SubtargetFeature> AppleA7 = [HasV8_0aOps, FeatureCrypto, FeatureFPARMv8, + list<SubtargetFeature> AppleA7 = [HasV8_0aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8, FeatureNEON,FeaturePerfMon, FeatureAppleA7SysReg]; - list<SubtargetFeature> AppleA10 = [HasV8_0aOps, FeatureCrypto, FeatureFPARMv8, + list<SubtargetFeature> AppleA10 = [HasV8_0aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8, FeatureNEON, FeaturePerfMon, FeatureCRC, FeatureRDM, FeaturePAN, FeatureLOR, FeatureVH]; - list<SubtargetFeature> AppleA11 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, + list<SubtargetFeature> AppleA11 = [HasV8_2aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8, FeatureNEON, FeaturePerfMon, FeatureFullFP16]; - list<SubtargetFeature> AppleA12 = [HasV8_3aOps, FeatureCrypto, FeatureFPARMv8, + list<SubtargetFeature> AppleA12 = [HasV8_3aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8, FeatureNEON, FeaturePerfMon, FeatureFullFP16]; - list<SubtargetFeature> AppleA13 = [HasV8_4aOps, FeatureCrypto, FeatureFPARMv8, + list<SubtargetFeature> AppleA13 = [HasV8_4aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8, FeatureNEON, FeaturePerfMon, FeatureFullFP16, FeatureFP16FML, FeatureSHA3]; - list<SubtargetFeature> AppleA14 = [HasV8_4aOps, FeatureCrypto, FeatureFPARMv8, - FeatureNEON, FeaturePerfMon, FeatureFRInt3264, - FeatureSpecRestrict, FeatureSSBS, FeatureSB, - FeaturePredRes, FeatureCacheDeepPersist, + list<SubtargetFeature> AppleA14 = [HasV8_4aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8, + FeatureNEON, FeaturePerfMon, FeatureFullFP16, FeatureFP16FML, FeatureSHA3, - FeatureAltFPCmp]; - list<SubtargetFeature> AppleA15 = [HasV8_6aOps, FeatureCrypto, FeatureFPARMv8, + // ArmV8.5-a extensions, excluding BTI: + FeatureAltFPCmp, FeatureFRInt3264, + FeatureSpecRestrict, FeatureSSBS, FeatureSB, + FeaturePredRes, FeatureCacheDeepPersist]; + list<SubtargetFeature> AppleA15 = [HasV8_6aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8, FeatureNEON, FeaturePerfMon, FeatureSHA3, FeatureFullFP16, FeatureFP16FML]; - list<SubtargetFeature> AppleA16 = [HasV8_6aOps, FeatureCrypto, FeatureFPARMv8, + list<SubtargetFeature> AppleA16 = [HasV8_6aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8, FeatureNEON, FeaturePerfMon, FeatureSHA3, FeatureFullFP16, FeatureFP16FML, FeatureHCX]; - list<SubtargetFeature> AppleA17 = [HasV8_6aOps, FeatureCrypto, FeatureFPARMv8, + list<SubtargetFeature> AppleA17 = [HasV8_6aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8, FeatureNEON, FeaturePerfMon, FeatureSHA3, FeatureFullFP16, FeatureFP16FML, FeatureHCX]; - list<SubtargetFeature> ExynosM3 = [HasV8_0aOps, FeatureCRC, FeatureCrypto, + // Technically apple-m4 is ARMv9.2a, but a quirk of LLVM defines v9.0 as + // requiring SVE, which is optional according to the Arm ARM and not + // supported by the core. ARMv8.7a is the next closest choice. + list<SubtargetFeature> AppleM4 = [HasV8_7aOps, FeatureSHA2, FeatureFPARMv8, + FeatureNEON, FeaturePerfMon, FeatureSHA3, + FeatureFullFP16, FeatureFP16FML, + FeatureAES, FeatureBF16, + FeatureSME, FeatureSME2, + FeatureSMEF64F64, FeatureSMEI16I64]; + list<SubtargetFeature> ExynosM3 = [HasV8_0aOps, FeatureCRC, FeatureSHA2, FeatureAES, FeaturePerfMon]; - list<SubtargetFeature> ExynosM4 = [HasV8_2aOps, FeatureCrypto, FeatureDotProd, + list<SubtargetFeature> ExynosM4 = [HasV8_2aOps, FeatureSHA2, FeatureAES, FeatureDotProd, FeatureFullFP16, FeaturePerfMon]; - list<SubtargetFeature> Falkor = [HasV8_0aOps, FeatureCRC, FeatureCrypto, + list<SubtargetFeature> Falkor = [HasV8_0aOps, FeatureCRC, FeatureSHA2, FeatureAES, FeatureFPARMv8, FeatureNEON, FeaturePerfMon, FeatureRDM]; - list<SubtargetFeature> NeoverseE1 = [HasV8_2aOps, FeatureCrypto, FeatureDotProd, + list<SubtargetFeature> NeoverseE1 = [HasV8_2aOps, FeatureSHA2, FeatureAES, FeatureDotProd, FeatureFPARMv8, FeatureFullFP16, FeatureNEON, FeatureRCPC, FeatureSSBS, FeaturePerfMon]; - list<SubtargetFeature> NeoverseN1 = [HasV8_2aOps, FeatureCrypto, FeatureDotProd, + list<SubtargetFeature> NeoverseN1 = [HasV8_2aOps, FeatureSHA2, FeatureAES, FeatureDotProd, FeatureFPARMv8, FeatureFullFP16, FeatureNEON, FeatureRCPC, FeatureSPE, FeatureSSBS, FeaturePerfMon]; list<SubtargetFeature> NeoverseN2 = [HasV9_0aOps, FeatureBF16, FeatureETE, FeatureFP16FML, FeatureMatMulInt8, FeatureMTE, FeatureSVE2, FeatureSVE2BitPerm, FeatureTRBE, - FeaturePerfMon]; + FeaturePerfMon, + FeatureDotProd, FeatureFullFP16, FeatureSB, FeatureSSBS, FeatureSVE]; list<SubtargetFeature> NeoverseN3 = [HasV9_2aOps, FeatureETE, FeatureFP16FML, FeatureFullFP16, FeatureMTE, FeaturePerfMon, FeatureRandGen, FeatureSPE, FeatureSPE_EEF, - FeatureSVE2BitPerm]; + FeatureSVE2BitPerm, + FeatureSSBS, FeatureSB, FeaturePredRes, FeaturePAuth, FeatureFlagM, + FeatureSVE, FeatureSVE2]; list<SubtargetFeature> Neoverse512TVB = [HasV8_4aOps, FeatureBF16, FeatureCacheDeepPersist, - FeatureCrypto, FeatureFPARMv8, FeatureFP16FML, + FeatureSHA2, FeatureAES, FeatureFPARMv8, FeatureFP16FML, FeatureFullFP16, FeatureMatMulInt8, FeatureNEON, FeaturePerfMon, FeatureRandGen, FeatureSPE, - FeatureSSBS, FeatureSVE]; + FeatureSSBS, FeatureSVE, + FeatureSHA3, FeatureSM4, FeatureDotProd]; list<SubtargetFeature> NeoverseV1 = [HasV8_4aOps, FeatureBF16, FeatureCacheDeepPersist, - FeatureCrypto, FeatureFPARMv8, FeatureFP16FML, + FeatureSHA2, FeatureAES, FeatureFPARMv8, FeatureFP16FML, FeatureFullFP16, FeatureMatMulInt8, FeatureNEON, FeaturePerfMon, FeatureRandGen, FeatureSPE, - FeatureSSBS, FeatureSVE]; + FeatureSSBS, FeatureSVE, + FeatureSHA3, FeatureSM4, FeatureDotProd]; list<SubtargetFeature> NeoverseV2 = [HasV9_0aOps, FeatureBF16, FeatureSPE, FeaturePerfMon, FeatureETE, FeatureMatMulInt8, FeatureNEON, FeatureSVE2BitPerm, FeatureFP16FML, - FeatureMTE, FeatureRandGen]; + FeatureMTE, FeatureRandGen, + FeatureSVE, FeatureSVE2, FeatureSSBS, FeatureFullFP16, FeatureDotProd]; list<SubtargetFeature> NeoverseV3 = [HasV9_2aOps, FeatureETE, FeatureFP16FML, FeatureFullFP16, FeatureLS64, FeatureMTE, FeaturePerfMon, FeatureRandGen, FeatureSPE, - FeatureSPE_EEF, FeatureSVE2BitPerm, FeatureBRBE]; + FeatureSPE_EEF, FeatureSVE2BitPerm, FeatureBRBE, + FeatureSSBS, FeatureSB, FeaturePredRes, FeaturePAuth, FeatureFlagM, + FeatureSVE, FeatureSVE2]; list<SubtargetFeature> NeoverseV3AE = [HasV9_2aOps, FeatureETE, FeatureFP16FML, FeatureFullFP16, FeatureLS64, FeatureMTE, FeaturePerfMon, FeatureRandGen, FeatureSPE, - FeatureSPE_EEF, FeatureSVE2BitPerm, FeatureBRBE]; - list<SubtargetFeature> Saphira = [HasV8_4aOps, FeatureCrypto, FeatureFPARMv8, + FeatureSPE_EEF, FeatureSVE2BitPerm, FeatureBRBE, + FeatureSSBS, FeatureSB, FeaturePredRes, FeaturePAuth, FeatureFlagM, + FeatureSVE, FeatureSVE2]; + list<SubtargetFeature> Saphira = [HasV8_4aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8, FeatureNEON, FeatureSPE, FeaturePerfMon]; - list<SubtargetFeature> ThunderX = [HasV8_0aOps, FeatureCRC, FeatureCrypto, + list<SubtargetFeature> ThunderX = [HasV8_0aOps, FeatureCRC, FeatureSHA2, FeatureAES, FeatureFPARMv8, FeaturePerfMon, FeatureNEON]; - list<SubtargetFeature> ThunderX2T99 = [HasV8_1aOps, FeatureCRC, FeatureCrypto, + list<SubtargetFeature> ThunderX2T99 = [HasV8_1aOps, FeatureCRC, FeatureSHA2, FeatureAES, FeatureFPARMv8, FeatureNEON, FeatureLSE]; - list<SubtargetFeature> ThunderX3T110 = [HasV8_3aOps, FeatureCRC, FeatureCrypto, + list<SubtargetFeature> ThunderX3T110 = [HasV8_3aOps, FeatureCRC, FeatureSHA2, FeatureAES, FeatureFPARMv8, FeatureNEON, FeatureLSE, FeaturePAuth, FeaturePerfMon]; - list<SubtargetFeature> TSV110 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, + list<SubtargetFeature> TSV110 = [HasV8_2aOps, FeatureSHA2, FeatureAES, FeatureFPARMv8, FeatureNEON, FeaturePerfMon, FeatureSPE, FeatureFullFP16, FeatureFP16FML, FeatureDotProd, FeatureJS, FeatureComplxNum]; list<SubtargetFeature> Ampere1 = [HasV8_6aOps, FeatureNEON, FeaturePerfMon, FeatureSSBS, FeatureRandGen, FeatureSB, - FeatureSHA2, FeatureSHA3, FeatureAES]; + FeatureSHA2, FeatureSHA3, FeatureAES, + FeatureFullFP16]; list<SubtargetFeature> Ampere1A = [HasV8_6aOps, FeatureNEON, FeaturePerfMon, FeatureMTE, FeatureSSBS, FeatureRandGen, FeatureSB, FeatureSM4, FeatureSHA2, - FeatureSHA3, FeatureAES]; + FeatureSHA3, FeatureAES, + FeatureFullFP16]; list<SubtargetFeature> Ampere1B = [HasV8_7aOps, FeatureNEON, FeaturePerfMon, FeatureMTE, FeatureSSBS, FeatureRandGen, FeatureSB, FeatureSM4, FeatureSHA2, @@ -828,9 +918,10 @@ def ProcessorFeatures { FeatureWFxT, FeatureFullFP16]; list<SubtargetFeature> Oryon = [HasV8_6aOps, FeatureNEON, FeaturePerfMon, - FeatureCrypto, FeatureRandGen, + FeatureRandGen, FeaturePAuth, FeatureSM4, FeatureSHA2, - FeatureSHA3, FeatureAES]; + FeatureSHA3, FeatureAES, + FeatureSPE]; // ETE and TRBE are future architecture extensions. We temporarily enable them // by default for users targeting generic AArch64. The extensions do not @@ -890,6 +981,8 @@ def : ProcessorModel<"cortex-a720", NeoverseN2Model, ProcessorFeatures.A720, [TuneA720]>; def : ProcessorModel<"cortex-a720ae", NeoverseN2Model, ProcessorFeatures.A720AE, [TuneA720AE]>; +def : ProcessorModel<"cortex-a725", NeoverseN2Model, ProcessorFeatures.A725, + [TuneA725]>; def : ProcessorModel<"cortex-r82", CortexA55Model, ProcessorFeatures.R82, [TuneR82]>; def : ProcessorModel<"cortex-r82ae", CortexA55Model, ProcessorFeatures.R82AE, @@ -904,6 +997,8 @@ def : ProcessorModel<"cortex-x3", NeoverseN2Model, ProcessorFeatures.X3, [TuneX3]>; def : ProcessorModel<"cortex-x4", NeoverseN2Model, ProcessorFeatures.X4, [TuneX4]>; +def : ProcessorModel<"cortex-x925", NeoverseV2Model, ProcessorFeatures.X925, + [TuneX925]>; def : ProcessorModel<"neoverse-e1", CortexA53Model, ProcessorFeatures.NeoverseE1, [TuneNeoverseE1]>; def : ProcessorModel<"neoverse-n1", NeoverseN1Model, @@ -952,50 +1047,60 @@ def : ProcessorModel<"thunderx3t110", ThunderX3T110Model, def : ProcessorModel<"tsv110", TSV110Model, ProcessorFeatures.TSV110, [TuneTSV110]>; + +// Apple CPUs + // Support cyclone as an alias for apple-a7 so we can still LTO old bitcode. def : ProcessorModel<"cyclone", CycloneModel, ProcessorFeatures.AppleA7, [TuneAppleA7]>; - -// iPhone and iPad CPUs def : ProcessorModel<"apple-a7", CycloneModel, ProcessorFeatures.AppleA7, [TuneAppleA7]>; def : ProcessorModel<"apple-a8", CycloneModel, ProcessorFeatures.AppleA7, [TuneAppleA7]>; def : ProcessorModel<"apple-a9", CycloneModel, ProcessorFeatures.AppleA7, [TuneAppleA7]>; + def : ProcessorModel<"apple-a10", CycloneModel, ProcessorFeatures.AppleA10, [TuneAppleA10]>; + def : ProcessorModel<"apple-a11", CycloneModel, ProcessorFeatures.AppleA11, [TuneAppleA11]>; + def : ProcessorModel<"apple-a12", CycloneModel, ProcessorFeatures.AppleA12, [TuneAppleA12]>; +def : ProcessorModel<"apple-s4", CycloneModel, ProcessorFeatures.AppleA12, + [TuneAppleA12]>; +def : ProcessorModel<"apple-s5", CycloneModel, ProcessorFeatures.AppleA12, + [TuneAppleA12]>; + def : ProcessorModel<"apple-a13", CycloneModel, ProcessorFeatures.AppleA13, [TuneAppleA13]>; + def : ProcessorModel<"apple-a14", CycloneModel, ProcessorFeatures.AppleA14, [TuneAppleA14]>; -def : ProcessorModel<"apple-a15", CycloneModel, ProcessorFeatures.AppleA15, - [TuneAppleA15]>; -def : ProcessorModel<"apple-a16", CycloneModel, ProcessorFeatures.AppleA16, - [TuneAppleA16]>; -def : ProcessorModel<"apple-a17", CycloneModel, ProcessorFeatures.AppleA17, - [TuneAppleA17]>; -// Mac CPUs def : ProcessorModel<"apple-m1", CycloneModel, ProcessorFeatures.AppleA14, [TuneAppleA14]>; + +def : ProcessorModel<"apple-a15", CycloneModel, ProcessorFeatures.AppleA15, + [TuneAppleA15]>; def : ProcessorModel<"apple-m2", CycloneModel, ProcessorFeatures.AppleA15, [TuneAppleA15]>; + +def : ProcessorModel<"apple-a16", CycloneModel, ProcessorFeatures.AppleA16, + [TuneAppleA16]>; def : ProcessorModel<"apple-m3", CycloneModel, ProcessorFeatures.AppleA16, [TuneAppleA16]>; -// watch CPUs. -def : ProcessorModel<"apple-s4", CycloneModel, ProcessorFeatures.AppleA12, - [TuneAppleA12]>; -def : ProcessorModel<"apple-s5", CycloneModel, ProcessorFeatures.AppleA12, - [TuneAppleA12]>; +def : ProcessorModel<"apple-a17", CycloneModel, ProcessorFeatures.AppleA17, + [TuneAppleA17]>; + +def : ProcessorModel<"apple-m4", CycloneModel, ProcessorFeatures.AppleM4, + [TuneAppleM4]>; // Alias for the latest Apple processor model supported by LLVM. -def : ProcessorModel<"apple-latest", CycloneModel, ProcessorFeatures.AppleA16, - [TuneAppleA16]>; +def : ProcessorModel<"apple-latest", CycloneModel, ProcessorFeatures.AppleM4, + [TuneAppleM4]>; + // Fujitsu A64FX def : ProcessorModel<"a64fx", A64FXModel, ProcessorFeatures.A64FX, diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 2b70c4715bf9..054eca8ad752 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -31,6 +31,27 @@ def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2, def AArch64CoalescerBarrier : SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, [SDNPOptInGlue, SDNPOutGlue]>; +def AArch64VGSave : SDNode<"AArch64ISD::VG_SAVE", SDTypeProfile<0, 0, []>, + [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>; + +def AArch64VGRestore : SDNode<"AArch64ISD::VG_RESTORE", SDTypeProfile<0, 0, []>, + [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>; + +def AArch64AllocateZABuffer : SDNode<"AArch64ISD::ALLOCATE_ZA_BUFFER", SDTypeProfile<1, 1, + [SDTCisInt<0>, SDTCisInt<1>]>, + [SDNPHasChain, SDNPSideEffect]>; +let usesCustomInserter = 1, Defs = [SP], Uses = [SP] in { + def AllocateZABuffer : Pseudo<(outs GPR64sp:$dst), (ins GPR64:$size), []>, Sched<[WriteI]> {} +} +def : Pat<(i64 (AArch64AllocateZABuffer GPR64:$size)), + (AllocateZABuffer $size)>; + +def AArch64InitTPIDR2Obj : SDNode<"AArch64ISD::INIT_TPIDR2OBJ", SDTypeProfile<0, 1, + [SDTCisInt<0>]>, [SDNPHasChain, SDNPMayStore]>; +let usesCustomInserter = 1 in { + def InitTPIDR2Obj : Pseudo<(outs), (ins GPR64:$buffer), [(AArch64InitTPIDR2Obj GPR64:$buffer)]>, Sched<[WriteI]> {} +} + //===----------------------------------------------------------------------===// // Instruction naming conventions. //===----------------------------------------------------------------------===// @@ -50,15 +71,17 @@ def AArch64CoalescerBarrier def SDT_AArch64RDSVL : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]>; def AArch64rdsvl : SDNode<"AArch64ISD::RDSVL", SDT_AArch64RDSVL>; -let Predicates = [HasSME] in { +let Predicates = [HasSMEandIsNonStreamingSafe] in { def RDSVLI_XI : sve_int_read_vl_a<0b0, 0b11111, "rdsvl", /*streaming_sve=*/0b1>; def ADDSPL_XXI : sve_int_arith_vl<0b1, "addspl", /*streaming_sve=*/0b1>; def ADDSVL_XXI : sve_int_arith_vl<0b0, "addsvl", /*streaming_sve=*/0b1>; +def : Pat<(AArch64rdsvl (i32 simm6_32b:$imm)), (RDSVLI_XI simm6_32b:$imm)>; +} + +let Predicates = [HasSME] in { defm ADDHA_MPPZ_S : sme_add_vector_to_tile_u32<0b0, "addha", int_aarch64_sme_addha>; defm ADDVA_MPPZ_S : sme_add_vector_to_tile_u32<0b1, "addva", int_aarch64_sme_addva>; - -def : Pat<(AArch64rdsvl (i32 simm6_32b:$imm)), (RDSVLI_XI simm6_32b:$imm)>; } let Predicates = [HasSMEI16I64] in { @@ -117,18 +140,20 @@ defm LD1_MXIPXX : sme_mem_ld_ss<"ld1">; defm ST1_MXIPXX : sme_mem_st_ss<"st1">; //===----------------------------------------------------------------------===// -// Spill + fill +// Move instructions //===----------------------------------------------------------------------===// -defm LDR_ZA : sme_fill<"ldr">; -defm STR_ZA : sme_spill<"str">; +defm INSERT_MXIPZ : sme_vector_to_tile<"mova">; +defm EXTRACT_ZPMXI : sme_tile_to_vector<"mova">; +} // End let Predicates = [HasSME] +let Predicates = [HasSMEandIsNonStreamingSafe] in { //===----------------------------------------------------------------------===// -// Move instructions +// Spill + fill //===----------------------------------------------------------------------===// -defm INSERT_MXIPZ : sme_vector_to_tile<"mova">; -defm EXTRACT_ZPMXI : sme_tile_to_vector<"mova">; +defm LDR_ZA : sme_fill<"ldr">; +defm STR_ZA : sme_spill<"str">; //===----------------------------------------------------------------------===// // Zero instruction @@ -164,7 +189,7 @@ def : Pat<(int_aarch64_sme_set_tpidr2 i64:$val), def : Pat<(i64 (int_aarch64_sme_get_tpidr2)), (MRS 0xde85)>; -} // End let Predicates = [HasSME] +} // End let Predicates = [HasSMEandIsNonStreamingSafe] multiclass CoalescerBarrierPseudo<RegisterClass rc, list<ValueType> vts> { def NAME : Pseudo<(outs rc:$dst), (ins rc:$src), []>, Sched<[]> { @@ -221,6 +246,15 @@ def : Pat<(AArch64_smstop (i32 svcr_op:$pstate), (i64 /*AArch64SME::Always*/0)), (MSRpstatesvcrImm1 svcr_op:$pstate, 0b0)>; +// Pseudo to insert cfi_offset/cfi_restore instructions. Used to save or restore +// the streaming value of VG around streaming-mode changes in locally-streaming +// functions. +def VGSavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>; +def : Pat<(AArch64VGSave), (VGSavePseudo)>; + +def VGRestorePseudo : Pseudo<(outs), (ins), []>, Sched<[]>; +def : Pat<(AArch64VGRestore), (VGRestorePseudo)>; + //===----------------------------------------------------------------------===// // SME2 Instructions //===----------------------------------------------------------------------===// @@ -550,11 +584,6 @@ defm SMOPS_MPPZZ_HtoS : sme2_int_mopx_tile<"smops", 0b001, int_aarch64_sme_smops defm UMOPA_MPPZZ_HtoS : sme2_int_mopx_tile<"umopa", 0b100, int_aarch64_sme_umopa_za32>; defm UMOPS_MPPZZ_HtoS : sme2_int_mopx_tile<"umops", 0b101, int_aarch64_sme_umops_za32>; -defm ZERO_T : sme2_zero_zt<"zero", 0b0001>; - -defm LDR_TX : sme2_spill_fill_vector<"ldr", 0b01111100, AArch64_restore_zt>; -defm STR_TX : sme2_spill_fill_vector<"str", 0b11111100, AArch64_save_zt>; - def MOVT_XTI : sme2_movt_zt_to_scalar<"movt", 0b0011111>; def MOVT_TIX : sme2_movt_scalar_to_zt<"movt", 0b0011111>; @@ -680,7 +709,15 @@ def STNT1D_2Z_STRIDED : sme2_st_vector_vg2_multi_scalar_scalar<0b11, 0b1, def STNT1D_4Z_STRIDED : sme2_st_vector_vg4_multi_scalar_scalar<0b11, 0b1, ZZZZ_d_strided, GPR64shifted64, "stnt1d">; defm STNT1D_2Z_STRIDED_IMM : sme2_st_vector_vg2_multi_scalar_immediate<0b11, 0b1, ZZ_d_strided, simm4s2, "stnt1d">; defm STNT1D_4Z_STRIDED_IMM : sme2_st_vector_vg4_multi_scalar_immediate<0b11, 0b1, ZZZZ_d_strided, simm4s4, "stnt1d">; -} +} // End let Predicates = [HasSME2] + + +let Predicates = [HasSME2andIsNonStreamingSafe] in { +defm ZERO_T : sme2_zero_zt<"zero", 0b0001>; + +defm LDR_TX : sme2_spill_fill_vector<"ldr", 0b01111100, AArch64_restore_zt>; +defm STR_TX : sme2_spill_fill_vector<"str", 0b11111100, AArch64_save_zt>; +} // End let Predicates = [HasSME2andIsNonStreamingSafe] let Predicates = [HasSME2, HasSMEI16I64] in { defm ADD_VG2_M2ZZ_D : sme2_dot_mla_add_sub_array_vg2_single<"add", 0b1011010, MatrixOp64, ZZ_d, ZPR4b64, nxv2i64, int_aarch64_sme_add_write_single_za_vg1x2>; diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index bd5de628d852..a3c41f2e052c 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -3351,7 +3351,7 @@ let Predicates = [HasSVEorSME] in { (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>; // Extract element from vector with immediate index that's within the bottom 128-bits. - let Predicates = [IsNeonAvailable], AddedComplexity = 1 in { + let Predicates = [HasNEON], AddedComplexity = 1 in { def : Pat<(i32 (vector_extract nxv16i8:$vec, VectorIndexB:$index)), (UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)>; def : Pat<(i32 (vector_extract nxv8i16:$vec, VectorIndexH:$index)), @@ -3360,9 +3360,9 @@ let Predicates = [HasSVEorSME] in { (UMOVvi32 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index)>; def : Pat<(i64 (vector_extract nxv2i64:$vec, VectorIndexD:$index)), (UMOVvi64 (v2i64 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexD:$index)>; - } // End IsNeonAvailable + } // End HasNEON - let Predicates = [IsNeonAvailable] in { + let Predicates = [HasNEON] in { def : Pat<(sext_inreg (vector_extract nxv16i8:$vec, VectorIndexB:$index), i8), (SMOVvi8to32 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)>; def : Pat<(sext_inreg (anyext (i32 (vector_extract nxv16i8:$vec, VectorIndexB:$index))), i8), @@ -3375,7 +3375,7 @@ let Predicates = [HasSVEorSME] in { def : Pat<(sext (i32 (vector_extract nxv4i32:$vec, VectorIndexS:$index))), (SMOVvi32to64 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index)>; - } // End IsNeonAvailable + } // End HasNEON // Extract first element from vector. let AddedComplexity = 2 in { diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 93ea729e2550..1fad1d5ca6d7 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -153,9 +153,11 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) { case CortexA710: case CortexA715: case CortexA720: + case CortexA725: case CortexX2: case CortexX3: case CortexX4: + case CortexX925: PrefFunctionAlignment = Align(16); VScaleForTuning = 1; PrefLoopAlignment = Align(32); @@ -180,6 +182,7 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) { case AppleA15: case AppleA16: case AppleA17: + case AppleM4: CacheLineSize = 64; PrefetchDistance = 280; MinPrefetchStride = 2048; @@ -189,6 +192,7 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) { case AppleA15: case AppleA16: case AppleA17: + case AppleM4: MaxInterleaveFactor = 4; break; default: diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index 7ef7a89b5749..5faba09aa67b 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -185,6 +185,12 @@ public: (hasSMEFA64() || (!isStreaming() && !isStreamingCompatible())); } + /// Returns true if the target has access to either the full range of SVE instructions, + /// or the streaming-compatible subset of SVE instructions. + bool isSVEorStreamingSVEAvailable() const { + return hasSVE() || (hasSME() && isStreaming()); + } + unsigned getMinVectorRegisterBitWidth() const { // Don't assume any minimum vector size when PSTATE.SM may not be 0, because // we don't yet support streaming-compatible codegen support that we trust @@ -355,30 +361,27 @@ public: void mirFileLoaded(MachineFunction &MF) const override; - bool hasSVEorSME() const { return hasSVE() || hasSME(); } - bool hasSVE2orSME() const { return hasSVE2() || hasSME(); } - // Return the known range for the bit length of SVE data registers. A value // of 0 means nothing is known about that particular limit beyong what's // implied by the architecture. unsigned getMaxSVEVectorSizeInBits() const { - assert(hasSVEorSME() && + assert(isSVEorStreamingSVEAvailable() && "Tried to get SVE vector length without SVE support!"); return MaxSVEVectorSizeInBits; } unsigned getMinSVEVectorSizeInBits() const { - assert(hasSVEorSME() && + assert(isSVEorStreamingSVEAvailable() && "Tried to get SVE vector length without SVE support!"); return MinSVEVectorSizeInBits; } bool useSVEForFixedLengthVectors() const { - if (!isNeonAvailable()) - return hasSVEorSME(); + if (!isSVEorStreamingSVEAvailable()) + return false; // Prefer NEON unless larger SVE registers are available. - return hasSVEorSME() && getMinSVEVectorSizeInBits() >= 256; + return !isNeonAvailable() || getMinSVEVectorSizeInBits() >= 256; } bool useSVEForFixedLengthVectors(EVT VT) const { diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 7de9071476e7..37ce07d4a09d 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -187,6 +187,11 @@ static cl::opt<unsigned> SVEVectorBitsMinOpt( "with zero meaning no minimum size is assumed."), cl::init(0), cl::Hidden); +static cl::opt<bool> ForceStreaming( + "force-streaming", + cl::desc("Force the use of streaming code for all functions"), + cl::init(false), cl::Hidden); + static cl::opt<bool> ForceStreamingCompatible( "force-streaming-compatible", cl::desc("Force the use of streaming-compatible code for all functions"), @@ -412,11 +417,11 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const { StringRef FS = FSAttr.isValid() ? FSAttr.getValueAsString() : TargetFS; bool HasMinSize = F.hasMinSize(); - bool IsStreaming = F.hasFnAttribute("aarch64_pstate_sm_enabled") || + bool IsStreaming = ForceStreaming || + F.hasFnAttribute("aarch64_pstate_sm_enabled") || F.hasFnAttribute("aarch64_pstate_sm_body"); - bool IsStreamingCompatible = - F.hasFnAttribute("aarch64_pstate_sm_compatible") || - ForceStreamingCompatible; + bool IsStreamingCompatible = ForceStreamingCompatible || + F.hasFnAttribute("aarch64_pstate_sm_compatible"); unsigned MinSVEVectorSize = 0; unsigned MaxSVEVectorSize = 0; @@ -549,8 +554,7 @@ public: } // end anonymous namespace -void AArch64TargetMachine::registerPassBuilderCallbacks( - PassBuilder &PB, bool PopulateClassToPassNames) { +void AArch64TargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PB.registerLateLoopOptimizationsEPCallback( [=](LoopPassManager &LPM, OptimizationLevel Level) { diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/llvm/lib/Target/AArch64/AArch64TargetMachine.h index e396d9204716..1a470ca87127 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.h +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.h @@ -43,8 +43,7 @@ public: // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &PM) override; - void registerPassBuilderCallbacks(PassBuilder &PB, - bool PopulateClassToPassNames) override; + void registerPassBuilderCallbacks(PassBuilder &PB) override; TargetTransformInfo getTargetTransformInfo(const Function &F) const override; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 9f5756fc7e40..9a0eb45b875d 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -985,6 +985,33 @@ static bool isAllActivePredicate(Value *Pred) { m_ConstantInt<AArch64SVEPredPattern::all>())); } +// Simplify unary operation where predicate has all inactive lanes by replacing +// instruction with zeroed object +static std::optional<Instruction *> +instCombineSVENoActiveUnaryZero(InstCombiner &IC, IntrinsicInst &II) { + if (match(II.getOperand(0), m_ZeroInt())) { + Constant *Node; + Type *RetTy = II.getType(); + if (RetTy->isStructTy()) { + auto StructT = cast<StructType>(RetTy); + auto VecT = StructT->getElementType(0); + SmallVector<llvm::Constant *, 4> ZerVec; + for (unsigned i = 0; i < StructT->getNumElements(); i++) { + ZerVec.push_back(VecT->isFPOrFPVectorTy() ? ConstantFP::get(VecT, 0.0) + : ConstantInt::get(VecT, 0)); + } + Node = ConstantStruct::get(StructT, ZerVec); + } else if (RetTy->isFPOrFPVectorTy()) + Node = ConstantFP::get(RetTy, 0.0); + else + Node = ConstantInt::get(II.getType(), 0); + + IC.replaceInstUsesWith(II, Node); + return IC.eraseInstFromFunction(II); + } + return std::nullopt; +} + static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC, IntrinsicInst &II) { // svsel(ptrue, x, y) => x @@ -1398,6 +1425,10 @@ instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { Value *PtrOp = II.getOperand(1); Type *VecTy = II.getType(); + // Replace by zero constant when all lanes are inactive + if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II)) + return II_NA; + if (isAllActivePredicate(Pred)) { LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp); Load->copyMetadata(II); @@ -1745,6 +1776,10 @@ instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) { Type *Ty = II.getType(); Value *PassThru = ConstantAggregateZero::get(Ty); + // Replace by zero constant when all lanes are inactive + if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II)) + return II_NA; + // Contiguous gather => masked load. // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1)) // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer) @@ -1971,6 +2006,41 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, switch (IID) { default: break; + + case Intrinsic::aarch64_sve_ld1_gather: + case Intrinsic::aarch64_sve_ld1_gather_scalar_offset: + case Intrinsic::aarch64_sve_ld1_gather_sxtw: + case Intrinsic::aarch64_sve_ld1_gather_sxtw_index: + case Intrinsic::aarch64_sve_ld1_gather_uxtw: + case Intrinsic::aarch64_sve_ld1_gather_uxtw_index: + case Intrinsic::aarch64_sve_ld1q_gather_index: + case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset: + case Intrinsic::aarch64_sve_ld1q_gather_vector_offset: + case Intrinsic::aarch64_sve_ld1ro: + case Intrinsic::aarch64_sve_ld1rq: + case Intrinsic::aarch64_sve_ld1udq: + case Intrinsic::aarch64_sve_ld1uwq: + case Intrinsic::aarch64_sve_ld2_sret: + case Intrinsic::aarch64_sve_ld2q_sret: + case Intrinsic::aarch64_sve_ld3_sret: + case Intrinsic::aarch64_sve_ld3q_sret: + case Intrinsic::aarch64_sve_ld4_sret: + case Intrinsic::aarch64_sve_ld4q_sret: + case Intrinsic::aarch64_sve_ldff1: + case Intrinsic::aarch64_sve_ldff1_gather: + case Intrinsic::aarch64_sve_ldff1_gather_index: + case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset: + case Intrinsic::aarch64_sve_ldff1_gather_sxtw: + case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index: + case Intrinsic::aarch64_sve_ldff1_gather_uxtw: + case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index: + case Intrinsic::aarch64_sve_ldnf1: + case Intrinsic::aarch64_sve_ldnt1: + case Intrinsic::aarch64_sve_ldnt1_gather: + case Intrinsic::aarch64_sve_ldnt1_gather_index: + case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset: + case Intrinsic::aarch64_sve_ldnt1_gather_uxtw: + return instCombineSVENoActiveUnaryZero(IC, II); case Intrinsic::aarch64_neon_fmaxnm: case Intrinsic::aarch64_neon_fminnm: return instCombineMaxMinNM(IC, II); @@ -2162,19 +2232,20 @@ AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { case TargetTransformInfo::RGK_Scalar: return TypeSize::getFixed(64); case TargetTransformInfo::RGK_FixedWidthVector: - if (!ST->isNeonAvailable() && !EnableFixedwidthAutovecInStreamingMode) - return TypeSize::getFixed(0); - - if (ST->hasSVE()) + if (ST->useSVEForFixedLengthVectors() && + (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode)) return TypeSize::getFixed( std::max(ST->getMinSVEVectorSizeInBits(), 128u)); - - return TypeSize::getFixed(ST->hasNEON() ? 128 : 0); + else if (ST->isNeonAvailable()) + return TypeSize::getFixed(128); + else + return TypeSize::getFixed(0); case TargetTransformInfo::RGK_ScalableVector: - if (!ST->isSVEAvailable() && !EnableScalableAutovecInStreamingMode) + if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() && + EnableScalableAutovecInStreamingMode)) + return TypeSize::getScalable(128); + else return TypeSize::getScalable(0); - - return TypeSize::getScalable(ST->hasSVE() ? 128 : 0); } llvm_unreachable("Unsupported register kind"); } @@ -2690,7 +2761,8 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, return AdjustCost(Entry->Cost); if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) && - CCH == TTI::CastContextHint::Masked && ST->hasSVEorSME() && + CCH == TTI::CastContextHint::Masked && + ST->isSVEorStreamingSVEAvailable() && TLI->getTypeAction(Src->getContext(), SrcTy) == TargetLowering::TypePromoteInteger && TLI->getTypeAction(Dst->getContext(), DstTy) == @@ -2711,8 +2783,8 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal, // but we also want to include the TTI::CastContextHint::Masked case too. if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) && - CCH == TTI::CastContextHint::Masked && ST->hasSVEorSME() && - TLI->isTypeLegal(DstTy)) + CCH == TTI::CastContextHint::Masked && + ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy)) CCH = TTI::CastContextHint::Normal; return AdjustCost( @@ -3187,11 +3259,16 @@ AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, if (!LT.first.isValid()) return InstructionCost::getInvalid(); + // Return an invalid cost for element types that we are unable to lower. + auto *VT = cast<VectorType>(Src); + if (VT->getElementType()->isIntegerTy(1)) + return InstructionCost::getInvalid(); + // The code-generator is currently not able to handle scalable vectors // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting // it. This change will be removed when code-generation for these types is // sufficiently reliable. - if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1)) + if (VT->getElementCount() == ElementCount::getScalable(1)) return InstructionCost::getInvalid(); return LT.first; @@ -3212,16 +3289,17 @@ InstructionCost AArch64TTIImpl::getGatherScatterOpCost( if (!LT.first.isValid()) return InstructionCost::getInvalid(); + // Return an invalid cost for element types that we are unable to lower. if (!LT.second.isVector() || - !isElementTypeLegalForScalableVector(VT->getElementType())) + !isElementTypeLegalForScalableVector(VT->getElementType()) || + VT->getElementType()->isIntegerTy(1)) return InstructionCost::getInvalid(); // The code-generator is currently not able to handle scalable vectors // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting // it. This change will be removed when code-generation for these types is // sufficiently reliable. - if (cast<VectorType>(DataTy)->getElementCount() == - ElementCount::getScalable(1)) + if (VT->getElementCount() == ElementCount::getScalable(1)) return InstructionCost::getInvalid(); ElementCount LegalVF = LT.second.getVectorElementCount(); @@ -3259,8 +3337,12 @@ InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting // it. This change will be removed when code-generation for these types is // sufficiently reliable. + // We also only support full register predicate loads and stores. if (auto *VTy = dyn_cast<ScalableVectorType>(Ty)) - if (VTy->getElementCount() == ElementCount::getScalable(1)) + if (VTy->getElementCount() == ElementCount::getScalable(1) || + (VTy->getElementType()->isIntegerTy(1) && + !VTy->getElementCount().isKnownMultipleOf( + ElementCount::getScalable(16)))) return InstructionCost::getInvalid(); // TODO: consider latency as well for TCK_SizeAndLatency. @@ -4234,4 +4316,4 @@ bool AArch64TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost); return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); -}
\ No newline at end of file +} diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index feec1a4289c3..1180225ce009 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -248,7 +248,7 @@ public: if (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy()) return true; - if (Ty->isIntegerTy(8) || Ty->isIntegerTy(16) || + if (Ty->isIntegerTy(1) || Ty->isIntegerTy(8) || Ty->isIntegerTy(16) || Ty->isIntegerTy(32) || Ty->isIntegerTy(64)) return true; @@ -276,7 +276,7 @@ public: } bool isLegalMaskedGatherScatter(Type *DataType) const { - if (!ST->hasSVE() || !ST->isNeonAvailable()) + if (!ST->isSVEAvailable()) return false; // For fixed vectors, scalarize if not using SVE for them. @@ -373,9 +373,11 @@ public: bool preferPredicateOverEpilogue(TailFoldingInfo *TFI); - bool supportsScalableVectors() const { return ST->hasSVE(); } + bool supportsScalableVectors() const { + return ST->isSVEorStreamingSVEAvailable(); + } - bool enableScalableVectorization() const { return ST->hasSVE(); } + bool enableScalableVectorization() const { return ST->isSVEAvailable(); } bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const; diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 57e4f6d298d8..5e17ed40df8a 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -8094,8 +8094,8 @@ ParseStatus AArch64AsmParser::tryParseImmRange(OperandVector &Operands) { if (getParser().parseExpression(ImmL)) return ParseStatus::NoMatch; - unsigned ImmFVal = dyn_cast<MCConstantExpr>(ImmF)->getValue(); - unsigned ImmLVal = dyn_cast<MCConstantExpr>(ImmL)->getValue(); + unsigned ImmFVal = cast<MCConstantExpr>(ImmF)->getValue(); + unsigned ImmLVal = cast<MCConstantExpr>(ImmL)->getValue(); Operands.push_back( AArch64Operand::CreateImmRange(ImmFVal, ImmLVal, S, E, getContext())); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp index 270474f80767..1fb50a089ea3 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp @@ -53,6 +53,8 @@ using namespace llvm; using namespace AArch64GISelUtils; +extern cl::opt<bool> EnableSVEGISel; + AArch64CallLowering::AArch64CallLowering(const AArch64TargetLowering &TLI) : CallLowering(&TLI) {} @@ -404,14 +406,13 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, ExtendOp = TargetOpcode::G_ZEXT; LLT NewLLT(NewVT); - LLT OldLLT(MVT::getVT(CurArgInfo.Ty)); + LLT OldLLT = getLLTForType(*CurArgInfo.Ty, DL); CurArgInfo.Ty = EVT(NewVT).getTypeForEVT(Ctx); // Instead of an extend, we might have a vector type which needs // padding with more elements, e.g. <2 x half> -> <4 x half>. if (NewVT.isVector()) { if (OldLLT.isVector()) { if (NewLLT.getNumElements() > OldLLT.getNumElements()) { - CurVReg = MIRBuilder.buildPadVectorWithUndefElements(NewLLT, CurVReg) .getReg(0); @@ -525,10 +526,10 @@ static void handleMustTailForwardedRegisters(MachineIRBuilder &MIRBuilder, bool AArch64CallLowering::fallBackToDAGISel(const MachineFunction &MF) const { auto &F = MF.getFunction(); - if (F.getReturnType()->isScalableTy() || - llvm::any_of(F.args(), [](const Argument &A) { - return A.getType()->isScalableTy(); - })) + if (!EnableSVEGISel && (F.getReturnType()->isScalableTy() || + llvm::any_of(F.args(), [](const Argument &A) { + return A.getType()->isScalableTy(); + }))) return true; const auto &ST = MF.getSubtarget<AArch64Subtarget>(); if (!ST.hasNEON() || !ST.hasFPARMv8()) { @@ -1022,7 +1023,7 @@ static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect, if (!IsTailCall) { if (!PAI) - return IsIndirect ? getBLRCallOpcode(CallerF) : AArch64::BL; + return IsIndirect ? getBLRCallOpcode(CallerF) : (unsigned)AArch64::BL; assert(IsIndirect && "Direct call should not be authenticated"); assert((PAI->Key == AArch64PACKey::IA || PAI->Key == AArch64PACKey::IB) && diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index 4a7c82b393c1..0357a7206c47 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -597,8 +597,14 @@ getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB, /// Given a register bank, and size in bits, return the smallest register class /// that can represent that combination. static const TargetRegisterClass * -getMinClassForRegBank(const RegisterBank &RB, unsigned SizeInBits, +getMinClassForRegBank(const RegisterBank &RB, TypeSize SizeInBits, bool GetAllRegSet = false) { + if (SizeInBits.isScalable()) { + assert(RB.getID() == AArch64::FPRRegBankID && + "Expected FPR regbank for scalable type size"); + return &AArch64::ZPRRegClass; + } + unsigned RegBankID = RB.getID(); if (RegBankID == AArch64::GPRRegBankID) { @@ -939,8 +945,9 @@ getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII, Register SrcReg = I.getOperand(1).getReg(); const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); - unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI); - unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); + + TypeSize DstSize = RBI.getSizeInBits(DstReg, MRI, TRI); + TypeSize SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); // Special casing for cross-bank copies of s1s. We can technically represent // a 1-bit value with any size of register. The minimum size for a GPR is 32 @@ -951,7 +958,7 @@ getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII, // register bank. Or make a new helper that carries along some constraint // information. if (SrcRegBank != DstRegBank && (DstSize == 1 && SrcSize == 1)) - SrcSize = DstSize = 32; + SrcSize = DstSize = TypeSize::getFixed(32); return {getMinClassForRegBank(SrcRegBank, SrcSize, true), getMinClassForRegBank(DstRegBank, DstSize, true)}; @@ -1016,8 +1023,8 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, return false; } - unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC); - unsigned DstSize = TRI.getRegSizeInBits(*DstRC); + const TypeSize SrcSize = TRI.getRegSizeInBits(*SrcRC); + const TypeSize DstSize = TRI.getRegSizeInBits(*DstRC); unsigned SubReg; // If the source bank doesn't support a subregister copy small enough, @@ -4660,7 +4667,6 @@ MachineInstr *AArch64InstructionSelector::emitConditionalComparison( Register LHS, Register RHS, CmpInst::Predicate CC, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, MachineIRBuilder &MIB) const { - // TODO: emit CMN as an optimization. auto &MRI = *MIB.getMRI(); LLT OpTy = MRI.getType(LHS); unsigned CCmpOpc; @@ -4668,10 +4674,12 @@ MachineInstr *AArch64InstructionSelector::emitConditionalComparison( if (CmpInst::isIntPredicate(CC)) { assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64); C = getIConstantVRegValWithLookThrough(RHS, MRI); - if (C && C->Value.ult(32)) + if (!C || C->Value.sgt(31) || C->Value.slt(-31)) + CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr; + else if (C->Value.ule(31)) CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWi : AArch64::CCMPXi; else - CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr; + CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMNWi : AArch64::CCMNXi; } else { assert(OpTy.getSizeInBits() == 16 || OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64); @@ -4696,6 +4704,8 @@ MachineInstr *AArch64InstructionSelector::emitConditionalComparison( MIB.buildInstr(CCmpOpc, {}, {LHS}); if (CCmpOpc == AArch64::CCMPWi || CCmpOpc == AArch64::CCMPXi) CCmp.addImm(C->Value.getZExtValue()); + else if (CCmpOpc == AArch64::CCMNWi || CCmpOpc == AArch64::CCMNXi) + CCmp.addImm(C->Value.abs().getZExtValue()); else CCmp.addReg(RHS); CCmp.addImm(NZCV).addImm(Predicate); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 42cd43c3afa3..fef0b722efe4 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -661,7 +661,13 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) // Conversions getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) - .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32}) + .legalFor({{s32, s32}, + {s64, s32}, + {s32, s64}, + {s64, s64}, + {v2s64, v2s64}, + {v4s32, v4s32}, + {v2s32, v2s32}}) .legalIf([=](const LegalityQuery &Query) { return HasFP16 && (Query.Types[1] == s16 || Query.Types[1] == v4s16 || @@ -669,26 +675,38 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) (Query.Types[0] == s32 || Query.Types[0] == s64 || Query.Types[0] == v4s16 || Query.Types[0] == v8s16); }) - .widenScalarToNextPow2(0) - .clampScalar(0, s32, s64) - .widenScalarToNextPow2(1) - .clampScalarOrElt(1, MinFPScalar, s64) + .scalarizeIf(scalarOrEltWiderThan(0, 64), 0) + .scalarizeIf(scalarOrEltWiderThan(1, 64), 1) + // The range of a fp16 value fits into an i17, so we can lower the width + // to i64. + .narrowScalarIf( + [=](const LegalityQuery &Query) { + return Query.Types[1] == s16 && Query.Types[0].getSizeInBits() > 64; + }, + changeTo(0, s64)) .moreElementsToNextPow2(0) + .widenScalarOrEltToNextPow2OrMinSize(0) + .minScalar(0, s32) + .widenScalarOrEltToNextPow2OrMinSize(1, /*MinSize=*/HasFP16 ? 16 : 32) .widenScalarIf( [=](const LegalityQuery &Query) { - return Query.Types[0].getScalarSizeInBits() > - Query.Types[1].getScalarSizeInBits(); + return Query.Types[0].getScalarSizeInBits() <= 64 && + Query.Types[0].getScalarSizeInBits() > + Query.Types[1].getScalarSizeInBits(); }, LegalizeMutations::changeElementSizeTo(1, 0)) .widenScalarIf( [=](const LegalityQuery &Query) { - return Query.Types[0].getScalarSizeInBits() < - Query.Types[1].getScalarSizeInBits(); + return Query.Types[1].getScalarSizeInBits() <= 64 && + Query.Types[0].getScalarSizeInBits() < + Query.Types[1].getScalarSizeInBits(); }, LegalizeMutations::changeElementSizeTo(0, 1)) .clampNumElements(0, v4s16, v8s16) .clampNumElements(0, v2s32, v4s32) - .clampMaxNumElements(0, s64, 2); + .clampMaxNumElements(0, s64, 2) + .libcallFor( + {{s32, s128}, {s64, s128}, {s128, s128}, {s128, s32}, {s128, s64}}); getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32}) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp index 17dd8f2314a2..0ba3a543d114 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp @@ -165,6 +165,10 @@ bool AArch64O0PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, /*LegalizerInfo*/ nullptr, /*EnableOpt*/ false, F.hasOptSize(), F.hasMinSize()); + // Disable fixed-point iteration in the Combiner. This improves compile-time + // at the cost of possibly missing optimizations. See PR#94291 for details. + CInfo.MaxIterations = 1; + AArch64O0PreLegalizerCombinerImpl Impl(MF, CInfo, &TPC, *KB, /*CSEInfo*/ nullptr, RuleConfig, ST); return Impl.combineMachineInstrs(); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp index 0c7be9f42c57..f71fe323a6d3 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp @@ -524,8 +524,8 @@ void AArch64PostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<GISelKnownBitsAnalysis>(); AU.addPreserved<GISelKnownBitsAnalysis>(); if (!IsOptNone) { - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); AU.addRequired<GISelCSEAnalysisWrapperPass>(); AU.addPreserved<GISelCSEAnalysisWrapperPass>(); } @@ -557,7 +557,8 @@ bool AArch64PostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); MachineDominatorTree *MDT = - IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); + IsOptNone ? nullptr + : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); GISelCSEAnalysisWrapper &Wrapper = getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper(); auto *CSEInfo = &Wrapper.get(TPC->getCSEConfig()); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp index 77b8cbe5793c..4a1977ba1a00 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp @@ -19,6 +19,7 @@ /// //===----------------------------------------------------------------------===// +#include "AArch64ExpandImm.h" #include "AArch64GlobalISelUtils.h" #include "AArch64PerfectShuffle.h" #include "AArch64Subtarget.h" @@ -563,7 +564,8 @@ tryAdjustICmpImmAndPred(Register RHS, CmpInst::Predicate P, auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS, MRI); if (!ValAndVReg) return std::nullopt; - uint64_t C = ValAndVReg->Value.getZExtValue(); + uint64_t OriginalC = ValAndVReg->Value.getZExtValue(); + uint64_t C = OriginalC; if (isLegalArithImmed(C)) return std::nullopt; @@ -633,9 +635,20 @@ tryAdjustICmpImmAndPred(Register RHS, CmpInst::Predicate P, // predicate if it is. if (Size == 32) C = static_cast<uint32_t>(C); - if (!isLegalArithImmed(C)) - return std::nullopt; - return {{C, P}}; + if (isLegalArithImmed(C)) + return {{C, P}}; + + auto IsMaterializableInSingleInstruction = [=](uint64_t Imm) { + SmallVector<AArch64_IMM::ImmInsnModel> Insn; + AArch64_IMM::expandMOVImm(Imm, 32, Insn); + return Insn.size() == 1; + }; + + if (!IsMaterializableInSingleInstruction(OriginalC) && + IsMaterializableInSingleInstruction(C)) + return {{C, P}}; + + return std::nullopt; } /// Determine whether or not it is possible to update the RHS and predicate of diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp index 31f77be20f34..e9b25924b35f 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp @@ -823,8 +823,8 @@ void AArch64PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { getSelectionDAGFallbackAnalysisUsage(AU); AU.addRequired<GISelKnownBitsAnalysis>(); AU.addPreserved<GISelKnownBitsAnalysis>(); - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); AU.addRequired<GISelCSEAnalysisWrapperPass>(); AU.addPreserved<GISelCSEAnalysisWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); @@ -856,7 +856,8 @@ bool AArch64PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { bool EnableOpt = MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F); GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); - MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>(); + MachineDominatorTree *MDT = + &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, /*LegalizerInfo*/ nullptr, EnableOpt, F.hasOptSize(), F.hasMinSize()); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp index 4aa6999d1d3c..5616d063f70b 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp @@ -163,17 +163,18 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo( unsigned PartialMapSrcIdx = PMI_##RBNameSrc##Size - PMI_Min; \ (void)PartialMapDstIdx; \ (void)PartialMapSrcIdx; \ - const ValueMapping *Map = getCopyMapping( \ - AArch64::RBNameDst##RegBankID, AArch64::RBNameSrc##RegBankID, Size); \ + const ValueMapping *Map = getCopyMapping(AArch64::RBNameDst##RegBankID, \ + AArch64::RBNameSrc##RegBankID, \ + TypeSize::getFixed(Size)); \ (void)Map; \ assert(Map[0].BreakDown == \ &AArch64GenRegisterBankInfo::PartMappings[PartialMapDstIdx] && \ - Map[0].NumBreakDowns == 1 && #RBNameDst #Size \ - " Dst is incorrectly initialized"); \ + Map[0].NumBreakDowns == 1 && \ + #RBNameDst #Size " Dst is incorrectly initialized"); \ assert(Map[1].BreakDown == \ &AArch64GenRegisterBankInfo::PartMappings[PartialMapSrcIdx] && \ - Map[1].NumBreakDowns == 1 && #RBNameSrc #Size \ - " Src is incorrectly initialized"); \ + Map[1].NumBreakDowns == 1 && \ + #RBNameSrc #Size " Src is incorrectly initialized"); \ \ } while (false) @@ -218,7 +219,7 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo( unsigned AArch64RegisterBankInfo::copyCost(const RegisterBank &A, const RegisterBank &B, - TypeSize Size) const { + const TypeSize Size) const { // What do we do with different size? // copy are same size. // Will introduce other hooks for different size: @@ -258,6 +259,7 @@ AArch64RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, case AArch64::QQQRegClassID: case AArch64::QQQQRegClassID: case AArch64::ZPRRegClassID: + case AArch64::ZPR_3bRegClassID: return getRegBank(AArch64::FPRRegBankID); case AArch64::GPR32commonRegClassID: case AArch64::GPR32RegClassID: @@ -304,7 +306,7 @@ AArch64RegisterBankInfo::getInstrAlternativeMappings( case TargetOpcode::G_OR: { // 32 and 64-bit or can be mapped on either FPR or // GPR for the same cost. - unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI); + TypeSize Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI); if (Size != 32 && Size != 64) break; @@ -325,7 +327,7 @@ AArch64RegisterBankInfo::getInstrAlternativeMappings( return AltMappings; } case TargetOpcode::G_BITCAST: { - unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI); + TypeSize Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI); if (Size != 32 && Size != 64) break; @@ -365,7 +367,7 @@ AArch64RegisterBankInfo::getInstrAlternativeMappings( return AltMappings; } case TargetOpcode::G_LOAD: { - unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI); + TypeSize Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI); if (Size != 64) break; @@ -377,15 +379,17 @@ AArch64RegisterBankInfo::getInstrAlternativeMappings( InstructionMappings AltMappings; const InstructionMapping &GPRMapping = getInstructionMapping( /*ID*/ 1, /*Cost*/ 1, - getOperandsMapping({getValueMapping(PMI_FirstGPR, Size), - // Addresses are GPR 64-bit. - getValueMapping(PMI_FirstGPR, 64)}), + getOperandsMapping( + {getValueMapping(PMI_FirstGPR, Size), + // Addresses are GPR 64-bit. + getValueMapping(PMI_FirstGPR, TypeSize::getFixed(64))}), /*NumOperands*/ 2); const InstructionMapping &FPRMapping = getInstructionMapping( /*ID*/ 2, /*Cost*/ 1, - getOperandsMapping({getValueMapping(PMI_FirstFPR, Size), - // Addresses are GPR 64-bit. - getValueMapping(PMI_FirstGPR, 64)}), + getOperandsMapping( + {getValueMapping(PMI_FirstFPR, Size), + // Addresses are GPR 64-bit. + getValueMapping(PMI_FirstGPR, TypeSize::getFixed(64))}), /*NumOperands*/ 2); AltMappings.push_back(&GPRMapping); @@ -437,7 +441,7 @@ AArch64RegisterBankInfo::getSameKindOfOperandsMapping( "This code is for instructions with 3 or less operands"); LLT Ty = MRI.getType(MI.getOperand(0).getReg()); - unsigned Size = Ty.getSizeInBits(); + TypeSize Size = Ty.getSizeInBits(); bool IsFPR = Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc); PartialMappingIdx RBIdx = IsFPR ? PMI_FirstFPR : PMI_FirstGPR; @@ -496,6 +500,20 @@ static bool isFPIntrinsic(const MachineRegisterInfo &MRI, } } +bool AArch64RegisterBankInfo::isPHIWithFPContraints( + const MachineInstr &MI, const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI, const unsigned Depth) const { + if (!MI.isPHI() || Depth > MaxFPRSearchDepth) + return false; + + return any_of(MRI.use_nodbg_instructions(MI.getOperand(0).getReg()), + [&](const MachineInstr &UseMI) { + if (onlyUsesFP(UseMI, MRI, TRI, Depth + 1)) + return true; + return isPHIWithFPContraints(UseMI, MRI, TRI, Depth + 1); + }); +} + bool AArch64RegisterBankInfo::hasFPConstraints(const MachineInstr &MI, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, @@ -700,9 +718,9 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // If both RB are null that means both registers are generic. // We shouldn't be here. assert(DstRB && SrcRB && "Both RegBank were nullptr"); - unsigned Size = getSizeInBits(DstReg, MRI, TRI); + TypeSize Size = getSizeInBits(DstReg, MRI, TRI); return getInstructionMapping( - DefaultMappingID, copyCost(*DstRB, *SrcRB, TypeSize::getFixed(Size)), + DefaultMappingID, copyCost(*DstRB, *SrcRB, Size), getCopyMapping(DstRB->getID(), SrcRB->getID(), Size), // We only care about the mapping of the destination. /*NumOperands*/ 1); @@ -713,7 +731,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case TargetOpcode::G_BITCAST: { LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); - unsigned Size = DstTy.getSizeInBits(); + TypeSize Size = DstTy.getSizeInBits(); bool DstIsGPR = !DstTy.isVector() && DstTy.getSizeInBits() <= 64; bool SrcIsGPR = !SrcTy.isVector() && SrcTy.getSizeInBits() <= 64; const RegisterBank &DstRB = @@ -721,7 +739,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { const RegisterBank &SrcRB = SrcIsGPR ? AArch64::GPRRegBank : AArch64::FPRRegBank; return getInstructionMapping( - DefaultMappingID, copyCost(DstRB, SrcRB, TypeSize::getFixed(Size)), + DefaultMappingID, copyCost(DstRB, SrcRB, Size), getCopyMapping(DstRB.getID(), SrcRB.getID(), Size), // We only care about the mapping of the destination for COPY. /*NumOperands*/ Opc == TargetOpcode::G_BITCAST ? 2 : 1); @@ -851,13 +869,18 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // instead of blind map every scalar to GPR. if (any_of(MRI.use_nodbg_instructions(MI.getOperand(0).getReg()), [&](const MachineInstr &UseMI) { - // If we have at least one direct use in a FP instruction, + // If we have at least one direct or indirect use + // in a FP instruction, // assume this was a floating point load in the IR. If it was // not, we would have had a bitcast before reaching that // instruction. // // Int->FP conversion operations are also captured in // onlyDefinesFP(). + + if (isPHIWithFPContraints(UseMI, MRI, TRI)) + return true; + return onlyUsesFP(UseMI, MRI, TRI) || onlyDefinesFP(UseMI, MRI, TRI); })) @@ -1107,7 +1130,8 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { LLT Ty = MRI.getType(MI.getOperand(Idx).getReg()); if (!Ty.isValid()) continue; - auto Mapping = getValueMapping(OpRegBankIdx[Idx], OpSize[Idx]); + auto Mapping = + getValueMapping(OpRegBankIdx[Idx], TypeSize::getFixed(OpSize[Idx])); if (!Mapping->isValid()) return getInvalidInstructionMapping(); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h index b6364c6a6409..0d89f540650a 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h @@ -70,7 +70,7 @@ protected: PartialMappingIdx LastAlias, ArrayRef<PartialMappingIdx> Order); - static unsigned getRegBankBaseIdxOffset(unsigned RBIdx, unsigned Size); + static unsigned getRegBankBaseIdxOffset(unsigned RBIdx, TypeSize Size); /// Get the pointer to the ValueMapping representing the RegisterBank /// at \p RBIdx with a size of \p Size. @@ -80,13 +80,13 @@ protected: /// /// \pre \p RBIdx != PartialMappingIdx::None static const RegisterBankInfo::ValueMapping * - getValueMapping(PartialMappingIdx RBIdx, unsigned Size); + getValueMapping(PartialMappingIdx RBIdx, TypeSize Size); /// Get the pointer to the ValueMapping of the operands of a copy /// instruction from the \p SrcBankID register bank to the \p DstBankID /// register bank with a size of \p Size. static const RegisterBankInfo::ValueMapping * - getCopyMapping(unsigned DstBankID, unsigned SrcBankID, unsigned Size); + getCopyMapping(unsigned DstBankID, unsigned SrcBankID, TypeSize Size); /// Get the instruction mapping for G_FPEXT. /// @@ -120,6 +120,13 @@ class AArch64RegisterBankInfo final : public AArch64GenRegisterBankInfo { /// Maximum recursion depth for hasFPConstraints. const unsigned MaxFPRSearchDepth = 2; + /// \returns true if \p MI is a PHI that its def is used by + /// any instruction that onlyUsesFP. + bool isPHIWithFPContraints(const MachineInstr &MI, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI, + unsigned Depth = 0) const; + /// \returns true if \p MI only uses and defines FPRs. bool hasFPConstraints(const MachineInstr &MI, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, unsigned Depth = 0) const; diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index f5bea3336cbf..7dba22c066dc 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -28,7 +28,7 @@ #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCObjectWriter.h" -#include "llvm/MC/MCSection.h" +#include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbolELF.h" @@ -183,7 +183,7 @@ public: std::move(Emitter)), MappingSymbolCounter(0), LastEMS(EMS_None) {} - void changeSection(MCSection *Section, const MCExpr *Subsection) override { + void changeSection(MCSection *Section, uint32_t Subsection = 0) override { // We have to keep track of the mapping symbol state of any sections we // use. Each one should start off as EMS_None, which is provided as the // default constructor by DenseMap::lookup. @@ -248,6 +248,7 @@ public: emitDataMappingSymbol(); MCObjectStreamer::emitFill(NumBytes, FillValue, Loc); } + private: enum ElfMappingSymbol { EMS_None, @@ -283,7 +284,6 @@ private: DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols; ElfMappingSymbol LastEMS; }; - } // end anonymous namespace AArch64ELFStreamer &AArch64TargetELFStreamer::getStreamer() { @@ -299,6 +299,37 @@ void AArch64TargetELFStreamer::emitDirectiveVariantPCS(MCSymbol *Symbol) { cast<MCSymbolELF>(Symbol)->setOther(ELF::STO_AARCH64_VARIANT_PCS); } +void AArch64TargetELFStreamer::finish() { + AArch64TargetStreamer::finish(); + AArch64ELFStreamer &S = getStreamer(); + MCContext &Ctx = S.getContext(); + auto &Asm = S.getAssembler(); + MCSectionELF *MemtagSec = nullptr; + for (const MCSymbol &Symbol : Asm.symbols()) { + const auto &Sym = cast<MCSymbolELF>(Symbol); + if (Sym.isMemtag()) { + MemtagSec = Ctx.getELFSection(".memtag.globals.static", + ELF::SHT_AARCH64_MEMTAG_GLOBALS_STATIC, 0); + break; + } + } + if (!MemtagSec) + return; + + // switchSection registers the section symbol and invalidates symbols(). We + // need a separate symbols() loop. + S.switchSection(MemtagSec); + const auto *Zero = MCConstantExpr::create(0, Ctx); + for (const MCSymbol &Symbol : Asm.symbols()) { + const auto &Sym = cast<MCSymbolELF>(Symbol); + if (!Sym.isMemtag()) + continue; + auto *SRE = MCSymbolRefExpr::create(&Sym, MCSymbolRefExpr::VK_None, Ctx); + (void)S.emitRelocDirective(*Zero, "BFD_RELOC_NONE", SRE, SMLoc(), + *Ctx.getSubtargetInfo()); + } +} + MCTargetStreamer * llvm::createAArch64AsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS, MCInstPrinter *InstPrint, diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h index e8a9dc445b96..ac441ae3b603 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h @@ -93,6 +93,7 @@ private: void emitInst(uint32_t Inst) override; void emitDirectiveVariantPCS(MCSymbol *Symbol) override; + void finish() override; public: AArch64TargetELFStreamer(MCStreamer &S) : AArch64TargetStreamer(S) {} diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index b21b1faf5c96..3087f6090379 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -111,6 +111,12 @@ class sem2p1_zero_matrix_pseudo<string name, Operand index_ty, SMEMatrixTypeEnum let usesCustomInserter = 1; } +class sme2_movez_to_tile_multi_pseudo<string name, Operand tile_imm, Operand imm_ty, RegisterOperand vector_ty, SMEMatrixTypeEnum za_flag> + : SMEPseudo2Instr<name, 0>, + Pseudo<(outs vector_ty:$Zn), (ins tile_imm:$tile, MatrixIndexGPR32Op12_15:$Rs, imm_ty:$imm), []> { + let SMEMatrixType = za_flag; + let usesCustomInserter = 1; +} //===----------------------------------------------------------------------===// // SME pattern match helpers. //===----------------------------------------------------------------------===// @@ -4000,7 +4006,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst<bit v, bits<3> opc, string mnemo def _B : sme2_mova_tile_to_vec_vg2_multi_base<0b00, v, opc, ZZ_b_mul_r, !if(v, TileVectorOpV8, TileVectorOpH8), - uimm3s2range, mnemonic> { + uimm3s2range, mnemonic>, SMEPseudo2Instr<NAME # _B, 1> { bits<3> imm; let Inst{7-5} = imm; } @@ -4008,7 +4014,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst<bit v, bits<3> opc, string mnemo def _H : sme2_mova_tile_to_vec_vg2_multi_base<0b01, v, opc, ZZ_h_mul_r, !if(v, TileVectorOpV16, TileVectorOpH16), - uimm2s2range, mnemonic> { + uimm2s2range, mnemonic>, SMEPseudo2Instr<NAME # _H, 1> { bits<1> ZAn; bits<2> imm; let Inst{7} = ZAn; @@ -4018,7 +4024,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst<bit v, bits<3> opc, string mnemo def _S : sme2_mova_tile_to_vec_vg2_multi_base<0b10, v, opc, ZZ_s_mul_r, !if(v, TileVectorOpV32, TileVectorOpH32), - uimm1s2range, mnemonic> { + uimm1s2range, mnemonic>, SMEPseudo2Instr<NAME # _S, 1> { bits<2> ZAn; bits<1> imm; let Inst{7-6} = ZAn; @@ -4028,7 +4034,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst<bit v, bits<3> opc, string mnemo def _D : sme2_mova_tile_to_vec_vg2_multi_base<0b11, v, opc, ZZ_d_mul_r, !if(v, TileVectorOpV64, TileVectorOpH64), - uimm0s2range, mnemonic> { + uimm0s2range, mnemonic>, SMEPseudo2Instr<NAME # _D, 1> { bits<3> ZAn; let Inst{7-5} = ZAn; } @@ -4097,6 +4103,17 @@ multiclass sme2_mova_tile_to_vec_vg2_multi<string mnemonic>{ multiclass sme2p1_movaz_tile_to_vec_vg2<string mnemonic>{ defm _H : sme2_mova_tile_to_vec_vg2_multi_inst<0b0, 0b010, mnemonic>; defm _V : sme2_mova_tile_to_vec_vg2_multi_inst<0b1, 0b010, mnemonic>; + + + def NAME # _H_B_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _H_B, sme_elm_idx0_0, uimm3s2range, ZZ_b_mul_r, SMEMatrixTileB>; + def NAME # _H_H_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _H_H, sme_elm_idx0_1, uimm2s2range, ZZ_h_mul_r, SMEMatrixTileH>; + def NAME # _H_S_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _H_S, sme_elm_idx0_3, uimm1s2range, ZZ_s_mul_r, SMEMatrixTileS>; + def NAME # _H_D_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _H_D, sme_elm_idx0_7, uimm0s2range, ZZ_d_mul_r, SMEMatrixTileD>; + + def NAME # _V_B_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _V_B, sme_elm_idx0_0, uimm3s2range, ZZ_b_mul_r, SMEMatrixTileB>; + def NAME # _V_H_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _V_H, sme_elm_idx0_1, uimm2s2range, ZZ_h_mul_r, SMEMatrixTileH>; + def NAME # _V_S_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _V_S, sme_elm_idx0_3, uimm1s2range, ZZ_s_mul_r, SMEMatrixTileS>; + def NAME # _V_D_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _V_D, sme_elm_idx0_7, uimm0s2range, ZZ_d_mul_r, SMEMatrixTileD>; } class sme2_mova_tile_to_vec_vg4_multi_base<bits<2> sz, bit v, bits<6> op, @@ -4130,7 +4147,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base<bit v, bits<3> opc, string mnemo ZZZZ_b_mul_r, !if(v, TileVectorOpV8, TileVectorOpH8), - uimm2s4range, mnemonic> { + uimm2s4range, mnemonic>, SMEPseudo2Instr<NAME # _B, 1> { bits<2> imm; let Inst{6-5} = imm; } @@ -4139,7 +4156,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base<bit v, bits<3> opc, string mnemo ZZZZ_h_mul_r, !if(v, TileVectorOpV16, TileVectorOpH16), - uimm1s4range, mnemonic> { + uimm1s4range, mnemonic>, SMEPseudo2Instr<NAME # _H, 1> { bits<1> ZAn; bits<1> imm; let Inst{6} = ZAn; @@ -4150,7 +4167,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base<bit v, bits<3> opc, string mnemo ZZZZ_s_mul_r, !if(v, TileVectorOpV32, TileVectorOpH32), - uimm0s4range, mnemonic> { + uimm0s4range, mnemonic>, SMEPseudo2Instr<NAME # _S, 1> { bits<2> ZAn; let Inst{6-5} = ZAn; } @@ -4159,7 +4176,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base<bit v, bits<3> opc, string mnemo ZZZZ_d_mul_r, !if(v, TileVectorOpV64, TileVectorOpH64), - uimm0s4range, mnemonic> { + uimm0s4range, mnemonic>, SMEPseudo2Instr<NAME # _D, 1> { bits<3> ZAn; let Inst{7-5} = ZAn; } @@ -4228,6 +4245,16 @@ multiclass sme2_mova_tile_to_vec_vg4_multi<string mnemonic>{ multiclass sme2p1_movaz_tile_to_vec_vg4<string mnemonic>{ defm _H : sme2_mova_tile_to_vec_vg4_multi_base<0b0, 0b110, mnemonic>; defm _V : sme2_mova_tile_to_vec_vg4_multi_base<0b1, 0b110, mnemonic>; + + def NAME # _H_B_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _H_B, sme_elm_idx0_0, uimm2s4range, ZZZZ_b_mul_r, SMEMatrixTileB>; + def NAME # _H_H_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _H_H, sme_elm_idx0_1, uimm1s4range, ZZZZ_h_mul_r, SMEMatrixTileH>; + def NAME # _H_S_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _H_S, sme_elm_idx0_3, uimm0s4range, ZZZZ_s_mul_r, SMEMatrixTileS>; + def NAME # _H_D_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _H_D, sme_elm_idx0_7, uimm0s4range, ZZZZ_d_mul_r, SMEMatrixTileD>; + + def NAME # _V_B_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _V_B, sme_elm_idx0_0, uimm2s4range, ZZZZ_b_mul_r, SMEMatrixTileB>; + def NAME # _V_H_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _V_H, sme_elm_idx0_1, uimm1s4range, ZZZZ_h_mul_r, SMEMatrixTileH>; + def NAME # _V_S_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _V_S, sme_elm_idx0_3, uimm0s4range, ZZZZ_s_mul_r, SMEMatrixTileS>; + def NAME # _V_D_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME # _V_D, sme_elm_idx0_7, uimm0s4range, ZZZZ_d_mul_r, SMEMatrixTileD>; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index d0d7a9dc1724..63d83346528a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -351,6 +351,7 @@ def FeatureGFX90AInsts : SubtargetFeature<"gfx90a-insts", "GFX90AInsts", "true", "Additional instructions for GFX90A+" + // [HasAtomicFMinFMaxF64GlobalInsts, HasAtomicFMinFMaxF64FlatInsts] // TODO >; def FeatureGFX940Insts : SubtargetFeature<"gfx940-insts", @@ -711,6 +712,30 @@ def FeatureAtomicFaddRtnInsts : SubtargetFeature<"atomic-fadd-rtn-insts", [FeatureFlatGlobalInsts] >; +def FeatureAtomicFMinFMaxF32GlobalInsts : SubtargetFeature<"atomic-fmin-fmax-global-f32", + "HasAtomicFMinFMaxF32GlobalInsts", + "true", + "Has global/buffer instructions for atomicrmw fmin/fmax for float" +>; + +def FeatureAtomicFMinFMaxF64GlobalInsts : SubtargetFeature<"atomic-fmin-fmax-global-f64", + "HasAtomicFMinFMaxF64GlobalInsts", + "true", + "Has global/buffer instructions for atomicrmw fmin/fmax for float" +>; + +def FeatureAtomicFMinFMaxF32FlatInsts : SubtargetFeature<"atomic-fmin-fmax-flat-f32", + "HasAtomicFMinFMaxF32FlatInsts", + "true", + "Has flat memory instructions for atomicrmw fmin/fmax for float" +>; + +def FeatureAtomicFMinFMaxF64FlatInsts : SubtargetFeature<"atomic-fmin-fmax-flat-f64", + "HasAtomicFMinFMaxF64FlatInsts", + "true", + "Has flat memory instructions for atomicrmw fmin/fmax for double" +>; + def FeatureAtomicFaddNoRtnInsts : SubtargetFeature<"atomic-fadd-no-rtn-insts", "HasAtomicFaddNoRtnInsts", "true", @@ -743,6 +768,12 @@ def FeatureAtomicGlobalPkAddBF16Inst : SubtargetFeature<"atomic-global-pk-add-bf [FeatureFlatGlobalInsts] >; +def FeatureAtomicBufferPkAddBF16Inst : SubtargetFeature<"atomic-buffer-pk-add-bf16-inst", + "HasAtomicBufferPkAddBF16Inst", + "true", + "Has buffer_atomic_pk_add_bf16 instruction" +>; + def FeatureAtomicCSubNoRtnInsts : SubtargetFeature<"atomic-csub-no-rtn-insts", "HasAtomicCSubNoRtnInsts", "true", @@ -1061,7 +1092,8 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS", FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel, FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts, - FeatureGDS, FeatureGWS, FeatureDefaultComponentZero + FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, + FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts ] >; @@ -1072,7 +1104,9 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS", FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange, FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess, - FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero + FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, + FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts, + FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts ] >; @@ -1127,7 +1161,9 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, - FeatureMaxHardClauseLength63 + FeatureMaxHardClauseLength63, + FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts, + FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts ] >; @@ -1148,7 +1184,8 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11", FeatureA16, FeatureFastDenormalF32, FeatureG16, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, - FeatureMaxHardClauseLength32 + FeatureMaxHardClauseLength32, + FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts ] >; @@ -1169,7 +1206,8 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12", FeatureA16, FeatureFastDenormalF32, FeatureG16, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast, - FeatureMaxHardClauseLength32 + FeatureMaxHardClauseLength32, + FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts ] >; @@ -1332,7 +1370,10 @@ def FeatureISAVersion9_0_A : FeatureSet< FeaturePackedTID, FullRate64Ops, FeatureBackOffBarrier, - FeatureKernargPreload])>; + FeatureKernargPreload, + FeatureAtomicFMinFMaxF64GlobalInsts, + FeatureAtomicFMinFMaxF64FlatInsts + ])>; def FeatureISAVersion9_0_C : FeatureSet< !listconcat(FeatureISAVersion9_0_Consumer_Common.Features, @@ -1372,7 +1413,10 @@ def FeatureISAVersion9_4_Common : FeatureSet< FeatureArchitectedFlatScratch, FullRate64Ops, FeatureBackOffBarrier, - FeatureKernargPreload]>; + FeatureKernargPreload, + FeatureAtomicFMinFMaxF64GlobalInsts, + FeatureAtomicFMinFMaxF64FlatInsts + ]>; def FeatureISAVersion9_4_0 : FeatureSet< !listconcat(FeatureISAVersion9_4_Common.Features, @@ -1561,6 +1605,7 @@ def FeatureISAVersion12 : FeatureSet< FeatureAtomicFlatPkAdd16Insts, FeatureAtomicBufferGlobalPkAddF16Insts, FeatureAtomicGlobalPkAddBF16Inst, + FeatureAtomicBufferPkAddBF16Inst, FeatureFlatAtomicFaddF32Inst, FeatureImageInsts, FeatureExtendedImageInsts, @@ -1572,7 +1617,9 @@ def FeatureISAVersion12 : FeatureSet< FeatureHasRestrictedSOffset, FeatureVGPRSingleUseHintInsts, FeatureScalarDwordx3Loads, - FeatureDPPSrc1SGPR]>; + FeatureDPPSrc1SGPR, + FeatureMaxHardClauseLength32, + Feature1_5xVGPRs]>; def FeatureISAVersion12_Generic: FeatureSet< !listconcat(FeatureISAVersion12.Features, @@ -1862,9 +1909,28 @@ def isGFX12Plus : def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">, AssemblerPredicate<(all_of FeatureFlatAddressSpace)>; -def HasBufferFlatGlobalAtomicsF64 : + +def HasBufferFlatGlobalAtomicsF64 : // FIXME: Rename to show it's only for fadd Predicate<"Subtarget->hasBufferFlatGlobalAtomicsF64()">, - AssemblerPredicate<(any_of FeatureGFX90AInsts)>; + // FIXME: This is too coarse, and working around using pseudo's predicates on real instruction. + AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX10Insts, FeatureSouthernIslands, FeatureSeaIslands)>; + +def HasAtomicFMinFMaxF32GlobalInsts : + Predicate<"Subtarget->hasAtomicFMinFMaxF32GlobalInsts()">, + AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF32GlobalInsts)>; + +def HasAtomicFMinFMaxF64GlobalInsts : + Predicate<"Subtarget->hasAtomicFMinFMaxF64GlobalInsts()">, + AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF64GlobalInsts)>; + +def HasAtomicFMinFMaxF32FlatInsts : + Predicate<"Subtarget->hasAtomicFMinFMaxF32FlatInsts()">, + AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF32FlatInsts)>; + +def HasAtomicFMinFMaxF64FlatInsts : + Predicate<"Subtarget->hasAtomicFMinFMaxF64FlatInsts()">, + AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF64FlatInsts)>; + def HasLdsAtomicAddF64 : Predicate<"Subtarget->hasLdsAtomicAddF64()">, AssemblerPredicate<(any_of FeatureGFX90AInsts)>; @@ -2118,7 +2184,10 @@ def HasAtomicBufferGlobalPkAddF16Insts AssemblerPredicate<(all_of FeatureAtomicBufferGlobalPkAddF16Insts)>; def HasAtomicGlobalPkAddBF16Inst : Predicate<"Subtarget->hasAtomicGlobalPkAddBF16Inst()">, - AssemblerPredicate<(all_of FeatureAtomicGlobalPkAddBF16Inst)>; + AssemblerPredicate<(all_of FeatureAtomicGlobalPkAddBF16Inst)>; +def HasAtomicBufferPkAddBF16Inst + : Predicate<"Subtarget->hasAtomicBufferPkAddBF16Inst()">, + AssemblerPredicate<(all_of FeatureAtomicBufferPkAddBF16Inst)>; def HasFlatAtomicFaddF32Inst : Predicate<"Subtarget->hasFlatAtomicFaddF32Inst()">, AssemblerPredicate<(all_of FeatureFlatAtomicFaddF32Inst)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp index de25f9241a50..f57fc168c1df 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp @@ -115,6 +115,9 @@ AMDGPUFunctionArgInfo::getPreloadedValue( return std::tuple( PrivateSegmentWaveByteOffset ? &PrivateSegmentWaveByteOffset : nullptr, &AMDGPU::SGPR_32RegClass, LLT::scalar(32)); + case AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_SIZE: + return {PrivateSegmentSize ? &PrivateSegmentSize : nullptr, + &AMDGPU::SGPR_32RegClass, LLT::scalar(32)}; case AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR: return std::tuple(KernargSegmentPtr ? &KernargSegmentPtr : nullptr, &AMDGPU::SGPR_64RegClass, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h index 42b33c50d9f8..2e02bb4271ad 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h @@ -114,11 +114,12 @@ struct AMDGPUFunctionArgInfo { PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14, IMPLICIT_BUFFER_PTR = 15, IMPLICIT_ARG_PTR = 16, + PRIVATE_SEGMENT_SIZE = 17, // VGPRS: - WORKITEM_ID_X = 17, - WORKITEM_ID_Y = 18, - WORKITEM_ID_Z = 19, + WORKITEM_ID_X = 18, + WORKITEM_ID_Y = 19, + WORKITEM_ID_Z = 20, FIRST_VGPR_VALUE = WORKITEM_ID_X }; // clang-format on diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index cad4a3430327..e49925f86bd9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -29,6 +29,7 @@ #include "TargetInfo/AMDGPUTargetInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "Utils/AMDKernelCodeTUtils.h" +#include "Utils/SIDefinesUtils.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -135,15 +136,6 @@ void AMDGPUAsmPrinter::initTargetStreamer(Module &M) { getTargetStreamer()->getPALMetadata()->readFromIR(M); } -uint64_t AMDGPUAsmPrinter::getMCExprValue(const MCExpr *Value, MCContext &Ctx) { - int64_t Val; - if (!Value->evaluateAsAbsolute(Val)) { - Ctx.reportError(SMLoc(), "could not resolve expression when required."); - return 0; - } - return static_cast<uint64_t>(Val); -} - void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) { // Init target streamer if it has not yet happened if (!IsTargetStreamerInitialized) @@ -248,14 +240,14 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() { getNameWithPrefix(KernelName, &MF->getFunction()); getTargetStreamer()->EmitAmdhsaKernelDescriptor( STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo), - getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Context), - getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Context) - - IsaInfo::getNumExtraSGPRs( - &STM, getMCExprValue(CurrentProgramInfo.VCCUsed, Context), - getMCExprValue(CurrentProgramInfo.FlatUsed, Context), - getTargetStreamer()->getTargetID()->isXnackOnOrAny()), - getMCExprValue(CurrentProgramInfo.VCCUsed, Context), - getMCExprValue(CurrentProgramInfo.FlatUsed, Context)); + CurrentProgramInfo.NumVGPRsForWavesPerEU, + MCBinaryExpr::createSub( + CurrentProgramInfo.NumSGPRsForWavesPerEU, + AMDGPUMCExpr::createExtraSGPRs( + CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed, + getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Context), + Context), + CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed); Streamer.popSection(); } @@ -400,9 +392,40 @@ void AMDGPUAsmPrinter::emitCommonFunctionComments( false); } -uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( +SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) { + SmallString<128> Str; + raw_svector_ostream OSS(Str); + int64_t IVal; + if (Value->evaluateAsAbsolute(IVal)) { + OSS << static_cast<uint64_t>(IVal); + } else { + Value->print(OSS, MAI); + } + return Str; +} + +void AMDGPUAsmPrinter::emitCommonFunctionComments( + const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR, + const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize, + const AMDGPUMachineFunction *MFI) { + OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false); + OutStreamer->emitRawComment(" NumSgprs: " + getMCExprStr(NumSGPR), false); + OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false); + if (NumAGPR && TotalNumVGPR) { + OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false); + OutStreamer->emitRawComment(" TotalNumVgprs: " + getMCExprStr(TotalNumVGPR), + false); + } + OutStreamer->emitRawComment(" ScratchSize: " + getMCExprStr(ScratchSize), + false); + OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()), + false); +} + +const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( const MachineFunction &MF) const { const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); + MCContext &Ctx = MF.getContext(); uint16_t KernelCodeProperties = 0; const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo(); @@ -430,16 +453,28 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; } + if (UserSGPRInfo.hasPrivateSegmentSize()) { + KernelCodeProperties |= + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE; + } if (MF.getSubtarget<GCNSubtarget>().isWave32()) { KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32; } - if (getMCExprValue(CurrentProgramInfo.DynamicCallStack, MF.getContext()) && - CodeObjectVersion >= AMDGPU::AMDHSA_COV5) - KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK; - - return KernelCodeProperties; + // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be + // un-evaluatable at this point so it cannot be conditionally checked here. + // Instead, we'll directly shift the possibly unknown MCExpr into its place + // and bitwise-or it into KernelCodeProperties. + const MCExpr *KernelCodePropExpr = + MCConstantExpr::create(KernelCodeProperties, Ctx); + const MCExpr *OrValue = MCConstantExpr::create( + amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx); + OrValue = MCBinaryExpr::createShl(CurrentProgramInfo.DynamicCallStack, + OrValue, Ctx); + KernelCodePropExpr = MCBinaryExpr::createOr(KernelCodePropExpr, OrValue, Ctx); + + return KernelCodePropExpr; } MCKernelDescriptor @@ -462,11 +497,15 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF, KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx); KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx); - KernelDescriptor.kernel_code_properties = - MCConstantExpr::create(getAmdhsaKernelCodeProperties(MF), Ctx); - - assert(STM.hasGFX90AInsts() || - getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx) == 0); + KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF); + + int64_t PGRM_Rsrc3 = 1; + bool EvaluatableRsrc3 = + CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(PGRM_Rsrc3); + (void)PGRM_Rsrc3; + (void)EvaluatableRsrc3; + assert(STM.hasGFX90AInsts() || !EvaluatableRsrc3 || + static_cast<uint64_t>(PGRM_Rsrc3) == 0); KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A; KernelDescriptor.kernarg_preload = MCConstantExpr::create( @@ -554,13 +593,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { OutStreamer->emitRawComment(" Kernel info:", false); emitCommonFunctionComments( - getMCExprValue(CurrentProgramInfo.NumArchVGPR, Ctx), - STM.hasMAIInsts() ? getMCExprValue(CurrentProgramInfo.NumAccVGPR, Ctx) - : std::optional<uint32_t>(), - getMCExprValue(CurrentProgramInfo.NumVGPR, Ctx), - getMCExprValue(CurrentProgramInfo.NumSGPR, Ctx), - getMCExprValue(CurrentProgramInfo.ScratchSize, Ctx), - getFunctionCodeSize(MF), MFI); + CurrentProgramInfo.NumArchVGPR, + STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr, + CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR, + CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI); OutStreamer->emitRawComment( " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false); @@ -571,43 +607,38 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { " bytes/workgroup (compile time only)", false); OutStreamer->emitRawComment( - " SGPRBlocks: " + - Twine(getMCExprValue(CurrentProgramInfo.SGPRBlocks, Ctx)), - false); + " SGPRBlocks: " + getMCExprStr(CurrentProgramInfo.SGPRBlocks), false); + OutStreamer->emitRawComment( - " VGPRBlocks: " + - Twine(getMCExprValue(CurrentProgramInfo.VGPRBlocks, Ctx)), - false); + " VGPRBlocks: " + getMCExprStr(CurrentProgramInfo.VGPRBlocks), false); OutStreamer->emitRawComment( " NumSGPRsForWavesPerEU: " + - Twine( - getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx)), + getMCExprStr(CurrentProgramInfo.NumSGPRsForWavesPerEU), false); OutStreamer->emitRawComment( " NumVGPRsForWavesPerEU: " + - Twine( - getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx)), + getMCExprStr(CurrentProgramInfo.NumVGPRsForWavesPerEU), false); - if (STM.hasGFX90AInsts()) + if (STM.hasGFX90AInsts()) { + const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd( + CurrentProgramInfo.AccumOffset, MCConstantExpr::create(1, Ctx), Ctx); + AdjustedAccum = MCBinaryExpr::createMul( + AdjustedAccum, MCConstantExpr::create(4, Ctx), Ctx); OutStreamer->emitRawComment( - " AccumOffset: " + - Twine((getMCExprValue(CurrentProgramInfo.AccumOffset, Ctx) + 1) * - 4), - false); + " AccumOffset: " + getMCExprStr(AdjustedAccum), false); + } OutStreamer->emitRawComment( - " Occupancy: " + - Twine(getMCExprValue(CurrentProgramInfo.Occupancy, Ctx)), - false); + " Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false); OutStreamer->emitRawComment( " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false); OutStreamer->emitRawComment( " COMPUTE_PGM_RSRC2:SCRATCH_EN: " + - Twine(getMCExprValue(CurrentProgramInfo.ScratchEnable, Ctx)), + getMCExprStr(CurrentProgramInfo.ScratchEnable), false); OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " + Twine(CurrentProgramInfo.UserSGPR), @@ -628,20 +659,25 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { Twine(CurrentProgramInfo.TIdIGCompCount), false); + [[maybe_unused]] int64_t PGMRSrc3; assert(STM.hasGFX90AInsts() || - getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx) == 0); + (CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute( + PGMRSrc3) && + static_cast<uint64_t>(PGMRSrc3) == 0)); if (STM.hasGFX90AInsts()) { OutStreamer->emitRawComment( " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " + - Twine((AMDHSA_BITS_GET( - getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx), - amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))), + getMCExprStr(MCKernelDescriptor::bits_get( + CurrentProgramInfo.ComputePGMRSrc3GFX90A, + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT, + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)), false); OutStreamer->emitRawComment( " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " + - Twine((AMDHSA_BITS_GET( - getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx), - amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))), + getMCExprStr(MCKernelDescriptor::bits_get( + CurrentProgramInfo.ComputePGMRSrc3GFX90A, + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT, + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)), false); } } @@ -765,7 +801,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // The calculations related to SGPR/VGPR blocks are // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be // unified. - const MCExpr *ExtraSGPRs = AMDGPUVariadicMCExpr::createExtraSGPRs( + const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs( ProgInfo.VCCUsed, ProgInfo.FlatUsed, getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx); @@ -858,27 +894,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, } } } - ProgInfo.NumSGPR = AMDGPUVariadicMCExpr::createMax( + ProgInfo.NumSGPR = AMDGPUMCExpr::createMax( {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx); - ProgInfo.NumArchVGPR = AMDGPUVariadicMCExpr::createMax( + ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax( {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx); - ProgInfo.NumVGPR = AMDGPUVariadicMCExpr::createTotalNumVGPR( + ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR( ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx); } // Adjust number of registers used to meet default/requested minimum/maximum // number of waves per execution unit request. unsigned MaxWaves = MFI->getMaxWavesPerEU(); - ProgInfo.NumSGPRsForWavesPerEU = AMDGPUVariadicMCExpr::createMax( - {ProgInfo.NumSGPR, CreateExpr(1ul), - CreateExpr(STM.getMinNumSGPRs(MaxWaves))}, - Ctx); - ProgInfo.NumVGPRsForWavesPerEU = AMDGPUVariadicMCExpr::createMax( - {ProgInfo.NumVGPR, CreateExpr(1ul), - CreateExpr(STM.getMinNumVGPRs(MaxWaves))}, - Ctx); + ProgInfo.NumSGPRsForWavesPerEU = + AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, CreateExpr(1ul), + CreateExpr(STM.getMinNumSGPRs(MaxWaves))}, + Ctx); + ProgInfo.NumVGPRsForWavesPerEU = + AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul), + CreateExpr(STM.getMinNumVGPRs(MaxWaves))}, + Ctx); if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS || STM.hasSGPRInitBug()) { @@ -927,10 +963,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, unsigned Granule) { const MCExpr *OneConst = CreateExpr(1ul); const MCExpr *GranuleConst = CreateExpr(Granule); - const MCExpr *MaxNumGPR = - AMDGPUVariadicMCExpr::createMax({NumGPR, OneConst}, Ctx); + const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx); const MCExpr *AlignToGPR = - AMDGPUVariadicMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx); + AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx); const MCExpr *DivGPR = MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx); const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx); @@ -972,7 +1007,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // The MCExpr equivalent of divideCeil. auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) { const MCExpr *Ceil = - AMDGPUVariadicMCExpr::createAlignTo(Numerator, Denominator, Ctx); + AMDGPUMCExpr::createAlignTo(Numerator, Denominator, Ctx); return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx); }; @@ -1045,7 +1080,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT); } - ProgInfo.Occupancy = AMDGPUVariadicMCExpr::createOccupancy( + ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy( STM.computeOccupancy(F, ProgInfo.LDSSize), ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx); @@ -1207,41 +1242,49 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF, auto &Ctx = MF.getContext(); MD->setEntryPoint(CC, MF.getFunction().getName()); - MD->setNumUsedVgprs( - CC, getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx)); + MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx); // Only set AGPRs for supported devices const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); if (STM.hasMAIInsts()) { - MD->setNumUsedAgprs(CC, getMCExprValue(CurrentProgramInfo.NumAccVGPR, Ctx)); + MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR); } - MD->setNumUsedSgprs( - CC, getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx)); + MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx); if (MD->getPALMajorVersion() < 3) { - MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM)); + MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM, Ctx), Ctx); if (AMDGPU::isCompute(CC)) { - MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2()); + MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx); } else { - if (getMCExprValue(CurrentProgramInfo.ScratchBlocks, Ctx) > 0) - MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1)); + const MCExpr *HasScratchBlocks = + MCBinaryExpr::createGT(CurrentProgramInfo.ScratchBlocks, + MCConstantExpr::create(0, Ctx), Ctx); + auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN); + MD->setRsrc2(CC, maskShiftSet(HasScratchBlocks, Mask, Shift, Ctx), Ctx); } } else { MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode); - MD->setHwStage(CC, ".scratch_en", - (bool)getMCExprValue(CurrentProgramInfo.ScratchEnable, Ctx)); + MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean, + CurrentProgramInfo.ScratchEnable); EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM); } // ScratchSize is in bytes, 16 aligned. MD->setScratchSize( - CC, alignTo(getMCExprValue(CurrentProgramInfo.ScratchSize, Ctx), 16)); + CC, + AMDGPUMCExpr::createAlignTo(CurrentProgramInfo.ScratchSize, + MCConstantExpr::create(16, Ctx), Ctx), + Ctx); + if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2) : CurrentProgramInfo.LDSBlocks; if (MD->getPALMajorVersion() < 3) { - MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize)); + MD->setRsrc2( + CC, + MCConstantExpr::create(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize), Ctx), + Ctx); MD->setSpiPsInputEna(MFI->getPSInputEnable()); MD->setSpiPsInputAddr(MFI->getPSInputAddr()); } else { @@ -1288,20 +1331,19 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) { if (MD->getPALMajorVersion() < 3) { // Set compute registers - MD->setRsrc1(CallingConv::AMDGPU_CS, - CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST)); + MD->setRsrc1( + CallingConv::AMDGPU_CS, + CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST, Ctx), Ctx); MD->setRsrc2(CallingConv::AMDGPU_CS, - CurrentProgramInfo.getComputePGMRSrc2()); + CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx); } else { EmitPALMetadataCommon(MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST); } // Set optional info MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize); - MD->setFunctionNumUsedVgprs( - FnName, getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx)); - MD->setFunctionNumUsedSgprs( - FnName, getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx)); + MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU); + MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU); } // This is supposed to be log2(Size) @@ -1362,6 +1404,9 @@ void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out, if (UserSGPRInfo.hasFlatScratchInit()) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; + if (UserSGPRInfo.hasPrivateSegmentSize()) + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE; + if (UserSGPRInfo.hasDispatchPtr()) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; @@ -1463,28 +1508,26 @@ void AMDGPUAsmPrinter::emitResourceUsageRemarks( // remarks to simulate newlines. If and when clang does accept newlines, this // formatting should be aggregated into one remark with newlines to avoid // printing multiple diagnostic location and diag opts. - MCContext &MCCtx = MF.getContext(); EmitResourceUsageRemark("FunctionName", "Function Name", MF.getFunction().getName()); EmitResourceUsageRemark("NumSGPR", "SGPRs", - getMCExprValue(CurrentProgramInfo.NumSGPR, MCCtx)); - EmitResourceUsageRemark( - "NumVGPR", "VGPRs", - getMCExprValue(CurrentProgramInfo.NumArchVGPR, MCCtx)); + getMCExprStr(CurrentProgramInfo.NumSGPR)); + EmitResourceUsageRemark("NumVGPR", "VGPRs", + getMCExprStr(CurrentProgramInfo.NumArchVGPR)); if (hasMAIInsts) { - EmitResourceUsageRemark( - "NumAGPR", "AGPRs", - getMCExprValue(CurrentProgramInfo.NumAccVGPR, MCCtx)); + EmitResourceUsageRemark("NumAGPR", "AGPRs", + getMCExprStr(CurrentProgramInfo.NumAccVGPR)); } - EmitResourceUsageRemark( - "ScratchSize", "ScratchSize [bytes/lane]", - getMCExprValue(CurrentProgramInfo.ScratchSize, MCCtx)); + EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]", + getMCExprStr(CurrentProgramInfo.ScratchSize)); + int64_t DynStack; + bool DynStackEvaluatable = + CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(DynStack); StringRef DynamicStackStr = - getMCExprValue(CurrentProgramInfo.DynamicCallStack, MCCtx) ? "True" - : "False"; + DynStackEvaluatable && DynStack ? "True" : "False"; EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr); EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]", - getMCExprValue(CurrentProgramInfo.Occupancy, MCCtx)); + getMCExprStr(CurrentProgramInfo.Occupancy)); EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill", CurrentProgramInfo.SGPRSpill); EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill", diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index 87156f27fc6c..f70a60aef007 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -65,12 +65,16 @@ private: uint32_t TotalNumVGPR, uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize, const AMDGPUMachineFunction *MFI); + void emitCommonFunctionComments(const MCExpr *NumVGPR, const MCExpr *NumAGPR, + const MCExpr *TotalNumVGPR, + const MCExpr *NumSGPR, + const MCExpr *ScratchSize, uint64_t CodeSize, + const AMDGPUMachineFunction *MFI); void emitResourceUsageRemarks(const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo, bool isModuleEntryFunction, bool hasMAIInsts); - uint16_t getAmdhsaKernelCodeProperties( - const MachineFunction &MF) const; + const MCExpr *getAmdhsaKernelCodeProperties(const MachineFunction &MF) const; AMDGPU::MCKernelDescriptor getAmdhsaKernelDescriptor(const MachineFunction &MF, @@ -78,7 +82,7 @@ private: void initTargetStreamer(Module &M); - static uint64_t getMCExprValue(const MCExpr *Value, MCContext &Ctx); + SmallString<128> getMCExprStr(const MCExpr *Value); public: explicit AMDGPUAsmPrinter(TargetMachine &TM, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index 1d645002b1fe..d7ef6f3c5dc4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -249,63 +249,54 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) { switch (I.getIntrinsicID()) { default: return; - case Intrinsic::amdgcn_buffer_atomic_add: case Intrinsic::amdgcn_struct_buffer_atomic_add: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: case Intrinsic::amdgcn_raw_buffer_atomic_add: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: Op = AtomicRMWInst::Add; break; - case Intrinsic::amdgcn_buffer_atomic_sub: case Intrinsic::amdgcn_struct_buffer_atomic_sub: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: case Intrinsic::amdgcn_raw_buffer_atomic_sub: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: Op = AtomicRMWInst::Sub; break; - case Intrinsic::amdgcn_buffer_atomic_and: case Intrinsic::amdgcn_struct_buffer_atomic_and: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: case Intrinsic::amdgcn_raw_buffer_atomic_and: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: Op = AtomicRMWInst::And; break; - case Intrinsic::amdgcn_buffer_atomic_or: case Intrinsic::amdgcn_struct_buffer_atomic_or: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: case Intrinsic::amdgcn_raw_buffer_atomic_or: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: Op = AtomicRMWInst::Or; break; - case Intrinsic::amdgcn_buffer_atomic_xor: case Intrinsic::amdgcn_struct_buffer_atomic_xor: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: case Intrinsic::amdgcn_raw_buffer_atomic_xor: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: Op = AtomicRMWInst::Xor; break; - case Intrinsic::amdgcn_buffer_atomic_smin: case Intrinsic::amdgcn_struct_buffer_atomic_smin: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: case Intrinsic::amdgcn_raw_buffer_atomic_smin: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: Op = AtomicRMWInst::Min; break; - case Intrinsic::amdgcn_buffer_atomic_umin: case Intrinsic::amdgcn_struct_buffer_atomic_umin: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: case Intrinsic::amdgcn_raw_buffer_atomic_umin: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: Op = AtomicRMWInst::UMin; break; - case Intrinsic::amdgcn_buffer_atomic_smax: case Intrinsic::amdgcn_struct_buffer_atomic_smax: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: case Intrinsic::amdgcn_raw_buffer_atomic_smax: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: Op = AtomicRMWInst::Max; break; - case Intrinsic::amdgcn_buffer_atomic_umax: case Intrinsic::amdgcn_struct_buffer_atomic_umax: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: case Intrinsic::amdgcn_raw_buffer_atomic_umax: @@ -413,7 +404,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, assert(ST->hasPermLaneX16()); V = B.CreateBitCast(V, IntNTy); Value *Permlanex16Call = B.CreateIntrinsic( - Intrinsic::amdgcn_permlanex16, {}, + V->getType(), Intrinsic::amdgcn_permlanex16, {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), B.CreateBitCast(Permlanex16Call, AtomicTy)); @@ -425,7 +416,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, // Reduce across the upper and lower 32 lanes. V = B.CreateBitCast(V, IntNTy); Value *Permlane64Call = - B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V); + B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, V); return buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), B.CreateBitCast(Permlane64Call, AtomicTy)); } @@ -433,7 +424,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and // combine them with a scalar operation. Function *ReadLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, B.getInt32Ty()); V = B.CreateBitCast(V, IntNTy); Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)}); Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)}); @@ -481,7 +472,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B, assert(ST->hasPermLaneX16()); V = B.CreateBitCast(V, IntNTy); Value *PermX = B.CreateIntrinsic( - Intrinsic::amdgcn_permlanex16, {}, + V->getType(), Intrinsic::amdgcn_permlanex16, {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); Value *UpdateDPPCall = @@ -523,10 +514,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V, {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); } else { - Function *ReadLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); - Function *WriteLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {}); + Function *ReadLane = Intrinsic::getDeclaration( + M, Intrinsic::amdgcn_readlane, B.getInt32Ty()); + Function *WriteLane = Intrinsic::getDeclaration( + M, Intrinsic::amdgcn_writelane, B.getInt32Ty()); // On GFX10 all DPP operations are confined to a single row. To get cross- // row operations we have to use permlane or readlane. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 231db188e65d..537d3a43aa9f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -271,11 +271,8 @@ def : GINodeEquiv<G_AMDGPU_TBUFFER_STORE_FORMAT_D16, SItbuffer_store_d16>; // FIXME: Check MMO is atomic def : GINodeEquiv<G_ATOMICRMW_UINC_WRAP, atomic_load_uinc_wrap_glue>; def : GINodeEquiv<G_ATOMICRMW_UDEC_WRAP, atomic_load_udec_wrap_glue>; -def : GINodeEquiv<G_AMDGPU_ATOMIC_FMIN, SIatomic_fmin>; -def : GINodeEquiv<G_AMDGPU_ATOMIC_FMAX, SIatomic_fmax>; -def : GINodeEquiv<G_AMDGPU_ATOMIC_FMIN, atomic_load_fmin_glue>; -def : GINodeEquiv<G_AMDGPU_ATOMIC_FMAX, atomic_load_fmax_glue>; - +def : GINodeEquiv<G_ATOMICRMW_FMIN, atomic_load_fmin_glue>; +def : GINodeEquiv<G_ATOMICRMW_FMAX, atomic_load_fmax_glue>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SWAP, SIbuffer_atomic_swap>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_ADD, SIbuffer_atomic_add>; @@ -290,7 +287,6 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_XOR, SIbuffer_atomic_xor>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_INC, SIbuffer_atomic_inc>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_DEC, SIbuffer_atomic_dec>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD, SIbuffer_atomic_fadd>; -def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD_BF16, SIbuffer_atomic_fadd_bf16>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMIN, SIbuffer_atomic_fmin>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMAX, SIbuffer_atomic_fmax>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp index a0c6bf7cc31c..fb258547e8fb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp @@ -46,8 +46,8 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<MachineDominatorTree>(); - AU.addRequired<MachinePostDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addRequired<MachinePostDominatorTreeWrapperPass>(); AU.addRequired<MachineUniformityAnalysisPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -192,8 +192,8 @@ void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE, "AMDGPU GlobalISel divergence lowering", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass) INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE, "AMDGPU GlobalISel divergence lowering", false, false) @@ -209,8 +209,10 @@ FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() { bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction( MachineFunction &MF) { - MachineDominatorTree &DT = getAnalysis<MachineDominatorTree>(); - MachinePostDominatorTree &PDT = getAnalysis<MachinePostDominatorTree>(); + MachineDominatorTree &DT = + getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); + MachinePostDominatorTree &PDT = + getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree(); MachineUniformityInfo &MUI = getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index 7ab9ba285133..efe47b2c3eed 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -464,16 +464,6 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF, const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); const Function &F = MF.getFunction(); - auto GetMCExprValue = [&MF](const MCExpr *Value) { - int64_t Val; - if (!Value->evaluateAsAbsolute(Val)) { - MCContext &Ctx = MF.getContext(); - Ctx.reportError(SMLoc(), "could not resolve expression when required."); - Val = 0; - } - return static_cast<uint64_t>(Val); - }; - auto Kern = HSAMetadataDoc->getMapNode(); Align MaxKernArgAlign; @@ -481,11 +471,12 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF, STM.getKernArgSegmentSize(F, MaxKernArgAlign)); Kern[".group_segment_fixed_size"] = Kern.getDocument()->getNode(ProgramInfo.LDSSize); - Kern[".private_segment_fixed_size"] = - Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.ScratchSize)); + DelayedExprs->assignDocNode(Kern[".private_segment_fixed_size"], + msgpack::Type::UInt, ProgramInfo.ScratchSize); if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5) { - Kern[".uses_dynamic_stack"] = Kern.getDocument()->getNode( - static_cast<bool>(GetMCExprValue(ProgramInfo.DynamicCallStack))); + DelayedExprs->assignDocNode(Kern[".uses_dynamic_stack"], + msgpack::Type::Boolean, + ProgramInfo.DynamicCallStack); } if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5 && STM.supportsWGP()) @@ -497,15 +488,15 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF, Kern.getDocument()->getNode(std::max(Align(4), MaxKernArgAlign).value()); Kern[".wavefront_size"] = Kern.getDocument()->getNode(STM.getWavefrontSize()); - Kern[".sgpr_count"] = - Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.NumSGPR)); - Kern[".vgpr_count"] = - Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.NumVGPR)); + DelayedExprs->assignDocNode(Kern[".sgpr_count"], msgpack::Type::UInt, + ProgramInfo.NumSGPR); + DelayedExprs->assignDocNode(Kern[".vgpr_count"], msgpack::Type::UInt, + ProgramInfo.NumVGPR); // Only add AGPR count to metadata for supported devices if (STM.hasMAIInsts()) { - Kern[".agpr_count"] = - Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.NumAccVGPR)); + DelayedExprs->assignDocNode(Kern[".agpr_count"], msgpack::Type::UInt, + ProgramInfo.NumAccVGPR); } Kern[".max_flat_workgroup_size"] = @@ -527,6 +518,7 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF, } bool MetadataStreamerMsgPackV4::emitTo(AMDGPUTargetStreamer &TargetStreamer) { + DelayedExprs->resolveDelayedExpressions(); return TargetStreamer.EmitHSAMetadata(*HSAMetadataDoc, true); } @@ -536,9 +528,11 @@ void MetadataStreamerMsgPackV4::begin(const Module &Mod, emitTargetID(TargetID); emitPrintf(Mod); getRootMetadata("amdhsa.kernels") = HSAMetadataDoc->getArrayNode(); + DelayedExprs->clear(); } void MetadataStreamerMsgPackV4::end() { + DelayedExprs->resolveDelayedExpressions(); std::string HSAMetadataString; raw_string_ostream StrOS(HSAMetadataString); HSAMetadataDoc->toYAML(StrOS); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h index 0e3bc63919f0..fd76666dc360 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h @@ -15,6 +15,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H +#include "Utils/AMDGPUDelayedMCExpr.h" #include "llvm/BinaryFormat/MsgPackDocument.h" #include "llvm/Support/AMDGPUMetadata.h" #include "llvm/Support/Alignment.h" @@ -65,6 +66,9 @@ protected: class LLVM_EXTERNAL_VISIBILITY MetadataStreamerMsgPackV4 : public MetadataStreamer { protected: + std::unique_ptr<DelayedMCExprs> DelayedExprs = + std::make_unique<DelayedMCExprs>(); + std::unique_ptr<msgpack::Document> HSAMetadataDoc = std::make_unique<msgpack::Document>(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index 57769fe998d1..86f28a505769 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -1482,9 +1482,7 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) { MFMAChains = 0; for (auto &MFMAPipeSU : MFMAPipeSUs) { - if (MFMAChainSeeds.size() && - std::find(MFMAChainSeeds.begin(), MFMAChainSeeds.end(), MFMAPipeSU) != - MFMAChainSeeds.end()) + if (is_contained(MFMAChainSeeds, MFMAPipeSU)) continue; if (!std::any_of(MFMAPipeSU->Preds.begin(), MFMAPipeSU->Preds.end(), [&TII](SDep &Succ) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index b50c0cc12626..6d5ffc66d98b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -502,9 +502,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { // isa<MemSDNode> almost works but is slightly too permissive for some DS // intrinsics. - if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) || - Opc == AMDGPUISD::ATOMIC_LOAD_FMIN || - Opc == AMDGPUISD::ATOMIC_LOAD_FMAX) { + if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) { N = glueCopyToM0LDSInit(N); SelectCode(N); return; @@ -2006,12 +2004,31 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, return true; } +// For unbuffered smem loads, it is illegal for the Immediate Offset to be +// negative if the resulting (Offset + (M0 or SOffset or zero) is negative. +// Handle the case where the Immediate Offset + SOffset is negative. +bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset, + bool Imm32Only, + bool IsBuffer, + int64_t ImmOffset) const { + if (!IsBuffer && !Imm32Only && ImmOffset < 0 && + AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) { + KnownBits SKnown = CurDAG->computeKnownBits(*SOffset); + if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0) + return false; + } + + return true; +} + // Match an immediate (if Offset is not null) or an SGPR (if SOffset is // not null) offset. If Imm32Only is true, match only 32-bit immediate // offsets available on CI. bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, SDValue *SOffset, SDValue *Offset, - bool Imm32Only, bool IsBuffer) const { + bool Imm32Only, bool IsBuffer, + bool HasSOffset, + int64_t ImmOffset) const { assert((!SOffset || !Offset) && "Cannot match both soffset and offset at the same time!"); @@ -2019,15 +2036,18 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, if (!C) { if (!SOffset) return false; + if (ByteOffsetNode.getValueType().isScalarInteger() && ByteOffsetNode.getValueType().getSizeInBits() == 32) { *SOffset = ByteOffsetNode; - return true; + return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer, + ImmOffset); } if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) { if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) { *SOffset = ByteOffsetNode.getOperand(0); - return true; + return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer, + ImmOffset); } } return false; @@ -2038,8 +2058,8 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, // GFX9 and GFX10 have signed byte immediate offsets. The immediate // offset for S_BUFFER instructions is unsigned. int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue(); - std::optional<int64_t> EncodedOffset = - AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, IsBuffer); + std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset( + *Subtarget, ByteOffset, IsBuffer, HasSOffset); if (EncodedOffset && Offset && !Imm32Only) { *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32); return true; @@ -2098,13 +2118,22 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const { // true, match only 32-bit immediate offsets available on CI. bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset, SDValue *Offset, - bool Imm32Only, - bool IsBuffer) const { + bool Imm32Only, bool IsBuffer, + bool HasSOffset, + int64_t ImmOffset) const { if (SOffset && Offset) { assert(!Imm32Only && !IsBuffer); SDValue B; - return SelectSMRDBaseOffset(Addr, B, nullptr, Offset) && - SelectSMRDBaseOffset(B, SBase, SOffset, nullptr); + + if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true)) + return false; + + int64_t ImmOff = 0; + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset)) + ImmOff = C->getSExtValue(); + + return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true, + ImmOff); } // A 32-bit (address + offset) should not cause unsigned 32-bit integer @@ -2123,11 +2152,14 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, } if (!N0 || !N1) return false; - if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer)) { + + if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset, + ImmOffset)) { SBase = N0; return true; } - if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer)) { + if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset, + ImmOffset)) { SBase = N1; return true; } @@ -2551,14 +2583,6 @@ void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) { CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO}); } -void AMDGPUDAGToDAGISel::SelectPOPSExitingWaveID(SDNode *N) { - // TODO: Select this with a tablegen pattern. This is tricky because the - // intrinsic is IntrReadMem/IntrWriteMem but the instruction is not marked - // mayLoad/mayStore and tablegen complains about the mismatch. - SDValue Reg = CurDAG->getRegister(AMDGPU::SRC_POPS_EXITING_WAVE_ID, MVT::i32); - CurDAG->SelectNodeTo(N, AMDGPU::S_MOV_B32, N->getVTList(), Reg); -} - static unsigned gwsIntrinToOpcode(unsigned IntrID) { switch (IntrID) { case Intrinsic::amdgcn_ds_gws_init: @@ -2715,9 +2739,6 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) { case Intrinsic::amdgcn_ds_bvh_stack_rtn: SelectDSBvhStackIntrinsic(N); return; - case Intrinsic::amdgcn_pops_exiting_wave_id: - SelectPOPSExitingWaveID(N); - return; } SelectCode(N); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 8e5662a3cd81..e7911bc1793d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -24,10 +24,6 @@ using namespace llvm; namespace { -static inline bool isNullConstantOrUndef(SDValue V) { - return V.isUndef() || isNullConstant(V); -} - static inline bool getConstantValue(SDValue N, uint32_t &Out) { // This is only used for packed vectors, where using 0 for undef should // always be good. @@ -136,6 +132,8 @@ private: bool isFlatScratchBaseLegal(SDValue Addr) const; bool isFlatScratchBaseLegalSV(SDValue Addr) const; bool isFlatScratchBaseLegalSVImm(SDValue Addr) const; + bool isSOffsetLegalWithImmOffset(SDValue *SOffset, bool Imm32Only, + bool IsBuffer, int64_t ImmOffset = 0) const; bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, @@ -178,11 +176,13 @@ private: bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue *SOffset, SDValue *Offset, bool Imm32Only = false, - bool IsBuffer = false) const; + bool IsBuffer = false, bool HasSOffset = false, + int64_t ImmOffset = 0) const; SDValue Expand32BitAddress(SDValue Addr) const; bool SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset, SDValue *Offset, bool Imm32Only = false, - bool IsBuffer = false) const; + bool IsBuffer = false, bool HasSOffset = false, + int64_t ImmOffset = 0) const; bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue *SOffset, SDValue *Offset, bool Imm32Only = false) const; bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const; @@ -194,6 +194,8 @@ private: bool SelectSMRDBufferImm32(SDValue N, SDValue &Offset) const; bool SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset, SDValue &Offset) const; + bool SelectSMRDPrefetchImm(SDValue Addr, SDValue &SBase, + SDValue &Offset) const; bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods, @@ -267,7 +269,6 @@ private: void SelectFP_EXTEND(SDNode *N); void SelectDSAppendConsume(SDNode *N, unsigned IntrID); void SelectDSBvhStackIntrinsic(SDNode *N); - void SelectPOPSExitingWaveID(SDNode *N); void SelectDS_GWS(SDNode *N, unsigned IntrID); void SelectInterpP1F16(SDNode *N); void SelectINTRINSIC_W_CHAIN(SDNode *N); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 375643b7f519..522b3a34161c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -42,8 +42,10 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { if (StoreSize <= 32) return EVT::getIntegerVT(Ctx, StoreSize); - assert(StoreSize % 32 == 0 && "Store size not a multiple of 32"); - return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); + if (StoreSize % 32 == 0) + return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); + + return VT; } unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) { @@ -5522,8 +5524,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16) NODE_NAME_CASE(DS_ORDERED_COUNT) NODE_NAME_CASE(ATOMIC_CMP_SWAP) - NODE_NAME_CASE(ATOMIC_LOAD_FMIN) - NODE_NAME_CASE(ATOMIC_LOAD_FMAX) NODE_NAME_CASE(BUFFER_LOAD) NODE_NAME_CASE(BUFFER_LOAD_UBYTE) NODE_NAME_CASE(BUFFER_LOAD_USHORT) @@ -5562,7 +5562,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) NODE_NAME_CASE(BUFFER_ATOMIC_CSUB) NODE_NAME_CASE(BUFFER_ATOMIC_FADD) - NODE_NAME_CASE(BUFFER_ATOMIC_FADD_BF16) NODE_NAME_CASE(BUFFER_ATOMIC_FMIN) NODE_NAME_CASE(BUFFER_ATOMIC_FMAX) NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 71c4334029b4..37572af3897f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -575,8 +575,6 @@ enum NodeType : unsigned { TBUFFER_LOAD_FORMAT_D16, DS_ORDERED_COUNT, ATOMIC_CMP_SWAP, - ATOMIC_LOAD_FMIN, - ATOMIC_LOAD_FMAX, BUFFER_LOAD, BUFFER_LOAD_UBYTE, BUFFER_LOAD_USHORT, @@ -615,7 +613,6 @@ enum NodeType : unsigned { BUFFER_ATOMIC_CMPSWAP, BUFFER_ATOMIC_CSUB, BUFFER_ATOMIC_FADD, - BUFFER_ATOMIC_FADD_BF16, BUFFER_ATOMIC_FMIN, BUFFER_ATOMIC_FMAX, BUFFER_ATOMIC_COND_SUB_U32, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp index b78952ca3a62..43b3bf43fe56 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp @@ -15,6 +15,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUGenSearchableTables.inc" #include "GCNSubtarget.h" #include "SIInstrInfo.h" #include "SIRegisterInfo.h" @@ -214,12 +215,14 @@ public: RegisterUseCount[Unit]++; // Do not attempt to optimise across exec mask changes. - if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) { + if (MI.modifiesRegister(AMDGPU::EXEC, TRI) || + AMDGPU::isInvalidSingleUseConsumerInst(MI.getOpcode())) { for (auto &UsedReg : RegisterUseCount) UsedReg.second = 2; } - if (!SIInstrInfo::isVALU(MI)) + if (!SIInstrInfo::isVALU(MI) || + AMDGPU::isInvalidSingleUseProducerInst(MI.getOpcode())) continue; if (AllProducerOperandsAreSingleUse) { SingleUseProducerPositions.push_back({VALUInstrCount, &MI}); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 160a17584ca3..93bca4402ed2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1158,12 +1158,10 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType())); break; } - case Intrinsic::amdgcn_buffer_store_format: case Intrinsic::amdgcn_raw_buffer_store_format: case Intrinsic::amdgcn_struct_buffer_store_format: case Intrinsic::amdgcn_raw_tbuffer_store: case Intrinsic::amdgcn_struct_tbuffer_store: - case Intrinsic::amdgcn_tbuffer_store: case Intrinsic::amdgcn_image_store_1d: case Intrinsic::amdgcn_image_store_1darray: case Intrinsic::amdgcn_image_store_2d: @@ -1376,8 +1374,6 @@ std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( std::function<void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const { switch (II.getIntrinsicID()) { - case Intrinsic::amdgcn_buffer_load: - case Intrinsic::amdgcn_buffer_load_format: case Intrinsic::amdgcn_raw_buffer_load: case Intrinsic::amdgcn_raw_ptr_buffer_load: case Intrinsic::amdgcn_raw_buffer_load_format: @@ -1391,7 +1387,6 @@ std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( case Intrinsic::amdgcn_struct_ptr_buffer_load_format: case Intrinsic::amdgcn_struct_tbuffer_load: case Intrinsic::amdgcn_struct_ptr_tbuffer_load: - case Intrinsic::amdgcn_tbuffer_load: return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts); default: { if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index ae3f2b87f353..a3cb3b3f47e0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2079,21 +2079,6 @@ bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic( return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); } -bool AMDGPUInstructionSelector::selectPOPSExitingWaveID( - MachineInstr &MI) const { - Register Dst = MI.getOperand(0).getReg(); - const DebugLoc &DL = MI.getDebugLoc(); - MachineBasicBlock *MBB = MI.getParent(); - - // TODO: Select this with a tablegen pattern. This is tricky because the - // intrinsic is IntrReadMem/IntrWriteMem but the instruction is not marked - // mayLoad/mayStore and tablegen complains about the mismatch. - auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst) - .addReg(AMDGPU::SRC_POPS_EXITING_WAVE_ID); - MI.eraseFromParent(); - return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); -} - bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( MachineInstr &I) const { Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID(); @@ -2144,8 +2129,6 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( return selectSBarrierSignalIsfirst(I, IntrinsicID); case Intrinsic::amdgcn_s_barrier_leave: return selectSBarrierLeave(I); - case Intrinsic::amdgcn_pops_exiting_wave_id: - return selectPOPSExitingWaveID(I); } return selectImpl(I, *CoverageInfo); } @@ -3620,8 +3603,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_ATOMICRMW_UINC_WRAP: case TargetOpcode::G_ATOMICRMW_UDEC_WRAP: case TargetOpcode::G_ATOMICRMW_FADD: - case AMDGPU::G_AMDGPU_ATOMIC_FMIN: - case AMDGPU::G_AMDGPU_ATOMIC_FMAX: + case TargetOpcode::G_ATOMICRMW_FMIN: + case TargetOpcode::G_ATOMICRMW_FMAX: return selectG_LOAD_STORE_ATOMICRMW(I); case TargetOpcode::G_SELECT: return selectG_SELECT(I); @@ -4216,10 +4199,11 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, return false; const GEPInfo &GEPI = AddrInfo[0]; - std::optional<int64_t> EncodedImm = - AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, false); + std::optional<int64_t> EncodedImm; if (SOffset && Offset) { + EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false, + /*HasSOffset=*/true); if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm && AddrInfo.size() > 1) { const GEPInfo &GEPI2 = AddrInfo[1]; @@ -4229,6 +4213,17 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, Base = GEPI2.SgprParts[0]; *SOffset = OffsetReg; *Offset = *EncodedImm; + if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI)) + return true; + + // For unbuffered smem loads, it is illegal for the Immediate Offset + // to be negative if the resulting (Offset + (M0 or SOffset or zero) + // is negative. Handle the case where the Immediate Offset + SOffset + // is negative. + auto SKnown = KB->getKnownBits(*SOffset); + if (*Offset + SKnown.getMinValue().getSExtValue() < 0) + return false; + return true; } } @@ -4236,6 +4231,8 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, return false; } + EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false, + /*HasSOffset=*/false); if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) { Base = GEPI.SgprParts[0]; *Offset = *EncodedImm; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 48f3b1811801..f561d5d29efc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -125,7 +125,6 @@ private: bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const; bool selectSBarrier(MachineInstr &MI) const; bool selectDSBvhStackIntrinsic(MachineInstr &MI) const; - bool selectPOPSExitingWaveID(MachineInstr &MI) const; bool selectImageIntrinsic(MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index fa7492ac6cbe..c6dbc58395e4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -140,7 +140,9 @@ class ImmOperand<ValueType type, string name = NAME, bit optional = 0, let PrintMethod = printer; } -def s16imm : ImmOperand<i16, "S16Imm", 0, "printU16ImmOperand">; +class S16ImmOperand : ImmOperand<i16, "S16Imm", 0, "printU16ImmOperand">; + +def s16imm : S16ImmOperand; def u16imm : ImmOperand<i16, "U16Imm", 0, "printU16ImmOperand">; class ValuePredicatedOperand<CustomOperand op, string valuePredicate, @@ -616,6 +618,7 @@ multiclass local_addr_space_atomic_op { } } +defm int_amdgcn_flat_atomic_fadd : noret_op; defm int_amdgcn_flat_atomic_fadd : flat_addr_space_atomic_op; defm int_amdgcn_flat_atomic_fadd_v2bf16 : noret_op; defm int_amdgcn_flat_atomic_fmin : noret_op; @@ -627,7 +630,6 @@ defm int_amdgcn_global_atomic_fmin : noret_op; defm int_amdgcn_global_atomic_fmax : noret_op; defm int_amdgcn_global_atomic_csub : noret_op; defm int_amdgcn_flat_atomic_fadd : local_addr_space_atomic_op; -defm int_amdgcn_ds_fadd_v2bf16 : noret_op; defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op; defm int_amdgcn_flat_atomic_fmin_num : noret_op; defm int_amdgcn_flat_atomic_fmax_num : noret_op; @@ -637,9 +639,14 @@ defm int_amdgcn_atomic_cond_sub_u32 : local_addr_space_atomic_op; defm int_amdgcn_atomic_cond_sub_u32 : flat_addr_space_atomic_op; defm int_amdgcn_atomic_cond_sub_u32 : global_addr_space_atomic_op; -multiclass noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> { +multiclass noret_binary_atomic_op<SDNode atomic_op> { let HasNoUse = true in - defm "_noret" : binary_atomic_op<atomic_op, IsInt>; + defm "_noret" : binary_atomic_op<atomic_op>; +} + +multiclass noret_binary_atomic_op_fp<SDNode atomic_op> { + let HasNoUse = true in + defm "_noret" : binary_atomic_op_fp<atomic_op>; } multiclass noret_ternary_atomic_op<SDNode atomic_op> { @@ -647,11 +654,21 @@ multiclass noret_ternary_atomic_op<SDNode atomic_op> { defm "_noret" : ternary_atomic_op<atomic_op>; } -multiclass binary_atomic_op_all_as<SDNode atomic_op, bit IsInt = 1> { - foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in { +defvar atomic_addrspace_names = [ "global", "flat", "constant", "local", "private", "region" ]; + +multiclass binary_atomic_op_all_as<SDNode atomic_op> { + foreach as = atomic_addrspace_names in { + let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in { + defm "_"#as : binary_atomic_op<atomic_op>; + defm "_"#as : noret_binary_atomic_op<atomic_op>; + } + } +} +multiclass binary_atomic_op_fp_all_as<SDNode atomic_op> { + foreach as = atomic_addrspace_names in { let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in { - defm "_"#as : binary_atomic_op<atomic_op, IsInt>; - defm "_"#as : noret_binary_atomic_op<atomic_op, IsInt>; + defm "_"#as : binary_atomic_op_fp<atomic_op>; + defm "_"#as : noret_binary_atomic_op_fp<atomic_op>; } } } @@ -666,11 +683,11 @@ defm atomic_load_sub : binary_atomic_op_all_as<atomic_load_sub>; defm atomic_load_umax : binary_atomic_op_all_as<atomic_load_umax>; defm atomic_load_umin : binary_atomic_op_all_as<atomic_load_umin>; defm atomic_load_xor : binary_atomic_op_all_as<atomic_load_xor>; -defm atomic_load_fadd : binary_atomic_op_all_as<atomic_load_fadd, 0>; +defm atomic_load_fadd : binary_atomic_op_fp_all_as<atomic_load_fadd>; +defm atomic_load_fmin : binary_atomic_op_fp_all_as<atomic_load_fmin>; +defm atomic_load_fmax : binary_atomic_op_fp_all_as<atomic_load_fmax>; defm atomic_load_uinc_wrap : binary_atomic_op_all_as<atomic_load_uinc_wrap>; defm atomic_load_udec_wrap : binary_atomic_op_all_as<atomic_load_udec_wrap>; -let MemoryVT = v2f16 in -defm atomic_load_fadd_v2f16 : binary_atomic_op_all_as<atomic_load_fadd, 0>; defm AMDGPUatomic_cmp_swap : binary_atomic_op_all_as<AMDGPUatomic_cmp_swap>; def load_align8_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>, diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index ee7fb20c23aa..f1254b2e9e1d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -283,7 +283,9 @@ static const LLT S1 = LLT::scalar(1); static const LLT S8 = LLT::scalar(8); static const LLT S16 = LLT::scalar(16); static const LLT S32 = LLT::scalar(32); +static const LLT F32 = LLT::float32(); static const LLT S64 = LLT::scalar(64); +static const LLT F64 = LLT::float64(); static const LLT S96 = LLT::scalar(96); static const LLT S128 = LLT::scalar(128); static const LLT S160 = LLT::scalar(160); @@ -301,6 +303,9 @@ static const LLT V10S16 = LLT::fixed_vector(10, 16); static const LLT V12S16 = LLT::fixed_vector(12, 16); static const LLT V16S16 = LLT::fixed_vector(16, 16); +static const LLT V2F16 = LLT::fixed_vector(2, LLT::float16()); +static const LLT V2BF16 = V2F16; // FIXME + static const LLT V2S32 = LLT::fixed_vector(2, 32); static const LLT V3S32 = LLT::fixed_vector(3, 32); static const LLT V4S32 = LLT::fixed_vector(4, 32); @@ -1638,13 +1643,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (ST.hasLdsAtomicAddF64()) Atomic.legalFor({{S64, LocalPtr}}); if (ST.hasAtomicDsPkAdd16Insts()) - Atomic.legalFor({{V2S16, LocalPtr}}); + Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}}); } if (ST.hasAtomicFaddInsts()) Atomic.legalFor({{S32, GlobalPtr}}); if (ST.hasFlatAtomicFaddF32Inst()) Atomic.legalFor({{S32, FlatPtr}}); + getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX}) + .legalFor({{F32, LocalPtr}, {F64, LocalPtr}}); + if (ST.hasGFX90AInsts()) { // These are legal with some caveats, and should have undergone expansion in // the IR in most situations @@ -1656,6 +1664,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, }); } + if (ST.hasAtomicBufferGlobalPkAddF16Insts()) + Atomic.legalFor({{V2F16, GlobalPtr}}); + if (ST.hasAtomicGlobalPkAddBF16Inst()) + Atomic.legalFor({{V2BF16, GlobalPtr}}); + if (ST.hasAtomicFlatPkAdd16Insts()) + Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}}); + // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output // demarshalling getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) @@ -5388,12 +5403,10 @@ bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { switch (IID) { - case Intrinsic::amdgcn_ds_fadd: - return AMDGPU::G_ATOMICRMW_FADD; case Intrinsic::amdgcn_ds_fmin: - return AMDGPU::G_AMDGPU_ATOMIC_FMIN; + return AMDGPU::G_ATOMICRMW_FMIN; case Intrinsic::amdgcn_ds_fmax: - return AMDGPU::G_AMDGPU_ATOMIC_FMAX; + return AMDGPU::G_ATOMICRMW_FMAX; default: llvm_unreachable("not a DS FP intrinsic"); } @@ -5417,6 +5430,126 @@ bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, return true; } +// TODO: Fix pointer type handling +bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, + MachineInstr &MI, + Intrinsic::ID IID) const { + + MachineIRBuilder &B = Helper.MIRBuilder; + MachineRegisterInfo &MRI = *B.getMRI(); + + bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 || + IID == Intrinsic::amdgcn_permlanex16; + + auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1, + Register Src2, LLT VT) -> Register { + auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0); + switch (IID) { + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_permlane64: + return LaneOp.getReg(0); + case Intrinsic::amdgcn_readlane: + return LaneOp.addUse(Src1).getReg(0); + case Intrinsic::amdgcn_writelane: + return LaneOp.addUse(Src1).addUse(Src2).getReg(0); + case Intrinsic::amdgcn_permlane16: + case Intrinsic::amdgcn_permlanex16: { + Register Src3 = MI.getOperand(5).getReg(); + Register Src4 = MI.getOperand(6).getImm(); + Register Src5 = MI.getOperand(7).getImm(); + return LaneOp.addUse(Src1) + .addUse(Src2) + .addUse(Src3) + .addImm(Src4) + .addImm(Src5) + .getReg(0); + } + default: + llvm_unreachable("unhandled lane op"); + } + }; + + Register DstReg = MI.getOperand(0).getReg(); + Register Src0 = MI.getOperand(2).getReg(); + Register Src1, Src2; + if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane || + IsPermLane16) { + Src1 = MI.getOperand(3).getReg(); + if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) { + Src2 = MI.getOperand(4).getReg(); + } + } + + LLT Ty = MRI.getType(DstReg); + unsigned Size = Ty.getSizeInBits(); + + if (Size == 32) { + // Already legal + return true; + } + + if (Size < 32) { + Src0 = B.buildAnyExt(S32, Src0).getReg(0); + + if (IsPermLane16) + Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0); + + if (IID == Intrinsic::amdgcn_writelane) + Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0); + + Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32); + B.buildTrunc(DstReg, LaneOpDst); + MI.eraseFromParent(); + return true; + } + + if (Size % 32 != 0) + return false; + + LLT PartialResTy = S32; + if (Ty.isVector()) { + LLT EltTy = Ty.getElementType(); + switch (EltTy.getSizeInBits()) { + case 16: + PartialResTy = Ty.changeElementCount(ElementCount::getFixed(2)); + break; + case 32: + PartialResTy = EltTy; + break; + default: + // Handle all other cases via S32 pieces; + break; + } + } + + SmallVector<Register, 2> PartialRes; + unsigned NumParts = Size / 32; + MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0); + MachineInstrBuilder Src1Parts, Src2Parts; + + if (IsPermLane16) + Src1Parts = B.buildUnmerge(PartialResTy, Src1); + + if (IID == Intrinsic::amdgcn_writelane) + Src2Parts = B.buildUnmerge(PartialResTy, Src2); + + for (unsigned i = 0; i < NumParts; ++i) { + Src0 = Src0Parts.getReg(i); + + if (IsPermLane16) + Src1 = Src1Parts.getReg(i); + + if (IID == Intrinsic::amdgcn_writelane) + Src2 = Src2Parts.getReg(i); + + PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy)); + } + + B.buildMergeLikeInstr(DstReg, PartialRes); + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { @@ -6008,9 +6141,6 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { case Intrinsic::amdgcn_struct_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; - case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16: - case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16: - return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16; case Intrinsic::amdgcn_raw_buffer_atomic_fmin: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: case Intrinsic::amdgcn_struct_buffer_atomic_fmin: @@ -6630,9 +6760,9 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper, MI.removeOperand(1); // Remove intrinsic ID // FIXME: When intrinsic definition is fixed, this should have an MMO already. - // TODO: Should this use datalayout alignment? const unsigned MemSize = (Size + 7) / 8; - const Align MemAlign(std::min(MemSize, 4u)); + const Align MemAlign = B.getDataLayout().getABITypeAlign( + getTypeForLLT(Ty, MF.getFunction().getContext())); MachineMemOperand *MMO = MF.getMachineMemOperand( MachinePointerInfo(), MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | @@ -7318,14 +7448,9 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: - case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16: - case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16: - case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16: - case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16: return legalizeBufferAtomic(MI, B, IntrID); case Intrinsic::amdgcn_rsq_clamp: return legalizeRsqClampIntrinsic(MI, MRI, B); - case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); @@ -7365,6 +7490,13 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, Observer.changedInstr(MI); return true; } + case Intrinsic::amdgcn_readlane: + case Intrinsic::amdgcn_writelane: + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_permlane16: + case Intrinsic::amdgcn_permlanex16: + case Intrinsic::amdgcn_permlane64: + return legalizeLaneOp(Helper, MI, IntrID); default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrID)) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index 4b1d821dadc2..ae01bb29c110 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -210,6 +210,9 @@ public: bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const; + bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, + Intrinsic::ID IID) const; + bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const; bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp index c515138d95a2..456f3cb332cf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -1129,15 +1129,11 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, nval = CreateCallEx(B,ExpExpr, nval, "__exp2"); if (needcopysign) { - Value *opr_n; - Type* rTy = opr0->getType(); Type* nTyS = B.getIntNTy(eltType->getPrimitiveSizeInBits()); - Type *nTy = nTyS; - if (const auto *vTy = dyn_cast<FixedVectorType>(rTy)) - nTy = FixedVectorType::get(nTyS, vTy); + Type *nTy = FPOp->getType()->getWithNewType(nTyS); unsigned size = nTy->getScalarSizeInBits(); - opr_n = FPOp->getOperand(1); - if (opr_n->getType()->isIntegerTy()) + Value *opr_n = FPOp->getOperand(1); + if (opr_n->getType()->getScalarType()->isIntegerTy()) opr_n = B.CreateZExtOrTrunc(opr_n, nTy, "__ytou"); else opr_n = B.CreateFPToSI(opr1, nTy, "__ytou"); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp index f878bd9465d3..a8f6ad09fe28 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp @@ -200,6 +200,7 @@ #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/Utils/Local.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/AttributeMask.h" #include "llvm/IR/Constants.h" @@ -214,6 +215,7 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/ReplaceConstant.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/AtomicOrdering.h" @@ -578,18 +580,14 @@ bool StoreFatPtrsAsIntsVisitor::visitStoreInst(StoreInst &SI) { /// buffer fat pointer constant. static std::pair<Constant *, Constant *> splitLoweredFatBufferConst(Constant *C) { - if (auto *AZ = dyn_cast<ConstantAggregateZero>(C)) - return std::make_pair(AZ->getStructElement(0), AZ->getStructElement(1)); - if (auto *SC = dyn_cast<ConstantStruct>(C)) - return std::make_pair(SC->getOperand(0), SC->getOperand(1)); - llvm_unreachable("Conversion should've created a {p8, i32} struct"); + assert(isSplitFatPtr(C->getType()) && "Not a split fat buffer pointer"); + return std::make_pair(C->getAggregateElement(0u), C->getAggregateElement(1u)); } namespace { /// Handle the remapping of ptr addrspace(7) constants. class FatPtrConstMaterializer final : public ValueMaterializer { BufferFatPtrToStructTypeMap *TypeMap; - BufferFatPtrToIntTypeMap *IntTypeMap; // An internal mapper that is used to recurse into the arguments of constants. // While the documentation for `ValueMapper` specifies not to use it // recursively, examination of the logic in mapValue() shows that it can @@ -599,16 +597,12 @@ class FatPtrConstMaterializer final : public ValueMaterializer { Constant *materializeBufferFatPtrConst(Constant *C); - const DataLayout &DL; - public: // UnderlyingMap is the value map this materializer will be filling. FatPtrConstMaterializer(BufferFatPtrToStructTypeMap *TypeMap, - ValueToValueMapTy &UnderlyingMap, - BufferFatPtrToIntTypeMap *IntTypeMap, - const DataLayout &DL) - : TypeMap(TypeMap), IntTypeMap(IntTypeMap), - InternalMapper(UnderlyingMap, RF_None, TypeMap, this), DL(DL) {} + ValueToValueMapTy &UnderlyingMap) + : TypeMap(TypeMap), + InternalMapper(UnderlyingMap, RF_None, TypeMap, this) {} virtual ~FatPtrConstMaterializer() = default; Value *materialize(Value *V) override; @@ -631,10 +625,6 @@ Constant *FatPtrConstMaterializer::materializeBufferFatPtrConst(Constant *C) { UndefValue::get(NewTy->getElementType(1))}); } - if (isa<GlobalValue>(C)) - report_fatal_error("Global values containing ptr addrspace(7) (buffer " - "fat pointer) values are not supported"); - if (auto *VC = dyn_cast<ConstantVector>(C)) { if (Constant *S = VC->getSplatValue()) { Constant *NewS = InternalMapper.mapConstant(*S); @@ -660,127 +650,14 @@ Constant *FatPtrConstMaterializer::materializeBufferFatPtrConst(Constant *C) { return ConstantStruct::get(NewTy, {RsrcVec, OffVec}); } - // Constant expressions. This code mirrors how we fix up the equivalent - // instructions later. - auto *CE = dyn_cast<ConstantExpr>(C); - if (!CE) - return nullptr; - if (auto *GEPO = dyn_cast<GEPOperator>(C)) { - Constant *RemappedPtr = - InternalMapper.mapConstant(*cast<Constant>(GEPO->getPointerOperand())); - auto [Rsrc, Off] = splitLoweredFatBufferConst(RemappedPtr); - Type *OffTy = Off->getType(); - bool InBounds = GEPO->isInBounds(); - - MapVector<Value *, APInt> VariableOffs; - APInt NewConstOffVal = APInt::getZero(BufferOffsetWidth); - if (!GEPO->collectOffset(DL, BufferOffsetWidth, VariableOffs, - NewConstOffVal)) - report_fatal_error( - "Scalable vector or unsized struct in fat pointer GEP"); - Constant *OffAccum = nullptr; - // Accumulate offsets together before adding to the base in order to - // preserve as many of the inbounds properties as possible. - for (auto [Arg, Multiple] : VariableOffs) { - Constant *NewArg = InternalMapper.mapConstant(*cast<Constant>(Arg)); - NewArg = ConstantFoldIntegerCast(NewArg, OffTy, /*IsSigned=*/true, DL); - if (!Multiple.isOne()) { - if (Multiple.isPowerOf2()) { - NewArg = ConstantExpr::getShl( - NewArg, - CE->getIntegerValue( - OffTy, APInt(BufferOffsetWidth, Multiple.logBase2())), - /*hasNUW=*/InBounds, /*HasNSW=*/InBounds); - } else { - NewArg = - ConstantExpr::getMul(NewArg, CE->getIntegerValue(OffTy, Multiple), - /*hasNUW=*/InBounds, /*hasNSW=*/InBounds); - } - } - if (OffAccum) { - OffAccum = ConstantExpr::getAdd(OffAccum, NewArg, /*hasNUW=*/InBounds, - /*hasNSW=*/InBounds); - } else { - OffAccum = NewArg; - } - } - Constant *NewConstOff = CE->getIntegerValue(OffTy, NewConstOffVal); - if (OffAccum) - OffAccum = ConstantExpr::getAdd(OffAccum, NewConstOff, - /*hasNUW=*/InBounds, /*hasNSW=*/InBounds); - else - OffAccum = NewConstOff; - bool HasNonNegativeOff = false; - if (auto *CI = dyn_cast<ConstantInt>(OffAccum)) { - HasNonNegativeOff = !CI->isNegative(); - } - Constant *NewOff = ConstantExpr::getAdd( - Off, OffAccum, /*hasNUW=*/InBounds && HasNonNegativeOff, - /*hasNSW=*/false); - return ConstantStruct::get(NewTy, {Rsrc, NewOff}); - } - - if (auto *PI = dyn_cast<PtrToIntOperator>(CE)) { - Constant *Parts = - InternalMapper.mapConstant(*cast<Constant>(PI->getPointerOperand())); - auto [Rsrc, Off] = splitLoweredFatBufferConst(Parts); - // Here, we take advantage of the fact that ptrtoint has a built-in - // zero-extension behavior. - unsigned FatPtrWidth = - DL.getPointerSizeInBits(AMDGPUAS::BUFFER_FAT_POINTER); - Constant *RsrcInt = CE->getPtrToInt(Rsrc, SrcTy); - unsigned Width = SrcTy->getScalarSizeInBits(); - Constant *Shift = - CE->getIntegerValue(SrcTy, APInt(Width, BufferOffsetWidth)); - Constant *OffCast = - ConstantFoldIntegerCast(Off, SrcTy, /*IsSigned=*/false, DL); - Constant *RsrcHi = ConstantExpr::getShl( - RsrcInt, Shift, Width >= FatPtrWidth, Width > FatPtrWidth); - // This should be an or, but those got recently removed. - Constant *Result = ConstantExpr::getAdd(RsrcHi, OffCast, true, true); - return Result; - } + if (isa<GlobalValue>(C)) + report_fatal_error("Global values containing ptr addrspace(7) (buffer " + "fat pointer) values are not supported"); - if (CE->getOpcode() == Instruction::IntToPtr) { - auto *Arg = cast<Constant>(CE->getOperand(0)); - unsigned FatPtrWidth = - DL.getPointerSizeInBits(AMDGPUAS::BUFFER_FAT_POINTER); - unsigned RsrcPtrWidth = DL.getPointerSizeInBits(AMDGPUAS::BUFFER_RESOURCE); - auto *WantedTy = Arg->getType()->getWithNewBitWidth(FatPtrWidth); - Arg = ConstantFoldIntegerCast(Arg, WantedTy, /*IsSigned=*/false, DL); - - Constant *Shift = - CE->getIntegerValue(WantedTy, APInt(FatPtrWidth, BufferOffsetWidth)); - Type *RsrcIntType = WantedTy->getWithNewBitWidth(RsrcPtrWidth); - Type *RsrcTy = NewTy->getElementType(0); - Type *OffTy = WantedTy->getWithNewBitWidth(BufferOffsetWidth); - Constant *RsrcInt = CE->getTrunc( - ConstantFoldBinaryOpOperands(Instruction::LShr, Arg, Shift, DL), - RsrcIntType); - Constant *Rsrc = CE->getIntToPtr(RsrcInt, RsrcTy); - Constant *Off = ConstantFoldIntegerCast(Arg, OffTy, /*isSigned=*/false, DL); - - return ConstantStruct::get(NewTy, {Rsrc, Off}); - } + if (isa<ConstantExpr>(C)) + report_fatal_error("Constant exprs containing ptr addrspace(7) (buffer " + "fat pointer) values should have been expanded earlier"); - if (auto *AC = dyn_cast<AddrSpaceCastOperator>(CE)) { - unsigned SrcAS = AC->getSrcAddressSpace(); - unsigned DstAS = AC->getDestAddressSpace(); - auto *Arg = cast<Constant>(AC->getPointerOperand()); - auto *NewArg = InternalMapper.mapConstant(*Arg); - if (!NewArg) - return nullptr; - if (SrcAS == AMDGPUAS::BUFFER_FAT_POINTER && - DstAS == AMDGPUAS::BUFFER_FAT_POINTER) - return NewArg; - if (SrcAS == AMDGPUAS::BUFFER_RESOURCE && - DstAS == AMDGPUAS::BUFFER_FAT_POINTER) { - auto *NullOff = CE->getNullValue(NewTy->getElementType(1)); - return ConstantStruct::get(NewTy, {NewArg, NullOff}); - } - report_fatal_error( - "Unsupported address space cast for a buffer fat pointer"); - } return nullptr; } @@ -788,26 +665,6 @@ Value *FatPtrConstMaterializer::materialize(Value *V) { Constant *C = dyn_cast<Constant>(V); if (!C) return nullptr; - if (auto *GEPO = dyn_cast<GEPOperator>(C)) { - // As a special case, adjust GEP constants that have a ptr addrspace(7) in - // their source types here, since the earlier local changes didn't handle - // htis. - Type *SrcTy = GEPO->getSourceElementType(); - Type *NewSrcTy = IntTypeMap->remapType(SrcTy); - if (SrcTy != NewSrcTy) { - SmallVector<Constant *> Ops; - Ops.reserve(GEPO->getNumOperands()); - for (const Use &U : GEPO->operands()) - Ops.push_back(cast<Constant>(U.get())); - auto *NewGEP = ConstantExpr::getGetElementPtr( - NewSrcTy, Ops[0], ArrayRef<Constant *>(Ops).slice(1), - GEPO->getNoWrapFlags(), GEPO->getInRange()); - LLVM_DEBUG(dbgs() << "p7-getting GEP: " << *GEPO << " becomes " << *NewGEP - << "\n"); - Value *FurtherMap = materialize(NewGEP); - return FurtherMap ? FurtherMap : NewGEP; - } - } // Structs and other types that happen to contain fat pointers get remapped // by the mapValue() logic. if (!isBufferFatPtrConst(C)) @@ -1387,57 +1244,25 @@ PtrParts SplitPtrStructs::visitAtomicCmpXchgInst(AtomicCmpXchgInst &AI) { } PtrParts SplitPtrStructs::visitGetElementPtrInst(GetElementPtrInst &GEP) { + using namespace llvm::PatternMatch; Value *Ptr = GEP.getPointerOperand(); if (!isSplitFatPtr(Ptr->getType())) return {nullptr, nullptr}; IRB.SetInsertPoint(&GEP); auto [Rsrc, Off] = getPtrParts(Ptr); - Type *OffTy = Off->getType(); const DataLayout &DL = GEP.getModule()->getDataLayout(); bool InBounds = GEP.isInBounds(); - // In order to call collectOffset() and thus not have to reimplement it, - // we need the GEP's pointer operand to have ptr addrspace(7) type - GEP.setOperand(GEP.getPointerOperandIndex(), - PoisonValue::get(IRB.getPtrTy(AMDGPUAS::BUFFER_FAT_POINTER))); - MapVector<Value *, APInt> VariableOffs; - APInt ConstOffVal = APInt::getZero(BufferOffsetWidth); - if (!GEP.collectOffset(DL, BufferOffsetWidth, VariableOffs, ConstOffVal)) - report_fatal_error("Scalable vector or unsized struct in fat pointer GEP"); - GEP.setOperand(GEP.getPointerOperandIndex(), Ptr); - Value *OffAccum = nullptr; - // Accumulate offsets together before adding to the base in order to preserve - // as many of the inbounds properties as possible. - for (auto [Arg, Multiple] : VariableOffs) { - if (auto *OffVecTy = dyn_cast<VectorType>(OffTy)) - if (!Arg->getType()->isVectorTy()) - Arg = IRB.CreateVectorSplat(OffVecTy->getElementCount(), Arg); - Arg = IRB.CreateIntCast(Arg, OffTy, /*isSigned=*/true); - if (!Multiple.isOne()) { - if (Multiple.isPowerOf2()) - Arg = IRB.CreateShl(Arg, Multiple.logBase2(), "", /*hasNUW=*/InBounds, - /*HasNSW=*/InBounds); - else - Arg = IRB.CreateMul(Arg, ConstantExpr::getIntegerValue(OffTy, Multiple), - "", /*hasNUW=*/InBounds, /*hasNSW=*/InBounds); - } - if (OffAccum) - OffAccum = IRB.CreateAdd(OffAccum, Arg, "", /*hasNUW=*/InBounds, - /*hasNSW=*/InBounds); - else - OffAccum = Arg; - } - if (!ConstOffVal.isZero()) { - Constant *ConstOff = ConstantExpr::getIntegerValue(OffTy, ConstOffVal); - if (OffAccum) - OffAccum = IRB.CreateAdd(OffAccum, ConstOff, "", /*hasNUW=*/InBounds, - /*hasNSW=*/InBounds); - else - OffAccum = ConstOff; - } - - if (!OffAccum) { // Constant-zero offset + // In order to call emitGEPOffset() and thus not have to reimplement it, + // we need the GEP result to have ptr addrspace(7) type. + Type *FatPtrTy = IRB.getPtrTy(AMDGPUAS::BUFFER_FAT_POINTER); + if (auto *VT = dyn_cast<VectorType>(Off->getType())) + FatPtrTy = VectorType::get(FatPtrTy, VT->getElementCount()); + GEP.mutateType(FatPtrTy); + Value *OffAccum = emitGEPOffset(&IRB, DL, &GEP); + GEP.mutateType(Ptr->getType()); + if (match(OffAccum, m_Zero())) { // Constant-zero offset SplitUsers.insert(&GEP); return {Rsrc, Off}; } @@ -1447,7 +1272,7 @@ PtrParts SplitPtrStructs::visitGetElementPtrInst(GetElementPtrInst &GEP) { HasNonNegativeOff = !CI->isNegative(); } Value *NewOff; - if (PatternMatch::match(Off, PatternMatch::is_zero())) { + if (match(Off, m_Zero())) { NewOff = OffAccum; } else { NewOff = IRB.CreateAdd(Off, OffAccum, "", @@ -1473,20 +1298,22 @@ PtrParts SplitPtrStructs::visitPtrToIntInst(PtrToIntInst &PI) { const DataLayout &DL = PI.getModule()->getDataLayout(); unsigned FatPtrWidth = DL.getPointerSizeInBits(AMDGPUAS::BUFFER_FAT_POINTER); - Value *RsrcInt; - if (Width <= BufferOffsetWidth) - RsrcInt = ConstantExpr::getIntegerValue(ResTy, APInt::getZero(Width)); - else - RsrcInt = IRB.CreatePtrToInt(Rsrc, ResTy, PI.getName() + ".rsrc"); - copyMetadata(RsrcInt, &PI); - - Value *Shl = IRB.CreateShl( - RsrcInt, - ConstantExpr::getIntegerValue(ResTy, APInt(Width, BufferOffsetWidth)), "", - Width >= FatPtrWidth, Width > FatPtrWidth); - Value *OffCast = - IRB.CreateIntCast(Off, ResTy, /*isSigned=*/false, PI.getName() + ".off"); - Value *Res = IRB.CreateOr(Shl, OffCast); + Value *Res; + if (Width <= BufferOffsetWidth) { + Res = IRB.CreateIntCast(Off, ResTy, /*isSigned=*/false, + PI.getName() + ".off"); + } else { + Value *RsrcInt = IRB.CreatePtrToInt(Rsrc, ResTy, PI.getName() + ".rsrc"); + Value *Shl = IRB.CreateShl( + RsrcInt, + ConstantExpr::getIntegerValue(ResTy, APInt(Width, BufferOffsetWidth)), + "", Width >= FatPtrWidth, Width > FatPtrWidth); + Value *OffCast = IRB.CreateIntCast(Off, ResTy, /*isSigned=*/false, + PI.getName() + ".off"); + Res = IRB.CreateOr(Shl, OffCast); + } + + copyMetadata(Res, &PI); Res->takeName(&PI); SplitUsers.insert(&PI); PI.replaceAllUsesWith(Res); @@ -1818,14 +1645,9 @@ public: static bool containsBufferFatPointers(const Function &F, BufferFatPtrToStructTypeMap *TypeMap) { bool HasFatPointers = false; - for (const BasicBlock &BB : F) { - for (const Instruction &I : BB) { + for (const BasicBlock &BB : F) + for (const Instruction &I : BB) HasFatPointers |= (I.getType() != TypeMap->remapType(I.getType())); - for (const Use &U : I.operands()) - if (auto *C = dyn_cast<Constant>(U.get())) - HasFatPointers |= isBufferFatPtrConst(C); - } - } return HasFatPointers; } @@ -1924,6 +1746,36 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) { "buffer resource pointers (address space 8) instead."); } + { + // Collect all constant exprs and aggregates referenced by any function. + SmallVector<Constant *, 8> Worklist; + for (Function &F : M.functions()) + for (Instruction &I : instructions(F)) + for (Value *Op : I.operands()) + if (isa<ConstantExpr>(Op) || isa<ConstantAggregate>(Op)) + Worklist.push_back(cast<Constant>(Op)); + + // Recursively look for any referenced buffer pointer constants. + SmallPtrSet<Constant *, 8> Visited; + SetVector<Constant *> BufferFatPtrConsts; + while (!Worklist.empty()) { + Constant *C = Worklist.pop_back_val(); + if (!Visited.insert(C).second) + continue; + if (isBufferFatPtrOrVector(C->getType())) + BufferFatPtrConsts.insert(C); + for (Value *Op : C->operands()) + if (isa<ConstantExpr>(Op) || isa<ConstantAggregate>(Op)) + Worklist.push_back(cast<Constant>(Op)); + } + + // Expand all constant expressions using fat buffer pointers to + // instructions. + Changed |= convertUsersOfConstantsToInstructions( + BufferFatPtrConsts.getArrayRef(), /*RestrictToFunc=*/nullptr, + /*RemoveDeadConstants=*/false, /*IncludeSelf=*/true); + } + StoreFatPtrsAsIntsVisitor MemOpsRewrite(&IntTM, M.getContext()); for (Function &F : M.functions()) { bool InterfaceChange = hasFatPointerInterface(F, &StructTM); @@ -1939,7 +1791,7 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) { SmallVector<Function *> Intrinsics; // Keep one big map so as to memoize constants across functions. ValueToValueMapTy CloneMap; - FatPtrConstMaterializer Materializer(&StructTM, CloneMap, &IntTM, DL); + FatPtrConstMaterializer Materializer(&StructTM, CloneMap); ValueMapper LowerInFuncs(CloneMap, RF_None, &StructTM, &Materializer); for (auto [F, InterfaceChange] : NeedsRemap) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp index 6ec4178053b2..11f0cba47afd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp @@ -17,6 +17,157 @@ using namespace llvm; +void AMDGPUMIRFormatter::printImm(raw_ostream &OS, const MachineInstr &MI, + std::optional<unsigned int> OpIdx, int64_t Imm) const { + + switch (MI.getOpcode()) { + case AMDGPU::S_DELAY_ALU: + assert(OpIdx == 0); + printSDelayAluImm(Imm, OS); + break; + default: + MIRFormatter::printImm(OS, MI, OpIdx, Imm); + break; + } +} + +/// Implement target specific parsing of immediate mnemonics. The mnemonic is +/// a string with a leading dot. +bool AMDGPUMIRFormatter::parseImmMnemonic(const unsigned OpCode, + const unsigned OpIdx, + StringRef Src, int64_t &Imm, + ErrorCallbackType ErrorCallback) const +{ + + switch (OpCode) { + case AMDGPU::S_DELAY_ALU: + return parseSDelayAluImmMnemonic(OpIdx, Imm, Src, ErrorCallback); + default: + break; + } + return true; // Don't know what this is +} + +void AMDGPUMIRFormatter::printSDelayAluImm(int64_t Imm, + llvm::raw_ostream &OS) const { + // Construct an immediate string to represent the information encoded in the + // s_delay_alu immediate. + // .id0_<dep>[_skip_<count>_id1<dep>] + constexpr int64_t None = 0; + constexpr int64_t Same = 0; + + uint64_t Id0 = (Imm & 0xF); + uint64_t Skip = ((Imm >> 4) & 0x7); + uint64_t Id1 = ((Imm >> 7) & 0xF); + auto Outdep = [&](uint64_t Id) { + if (Id == None) + OS << "NONE"; + else if (Id < 5) + OS << "VALU_DEP_" << Id; + else if (Id < 8) + OS << "TRANS32_DEP_" << Id - 4; + else + OS << "SALU_CYCLE_" << Id - 8; + }; + + OS << ".id0_"; + Outdep(Id0); + + // If the second inst is "same" and "none", no need to print the rest of the + // string. + if (Skip == Same && Id1 == None) + return; + + // Encode the second delay specification. + OS << "_skip_"; + if (Skip == 0) + OS << "SAME"; + else if (Skip == 1) + OS << "NEXT"; + else + OS << "SKIP_" << Skip - 1; + + OS << "_id1_"; + Outdep(Id1); +} + +bool AMDGPUMIRFormatter::parseSDelayAluImmMnemonic( + const unsigned int OpIdx, int64_t &Imm, llvm::StringRef &Src, + llvm::MIRFormatter::ErrorCallbackType &ErrorCallback) const +{ + assert(OpIdx == 0); + + Imm = 0; + bool Expected = Src.consume_front(".id0_"); + if (!Expected) + return ErrorCallback(Src.begin(), "Expected .id0_"); + + auto ExpectInt = [&](StringRef &Src, int64_t Offset) -> int64_t { + int64_t Dep; + if (!Src.consumeInteger(10, Dep)) + return Dep + Offset; + + return -1; + }; + + auto DecodeDelay = [&](StringRef &Src) -> int64_t { + if (Src.consume_front("NONE")) + return 0; + if (Src.consume_front("VALU_DEP_")) + return ExpectInt(Src, 0); + if (Src.consume_front("TRANS32_DEP_")) + return ExpectInt(Src, 4); + if (Src.consume_front("SALU_CYCLE_")) + return ExpectInt(Src, 8); + + return -1; + }; + + int64_t Delay0 = DecodeDelay(Src); + int64_t Skip = 0; + int64_t Delay1 = 0; + if (Delay0 == -1) + return ErrorCallback(Src.begin(), "Could not decode delay0"); + + + // Set the Imm so far, to that early return has the correct value. + Imm = Delay0; + + // If that was the end of the string, the second instruction is "same" and + // "none" + if (Src.begin() == Src.end()) + return false; + + Expected = Src.consume_front("_skip_"); + if (!Expected) + return ErrorCallback(Src.begin(), "Expected _skip_"); + + + if (Src.consume_front("SAME")) { + Skip = 0; + } else if (Src.consume_front("NEXT")) { + Skip = 1; + } else if (Src.consume_front("SKIP_")) { + if (Src.consumeInteger(10, Skip)) { + return ErrorCallback(Src.begin(), "Expected integer Skip value"); + } + Skip += 1; + } else { + ErrorCallback(Src.begin(), "Unexpected Skip Value"); + } + + Expected = Src.consume_front("_id1_"); + if (!Expected) + return ErrorCallback(Src.begin(), "Expected _id1_"); + + Delay1 = DecodeDelay(Src); + if (Delay1 == -1) + return ErrorCallback(Src.begin(), "Could not decode delay1"); + + Imm = Imm | (Skip << 4) | (Delay1 << 7); + return false; +} + bool AMDGPUMIRFormatter::parseCustomPseudoSourceValue( StringRef Src, MachineFunction &MF, PerFunctionMIParsingState &PFS, const PseudoSourceValue *&PSV, ErrorCallbackType ErrorCallback) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h index 98b5031071cf..c5c947375252 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h @@ -28,12 +28,35 @@ public: AMDGPUMIRFormatter() = default; virtual ~AMDGPUMIRFormatter() = default; + /// Implement target specific printing for machine operand immediate value, so + /// that we can have more meaningful mnemonic than a 64-bit integer. Passing + /// None to OpIdx means the index is unknown. + virtual void printImm(raw_ostream &OS, const MachineInstr &MI, + std::optional<unsigned> OpIdx, + int64_t Imm) const override; + + /// Implement target specific parsing of immediate mnemonics. The mnemonic is + /// a string with a leading dot. + virtual bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx, + StringRef Src, int64_t &Imm, + ErrorCallbackType ErrorCallback) const override; + /// Implement target specific parsing of target custom pseudo source value. bool parseCustomPseudoSourceValue(StringRef Src, MachineFunction &MF, PerFunctionMIParsingState &PFS, const PseudoSourceValue *&PSV, ErrorCallbackType ErrorCallback) const override; + +private: + /// Print the string to represent s_delay_alu immediate value + void printSDelayAluImm(int64_t Imm, llvm::raw_ostream &OS) const; + + /// Parse the immediate pseudo literal for s_delay_alu + bool parseSDelayAluImmMnemonic( + const unsigned int OpIdx, int64_t &Imm, llvm::StringRef &Src, + llvm::MIRFormatter::ErrorCallbackType &ErrorCallback) const; + }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp index f36374b08b34..cfe9f33efc91 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -100,7 +100,7 @@ public: bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg) const; // Combine unsigned buffer load and signed extension instructions to generate - // signed buffer laod instructions. + // signed buffer load instructions. bool matchCombineSignExtendInReg( MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchInfo) const; void applyCombineSignExtendInReg( @@ -465,8 +465,8 @@ void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<GISelKnownBitsAnalysis>(); AU.addPreserved<GISelKnownBitsAnalysis>(); if (!IsOptNone) { - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); } MachineFunctionPass::getAnalysisUsage(AU); } @@ -494,7 +494,8 @@ bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); MachineDominatorTree *MDT = - IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); + IsOptNone ? nullptr + : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, LI, EnableOpt, F.hasOptSize(), F.hasMinSize()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp index 3f01a328afaf..4d0cb467ba37 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -238,8 +238,8 @@ void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<GISelKnownBitsAnalysis>(); AU.addPreserved<GISelKnownBitsAnalysis>(); if (!IsOptNone) { - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); } AU.addRequired<GISelCSEAnalysisWrapperPass>(); @@ -272,7 +272,8 @@ bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &STI = MF.getSubtarget<GCNSubtarget>(); MachineDominatorTree *MDT = - IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); + IsOptNone ? nullptr + : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, nullptr, EnableOpt, F.hasOptSize(), F.hasMinSize()); AMDGPUPreLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, CSEInfo, RuleConfig, diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index 35abd6eddde8..74f0540239c9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -421,8 +421,8 @@ void AMDGPURegBankCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<GISelKnownBitsAnalysis>(); AU.addPreserved<GISelKnownBitsAnalysis>(); if (!IsOptNone) { - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); } MachineFunctionPass::getAnalysisUsage(AU); } @@ -449,7 +449,8 @@ bool AMDGPURegBankCombiner::runOnMachineFunction(MachineFunction &MF) { const auto *LI = ST.getLegalizerInfo(); MachineDominatorTree *MDT = - IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); + IsOptNone ? nullptr + : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, LI, EnableOpt, F.hasOptSize(), F.hasMinSize()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp index 2ea03ddb1fcc..d1985f46b1c4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp @@ -33,7 +33,7 @@ StringRef AMDGPURegBankSelect::getPassName() const { void AMDGPURegBankSelect::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<MachineCycleInfoWrapperPass>(); - AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); // TODO: Preserve DomTree RegBankSelect::getAnalysisUsage(AU); } @@ -41,7 +41,7 @@ void AMDGPURegBankSelect::getAnalysisUsage(AnalysisUsage &AU) const { INITIALIZE_PASS_BEGIN(AMDGPURegBankSelect, "amdgpu-" DEBUG_TYPE, "AMDGPU Register Bank Select", false, false) INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(AMDGPURegBankSelect, "amdgpu-" DEBUG_TYPE, "AMDGPU Register Bank Select", false, false) @@ -63,7 +63,8 @@ bool AMDGPURegBankSelect::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); MachineCycleInfo &CycleInfo = getAnalysis<MachineCycleInfoWrapperPass>().getCycleInfo(); - MachineDominatorTree &DomTree = getAnalysis<MachineDominatorTree>(); + MachineDominatorTree &DomTree = + getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); MachineUniformityInfo Uniformity = computeMachineUniformityInfo(MF, CycleInfo, DomTree.getBase(), diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 7ebd674757fb..9e7694f41d6b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3079,7 +3079,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl( return; } case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: - case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { applyDefaultMapping(OpdMapper); @@ -4376,7 +4375,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: - case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { // vdata_out @@ -4907,8 +4905,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_global_load_tr_b128: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_ds_ordered_add: - case Intrinsic::amdgcn_ds_ordered_swap: - case Intrinsic::amdgcn_ds_fadd_v2bf16: { + case Intrinsic::amdgcn_ds_ordered_swap: { unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, @@ -5221,11 +5218,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_ATOMICRMW_UMAX: case AMDGPU::G_ATOMICRMW_UMIN: case AMDGPU::G_ATOMICRMW_FADD: + case AMDGPU::G_ATOMICRMW_FMIN: + case AMDGPU::G_ATOMICRMW_FMAX: case AMDGPU::G_ATOMICRMW_UINC_WRAP: case AMDGPU::G_ATOMICRMW_UDEC_WRAP: - case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: - case AMDGPU::G_AMDGPU_ATOMIC_FMIN: - case AMDGPU::G_AMDGPU_ATOMIC_FMAX: { + case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: { OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 410dc83d45c5..ed5bae3e4ff6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -252,21 +252,8 @@ def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin_num>; def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax_num>; def : SourceOfDivergence<int_amdgcn_global_atomic_fadd_v2bf16>; def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd_v2bf16>; -def : SourceOfDivergence<int_amdgcn_ds_fadd>; def : SourceOfDivergence<int_amdgcn_ds_fmin>; def : SourceOfDivergence<int_amdgcn_ds_fmax>; -def : SourceOfDivergence<int_amdgcn_ds_fadd_v2bf16>; -def : SourceOfDivergence<int_amdgcn_buffer_atomic_swap>; -def : SourceOfDivergence<int_amdgcn_buffer_atomic_add>; -def : SourceOfDivergence<int_amdgcn_buffer_atomic_sub>; -def : SourceOfDivergence<int_amdgcn_buffer_atomic_smin>; -def : SourceOfDivergence<int_amdgcn_buffer_atomic_umin>; -def : SourceOfDivergence<int_amdgcn_buffer_atomic_smax>; -def : SourceOfDivergence<int_amdgcn_buffer_atomic_umax>; -def : SourceOfDivergence<int_amdgcn_buffer_atomic_and>; -def : SourceOfDivergence<int_amdgcn_buffer_atomic_or>; -def : SourceOfDivergence<int_amdgcn_buffer_atomic_xor>; -def : SourceOfDivergence<int_amdgcn_buffer_atomic_cmpswap>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_swap>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_add>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_sub>; @@ -280,7 +267,6 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_xor>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_inc>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_dec>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd>; -def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd_v2bf16>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>; @@ -298,7 +284,6 @@ def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_xor>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_inc>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_dec>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd>; -def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_cmpswap>; @@ -316,7 +301,6 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_inc>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd>; -def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd_v2bf16>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>; @@ -334,12 +318,10 @@ def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_xor>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_inc>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_dec>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd>; -def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cmpswap>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32>; -def : SourceOfDivergence<int_amdgcn_buffer_atomic_csub>; def : SourceOfDivergence<int_amdgcn_ps_live>; def : SourceOfDivergence<int_amdgcn_live_mask>; def : SourceOfDivergence<int_amdgcn_ds_swizzle>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp index 2449fa581842..3e5d83b8e3fb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp @@ -15,10 +15,9 @@ /// SplitModule: load-balance the module's functions across a set of N /// partitions to allow parallel codegen. However, it does it very /// differently than the target-agnostic variant: -/// - Kernels are used as the module's "roots". -/// They're known entry points on AMDGPU, and everything else is often -/// internal only. -/// - Each kernel has a set of dependencies, and when a kernel and its +/// - The module has "split roots", which are kernels in the vast +// majority of cases. +/// - Each root has a set of dependencies, and when a root and its /// dependencies is considered "big", we try to put it in a partition where /// most dependencies are already imported, to avoid duplicating large /// amounts of code. @@ -67,20 +66,22 @@ using namespace llvm; namespace { -static cl::opt<float> LargeKernelFactor( - "amdgpu-module-splitting-large-kernel-threshold", cl::init(2.0f), +static cl::opt<float> LargeFnFactor( + "amdgpu-module-splitting-large-function-threshold", cl::init(2.0f), cl::Hidden, cl::desc( - "consider a kernel as large and needing special treatment when it " + "consider a function as large and needing special treatment when the " + "cost of importing it into a partition" "exceeds the average cost of a partition by this factor; e;g. 2.0 " - "means if the kernel and its dependencies is 2 times bigger than " - "an average partition; 0 disables large kernels handling entirely")); + "means if the function and its dependencies is 2 times bigger than " + "an average partition; 0 disables large functions handling entirely")); -static cl::opt<float> LargeKernelOverlapForMerge( - "amdgpu-module-splitting-large-kernel-merge-overlap", cl::init(0.8f), +static cl::opt<float> LargeFnOverlapForMerge( + "amdgpu-module-splitting-large-function-merge-overlap", cl::init(0.8f), cl::Hidden, - cl::desc("defines how much overlap between two large kernel's dependencies " - "is needed to put them in the same partition")); + cl::desc( + "defines how much overlap between two large function's dependencies " + "is needed to put them in the same partition")); static cl::opt<bool> NoExternalizeGlobals( "amdgpu-module-splitting-no-externalize-globals", cl::Hidden, @@ -98,6 +99,7 @@ static cl::opt<bool> using CostType = InstructionCost::CostType; using PartitionID = unsigned; +using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>; static bool isEntryPoint(const Function *F) { return AMDGPU::isEntryFunctionCC(F->getCallingConv()); @@ -214,13 +216,12 @@ static SplitModuleLogger &operator<<(SplitModuleLogger &SML, const Ty &Val) { /// Calculate the cost of each function in \p M /// \param SML Log Helper -/// \param TM TargetMachine instance used to retrieve TargetTransformInfo. +/// \param GetTTI Abstract getter for TargetTransformInfo. /// \param M Module to analyze. /// \param CostMap[out] Resulting Function -> Cost map. /// \return The module's total cost. static CostType -calculateFunctionCosts(SplitModuleLogger &SML, const AMDGPUTargetMachine &TM, - Module &M, +calculateFunctionCosts(SplitModuleLogger &SML, GetTTIFn GetTTI, Module &M, DenseMap<const Function *, CostType> &CostMap) { CostType ModuleCost = 0; CostType KernelCost = 0; @@ -230,8 +231,7 @@ calculateFunctionCosts(SplitModuleLogger &SML, const AMDGPUTargetMachine &TM, continue; CostType FnCost = 0; - TargetTransformInfo TTI = TM.getTargetTransformInfo(Fn); - + const auto &TTI = GetTTI(Fn); for (const auto &BB : Fn) { for (const auto &I : BB) { auto Cost = @@ -277,9 +277,9 @@ static bool canBeIndirectlyCalled(const Function &F) { /*IgnoreCastedDirectCall=*/true); } -/// When a kernel or any of its callees performs an indirect call, this function +/// When a function or any of its callees performs an indirect call, this /// takes over \ref addAllDependencies and adds all potentially callable -/// functions to \p Fns so they can be counted as dependencies of the kernel. +/// functions to \p Fns so they can be counted as dependencies of the function. /// /// This is needed due to how AMDGPUResourceUsageAnalysis operates: in the /// presence of an indirect call, the function's resource usage is the same as @@ -301,13 +301,14 @@ static void addAllIndirectCallDependencies(const Module &M, /// \param CG Call graph for \p Fn's module. /// \param Fn Current function to look at. /// \param Fns[out] Resulting list of functions. +/// \param OnlyDirect Whether to only consider direct callees. /// \param HadIndirectCall[out] Set to true if an indirect call was seen at some /// point, either in \p Fn or in one of the function it calls. When that /// happens, we fall back to adding all callable functions inside \p Fn's module /// to \p Fns. static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG, const Function &Fn, - DenseSet<const Function *> &Fns, + DenseSet<const Function *> &Fns, bool OnlyDirect, bool &HadIndirectCall) { assert(!Fn.isDeclaration()); @@ -325,6 +326,9 @@ static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG, auto *CGNode = CGEntry.second; auto *Callee = CGNode->getFunction(); if (!Callee) { + if (OnlyDirect) + continue; + // Functions have an edge towards CallsExternalNode if they're external // declarations, or if they do an indirect call. As we only process // definitions here, we know this means the function has an indirect @@ -353,13 +357,19 @@ static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG, } } -/// Contains information about a kernel and its dependencies. -struct KernelWithDependencies { - KernelWithDependencies(SplitModuleLogger &SML, CallGraph &CG, - const DenseMap<const Function *, CostType> &FnCosts, - const Function *Fn) +/// Contains information about a function and its dependencies. +/// This is a splitting root. The splitting algorithm works by +/// assigning these to partitions. +struct FunctionWithDependencies { + FunctionWithDependencies(SplitModuleLogger &SML, CallGraph &CG, + const DenseMap<const Function *, CostType> &FnCosts, + const Function *Fn) : Fn(Fn) { - addAllDependencies(SML, CG, *Fn, Dependencies, HasIndirectCall); + // When Fn is not a kernel, we don't need to collect indirect callees. + // Resource usage analysis is only performed on kernels, and we collect + // indirect callees for resource usage analysis. + addAllDependencies(SML, CG, *Fn, Dependencies, + /*OnlyDirect*/ !isEntryPoint(Fn), HasIndirectCall); TotalCost = FnCosts.at(Fn); for (const auto *Dep : Dependencies) { TotalCost += FnCosts.at(Dep); @@ -380,8 +390,8 @@ struct KernelWithDependencies { CostType TotalCost = 0; - /// \returns true if this kernel and its dependencies can be considered large - /// according to \p Threshold. + /// \returns true if this function and its dependencies can be considered + /// large according to \p Threshold. bool isLarge(CostType Threshold) const { return TotalCost > Threshold && !Dependencies.empty(); } @@ -420,39 +430,39 @@ static float calculateOverlap(const DenseSet<const Function *> &A, /// \param NumParts Number of partitions to create. /// \param ModuleCost Total cost of all functions in \p M. /// \param FnCosts Map of Function -> Cost -/// \param WorkList Kernels and their dependencies to process in order. +/// \param WorkList Functions and their dependencies to process in order. /// \returns The created partitions (a vector of size \p NumParts ) static std::vector<DenseSet<const Function *>> doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts, CostType ModuleCost, const DenseMap<const Function *, CostType> &FnCosts, - const SmallVector<KernelWithDependencies> &WorkList) { + const SmallVector<FunctionWithDependencies> &WorkList) { SML << "\n--Partitioning Starts--\n"; - // Calculate a "large kernel threshold". When more than one kernel's total - // import cost exceeds this value, we will try to merge it with other, - // similarly large kernels. + // Calculate a "large function threshold". When more than one function's total + // import cost exceeds this value, we will try to assign it to an existing + // partition to reduce the amount of duplication needed. // - // e.g. let two kernels X and Y have a import cost of ~10% of the module, we + // e.g. let two functions X and Y have a import cost of ~10% of the module, we // assign X to a partition as usual, but when we get to Y, we check if it's // worth also putting it in Y's partition. - const CostType LargeKernelThreshold = - LargeKernelFactor ? CostType(((ModuleCost / NumParts) * LargeKernelFactor)) - : std::numeric_limits<CostType>::max(); + const CostType LargeFnThreshold = + LargeFnFactor ? CostType(((ModuleCost / NumParts) * LargeFnFactor)) + : std::numeric_limits<CostType>::max(); std::vector<DenseSet<const Function *>> Partitions; Partitions.resize(NumParts); - // Assign a partition to each kernel, and try to keep the partitions more or + // Assign functions to partitions, and try to keep the partitions more or // less balanced. We do that through a priority queue sorted in reverse, so we // can always look at the partition with the least content. // // There are some cases where we will be deliberately unbalanced though. - // - Large kernels: we try to merge with existing partitions to reduce code + // - Large functions: we try to merge with existing partitions to reduce code // duplication. - // - Kernels with indirect or external calls always go in the first partition - // (P0). + // - Functions with indirect or external calls always go in the first + // partition (P0). auto ComparePartitions = [](const std::pair<PartitionID, CostType> &a, const std::pair<PartitionID, CostType> &b) { // When two partitions have the same cost, assign to the one with the @@ -471,17 +481,17 @@ doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts, for (unsigned I = 0; I < NumParts; ++I) BalancingQueue.push_back(std::make_pair(I, 0)); - // Helper function to handle assigning a kernel to a partition. This takes + // Helper function to handle assigning a function to a partition. This takes // care of updating the balancing queue. const auto AssignToPartition = [&](PartitionID PID, - const KernelWithDependencies &KWD) { + const FunctionWithDependencies &FWD) { auto &FnsInPart = Partitions[PID]; - FnsInPart.insert(KWD.Fn); - FnsInPart.insert(KWD.Dependencies.begin(), KWD.Dependencies.end()); + FnsInPart.insert(FWD.Fn); + FnsInPart.insert(FWD.Dependencies.begin(), FWD.Dependencies.end()); - SML << "assign " << getName(*KWD.Fn) << " to P" << PID << "\n -> "; - if (!KWD.Dependencies.empty()) { - SML << KWD.Dependencies.size() << " dependencies added\n"; + SML << "assign " << getName(*FWD.Fn) << " to P" << PID << "\n -> "; + if (!FWD.Dependencies.empty()) { + SML << FWD.Dependencies.size() << " dependencies added\n"; }; // Update the balancing queue. we scan backwards because in the common case @@ -506,44 +516,43 @@ doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts, sort(BalancingQueue, ComparePartitions); }; - for (auto &CurKernel : WorkList) { - // When a kernel has indirect calls, it must stay in the first partition + for (auto &CurFn : WorkList) { + // When a function has indirect calls, it must stay in the first partition // alongside every reachable non-entry function. This is a nightmare case // for splitting as it severely limits what we can do. - if (CurKernel.HasIndirectCall) { - SML << "Kernel with indirect call(s): " << getName(*CurKernel.Fn) + if (CurFn.HasIndirectCall) { + SML << "Function with indirect call(s): " << getName(*CurFn.Fn) << " defaulting to P0\n"; - AssignToPartition(0, CurKernel); + AssignToPartition(0, CurFn); continue; } - // When a kernel has non duplicatable dependencies, we have to keep it in + // When a function has non duplicatable dependencies, we have to keep it in // the first partition as well. This is a conservative approach, a // finer-grained approach could keep track of which dependencies are // non-duplicatable exactly and just make sure they're grouped together. - if (CurKernel.HasNonDuplicatableDependecy) { - SML << "Kernel with externally visible dependency " - << getName(*CurKernel.Fn) << " defaulting to P0\n"; - AssignToPartition(0, CurKernel); + if (CurFn.HasNonDuplicatableDependecy) { + SML << "Function with externally visible dependency " + << getName(*CurFn.Fn) << " defaulting to P0\n"; + AssignToPartition(0, CurFn); continue; } - // Be smart with large kernels to avoid duplicating their dependencies. - if (CurKernel.isLarge(LargeKernelThreshold)) { - assert(LargeKernelOverlapForMerge >= 0.0f && - LargeKernelOverlapForMerge <= 1.0f); - SML << "Large Kernel: " << getName(*CurKernel.Fn) + // Be smart with large functions to avoid duplicating their dependencies. + if (CurFn.isLarge(LargeFnThreshold)) { + assert(LargeFnOverlapForMerge >= 0.0f && LargeFnOverlapForMerge <= 1.0f); + SML << "Large Function: " << getName(*CurFn.Fn) << " - looking for partition with at least " - << format("%0.2f", LargeKernelOverlapForMerge * 100) << "% overlap\n"; + << format("%0.2f", LargeFnOverlapForMerge * 100) << "% overlap\n"; bool Assigned = false; for (const auto &[PID, Fns] : enumerate(Partitions)) { - float Overlap = calculateOverlap(CurKernel.Dependencies, Fns); + float Overlap = calculateOverlap(CurFn.Dependencies, Fns); SML << " => " << format("%0.2f", Overlap * 100) << "% overlap with P" << PID << '\n'; - if (Overlap > LargeKernelOverlapForMerge) { + if (Overlap > LargeFnOverlapForMerge) { SML << " selecting P" << PID << '\n'; - AssignToPartition(PID, CurKernel); + AssignToPartition(PID, CurFn); Assigned = true; } } @@ -554,41 +563,34 @@ doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts, // Normal "load-balancing", assign to partition with least pressure. auto [PID, CurCost] = BalancingQueue.back(); - AssignToPartition(PID, CurKernel); + AssignToPartition(PID, CurFn); } - // Work is mostly done now, verify the partioning and add all functions we may - // have missed (= unreachable, or we don't understand how they're reached) to - // P0. - DenseSet<const Function *> AllFunctions; - for (const auto &[Idx, Part] : enumerate(Partitions)) { - CostType Cost = 0; - for (auto *Fn : Part) { - // external linkage functions should exclusively be in the first partition - // at this stage. In theory, we should only ever see external linkage - // functions here if they're kernels, or if they've been added due to a - // kernel using indirect calls somewhere in its CallGraph. - assert(Idx == 0 || (!Fn->hasExternalLinkage() || isEntryPoint(Fn))); - Cost += FnCosts.at(Fn); + if (SML) { + for (const auto &[Idx, Part] : enumerate(Partitions)) { + CostType Cost = 0; + for (auto *Fn : Part) + Cost += FnCosts.at(Fn); + SML << "P" << Idx << " has a total cost of " << Cost << " (" + << format("%0.2f", (float(Cost) / ModuleCost) * 100) + << "% of source module)\n"; } - SML << "P" << Idx << " has a total cost of " << Cost << " (" - << format("%0.2f", (float(Cost) / ModuleCost) * 100) - << "% of source module)\n"; - AllFunctions.insert(Part.begin(), Part.end()); + + SML << "--Partitioning Done--\n\n"; } - // Add missed functions to P0. This will take care of adding things like - // external functions with no callers in the module to P0. This should be - // fairly rare as AMDGPU internalizes everything in most cases, so unused - // internal functions would get removed. + // Check no functions were missed. +#ifndef NDEBUG + DenseSet<const Function *> AllFunctions; + for (const auto &Part : Partitions) + AllFunctions.insert(Part.begin(), Part.end()); + for (auto &Fn : M) { if (!Fn.isDeclaration() && !AllFunctions.contains(&Fn)) { - SML << getName(Fn) << " has no partition assigned, defaulting to P0\n"; - Partitions[0].insert(&Fn); + assert(AllFunctions.contains(&Fn) && "Missed a function?!"); } } - - SML << "--Partitioning Done--\n\n"; +#endif return Partitions; } @@ -604,10 +606,17 @@ static void externalize(GlobalValue &GV) { if (!GV.hasName()) GV.setName("__llvmsplit_unnamed"); } -} // end anonymous namespace -void llvm::splitAMDGPUModule( - const AMDGPUTargetMachine &TM, Module &M, unsigned N, +static bool hasDirectCaller(const Function &Fn) { + for (auto &U : Fn.uses()) { + if (auto *CB = dyn_cast<CallBase>(U.getUser()); CB && CB->isCallee(&U)) + return true; + } + return false; +} + +static void splitAMDGPUModule( + GetTTIFn GetTTI, Module &M, unsigned N, function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) { SplitModuleLogger SML(M); @@ -648,15 +657,36 @@ void llvm::splitAMDGPUModule( // Start by calculating the cost of every function in the module, as well as // the module's overall cost. DenseMap<const Function *, CostType> FnCosts; - const CostType ModuleCost = calculateFunctionCosts(SML, TM, M, FnCosts); + const CostType ModuleCost = calculateFunctionCosts(SML, GetTTI, M, FnCosts); - // Gather every kernel into a WorkList, then sort it by descending total cost - // of the kernel so the biggest kernels are seen first. - SmallVector<KernelWithDependencies> WorkList; + // First, gather ever kernel into the worklist. + SmallVector<FunctionWithDependencies> WorkList; for (auto &Fn : M) { if (isEntryPoint(&Fn) && !Fn.isDeclaration()) WorkList.emplace_back(SML, CG, FnCosts, &Fn); } + + // Then, find missing functions that need to be considered as additional + // roots. These can't be called in theory, but in practice we still have to + // handle them to avoid linker errors. + { + DenseSet<const Function *> SeenFunctions; + for (const auto &FWD : WorkList) { + SeenFunctions.insert(FWD.Fn); + SeenFunctions.insert(FWD.Dependencies.begin(), FWD.Dependencies.end()); + } + + for (auto &Fn : M) { + // If this function is not part of any kernel's dependencies and isn't + // directly called, consider it as a root. + if (!Fn.isDeclaration() && !isEntryPoint(&Fn) && + !SeenFunctions.count(&Fn) && !hasDirectCaller(Fn)) { + WorkList.emplace_back(SML, CG, FnCosts, &Fn); + } + } + } + + // Sort the worklist so the most expensive roots are seen first. sort(WorkList, [&](auto &A, auto &B) { // Sort by total cost, and if the total cost is identical, sort // alphabetically. @@ -667,13 +697,20 @@ void llvm::splitAMDGPUModule( if (SML) { SML << "Worklist\n"; - for (const auto &KWD : WorkList) { - SML << "[Kernel] " << getName(*KWD.Fn) << " (totalCost:" << KWD.TotalCost - << " indirect:" << KWD.HasIndirectCall - << " hasNonDuplicatableDep:" << KWD.HasNonDuplicatableDependecy + for (const auto &FWD : WorkList) { + SML << "[root] " << getName(*FWD.Fn) << " (totalCost:" << FWD.TotalCost + << " indirect:" << FWD.HasIndirectCall + << " hasNonDuplicatableDep:" << FWD.HasNonDuplicatableDependecy << ")\n"; - for (const auto *Dep : KWD.Dependencies) - SML << " [Dep] " << getName(*Dep) << '\n'; + // Sort function names before printing to ensure determinism. + SmallVector<std::string> SortedDepNames; + SortedDepNames.reserve(FWD.Dependencies.size()); + for (const auto *Dep : FWD.Dependencies) + SortedDepNames.push_back(getName(*Dep)); + sort(SortedDepNames); + + for (const auto &Name : SortedDepNames) + SML << " [dependency] " << Name << '\n'; } } @@ -700,16 +737,8 @@ void llvm::splitAMDGPUModule( std::unique_ptr<Module> MPart( CloneModule(M, VMap, [&](const GlobalValue *GV) { // Functions go in their assigned partition. - if (const auto *Fn = dyn_cast<Function>(GV)) { -// Check we don't import an external linkage function in any -// partition other than P0. -#ifndef NDEBUG - if (Fn->hasExternalLinkage() && !isEntryPoint(Fn)) { - assert((I == 0) == FnsInPart.contains(Fn)); - } -#endif + if (const auto *Fn = dyn_cast<Function>(GV)) return FnsInPart.contains(Fn); - } if (NeedsConservativeImport(GV)) return true; @@ -742,3 +771,16 @@ void llvm::splitAMDGPUModule( << format("%0.2f", (float(TotalFnImpls) / FnCosts.size()) * 100) << "% of original module)\n"; } +} // namespace + +PreservedAnalyses AMDGPUSplitModulePass::run(Module &M, + ModuleAnalysisManager &MAM) { + FunctionAnalysisManager &FAM = + MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); + const auto TTIGetter = [&FAM](Function &F) -> const TargetTransformInfo & { + return FAM.getResult<TargetIRAnalysis>(F); + }; + splitAMDGPUModule(TTIGetter, M, N, ModuleCallback); + // We don't change the original module. + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h index 6171643bd4ad..d814dedd6f0c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h @@ -12,18 +12,27 @@ #define LLVM_TARGET_AMDGPUSPLITMODULE_H #include "llvm/ADT/STLFunctionalExtras.h" +#include "llvm/IR/PassManager.h" #include <memory> namespace llvm { -class Module; -class AMDGPUTargetMachine; - /// Splits the module M into N linkable partitions. The function ModuleCallback /// is called N times passing each individual partition as the MPart argument. -void splitAMDGPUModule( - const AMDGPUTargetMachine &TM, Module &M, unsigned N, - function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback); +class AMDGPUSplitModulePass : public PassInfoMixin<AMDGPUSplitModulePass> { +public: + using ModuleCreationCallback = + function_ref<void(std::unique_ptr<Module> MPart)>; + + AMDGPUSplitModulePass(unsigned N, ModuleCreationCallback ModuleCallback) + : N(N), ModuleCallback(ModuleCallback) {} + + PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM); + +private: + unsigned N; + ModuleCreationCallback ModuleCallback; +}; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 0751c8dc8b8b..a8e26f104f58 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -1104,6 +1104,9 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F, if (hasFlatScratchInit()) NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID); + + if (hasPrivateSegmentSize()) + NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentSizeID); } void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index ce997c659094..9162e110aa10 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -658,8 +658,7 @@ Error AMDGPUTargetMachine::buildCodeGenPipeline( return CGPB.buildPipeline(MPM, Out, DwoOut, FileType); } -void AMDGPUTargetMachine::registerPassBuilderCallbacks( - PassBuilder &PB, bool PopulateClassToPassNames) { +void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { #define GET_PASS_REGISTRY "AMDGPUPassRegistry.def" #include "llvm/Passes/TargetPassRegistry.inc" @@ -829,8 +828,24 @@ AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const { bool AMDGPUTargetMachine::splitModule( Module &M, unsigned NumParts, - function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) const { - splitAMDGPUModule(*this, M, NumParts, ModuleCallback); + function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) { + // FIXME(?): Would be better to use an already existing Analysis/PassManager, + // but all current users of this API don't have one ready and would need to + // create one anyway. Let's hide the boilerplate for now to keep it simple. + + LoopAnalysisManager LAM; + FunctionAnalysisManager FAM; + CGSCCAnalysisManager CGAM; + ModuleAnalysisManager MAM; + + PassBuilder PB(this); + PB.registerModuleAnalyses(MAM); + PB.registerFunctionAnalyses(FAM); + PB.crossRegisterProxies(LAM, FAM, CGAM, MAM); + + ModulePassManager MPM; + MPM.addPass(AMDGPUSplitModulePass(NumParts, ModuleCallback)); + MPM.run(M, MAM); return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 2cfd232483a8..0f74fbc22fa8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -58,8 +58,7 @@ public: const CGPassBuilderOption &Opts, PassInstrumentationCallbacks *PIC) override; - void registerPassBuilderCallbacks(PassBuilder &PB, - bool PopulateClassToPassNames) override; + void registerPassBuilderCallbacks(PassBuilder &PB) override; void registerDefaultAliasAnalyses(AAManager &) override; /// Get the integer value of a null pointer in the given address space. @@ -76,7 +75,7 @@ public: bool splitModule(Module &M, unsigned NumParts, function_ref<void(std::unique_ptr<Module> MPart)> - ModuleCallback) const override; + ModuleCallback) override; }; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 437e01c37c6b..1192b49fd1f0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -502,7 +502,6 @@ bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, switch (Inst->getIntrinsicID()) { case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: - case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: { auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2)); @@ -1019,7 +1018,6 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const { bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, Intrinsic::ID IID) const { switch (IID) { - case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: case Intrinsic::amdgcn_is_shared: @@ -1041,7 +1039,6 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *NewV) const { auto IntrID = II->getIntrinsicID(); switch (IntrID) { - case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: { const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4)); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index bdb5a8d9a0a0..b08957d22ee7 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1314,6 +1314,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser { /// } private: + void createConstantSymbol(StringRef Id, int64_t Val); + bool ParseAsAbsoluteExpression(uint32_t &Ret); bool OutOfRangeError(SMRange Range); /// Calculate VGPR/SGPR blocks required for given target, reserved @@ -1331,12 +1333,12 @@ private: /// \param SGPRRange [in] Token range, used for SGPR diagnostics. /// \param VGPRBlocks [out] Result VGPR block count. /// \param SGPRBlocks [out] Result SGPR block count. - bool calculateGPRBlocks(const FeatureBitset &Features, bool VCCUsed, - bool FlatScrUsed, bool XNACKUsed, + bool calculateGPRBlocks(const FeatureBitset &Features, const MCExpr *VCCUsed, + const MCExpr *FlatScrUsed, bool XNACKUsed, std::optional<bool> EnableWavefrontSize32, - unsigned NextFreeVGPR, SMRange VGPRRange, - unsigned NextFreeSGPR, SMRange SGPRRange, - unsigned &VGPRBlocks, unsigned &SGPRBlocks); + const MCExpr *NextFreeVGPR, SMRange VGPRRange, + const MCExpr *NextFreeSGPR, SMRange SGPRRange, + const MCExpr *&VGPRBlocks, const MCExpr *&SGPRBlocks); bool ParseDirectiveAMDGCNTarget(); bool ParseDirectiveAMDHSACodeObjectVersion(); bool ParseDirectiveAMDHSAKernel(); @@ -1408,36 +1410,28 @@ public: setAvailableFeatures(ComputeAvailableFeatures(getFeatureBits())); - { - // TODO: make those pre-defined variables read-only. - // Currently there is none suitable machinery in the core llvm-mc for this. - // MCSymbol::isRedefinable is intended for another purpose, and - // AsmParser::parseDirectiveSet() cannot be specialized for specific target. - AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU()); - MCContext &Ctx = getContext(); - if (ISA.Major >= 6 && isHsaAbi(getSTI())) { - MCSymbol *Sym = - Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_number")); - Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx)); - Sym = Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_minor")); - Sym->setVariableValue(MCConstantExpr::create(ISA.Minor, Ctx)); - Sym = Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_stepping")); - Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx)); - } else { - MCSymbol *Sym = - Ctx.getOrCreateSymbol(Twine(".option.machine_version_major")); - Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx)); - Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_minor")); - Sym->setVariableValue(MCConstantExpr::create(ISA.Minor, Ctx)); - Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping")); - Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx)); - } - if (ISA.Major >= 6 && isHsaAbi(getSTI())) { - initializeGprCountSymbol(IS_VGPR); - initializeGprCountSymbol(IS_SGPR); - } else - KernelScope.initialize(getContext()); + AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU()); + if (ISA.Major >= 6 && isHsaAbi(getSTI())) { + createConstantSymbol(".amdgcn.gfx_generation_number", ISA.Major); + createConstantSymbol(".amdgcn.gfx_generation_minor", ISA.Minor); + createConstantSymbol(".amdgcn.gfx_generation_stepping", ISA.Stepping); + } else { + createConstantSymbol(".option.machine_version_major", ISA.Major); + createConstantSymbol(".option.machine_version_minor", ISA.Minor); + createConstantSymbol(".option.machine_version_stepping", ISA.Stepping); } + if (ISA.Major >= 6 && isHsaAbi(getSTI())) { + initializeGprCountSymbol(IS_VGPR); + initializeGprCountSymbol(IS_SGPR); + } else + KernelScope.initialize(getContext()); + + for (auto [Symbol, Code] : AMDGPU::UCVersion::getGFXVersions()) + createConstantSymbol(Symbol, Code); + + createConstantSymbol("UC_VERSION_W64_BIT", 0x2000); + createConstantSymbol("UC_VERSION_W32_BIT", 0x4000); + createConstantSymbol("UC_VERSION_MDP_BIT", 0x8000); } bool hasMIMG_R128() const { @@ -2486,6 +2480,16 @@ bool AMDGPUOperand::isInlineValue() const { // AsmParser //===----------------------------------------------------------------------===// +void AMDGPUAsmParser::createConstantSymbol(StringRef Id, int64_t Val) { + // TODO: make those pre-defined variables read-only. + // Currently there is none suitable machinery in the core llvm-mc for this. + // MCSymbol::isRedefinable is intended for another purpose, and + // AsmParser::parseDirectiveSet() cannot be specialized for specific target. + MCContext &Ctx = getContext(); + MCSymbol *Sym = Ctx.getOrCreateSymbol(Id); + Sym->setVariableValue(MCConstantExpr::create(Val, Ctx)); +} + static int getRegClass(RegisterKind Is, unsigned RegWidth) { if (Is == IS_VGPR) { switch (RegWidth) { @@ -5352,41 +5356,64 @@ bool AMDGPUAsmParser::OutOfRangeError(SMRange Range) { } bool AMDGPUAsmParser::calculateGPRBlocks( - const FeatureBitset &Features, bool VCCUsed, bool FlatScrUsed, - bool XNACKUsed, std::optional<bool> EnableWavefrontSize32, - unsigned NextFreeVGPR, SMRange VGPRRange, unsigned NextFreeSGPR, - SMRange SGPRRange, unsigned &VGPRBlocks, unsigned &SGPRBlocks) { + const FeatureBitset &Features, const MCExpr *VCCUsed, + const MCExpr *FlatScrUsed, bool XNACKUsed, + std::optional<bool> EnableWavefrontSize32, const MCExpr *NextFreeVGPR, + SMRange VGPRRange, const MCExpr *NextFreeSGPR, SMRange SGPRRange, + const MCExpr *&VGPRBlocks, const MCExpr *&SGPRBlocks) { // TODO(scott.linder): These calculations are duplicated from // AMDGPUAsmPrinter::getSIProgramInfo and could be unified. IsaVersion Version = getIsaVersion(getSTI().getCPU()); + MCContext &Ctx = getContext(); - unsigned NumVGPRs = NextFreeVGPR; - unsigned NumSGPRs = NextFreeSGPR; + const MCExpr *NumSGPRs = NextFreeSGPR; + int64_t EvaluatedSGPRs; if (Version.Major >= 10) - NumSGPRs = 0; + NumSGPRs = MCConstantExpr::create(0, Ctx); else { unsigned MaxAddressableNumSGPRs = IsaInfo::getAddressableNumSGPRs(&getSTI()); - if (Version.Major >= 8 && !Features.test(FeatureSGPRInitBug) && - NumSGPRs > MaxAddressableNumSGPRs) + if (NumSGPRs->evaluateAsAbsolute(EvaluatedSGPRs) && Version.Major >= 8 && + !Features.test(FeatureSGPRInitBug) && + static_cast<uint64_t>(EvaluatedSGPRs) > MaxAddressableNumSGPRs) return OutOfRangeError(SGPRRange); - NumSGPRs += - IsaInfo::getNumExtraSGPRs(&getSTI(), VCCUsed, FlatScrUsed, XNACKUsed); + const MCExpr *ExtraSGPRs = + AMDGPUMCExpr::createExtraSGPRs(VCCUsed, FlatScrUsed, XNACKUsed, Ctx); + NumSGPRs = MCBinaryExpr::createAdd(NumSGPRs, ExtraSGPRs, Ctx); - if ((Version.Major <= 7 || Features.test(FeatureSGPRInitBug)) && - NumSGPRs > MaxAddressableNumSGPRs) + if (NumSGPRs->evaluateAsAbsolute(EvaluatedSGPRs) && + (Version.Major <= 7 || Features.test(FeatureSGPRInitBug)) && + static_cast<uint64_t>(EvaluatedSGPRs) > MaxAddressableNumSGPRs) return OutOfRangeError(SGPRRange); if (Features.test(FeatureSGPRInitBug)) - NumSGPRs = IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; - } + NumSGPRs = + MCConstantExpr::create(IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG, Ctx); + } + + // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks: + // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1 + auto GetNumGPRBlocks = [&Ctx](const MCExpr *NumGPR, + unsigned Granule) -> const MCExpr * { + const MCExpr *OneConst = MCConstantExpr::create(1ul, Ctx); + const MCExpr *GranuleConst = MCConstantExpr::create(Granule, Ctx); + const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx); + const MCExpr *AlignToGPR = + AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx); + const MCExpr *DivGPR = + MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx); + const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx); + return SubGPR; + }; - VGPRBlocks = IsaInfo::getEncodedNumVGPRBlocks(&getSTI(), NumVGPRs, - EnableWavefrontSize32); - SGPRBlocks = IsaInfo::getNumSGPRBlocks(&getSTI(), NumSGPRs); + VGPRBlocks = GetNumGPRBlocks( + NextFreeVGPR, + IsaInfo::getVGPREncodingGranule(&getSTI(), EnableWavefrontSize32)); + SGPRBlocks = + GetNumGPRBlocks(NumSGPRs, IsaInfo::getSGPREncodingGranule(&getSTI())); return false; } @@ -5410,14 +5437,17 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { IsaVersion IVersion = getIsaVersion(getSTI().getCPU()); + const MCExpr *ZeroExpr = MCConstantExpr::create(0, getContext()); + const MCExpr *OneExpr = MCConstantExpr::create(1, getContext()); + SMRange VGPRRange; - uint64_t NextFreeVGPR = 0; - uint64_t AccumOffset = 0; + const MCExpr *NextFreeVGPR = ZeroExpr; + const MCExpr *AccumOffset = MCConstantExpr::create(0, getContext()); uint64_t SharedVGPRCount = 0; uint64_t PreloadLength = 0; uint64_t PreloadOffset = 0; SMRange SGPRRange; - uint64_t NextFreeSGPR = 0; + const MCExpr *NextFreeSGPR = ZeroExpr; // Count the number of user SGPRs implied from the enabled feature bits. unsigned ImpliedUserSGPRCount = 0; @@ -5425,8 +5455,8 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { // Track if the asm explicitly contains the directive for the user SGPR // count. std::optional<unsigned> ExplicitUserSGPRCount; - bool ReserveVCC = true; - bool ReserveFlatScr = true; + const MCExpr *ReserveVCC = OneExpr; + const MCExpr *ReserveFlatScr = OneExpr; std::optional<bool> EnableWavefrontSize32; while (true) { @@ -5620,34 +5650,29 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID, ExprVal, ValRange); } else if (ID == ".amdhsa_next_free_vgpr") { - EXPR_RESOLVE_OR_ERROR(EvaluatableExpr); VGPRRange = ValRange; - NextFreeVGPR = Val; + NextFreeVGPR = ExprVal; } else if (ID == ".amdhsa_next_free_sgpr") { - EXPR_RESOLVE_OR_ERROR(EvaluatableExpr); SGPRRange = ValRange; - NextFreeSGPR = Val; + NextFreeSGPR = ExprVal; } else if (ID == ".amdhsa_accum_offset") { if (!isGFX90A()) return Error(IDRange.Start, "directive requires gfx90a+", IDRange); - EXPR_RESOLVE_OR_ERROR(EvaluatableExpr); - AccumOffset = Val; + AccumOffset = ExprVal; } else if (ID == ".amdhsa_reserve_vcc") { - EXPR_RESOLVE_OR_ERROR(EvaluatableExpr); - if (!isUInt<1>(Val)) + if (EvaluatableExpr && !isUInt<1>(Val)) return OutOfRangeError(ValRange); - ReserveVCC = Val; + ReserveVCC = ExprVal; } else if (ID == ".amdhsa_reserve_flat_scratch") { - EXPR_RESOLVE_OR_ERROR(EvaluatableExpr); if (IVersion.Major < 7) return Error(IDRange.Start, "directive requires gfx7+", IDRange); if (hasArchitectedFlatScratch()) return Error(IDRange.Start, "directive is not supported with architected flat scratch", IDRange); - if (!isUInt<1>(Val)) + if (EvaluatableExpr && !isUInt<1>(Val)) return OutOfRangeError(ValRange); - ReserveFlatScr = Val; + ReserveFlatScr = ExprVal; } else if (ID == ".amdhsa_reserve_xnack_mask") { if (IVersion.Major < 8) return Error(IDRange.Start, "directive requires gfx8+", IDRange); @@ -5771,8 +5796,8 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { if (!Seen.contains(".amdhsa_next_free_sgpr")) return TokError(".amdhsa_next_free_sgpr directive is required"); - unsigned VGPRBlocks; - unsigned SGPRBlocks; + const MCExpr *VGPRBlocks; + const MCExpr *SGPRBlocks; if (calculateGPRBlocks(getFeatureBits(), ReserveVCC, ReserveFlatScr, getTargetStreamer().getTargetID()->isXnackOnOrAny(), EnableWavefrontSize32, NextFreeVGPR, @@ -5780,19 +5805,26 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { SGPRBlocks)) return true; - if (!isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_WIDTH>( - VGPRBlocks)) + int64_t EvaluatedVGPRBlocks; + bool VGPRBlocksEvaluatable = + VGPRBlocks->evaluateAsAbsolute(EvaluatedVGPRBlocks); + if (VGPRBlocksEvaluatable && + !isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_WIDTH>( + static_cast<uint64_t>(EvaluatedVGPRBlocks))) { return OutOfRangeError(VGPRRange); + } AMDGPU::MCKernelDescriptor::bits_set( - KD.compute_pgm_rsrc1, MCConstantExpr::create(VGPRBlocks, getContext()), + KD.compute_pgm_rsrc1, VGPRBlocks, COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_SHIFT, COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT, getContext()); - if (!isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_WIDTH>( - SGPRBlocks)) + int64_t EvaluatedSGPRBlocks; + if (SGPRBlocks->evaluateAsAbsolute(EvaluatedSGPRBlocks) && + !isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_WIDTH>( + static_cast<uint64_t>(EvaluatedSGPRBlocks))) return OutOfRangeError(SGPRRange); AMDGPU::MCKernelDescriptor::bits_set( - KD.compute_pgm_rsrc1, MCConstantExpr::create(SGPRBlocks, getContext()), + KD.compute_pgm_rsrc1, SGPRBlocks, COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_SHIFT, COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT, getContext()); @@ -5822,16 +5854,28 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { if (isGFX90A()) { if (!Seen.contains(".amdhsa_accum_offset")) return TokError(".amdhsa_accum_offset directive is required"); - if (AccumOffset < 4 || AccumOffset > 256 || (AccumOffset & 3)) + int64_t EvaluatedAccum; + bool AccumEvaluatable = AccumOffset->evaluateAsAbsolute(EvaluatedAccum); + uint64_t UEvaluatedAccum = EvaluatedAccum; + if (AccumEvaluatable && + (UEvaluatedAccum < 4 || UEvaluatedAccum > 256 || (UEvaluatedAccum & 3))) return TokError("accum_offset should be in range [4..256] in " "increments of 4"); - if (AccumOffset > alignTo(std::max((uint64_t)1, NextFreeVGPR), 4)) + + int64_t EvaluatedNumVGPR; + if (NextFreeVGPR->evaluateAsAbsolute(EvaluatedNumVGPR) && + AccumEvaluatable && + UEvaluatedAccum > + alignTo(std::max((uint64_t)1, (uint64_t)EvaluatedNumVGPR), 4)) return TokError("accum_offset exceeds total VGPR allocation"); - MCKernelDescriptor::bits_set( - KD.compute_pgm_rsrc3, - MCConstantExpr::create(AccumOffset / 4 - 1, getContext()), - COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT, - COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, getContext()); + const MCExpr *AdjustedAccum = MCBinaryExpr::createSub( + MCBinaryExpr::createDiv( + AccumOffset, MCConstantExpr::create(4, getContext()), getContext()), + MCConstantExpr::create(1, getContext()), getContext()); + MCKernelDescriptor::bits_set(KD.compute_pgm_rsrc3, AdjustedAccum, + COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT, + COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, + getContext()); } if (IVersion.Major >= 10 && IVersion.Major < 12) { @@ -5840,7 +5884,10 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { return TokError("shared_vgpr_count directive not valid on " "wavefront size 32"); } - if (SharedVGPRCount * 2 + VGPRBlocks > 63) { + + if (VGPRBlocksEvaluatable && + (SharedVGPRCount * 2 + static_cast<uint64_t>(EvaluatedVGPRBlocks) > + 63)) { return TokError("shared_vgpr_count*2 + " "compute_pgm_rsrc1.GRANULATED_WORKITEM_VGPR_COUNT cannot " "exceed 63\n"); @@ -8353,7 +8400,7 @@ void AMDGPUAsmParser::onBeginOfFile() { /// max(expr, ...) /// bool AMDGPUAsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) { - using AGVK = AMDGPUVariadicMCExpr::VariadicKind; + using AGVK = AMDGPUMCExpr::VariantKind; if (isToken(AsmToken::Identifier)) { StringRef TokenId = getTokenStr(); @@ -8383,7 +8430,7 @@ bool AMDGPUAsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) { "mismatch of commas in " + Twine(TokenId) + " expression"); return true; } - Res = AMDGPUVariadicMCExpr::create(VK, Exprs, getContext()); + Res = AMDGPUMCExpr::create(VK, Exprs, getContext()); return false; } const MCExpr *Expr; diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index b05834e5803a..3b8d94b74400 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -399,12 +399,10 @@ class MUBUF_Invalidate <string opName, SDPatternOperator node = null_frag> : class getLdStVDataRegisterOperand<RegisterClass RC, bit isTFE> { RegisterOperand tfeVDataOp = - !if(!eq(RC.Size, 32), AVLdSt_64, - !if(!eq(RC.Size, 64), AVLdSt_96, - !if(!eq(RC.Size, 96), AVLdSt_128, - !if(!eq(RC.Size, 128), AVLdSt_160, - RegisterOperand<VReg_1> // Invalid register. - )))); + !cond(!eq(RC.Size, 32) : AVLdSt_64, + !eq(RC.Size, 64) : AVLdSt_96, + !eq(RC.Size, 96) : AVLdSt_128, + !eq(RC.Size, 128) : AVLdSt_160); RegisterOperand ret = !if(isTFE, tfeVDataOp, getLdStRegisterOperand<RC>.ret); } @@ -534,7 +532,7 @@ multiclass MUBUF_Pseudo_Load_Pats_Common<string BaseInst, ValueType load_vt = i3 } multiclass MUBUF_Pseudo_Load_Pats<string BaseInst, ValueType load_vt = i32, SDPatternOperator ld = null_frag>{ - let SubtargetPredicate = HasUnrestrictedSOffset in { + let OtherPredicates = [HasUnrestrictedSOffset] in { defm : MUBUF_Pseudo_Load_Pats_Common<BaseInst, load_vt, ld>; } defm : MUBUF_Pseudo_Load_Pats_Common<BaseInst # "_VBUFFER", load_vt, ld>; @@ -631,7 +629,7 @@ multiclass MUBUF_Pseudo_Store_Pats_Common<string BaseInst, ValueType store_vt = } multiclass MUBUF_Pseudo_Store_Pats<string BaseInst, ValueType store_vt = i32, SDPatternOperator st = null_frag> { - let SubtargetPredicate = HasUnrestrictedSOffset in { + let OtherPredicates = [HasUnrestrictedSOffset] in { defm : MUBUF_Pseudo_Store_Pats_Common<BaseInst, store_vt, st>; } defm : MUBUF_Pseudo_Store_Pats_Common<BaseInst # "_VBUFFER", store_vt, st>; @@ -1151,27 +1149,21 @@ let SubtargetPredicate = isGFX6GFX7GFX10Plus in { defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics < "buffer_atomic_fcmpswap", VReg_64, v2f32, null_frag >; +} + +let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts in { defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics < "buffer_atomic_fmin", VGPR_32, f32, null_frag >; defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics < "buffer_atomic_fmax", VGPR_32, f32, null_frag >; - } let SubtargetPredicate = isGFX6GFX7GFX10 in { - defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics < "buffer_atomic_fcmpswap_x2", VReg_128, v2f64, null_frag >; -defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_fmin_x2", VReg_64, f64, null_frag ->; -defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_fmax_x2", VReg_64, f64, null_frag ->; - } let SubtargetPredicate = HasD16LoadStore in { @@ -1235,12 +1227,12 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN < "buffer_atomic_pk_add_f16", VGPR_32, v2f16 >; -let OtherPredicates = [HasAtomicFaddRtnInsts] in +let SubtargetPredicate = HasAtomicFaddRtnInsts in defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN< "buffer_atomic_add_f32", VGPR_32, f32, null_frag >; -let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in +let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN < "buffer_atomic_pk_add_f16", VGPR_32, v2f16, null_frag >; @@ -1249,7 +1241,9 @@ let SubtargetPredicate = isGFX12Plus in { defm BUFFER_ATOMIC_COND_SUB_U32 : MUBUF_Pseudo_Atomics < "buffer_atomic_cond_sub_u32", VGPR_32, i32 >; +} +let SubtargetPredicate = HasAtomicBufferPkAddBF16Inst in { let FPAtomic = 1 in defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Pseudo_Atomics < "buffer_atomic_pk_add_bf16", VGPR_32, v2bf16 @@ -1320,6 +1314,9 @@ let SubtargetPredicate = isGFX90APlus in { let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in { defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", VReg_64, f64>; + + // Note the names can be buffer_atomic_fmin_x2/buffer_atomic_fmax_x2 + // depending on some subtargets. defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", VReg_64, f64>; defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", VReg_64, f64>; } // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 @@ -1421,18 +1418,22 @@ let OtherPredicates = [HasPackedD16VMem] in { defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i16, "BUFFER_LOAD_FORMAT_D16_XYZW">; } // End HasPackedD16VMem. -defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">; -defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, i32, "BUFFER_LOAD_DWORD">; -defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i16, "BUFFER_LOAD_DWORD">; -defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f16, "BUFFER_LOAD_DWORD">; -defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">; -defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i32, "BUFFER_LOAD_DWORDX2">; -defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i16, "BUFFER_LOAD_DWORDX2">; -defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f16, "BUFFER_LOAD_DWORDX2">; -defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3f32, "BUFFER_LOAD_DWORDX3">; -defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3i32, "BUFFER_LOAD_DWORDX3">; -defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">; -defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i32, "BUFFER_LOAD_DWORDX4">; +foreach vt = Reg32Types.types in { +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, vt, "BUFFER_LOAD_DWORD">; +} + +foreach vt = Reg64Types.types in { +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, vt, "BUFFER_LOAD_DWORDX2">; +} + +foreach vt = Reg96Types.types in { +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, vt, "BUFFER_LOAD_DWORDX3">; +} + +foreach vt = Reg128Types.types in { +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, vt, "BUFFER_LOAD_DWORDX4">; +} + defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_byte, i32, "BUFFER_LOAD_SBYTE">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_short, i32, "BUFFER_LOAD_SSHORT">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ubyte, i32, "BUFFER_LOAD_UBYTE">; @@ -1495,6 +1496,7 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, f32, "BUFFER_STORE_FORMAT_ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, i32, "BUFFER_STORE_FORMAT_X">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2i32, "BUFFER_STORE_FORMAT_XY">; +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2i32, "BUFFER_STORE_FORMAT_XY">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v3f32, "BUFFER_STORE_FORMAT_XYZ">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v3i32, "BUFFER_STORE_FORMAT_XYZ">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">; @@ -1521,18 +1523,22 @@ let OtherPredicates = [HasPackedD16VMem] in { defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i16, "BUFFER_STORE_FORMAT_D16_XYZW">; } // End HasPackedD16VMem. -defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f32, "BUFFER_STORE_DWORD">; -defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, i32, "BUFFER_STORE_DWORD">; -defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i16, "BUFFER_STORE_DWORD">; -defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f16, "BUFFER_STORE_DWORD">; -defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f32, "BUFFER_STORE_DWORDX2">; -defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i32, "BUFFER_STORE_DWORDX2">; -defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i16, "BUFFER_STORE_DWORDX2">; -defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f16, "BUFFER_STORE_DWORDX2">; -defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3f32, "BUFFER_STORE_DWORDX3">; -defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3i32, "BUFFER_STORE_DWORDX3">; -defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">; -defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i32, "BUFFER_STORE_DWORDX4">; +foreach vt = Reg32Types.types in { +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, vt, "BUFFER_STORE_DWORD">; +} + +foreach vt = Reg64Types.types in { +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, vt, "BUFFER_STORE_DWORDX2">; +} + +foreach vt = Reg96Types.types in { +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, vt, "BUFFER_STORE_DWORDX3">; +} + +foreach vt = Reg128Types.types in { +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, vt, "BUFFER_STORE_DWORDX4">; +} + defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_byte, i32, "BUFFER_STORE_BYTE">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">; @@ -1545,7 +1551,7 @@ multiclass BufferAtomicPat_Common<string OpPrefix, ValueType vt, string Inst, bi defvar Op = !cast<SDPatternOperator>(OpPrefix # !if(!eq(RtnMode, "ret"), "", "_noret") - # !if(isIntr, "", "_" # vt.Size)); + # !if(isIntr, "", "_" # vt)); defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", ""); let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in { @@ -1582,7 +1588,7 @@ multiclass BufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, string defvar Op = !cast<SDPatternOperator>("AMDGPUatomic_cmp_swap_global" # !if(!eq(RtnMode, "ret"), "", "_noret") - # "_" # vt.Size); + # "_" # vt); defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", ""); defvar data_vt_RC = getVregSrcForVT<data_vt>.ret.RegClass; @@ -1641,6 +1647,16 @@ defm : BufferAtomicPat<"atomic_load_udec_wrap_global", Ty, "BUFFER_ATOMIC_DEC" # } // end foreach Ty +let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts in { +defm : BufferAtomicPat<"atomic_load_fmin_global", f32, "BUFFER_ATOMIC_FMIN">; +defm : BufferAtomicPat<"atomic_load_fmax_global", f32, "BUFFER_ATOMIC_FMAX">; +} + +let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in { +defm : BufferAtomicPat<"atomic_load_fmin_global", f64, "BUFFER_ATOMIC_MIN_F64">; +defm : BufferAtomicPat<"atomic_load_fmax_global", f64, "BUFFER_ATOMIC_MAX_F64">; +} + defm : BufferAtomicCmpSwapPat<i32, v2i32, "BUFFER_ATOMIC_CMPSWAP">; defm : BufferAtomicCmpSwapPat<i64, v2i64, "BUFFER_ATOMIC_CMPSWAP_X2">; @@ -1695,9 +1711,11 @@ multiclass SIBufferAtomicPat_Common<string OpPrefix, ValueType vt, string Inst, multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst, list<string> RtnModes = ["ret", "noret"]> { - let SubtargetPredicate = HasUnrestrictedSOffset in { + let OtherPredicates = [HasUnrestrictedSOffset] in { defm : SIBufferAtomicPat_Common<OpPrefix, vt, Inst, RtnModes>; } + + // FIXME: This needs a !HasUnrestrictedSOffset predicate defm : SIBufferAtomicPat_Common<OpPrefix, vt, Inst # "_VBUFFER", RtnModes>; } @@ -1728,24 +1746,29 @@ defm : SIBufferAtomicPat<"SIbuffer_atomic_xor", i64, "BUFFER_ATOMIC_XOR_X2">; defm : SIBufferAtomicPat<"SIbuffer_atomic_inc", i64, "BUFFER_ATOMIC_INC_X2">; defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i64, "BUFFER_ATOMIC_DEC_X2">; -let OtherPredicates = [HasAtomicCSubNoRtnInsts] in +let SubtargetPredicate = HasAtomicCSubNoRtnInsts in defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["noret"]>; +let SubtargetPredicate = HasAtomicBufferPkAddBF16Inst in { + defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2bf16, "BUFFER_ATOMIC_PK_ADD_BF16">; +} + let SubtargetPredicate = isGFX12Plus in { - defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd_bf16", v2bf16, "BUFFER_ATOMIC_PK_ADD_BF16_VBUFFER">; defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_cond_sub_u32", i32, "BUFFER_ATOMIC_COND_SUB_U32_VBUFFER", ["ret"]>; +} - let OtherPredicates = [HasAtomicCSubNoRtnInsts] in - defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_cond_sub_u32", i32, "BUFFER_ATOMIC_COND_SUB_U32_VBUFFER", ["noret"]>; +let SubtargetPredicate = HasAtomicCSubNoRtnInsts in { +defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_cond_sub_u32", i32, "BUFFER_ATOMIC_COND_SUB_U32_VBUFFER", ["noret"]>; } -let OtherPredicates = [isGFX6GFX7GFX10Plus] in { +let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts in { defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f32, "BUFFER_ATOMIC_FMIN">; defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f32, "BUFFER_ATOMIC_FMAX">; } -let SubtargetPredicate = isGFX6GFX7GFX10 in { - defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_FMIN_X2">; - defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, "BUFFER_ATOMIC_FMAX_X2">; + +let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in { + defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64">; + defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64">; } class NoUseBufferAtomic<SDPatternOperator Op, ValueType vt> : PatFrag < @@ -1799,33 +1822,28 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt, defm : BufferAtomicPatterns_NO_RTN_Common<name, vt, opcode # "_VBUFFER">; } -let OtherPredicates = [HasAtomicFaddNoRtnInsts] in +let SubtargetPredicate = HasAtomicFaddNoRtnInsts in defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["noret"]>; -let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in { - let SubtargetPredicate = isGFX9Only in - defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["noret"]>; - - let SubtargetPredicate = isGFX12Plus in - defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16_VBUFFER", ["noret"]>; -} // End OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] +let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16NoRtnInsts in { + defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["noret"]>; +} // End SubtargetPredicate = HasAtomicBufferGlobalPkAddF16NoRtnInsts -let OtherPredicates = [HasAtomicFaddRtnInsts] in +let SubtargetPredicate = HasAtomicFaddRtnInsts in defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["ret"]>; -let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in { - let SubtargetPredicate = isGFX9Only in - defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>; - - let SubtargetPredicate = isGFX12Plus in - defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16_VBUFFER", ["ret"]>; -} // End OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] +let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in { + defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>; +} // End SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts -let OtherPredicates = [HasBufferFlatGlobalAtomicsF64] in { +let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in { defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f64, "BUFFER_ATOMIC_ADD_F64">; +} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 + +let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in { defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64">; defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64">; -} // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 +} //End let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, string Inst> { foreach RtnMode = ["ret", "noret"] in { @@ -1897,7 +1915,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri } multiclass SIBufferAtomicCmpSwapPat<ValueType vt, ValueType data_vt, string Inst> { - let SubtargetPredicate = HasUnrestrictedSOffset in { + let OtherPredicates = [HasUnrestrictedSOffset] in { defm : SIBufferAtomicCmpSwapPat_Common<vt, data_vt, Inst>; } defm : SIBufferAtomicCmpSwapPat_Common<vt, data_vt, Inst # "_VBUFFER">; @@ -1948,7 +1966,7 @@ multiclass MUBUFLoad_PatternOffset_Common <string Instr, ValueType vt, multiclass MUBUFLoad_PatternOffset <string Instr, ValueType vt, PatFrag ld> { - let SubtargetPredicate = HasUnrestrictedSOffset in { + let OtherPredicates = [HasUnrestrictedSOffset] in { defm : MUBUFLoad_PatternOffset_Common<Instr, vt, ld>; } defm : MUBUFLoad_PatternOffset_Common<Instr # "_VBUFFER", vt, ld>; @@ -2189,7 +2207,7 @@ multiclass MTBUF_LoadIntrinsicPat_Common<SDPatternOperator name, ValueType vt, multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, string opcode, ValueType memoryVt = vt> { - let SubtargetPredicate = HasUnrestrictedSOffset in { + let OtherPredicates = [HasUnrestrictedSOffset] in { defm : MTBUF_LoadIntrinsicPat_Common<name, vt, opcode, memoryVt>; } defm : MTBUF_LoadIntrinsicPat_Common<name, vt, opcode # "_VBUFFER", memoryVt>; @@ -2204,7 +2222,7 @@ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2f32, "TBUFFER_LOAD_FORMAT_XY">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v3f32, "TBUFFER_LOAD_FORMAT_XYZ">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW">; -let OtherPredicates = [HasUnpackedD16VMem] in { +let SubtargetPredicate = HasUnpackedD16VMem in { defm : MTBUF_LoadIntrinsicPat_Common<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">; defm : MTBUF_LoadIntrinsicPat_Common<SItbuffer_load_d16, i32, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">; defm : MTBUF_LoadIntrinsicPat_Common<SItbuffer_load_d16, v2i32, "TBUFFER_LOAD_FORMAT_D16_XY_gfx80">; @@ -2212,7 +2230,7 @@ let OtherPredicates = [HasUnpackedD16VMem] in { defm : MTBUF_LoadIntrinsicPat_Common<SItbuffer_load_d16, v4i32, "TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80">; } // End HasUnpackedD16VMem. -let OtherPredicates = [HasPackedD16VMem] in { +let SubtargetPredicate = HasPackedD16VMem in { defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, i32, "TBUFFER_LOAD_FORMAT_D16_X">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2f16, "TBUFFER_LOAD_FORMAT_D16_XY">; @@ -2261,7 +2279,7 @@ multiclass MTBUF_StoreIntrinsicPat_Common<SDPatternOperator name, ValueType vt, multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, string opcode, ValueType memoryVt = vt> { - let SubtargetPredicate = HasUnrestrictedSOffset in { + let OtherPredicates = [HasUnrestrictedSOffset] in { defm : MTBUF_StoreIntrinsicPat_Common<name, vt, opcode, memoryVt>; } defm : MTBUF_StoreIntrinsicPat_Common<name, vt, opcode # "_VBUFFER", memoryVt>; @@ -2276,7 +2294,7 @@ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2f32, "TBUFFER_STORE_FORMAT_XY" defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v3f32, "TBUFFER_STORE_FORMAT_XYZ">; defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4f32, "TBUFFER_STORE_FORMAT_XYZW">; -let OtherPredicates = [HasUnpackedD16VMem] in { +let SubtargetPredicate = HasUnpackedD16VMem in { defm : MTBUF_StoreIntrinsicPat_Common<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X_gfx80">; defm : MTBUF_StoreIntrinsicPat_Common<SItbuffer_store_d16, i32, "TBUFFER_STORE_FORMAT_D16_X_gfx80">; defm : MTBUF_StoreIntrinsicPat_Common<SItbuffer_store_d16, v2i32, "TBUFFER_STORE_FORMAT_D16_XY_gfx80">; @@ -2284,7 +2302,7 @@ let OtherPredicates = [HasUnpackedD16VMem] in { defm : MTBUF_StoreIntrinsicPat_Common<SItbuffer_store_d16, v4i32, "TBUFFER_STORE_FORMAT_D16_XYZW_gfx80">; } // End HasUnpackedD16VMem. -let OtherPredicates = [HasPackedD16VMem] in { +let SubtargetPredicate = HasPackedD16VMem in { defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X">; defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, i32, "TBUFFER_STORE_FORMAT_D16_X">; defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2f16, "TBUFFER_STORE_FORMAT_D16_XY">; @@ -2296,6 +2314,12 @@ let OtherPredicates = [HasPackedD16VMem] in { // Target-specific instruction encodings. //===----------------------------------------------------------------------===// +// Shortcut to default Mnemonic from BUF_Pseudo. Hides the cast to the +// specific pseudo (bothen in this case) since any of them will work. +class get_BUF_ps<string name> { + string Mnemonic = !cast<BUF_Pseudo>(name # "_OFFSET").Mnemonic; +} + //===----------------------------------------------------------------------===// // Base ENC_MUBUF for GFX6, GFX7, GFX10, GFX11. //===----------------------------------------------------------------------===// @@ -2327,8 +2351,8 @@ multiclass MUBUF_Real_gfx11<bits<8> op, string real_name = !cast<MUBUF_Pseudo>(N } } -class Base_MUBUF_Real_gfx6_gfx7_gfx10<bits<7> op, MUBUF_Pseudo ps, int ef> : - Base_MUBUF_Real_gfx6_gfx7_gfx10_gfx11<ps, ef> { +class Base_MUBUF_Real_gfx6_gfx7_gfx10<bits<7> op, MUBUF_Pseudo ps, int ef, string asmName> : + Base_MUBUF_Real_gfx6_gfx7_gfx10_gfx11<ps, ef, asmName> { let Inst{12} = ps.offen; let Inst{13} = ps.idxen; let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value); @@ -2338,9 +2362,10 @@ class Base_MUBUF_Real_gfx6_gfx7_gfx10<bits<7> op, MUBUF_Pseudo ps, int ef> : let Inst{55} = ps.tfe; } -multiclass MUBUF_Real_gfx10<bits<8> op> { - defvar ps = !cast<MUBUF_Pseudo>(NAME); - def _gfx10 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.GFX10> { +multiclass MUBUF_Real_gfx10<bits<8> op, string psName = NAME, + string asmName = !cast<MUBUF_Pseudo>(psName).Mnemonic> { + defvar ps = !cast<MUBUF_Pseudo>(psName); + def _gfx10 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.GFX10, asmName> { let Inst{15} = !if(ps.has_dlc, cpol{CPolBit.DLC}, ps.dlc_value); let Inst{25} = op{7}; let AssemblerPredicate = isGFX10Only; @@ -2348,9 +2373,10 @@ multiclass MUBUF_Real_gfx10<bits<8> op> { } } -multiclass MUBUF_Real_gfx6_gfx7<bits<8> op> { - defvar ps = !cast<MUBUF_Pseudo>(NAME); - def _gfx6_gfx7 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.SI> { +multiclass MUBUF_Real_gfx6_gfx7<bits<8> op, string psName = NAME, + string asmName = !cast<MUBUF_Pseudo>(psName).Mnemonic> { + defvar ps = !cast<MUBUF_Pseudo>(psName); + def _gfx6_gfx7 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.SI, asmName> { let Inst{15} = ps.addr64; let AssemblerPredicate = isGFX6GFX7; let DecoderNamespace = "GFX6GFX7"; @@ -2359,7 +2385,7 @@ multiclass MUBUF_Real_gfx6_gfx7<bits<8> op> { multiclass MUBUF_Real_gfx6<bits<8> op> { defvar ps = !cast<MUBUF_Pseudo>(NAME); - def _gfx6 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.SI> { + def _gfx6 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.SI, ps.Mnemonic> { let Inst{15} = ps.addr64; let AssemblerPredicate = isGFX6; let DecoderNamespace = "GFX6"; @@ -2368,7 +2394,7 @@ multiclass MUBUF_Real_gfx6<bits<8> op> { multiclass MUBUF_Real_gfx7<bits<8> op> { defvar ps = !cast<MUBUF_Pseudo>(NAME); - def _gfx7 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.SI> { + def _gfx7 : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.SI, ps.Mnemonic> { let Inst{15} = ps.addr64; let AssemblerPredicate = isGFX7Only; let DecoderNamespace = "GFX7"; @@ -2445,9 +2471,15 @@ class VBUFFER_Real_gfx12<bits<8> op, BUF_Pseudo ps, string real_name> : multiclass VBUFFER_MUBUF_Real_gfx12<bits<8> op, string real_name> { defvar ps = !cast<MUBUF_Pseudo>(NAME); def _gfx12 : VBUFFER_Real_gfx12<op, ps, real_name> { - // Set the last bit of format to 1 to avoid round-trip issues, as some tools + // Set the format field to be 1 to avoid round-trip issues, as some tools // print BUF_FMT_INVALID for format 0. - let Inst{55} = 0b1; + let Inst{61-55} = 0b0000001; + } + // Have a version of the instruction to disassemble to for any other + // format field values. + def _gfx12_format : VBUFFER_Real<op, ps, real_name> { + let AsmVariantName = "NonParsable"; + let DecoderNamespace = "GFX12"; } } @@ -2463,12 +2495,6 @@ multiclass VBUFFER_MTBUF_Real_gfx12<bits<4> op, string real_name> { // MUBUF - GFX11, GFX12. //===----------------------------------------------------------------------===// -// Shortcut to default Mnemonic from BUF_Pseudo. Hides the cast to the -// specific pseudo (bothen in this case) since any of them will work. -class get_BUF_ps<string name> { - string Mnemonic = !cast<BUF_Pseudo>(name # "_BOTHEN").Mnemonic; -} - // gfx11 instruction that accept both old and new assembler name. class Mnem_gfx11_gfx12 <string mnemonic, string real_name> : AMDGPUMnemonicAlias<mnemonic, real_name> { @@ -2690,18 +2716,20 @@ multiclass MUBUF_Real_AllAddr_Lds_gfx10<bits<8> op, bit isTFE = 0> { defm _LDS_BOTHEN : MUBUF_Real_gfx10<op>; } } -multiclass MUBUF_Real_Atomics_RTN_gfx10<bits<8> op> { - defm _BOTHEN_RTN : MUBUF_Real_gfx10<op>; - defm _IDXEN_RTN : MUBUF_Real_gfx10<op>; - defm _OFFEN_RTN : MUBUF_Real_gfx10<op>; - defm _OFFSET_RTN : MUBUF_Real_gfx10<op>; +multiclass MUBUF_Real_Atomics_RTN_gfx10<bits<8> op, string psName = NAME, + string asmName = !cast<MUBUF_Pseudo>(psName).Mnemonic> { + defm _BOTHEN_RTN : MUBUF_Real_gfx10<op, psName#"_BOTHEN_RTN", asmName>; + defm _IDXEN_RTN : MUBUF_Real_gfx10<op, psName#"_IDXEN_RTN", asmName>; + defm _OFFEN_RTN : MUBUF_Real_gfx10<op, psName#"_OFFEN_RTN", asmName>; + defm _OFFSET_RTN : MUBUF_Real_gfx10<op, psName#"_OFFSET_RTN", asmName>; } -multiclass MUBUF_Real_Atomics_gfx10<bits<8> op> : - MUBUF_Real_Atomics_RTN_gfx10<op> { - defm _BOTHEN : MUBUF_Real_gfx10<op>; - defm _IDXEN : MUBUF_Real_gfx10<op>; - defm _OFFEN : MUBUF_Real_gfx10<op>; - defm _OFFSET : MUBUF_Real_gfx10<op>; +multiclass MUBUF_Real_Atomics_gfx10<bits<8> op, string psName = NAME, + string asmName = get_BUF_ps<psName>.Mnemonic> : + MUBUF_Real_Atomics_RTN_gfx10<op, psName, asmName> { + defm _BOTHEN : MUBUF_Real_gfx10<op, psName#"_BOTHEN", asmName>; + defm _IDXEN : MUBUF_Real_gfx10<op, psName#"_IDXEN", asmName>; + defm _OFFEN : MUBUF_Real_gfx10<op, psName#"_OFFEN", asmName>; + defm _OFFSET : MUBUF_Real_gfx10<op, psName#"_OFFSET", asmName>; } defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_gfx10<0x019>; @@ -2756,18 +2784,18 @@ multiclass MUBUF_Real_AllAddr_Lds_gfx6_gfx7<bits<8> op, bit isTFE = 0> { defm _LDS_BOTHEN : MUBUF_Real_gfx6_gfx7<op>; } } -multiclass MUBUF_Real_Atomics_gfx6_gfx7<bits<8> op> { - defm _ADDR64 : MUBUF_Real_gfx6_gfx7<op>; - defm _BOTHEN : MUBUF_Real_gfx6_gfx7<op>; - defm _IDXEN : MUBUF_Real_gfx6_gfx7<op>; - defm _OFFEN : MUBUF_Real_gfx6_gfx7<op>; - defm _OFFSET : MUBUF_Real_gfx6_gfx7<op>; +multiclass MUBUF_Real_Atomics_gfx6_gfx7<bits<8> op, string psName, string asmName> { + defm _ADDR64 : MUBUF_Real_gfx6_gfx7<op, psName#"_ADDR64", asmName>; + defm _BOTHEN : MUBUF_Real_gfx6_gfx7<op, psName#"_BOTHEN", asmName>; + defm _IDXEN : MUBUF_Real_gfx6_gfx7<op, psName#"_IDXEN", asmName>; + defm _OFFEN : MUBUF_Real_gfx6_gfx7<op, psName#"_OFFEN", asmName>; + defm _OFFSET : MUBUF_Real_gfx6_gfx7<op, psName#"_OFFSET", asmName>; - defm _ADDR64_RTN : MUBUF_Real_gfx6_gfx7<op>; - defm _BOTHEN_RTN : MUBUF_Real_gfx6_gfx7<op>; - defm _IDXEN_RTN : MUBUF_Real_gfx6_gfx7<op>; - defm _OFFEN_RTN : MUBUF_Real_gfx6_gfx7<op>; - defm _OFFSET_RTN : MUBUF_Real_gfx6_gfx7<op>; + defm _ADDR64_RTN : MUBUF_Real_gfx6_gfx7<op, psName#"_ADDR64_RTN", asmName>; + defm _BOTHEN_RTN : MUBUF_Real_gfx6_gfx7<op, psName#"_BOTHEN_RTN", asmName>; + defm _IDXEN_RTN : MUBUF_Real_gfx6_gfx7<op, psName#"_IDXEN_RTN", asmName>; + defm _OFFEN_RTN : MUBUF_Real_gfx6_gfx7<op, psName#"_OFFEN_RTN", asmName>; + defm _OFFSET_RTN : MUBUF_Real_gfx6_gfx7<op, psName#"_OFFSET_RTN", asmName>; } multiclass MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<bits<8> op> : @@ -2782,8 +2810,10 @@ multiclass MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<bits<8> op> { defm _TFE : MUBUF_Real_AllAddr_Lds_Helper_gfx6_gfx7_gfx10<op, 1>; } -multiclass MUBUF_Real_Atomics_gfx6_gfx7_gfx10<bits<8> op> : - MUBUF_Real_Atomics_gfx6_gfx7<op>, MUBUF_Real_Atomics_gfx10<op>; +multiclass MUBUF_Real_Atomics_gfx6_gfx7_gfx10<bits<8> op, string psName = NAME, + string asmName = get_BUF_ps<psName>.Mnemonic> : + MUBUF_Real_Atomics_gfx6_gfx7<op, psName, asmName>, + MUBUF_Real_Atomics_gfx10<op, psName, asmName>; // FIXME-GFX6: Following instructions are available only on GFX6. //defm BUFFER_ATOMIC_RSUB : MUBUF_Real_Atomics_gfx6 <0x034>; @@ -2843,8 +2873,8 @@ defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05c>; defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05d>; // FIXME-GFX7: Need to handle hazard for BUFFER_ATOMIC_FCMPSWAP_X2 on GFX7. defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05e>; -defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05f>; -defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x060>; +defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05f, "BUFFER_ATOMIC_MIN_F64", "buffer_atomic_fmin_x2">; +defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x060, "BUFFER_ATOMIC_MAX_F64", "buffer_atomic_fmax_x2">; defm BUFFER_ATOMIC_CSUB : MUBUF_Real_Atomics_gfx10<0x034>; @@ -3066,9 +3096,9 @@ multiclass MUBUF_Real_vi_gfx90a<bits<7> op, bit isTFE = 0> : MUBUF_Real_vi<op> { } if ps.FPAtomic then { - let SubtargetPredicate = isGFX90AOnly, - AssemblerPredicate = isGFX90AOnly in - defm NAME : MUBUF_Real_gfx90a<op, 0>; + let AssemblerPredicate = isGFX90AOnly in + defm NAME : MUBUF_Real_gfx90a<op, 0>; + def _gfx940 : MUBUF_Real_gfx940<op, ps>; } } @@ -3251,10 +3281,7 @@ defm BUFFER_WBINVL1_VOL : MUBUF_Real_vi <0x3f>; defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_vi <0x4e>; - -let SubtargetPredicate = HasAtomicFaddNoRtnInsts in { defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_Atomic_vi <0x4d>; -} // End SubtargetPredicate = HasAtomicFaddNoRtnInsts let SubtargetPredicate = isGFX90APlus in { defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Real_Atomic_vi<0x4f>; diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 19bb4300531c..219246b71fe8 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -965,16 +965,16 @@ defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align_less_than_4_local">; multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> { let OtherPredicates = [LDSRequiresM0Init] in { - def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>; + def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt)>; } let OtherPredicates = [NotLDSRequiresM0Init] in { def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_"#vt)>; } let OtherPredicates = [HasGDS] in { - def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), + def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt), /* complexity */ 0, /* gds */ 1>; } } @@ -983,24 +983,24 @@ multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt, string frag> { let OtherPredicates = [LDSRequiresM0Init] in { def : DSAtomicRetPat<inst, vt, - !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_m0_"#vt)>; def : DSAtomicRetPat<noRetInst, vt, - !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size), /* complexity */ 1>; + !cast<PatFrag>(frag#"_local_m0_noret_"#vt), /* complexity */ 1>; } let OtherPredicates = [NotLDSRequiresM0Init] in { def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_"#vt)>; def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_noret_"#vt.Size), /* complexity */ 1>; + !cast<PatFrag>(frag#"_local_noret_"#vt), /* complexity */ 1>; } let OtherPredicates = [HasGDS] in { def : DSAtomicRetPat<inst, vt, - !cast<PatFrag>(frag#"_region_m0_"#vt.Size), + !cast<PatFrag>(frag#"_region_m0_"#vt), /* complexity */ 0, /* gds */ 1>; def : DSAtomicRetPat<noRetInst, vt, - !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), + !cast<PatFrag>(frag#"_region_m0_noret_"#vt), /* complexity */ 1, /* gds */ 1>; } } @@ -1019,23 +1019,23 @@ class DSAtomicCmpXChgSwapped<DS_Pseudo inst, ValueType vt, PatFrag frag, multiclass DSAtomicCmpXChgSwapped_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt, string frag> { let OtherPredicates = [LDSRequiresM0Init] in { - def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>; - def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size), + def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt)>; + def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_local_m0_noret_"#vt), /* complexity */ 1>; } let OtherPredicates = [NotLDSRequiresM0Init] in { def : DSAtomicCmpXChgSwapped<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_"#vt)>; def : DSAtomicCmpXChgSwapped<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_noret_"#vt.Size), + !cast<PatFrag>(frag#"_local_noret_"#vt), /* complexity */ 1>; } let OtherPredicates = [HasGDS] in { - def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), + def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt), /* complexity */ 0, /* gds */ 1>; - def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), + def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt), /* complexity */ 1, /* gds */ 1>; } } @@ -1053,14 +1053,14 @@ class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag, multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt, string frag> { def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_"#vt)>; def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_noret_"#vt.Size), /* complexity */ 1>; + !cast<PatFrag>(frag#"_local_noret_"#vt), /* complexity */ 1>; let OtherPredicates = [HasGDS] in { - def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), + def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt), /* complexity */ 0, /* gds */ 1>; - def : DSAtomicCmpXChg<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), + def : DSAtomicCmpXChg<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt), /* complexity */ 1, /* gds */ 1>; } } @@ -1082,6 +1082,12 @@ defm : DSAtomicRetNoRetPat_mc<DS_MAX_RTN_U32, DS_MAX_U32, i32, "atomic_load_umax defm : DSAtomicRetNoRetPat_mc<DS_MIN_RTN_F32, DS_MIN_F32, f32, "atomic_load_fmin">; defm : DSAtomicRetNoRetPat_mc<DS_MAX_RTN_F32, DS_MAX_F32, f32, "atomic_load_fmax">; + +let SubtargetPredicate = HasAtomicDsPkAdd16Insts in { +defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_load_fadd">; +defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_BF16, DS_PK_ADD_BF16, v2bf16, "atomic_load_fadd">; +} + let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in { defm : DSAtomicCmpXChgSwapped_mc<DS_CMPST_RTN_B32, DS_CMPST_B32, i32, "atomic_cmp_swap">; } @@ -1119,9 +1125,9 @@ defm : DSAtomicCmpXChg_mc<DS_CMPSTORE_RTN_B64, DS_CMPSTORE_B64, i64, "atomic_cmp } // End SubtargetPredicate = isGFX11Plus let SubtargetPredicate = HasLdsAtomicAddF64 in { -def : DSAtomicRetPat<DS_ADD_RTN_F64, f64, atomic_load_fadd_local_64>; +def : DSAtomicRetPat<DS_ADD_RTN_F64, f64, atomic_load_fadd_local_f64>; let AddedComplexity = 1 in -def : DSAtomicRetPat<DS_ADD_F64, f64, atomic_load_fadd_local_noret_64>; +def : DSAtomicRetPat<DS_ADD_F64, f64, atomic_load_fadd_local_noret_f64>; class DSAtomicRetPatIntrinsic<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat < @@ -1135,18 +1141,7 @@ def : DSAtomicRetPatIntrinsic<DS_ADD_F64, f64, int_amdgcn_flat_atomic_fadd_noret } let SubtargetPredicate = HasAtomicDsPkAdd16Insts in { -def : DSAtomicRetPat<DS_PK_ADD_RTN_F16, v2f16, atomic_load_fadd_v2f16_local_32>; -let AddedComplexity = 1 in -def : DSAtomicRetPat<DS_PK_ADD_F16, v2f16, atomic_load_fadd_v2f16_local_noret_32>; -def : GCNPat < - (v2i16 (int_amdgcn_ds_fadd_v2bf16 i32:$ptr, v2i16:$src)), - (DS_PK_ADD_RTN_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0) ->; -let AddedComplexity = 1 in -def : GCNPat < - (v2i16 (int_amdgcn_ds_fadd_v2bf16_noret i32:$ptr, v2i16:$src)), - (DS_PK_ADD_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0) ->; +defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_load_fadd">; } // End SubtargetPredicate = HasAtomicDsPkAdd16Insts let OtherPredicates = [HasGDS] in diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 05063c6c321a..76a559c9443b 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -21,6 +21,7 @@ #include "SIDefines.h" #include "SIRegisterInfo.h" #include "TargetInfo/AMDGPUTargetInfo.h" +#include "Utils/AMDGPUAsmUtils.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm-c/DisassemblerTypes.h" #include "llvm/BinaryFormat/ELF.h" @@ -52,6 +53,13 @@ AMDGPUDisassembler::AMDGPUDisassembler(const MCSubtargetInfo &STI, // ToDo: AMDGPUDisassembler supports only VI ISA. if (!STI.hasFeature(AMDGPU::FeatureGCN3Encoding) && !isGFX10Plus()) report_fatal_error("Disassembly not yet supported for subtarget"); + + for (auto [Symbol, Code] : AMDGPU::UCVersion::getGFXVersions()) + createConstantSymbolExpr(Symbol, Code); + + UCVersionW64Expr = createConstantSymbolExpr("UC_VERSION_W64_BIT", 0x2000); + UCVersionW32Expr = createConstantSymbolExpr("UC_VERSION_W32_BIT", 0x4000); + UCVersionMDPExpr = createConstantSymbolExpr("UC_VERSION_MDP_BIT", 0x8000); } void AMDGPUDisassembler::setABIVersion(unsigned Version) { @@ -421,6 +429,13 @@ DECODE_SDWA(Src32) DECODE_SDWA(Src16) DECODE_SDWA(VopcDst) +static DecodeStatus decodeVersionImm(MCInst &Inst, unsigned Imm, + uint64_t /* Addr */, + const MCDisassembler *Decoder) { + auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder); + return addOperand(Inst, DAsm->decodeVersionImm(Imm)); +} + #include "AMDGPUGenDisassemblerTables.inc" //===----------------------------------------------------------------------===// @@ -1727,6 +1742,41 @@ MCOperand AMDGPUDisassembler::decodeDpp8FI(unsigned Val) const { return MCOperand::createImm(Val); } +MCOperand AMDGPUDisassembler::decodeVersionImm(unsigned Imm) const { + using VersionField = AMDGPU::EncodingField<7, 0>; + using W64Bit = AMDGPU::EncodingBit<13>; + using W32Bit = AMDGPU::EncodingBit<14>; + using MDPBit = AMDGPU::EncodingBit<15>; + using Encoding = AMDGPU::EncodingFields<VersionField, W64Bit, W32Bit, MDPBit>; + + auto [Version, W64, W32, MDP] = Encoding::decode(Imm); + + // Decode into a plain immediate if any unused bits are raised. + if (Encoding::encode(Version, W64, W32, MDP) != Imm) + return MCOperand::createImm(Imm); + + const auto &Versions = AMDGPU::UCVersion::getGFXVersions(); + auto I = find_if(Versions, + [Version = Version](const AMDGPU::UCVersion::GFXVersion &V) { + return V.Code == Version; + }); + MCContext &Ctx = getContext(); + const MCExpr *E; + if (I == Versions.end()) + E = MCConstantExpr::create(Version, Ctx); + else + E = MCSymbolRefExpr::create(Ctx.getOrCreateSymbol(I->Symbol), Ctx); + + if (W64) + E = MCBinaryExpr::createOr(E, UCVersionW64Expr, Ctx); + if (W32) + E = MCBinaryExpr::createOr(E, UCVersionW32Expr, Ctx); + if (MDP) + E = MCBinaryExpr::createOr(E, UCVersionMDPExpr, Ctx); + + return MCOperand::createExpr(E); +} + bool AMDGPUDisassembler::isVI() const { return STI.hasFeature(AMDGPU::FeatureVolcanicIslands); } @@ -2312,6 +2362,15 @@ Expected<bool> AMDGPUDisassembler::onSymbolStart(SymbolInfoTy &Symbol, return false; } +const MCExpr *AMDGPUDisassembler::createConstantSymbolExpr(StringRef Id, + int64_t Val) { + MCContext &Ctx = getContext(); + MCSymbol *Sym = Ctx.getOrCreateSymbol(Id); + assert(!Sym->isVariable()); + Sym->setVariableValue(MCConstantExpr::create(Val, Ctx)); + return MCSymbolRefExpr::create(Sym, Ctx); +} + //===----------------------------------------------------------------------===// // AMDGPUSymbolizer //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index 2061d83af3da..694cd7a9bfd2 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -102,6 +102,11 @@ private: mutable bool HasLiteral; mutable std::optional<bool> EnableWavefrontSize32; unsigned CodeObjectVersion; + const MCExpr *UCVersionW64Expr; + const MCExpr *UCVersionW32Expr; + const MCExpr *UCVersionMDPExpr; + + const MCExpr *createConstantSymbolExpr(StringRef Id, int64_t Val); public: AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, @@ -264,6 +269,8 @@ public: MCOperand decodeSplitBarrier(unsigned Val) const; MCOperand decodeDpp8FI(unsigned Val) const; + MCOperand decodeVersionImm(unsigned Imm) const; + int getTTmpIdx(unsigned Val) const; const MCInstrInfo *getMCII() const { return MCII.get(); } diff --git a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td index 3767dd0b6d47..280def5440c8 100644 --- a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td +++ b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td @@ -322,25 +322,25 @@ def : EGOrCaymanPat<(i32 (atomic_cmp_swap_global_noret i32:$ptr, i32:$cmp, i32:$ $ptr), sub1)>; defm AtomicSwapPat : AtomicPat <RAT_ATOMIC_XCHG_INT_NORET, - atomic_swap_global_noret_32>; + atomic_swap_global_noret_i32>; defm AtomicAddPat : AtomicPat <RAT_ATOMIC_ADD_NORET, - atomic_load_add_global_noret_32>; + atomic_load_add_global_noret_i32>; defm AtomicSubPat : AtomicPat <RAT_ATOMIC_SUB_NORET, - atomic_load_sub_global_noret_32>; + atomic_load_sub_global_noret_i32>; defm AtomicMinPat : AtomicPat <RAT_ATOMIC_MIN_INT_NORET, - atomic_load_min_global_noret_32>; + atomic_load_min_global_noret_i32>; defm AtomicUMinPat : AtomicPat <RAT_ATOMIC_MIN_UINT_NORET, - atomic_load_umin_global_noret_32>; + atomic_load_umin_global_noret_i32>; defm AtomicMaxPat : AtomicPat <RAT_ATOMIC_MAX_INT_NORET, - atomic_load_max_global_noret_32>; + atomic_load_max_global_noret_i32>; defm AtomicUMaxPat : AtomicPat <RAT_ATOMIC_MAX_UINT_NORET, - atomic_load_umax_global_noret_32>; + atomic_load_umax_global_noret_i32>; defm AtomicAndPat : AtomicPat <RAT_ATOMIC_AND_NORET, - atomic_load_and_global_noret_32>; + atomic_load_and_global_noret_i32>; defm AtomicOrPat : AtomicPat <RAT_ATOMIC_OR_NORET, - atomic_load_or_global_noret_32>; + atomic_load_or_global_noret_i32>; defm AtomicXorPat : AtomicPat <RAT_ATOMIC_XOR_NORET, - atomic_load_xor_global_noret_32>; + atomic_load_xor_global_noret_i32>; // Should be predicated on FeatureFP64 // def FMA_64 : R600_3OP < @@ -712,37 +712,37 @@ def LDS_SHORT_WRITE : R600_LDS_1A1D_NORET<0x13, "LDS_SHORT_WRITE", [(truncstorei16_local i32:$src1, i32:$src0)] >; def LDS_ADD_RET : R600_LDS_1A1D_RET <0x20, "LDS_ADD", - [(set i32:$dst, (atomic_load_add_local_32 i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_add_local_i32 i32:$src0, i32:$src1))] >; def LDS_SUB_RET : R600_LDS_1A1D_RET <0x21, "LDS_SUB", - [(set i32:$dst, (atomic_load_sub_local_32 i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_sub_local_i32 i32:$src0, i32:$src1))] >; def LDS_AND_RET : R600_LDS_1A1D_RET <0x29, "LDS_AND", - [(set i32:$dst, (atomic_load_and_local_32 i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_and_local_i32 i32:$src0, i32:$src1))] >; def LDS_OR_RET : R600_LDS_1A1D_RET <0x2a, "LDS_OR", - [(set i32:$dst, (atomic_load_or_local_32 i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_or_local_i32 i32:$src0, i32:$src1))] >; def LDS_XOR_RET : R600_LDS_1A1D_RET <0x2b, "LDS_XOR", - [(set i32:$dst, (atomic_load_xor_local_32 i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_xor_local_i32 i32:$src0, i32:$src1))] >; def LDS_MIN_INT_RET : R600_LDS_1A1D_RET <0x25, "LDS_MIN_INT", - [(set i32:$dst, (atomic_load_min_local_32 i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_min_local_i32 i32:$src0, i32:$src1))] >; def LDS_MAX_INT_RET : R600_LDS_1A1D_RET <0x26, "LDS_MAX_INT", - [(set i32:$dst, (atomic_load_max_local_32 i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_max_local_i32 i32:$src0, i32:$src1))] >; def LDS_MIN_UINT_RET : R600_LDS_1A1D_RET <0x27, "LDS_MIN_UINT", - [(set i32:$dst, (atomic_load_umin_local_32 i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_umin_local_i32 i32:$src0, i32:$src1))] >; def LDS_MAX_UINT_RET : R600_LDS_1A1D_RET <0x28, "LDS_MAX_UINT", - [(set i32:$dst, (atomic_load_umax_local_32 i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_umax_local_i32 i32:$src0, i32:$src1))] >; def LDS_WRXCHG_RET : R600_LDS_1A1D_RET <0x2d, "LDS_WRXCHG", - [(set i32:$dst, (atomic_swap_local_32 i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_swap_local_i32 i32:$src0, i32:$src1))] >; def LDS_CMPST_RET : R600_LDS_1A2D_RET <0x30, "LDS_CMPST", - [(set i32:$dst, (atomic_cmp_swap_local_32 i32:$src0, i32:$src1, i32:$src2))] + [(set i32:$dst, (atomic_cmp_swap_local_i32 i32:$src0, i32:$src1, i32:$src2))] >; def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET", [(set (i32 R600_Reg32:$dst), (load_local R600_Reg32:$src0))] diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index aab19b8adc27..98054dde398b 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -752,25 +752,29 @@ defm FLAT_ATOMIC_DEC_X2 : FLAT_Atomic_Pseudo <"flat_atomic_dec_x2", // GFX7-, GFX10-only flat instructions. let SubtargetPredicate = isGFX7GFX10 in { - defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap_x2", VReg_64, f64, v2f64, VReg_128>; +} // End SubtargetPredicate = isGFX7GFX10 -defm FLAT_ATOMIC_FMIN_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmin_x2", - VReg_64, f64>; -defm FLAT_ATOMIC_FMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2", - VReg_64, f64>; +// The names may be flat_atomic_fmin_x2 on some subtargets, but we +// choose this as the canonical name. +let SubtargetPredicate = HasAtomicFMinFMaxF64FlatInsts in { +defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo <"flat_atomic_min_f64", + VReg_64, f64>; -} // End SubtargetPredicate = isGFX7GFX10 +defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo <"flat_atomic_max_f64", + VReg_64, f64>; +} + +let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in { +defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64>; +defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64>; +} let SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 in { defm FLAT_ATOMIC_ADD_F64 : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", VReg_64, f64>; - defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo<"flat_atomic_min_f64", VReg_64, f64>; - defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo<"flat_atomic_max_f64", VReg_64, f64>; defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", VReg_64, f64>; - defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64>; - defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64>; } // End SubtargetPredicate = HasBufferFlatGlobalAtomicsF64 let SubtargetPredicate = HasAtomicFlatPkAdd16Insts in { @@ -972,6 +976,15 @@ defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_s defm SCRATCH_LOAD_LDS_DWORD : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_dword">; let SubtargetPredicate = isGFX12Plus in { + let Uses = [EXEC, M0] in { + defm GLOBAL_LOAD_BLOCK : FLAT_Global_Load_Pseudo <"global_load_block", VReg_1024>; + defm GLOBAL_STORE_BLOCK : FLAT_Global_Store_Pseudo <"global_store_block", VReg_1024>; + } + let Uses = [EXEC, FLAT_SCR, M0] in { + defm SCRATCH_LOAD_BLOCK : FLAT_Scratch_Load_Pseudo <"scratch_load_block", VReg_1024>; + defm SCRATCH_STORE_BLOCK : FLAT_Scratch_Store_Pseudo <"scratch_store_block", VReg_1024>; + } + let WaveSizePredicate = isWave32 in { let Mnemonic = "global_load_tr_b128" in defm GLOBAL_LOAD_TR_B128_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b128_w32", VReg_128>; @@ -995,10 +1008,6 @@ let SubtargetPredicate = isGFX10Plus in { FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", VGPR_32, f32>; defm GLOBAL_ATOMIC_FCMPSWAP_X2 : FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap_x2", VReg_64, f64, v2f64, VReg_128>; - defm GLOBAL_ATOMIC_FMIN_X2 : - FLAT_Global_Atomic_Pseudo<"global_atomic_fmin_x2", VReg_64, f64>; - defm GLOBAL_ATOMIC_FMAX_X2 : - FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64>; } // End SubtargetPredicate = isGFX10Plus let OtherPredicates = [HasAtomicFaddNoRtnInsts] in @@ -1105,7 +1114,7 @@ multiclass FlatAtomicNoRtnPatWithAddrSpace<string inst, string node, string addr multiclass FlatAtomicNoRtnPat <string inst, string node, ValueType vt, ValueType data_vt = vt, bit isIntr = 0> : - FlatAtomicNoRtnPatBase<inst, node # "_noret" # !if(isIntr, "", "_"#vt.Size), vt, data_vt>; + FlatAtomicNoRtnPatBase<inst, node # "_noret" # !if(isIntr, "", "_"#vt), vt, data_vt>; multiclass FlatAtomicRtnPatBase <string inst, string node, ValueType vt, @@ -1123,7 +1132,7 @@ multiclass FlatAtomicRtnPatWithAddrSpace<string inst, string intr, string addrSp multiclass FlatAtomicRtnPat <string inst, string node, ValueType vt, ValueType data_vt = vt, bit isIntr = 0> : - FlatAtomicRtnPatBase<inst, node # !if(isIntr, "", "_"#vt.Size), vt, data_vt>; + FlatAtomicRtnPatBase<inst, node # !if(isIntr, "", "_"#vt), vt, data_vt>; multiclass FlatAtomicPat <string inst, string node, ValueType vt, @@ -1155,8 +1164,8 @@ class FlatSignedAtomicPatBase <FLAT_Pseudo inst, SDPatternOperator node, multiclass FlatSignedAtomicPat <string inst, string node, ValueType vt, ValueType data_vt = vt, int complexity = 0, bit isIntr = 0> { - defvar rtnNode = !cast<SDPatternOperator>(node # !if(isIntr, "", "_" # vt.Size)); - defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt.Size)); + defvar rtnNode = !cast<SDPatternOperator>(node # !if(isIntr, "", "_" # vt)); + defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt)); let AddedComplexity = complexity in def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst#"_RTN"), rtnNode, vt, data_vt>; @@ -1165,21 +1174,6 @@ multiclass FlatSignedAtomicPat <string inst, string node, ValueType vt, def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst), noRtnNode, vt, data_vt>; } -multiclass FlatSignedAtomicIntrPat <string inst, string node, ValueType vt, - ValueType data_vt = vt> { - defm : FlatSignedAtomicPat<inst, node, vt, data_vt, /* complexity */ 0, /* isIntr */ 1>; -} - -multiclass FlatSignedAtomicPatWithAddrSpace<string inst, string intr, string addrSpaceSuffix, - ValueType vt, ValueType data_vt = vt> { - defvar noRtnNode = !cast<PatFrags>(intr # "_noret_" # addrSpaceSuffix); - defvar rtnNode = !cast<PatFrags>(intr # "_" # addrSpaceSuffix); - - let AddedComplexity = 1 in - def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst), noRtnNode, vt, data_vt>; - def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst#"_RTN"), rtnNode, vt, data_vt>; -} - class ScratchLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset))), (inst $vaddr, $offset) @@ -1280,11 +1274,11 @@ multiclass GlobalFLATAtomicPatsRtnBase<string inst, string node, ValueType vt, multiclass GlobalFLATAtomicPatsNoRtn<string inst, string node, ValueType vt, ValueType data_vt = vt, bit isIntr = 0> : - GlobalFLATAtomicPatsNoRtnBase<inst, node # "_noret" # !if(isIntr, "", "_" # vt.Size), vt, data_vt>; + GlobalFLATAtomicPatsNoRtnBase<inst, node # "_noret" # !if(isIntr, "", "_" # vt), vt, data_vt>; multiclass GlobalFLATAtomicPatsRtn<string inst, string node, ValueType vt, ValueType data_vt = vt, bit isIntr = 0> : - GlobalFLATAtomicPatsRtnBase<inst, node # !if(isIntr, "", "_" # vt.Size), vt, data_vt>; + GlobalFLATAtomicPatsRtnBase<inst, node # !if(isIntr, "", "_" # vt), vt, data_vt>; multiclass GlobalFLATAtomicPats<string inst, string node, ValueType vt, ValueType data_vt = vt, bit isIntr = 0> : @@ -1431,6 +1425,17 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_OR_X2", "atomic_load_or_"#as, i64>; defm : FlatAtomicPat <"FLAT_ATOMIC_SWAP_X2", "atomic_swap_"#as, i64>; defm : FlatAtomicPat <"FLAT_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_"#as, i64, v2i64>; defm : FlatAtomicPat <"FLAT_ATOMIC_XOR_X2", "atomic_load_xor_"#as, i64>; + +let SubtargetPredicate = HasAtomicFMinFMaxF32FlatInsts in { +defm : FlatAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_"#as, f32>; +defm : FlatAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_"#as, f32>; +} + +let SubtargetPredicate = HasAtomicFMinFMaxF64FlatInsts in { +defm : FlatAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_"#as, f64>; +defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>; +} + } // end foreach as let SubtargetPredicate = isGFX12Plus in { @@ -1592,37 +1597,26 @@ let OtherPredicates = [isGFX12Plus] in { } } -let OtherPredicates = [isGFX10Plus] in { +let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>; -defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>; -defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>; -} - -let OtherPredicates = [isGFX10GFX11] in { defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>; - -defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin", f32>; -defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax", f32>; } -let OtherPredicates = [isGFX10Only] in { -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN_X2", "atomic_load_fmin_global", f64>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX_X2", "atomic_load_fmax_global", f64>; -defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN_X2", "int_amdgcn_global_atomic_fmin", f64>; -defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX_X2", "int_amdgcn_global_atomic_fmax", f64>; -defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMIN_X2", "atomic_load_fmin_flat", f64>; -defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMAX_X2", "atomic_load_fmax_flat", f64>; -defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN_X2", "int_amdgcn_flat_atomic_fmin", f64>; -defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX_X2", "int_amdgcn_flat_atomic_fmax", f64>; +let SubtargetPredicate = HasAtomicFMinFMaxF32FlatInsts in { +defm : FlatAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>; +defm : FlatAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>; +defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin", f32>; +defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax", f32>; } let OtherPredicates = [isGFX12Only] in { + // FIXME: Remove these intrinsics defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin_num", f32>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax_num", f32>; - defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin_num", f32>; - defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax_num", f32>; + defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin_num", f32>; + defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax_num", f32>; } let OtherPredicates = [HasAtomicFaddNoRtnInsts] in { @@ -1645,37 +1639,44 @@ defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgc let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in { defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>; defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>; } -let OtherPredicates = [HasBufferFlatGlobalAtomicsF64] in { -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>; +let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>; -defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f64>; -defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", "global_addrspace", f64>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MIN_F64", "int_amdgcn_global_atomic_fmin", f64>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MAX_F64", "int_amdgcn_global_atomic_fmax", f64>; -defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>; -defm : FlatSignedAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_flat", f64>; -defm : FlatSignedAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_flat", f64>; -defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", "flat_addrspace", f64>; -defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_MIN_F64", "int_amdgcn_flat_atomic_fmin", f64>; -defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax", f64>; +} + +let SubtargetPredicate = HasAtomicFMinFMaxF64FlatInsts in { +defm : FlatAtomicIntrPat <"FLAT_ATOMIC_MIN_F64", "int_amdgcn_flat_atomic_fmin", f64>; +defm : FlatAtomicIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax", f64>; +} + +let OtherPredicates = [HasBufferFlatGlobalAtomicsF64] in { +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>; +defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f64>; +defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", "global_addrspace", f64>; +defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>; +defm : FlatAtomicIntrPat <"FLAT_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", f64>; } let OtherPredicates = [HasFlatAtomicFaddF32Inst] in { -defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>; -defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", "flat_addrspace", f32>; +defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>; +defm : FlatAtomicIntrPat <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", f32>; } let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in { -defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "flat_addrspace", v2f16>; -defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>; +defm : FlatAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", v2f16>; +defm : FlatAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>; +defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_flat", v2f16>; +defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16>; } let OtherPredicates = [HasAtomicGlobalPkAddBF16Inst] in defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>; - +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_global", v2bf16>; } // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in { @@ -1745,8 +1746,8 @@ defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2f // CI //===----------------------------------------------------------------------===// -class FLAT_Real_ci <bits<7> op, FLAT_Pseudo ps> : - FLAT_Real <op, ps>, +class FLAT_Real_ci <bits<7> op, FLAT_Pseudo ps, string asmName = ps.Mnemonic> : + FLAT_Real <op, ps, asmName>, SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SI> { let AssemblerPredicate = isGFX7Only; let DecoderNamespace="GFX7"; @@ -1768,10 +1769,13 @@ def FLAT_STORE_DWORDX2_ci : FLAT_Real_ci <0x1d, FLAT_STORE_DWORDX2>; def FLAT_STORE_DWORDX4_ci : FLAT_Real_ci <0x1e, FLAT_STORE_DWORDX4>; def FLAT_STORE_DWORDX3_ci : FLAT_Real_ci <0x1f, FLAT_STORE_DWORDX3>; -multiclass FLAT_Real_Atomics_ci <bits<7> op> { - defvar ps = !cast<FLAT_Pseudo>(NAME); - def _ci : FLAT_Real_ci<op, !cast<FLAT_Pseudo>(ps.PseudoInstr)>; - def _RTN_ci : FLAT_Real_ci<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN")>; +multiclass FLAT_Real_Atomics_ci <bits<7> op, string opName = NAME, + string asmName = !cast<FLAT_Pseudo>(opName).Mnemonic> { + defvar ps = !cast<FLAT_Pseudo>(opName); + defvar ps_rtn = !cast<FLAT_Pseudo>(opName#"_RTN"); + + def _ci : FLAT_Real_ci<op, ps, asmName>; + def _RTN_ci : FLAT_Real_ci<op, ps_rtn, asmName>; } defm FLAT_ATOMIC_SWAP : FLAT_Real_Atomics_ci <0x30>; @@ -1806,8 +1810,8 @@ defm FLAT_ATOMIC_FCMPSWAP : FLAT_Real_Atomics_ci <0x3e>; defm FLAT_ATOMIC_FMIN : FLAT_Real_Atomics_ci <0x3f>; defm FLAT_ATOMIC_FMAX : FLAT_Real_Atomics_ci <0x40>; defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Real_Atomics_ci <0x5e>; -defm FLAT_ATOMIC_FMIN_X2 : FLAT_Real_Atomics_ci <0x5f>; -defm FLAT_ATOMIC_FMAX_X2 : FLAT_Real_Atomics_ci <0x60>; +defm FLAT_ATOMIC_FMIN_X2 : FLAT_Real_Atomics_ci <0x5f, "FLAT_ATOMIC_MIN_F64", "flat_atomic_fmin_x2">; +defm FLAT_ATOMIC_FMAX_X2 : FLAT_Real_Atomics_ci <0x60, "FLAT_ATOMIC_MAX_F64", "flat_atomic_fmax_x2">; //===----------------------------------------------------------------------===// @@ -2089,8 +2093,8 @@ let SubtargetPredicate = isGFX940Plus in { // GFX10. //===----------------------------------------------------------------------===// -class FLAT_Real_gfx10<bits<7> op, FLAT_Pseudo ps> : - FLAT_Real<op, ps>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX10> { +class FLAT_Real_gfx10<bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> : + FLAT_Real<op, ps, opName>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX10> { let AssemblerPredicate = isGFX10Only; let DecoderNamespace = "GFX10"; @@ -2102,25 +2106,28 @@ class FLAT_Real_gfx10<bits<7> op, FLAT_Pseudo ps> : let Inst{55} = 0; } - -multiclass FLAT_Real_Base_gfx10<bits<7> op> { +multiclass FLAT_Real_Base_gfx10<bits<7> op, string psName = NAME, + string asmName = !cast<FLAT_Pseudo>(psName).Mnemonic> { def _gfx10 : - FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME)>; + FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(psName), asmName>; } -multiclass FLAT_Real_RTN_gfx10<bits<7> op> { +multiclass FLAT_Real_RTN_gfx10<bits<7> op, string psName = NAME, + string asmName = !cast<FLAT_Pseudo>(psName).Mnemonic> { def _RTN_gfx10 : - FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_RTN")>; + FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(psName#"_RTN"), asmName>; } -multiclass FLAT_Real_SADDR_gfx10<bits<7> op> { +multiclass FLAT_Real_SADDR_gfx10<bits<7> op, string psName = NAME, + string asmName = !cast<FLAT_Pseudo>(psName#"_SADDR").Mnemonic> { def _SADDR_gfx10 : - FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")>; + FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(psName#"_SADDR"), asmName>; } -multiclass FLAT_Real_SADDR_RTN_gfx10<bits<7> op> { +multiclass FLAT_Real_SADDR_RTN_gfx10<bits<7> op, string psName = NAME, + string asmName = !cast<FLAT_Pseudo>(psName#"_SADDR_RTN").Mnemonic> { def _SADDR_RTN_gfx10 : - FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN")>; + FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(psName#"_SADDR_RTN"), asmName>; } multiclass FLAT_Real_ST_gfx10<bits<7> op> { @@ -2128,22 +2135,25 @@ multiclass FLAT_Real_ST_gfx10<bits<7> op> { FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_ST")>; } -multiclass FLAT_Real_AllAddr_gfx10<bits<7> op> : - FLAT_Real_Base_gfx10<op>, - FLAT_Real_SADDR_gfx10<op>; +multiclass FLAT_Real_AllAddr_gfx10<bits<7> op, string OpName = NAME, + string asmName = !cast<FLAT_Pseudo>(OpName).Mnemonic> : + FLAT_Real_Base_gfx10<op, OpName, asmName>, + FLAT_Real_SADDR_gfx10<op, OpName, asmName>; -multiclass FLAT_Real_Atomics_gfx10<bits<7> op> : - FLAT_Real_Base_gfx10<op>, - FLAT_Real_RTN_gfx10<op>; +multiclass FLAT_Real_Atomics_gfx10<bits<7> op, string OpName = NAME, + string asmName = !cast<FLAT_Pseudo>(OpName).Mnemonic> : + FLAT_Real_Base_gfx10<op, OpName, asmName>, + FLAT_Real_RTN_gfx10<op, OpName, asmName>; -multiclass FLAT_Real_GlblAtomics_gfx10<bits<7> op> : - FLAT_Real_AllAddr_gfx10<op>, - FLAT_Real_RTN_gfx10<op>, - FLAT_Real_SADDR_RTN_gfx10<op>; +multiclass FLAT_Real_GlblAtomics_gfx10<bits<7> op, string OpName = NAME, + string asmName = !cast<FLAT_Pseudo>(OpName).Mnemonic> : + FLAT_Real_AllAddr_gfx10<op, OpName, asmName>, + FLAT_Real_RTN_gfx10<op, OpName, asmName>, + FLAT_Real_SADDR_RTN_gfx10<op, OpName, asmName>; -multiclass FLAT_Real_GlblAtomics_RTN_gfx10<bits<7> op> : - FLAT_Real_RTN_gfx10<op>, - FLAT_Real_SADDR_RTN_gfx10<op>; +multiclass FLAT_Real_GlblAtomics_RTN_gfx10<bits<7> op, string OpName = NAME> : + FLAT_Real_RTN_gfx10<op, OpName>, + FLAT_Real_SADDR_RTN_gfx10<op, OpName>; multiclass FLAT_Real_ScratchAllAddr_gfx10<bits<7> op> : FLAT_Real_Base_gfx10<op>, @@ -2220,8 +2230,8 @@ defm FLAT_ATOMIC_XOR_X2 : FLAT_Real_Atomics_gfx10<0x05b>; defm FLAT_ATOMIC_INC_X2 : FLAT_Real_Atomics_gfx10<0x05c>; defm FLAT_ATOMIC_DEC_X2 : FLAT_Real_Atomics_gfx10<0x05d>; defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Real_Atomics_gfx10<0x05e>; -defm FLAT_ATOMIC_FMIN_X2 : FLAT_Real_Atomics_gfx10<0x05f>; -defm FLAT_ATOMIC_FMAX_X2 : FLAT_Real_Atomics_gfx10<0x060>; +defm FLAT_ATOMIC_FMIN_X2 : FLAT_Real_Atomics_gfx10<0x05f, "FLAT_ATOMIC_MIN_F64", "flat_atomic_fmin_x2">; +defm FLAT_ATOMIC_FMAX_X2 : FLAT_Real_Atomics_gfx10<0x060, "FLAT_ATOMIC_MAX_F64", "flat_atomic_fmax_x2">; // ENC_FLAT_GLBL. @@ -2278,8 +2288,8 @@ defm GLOBAL_ATOMIC_XOR_X2 : FLAT_Real_GlblAtomics_gfx10<0x05b>; defm GLOBAL_ATOMIC_INC_X2 : FLAT_Real_GlblAtomics_gfx10<0x05c>; defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Real_GlblAtomics_gfx10<0x05d>; defm GLOBAL_ATOMIC_FCMPSWAP_X2 : FLAT_Real_GlblAtomics_gfx10<0x05e>; -defm GLOBAL_ATOMIC_FMIN_X2 : FLAT_Real_GlblAtomics_gfx10<0x05f>; -defm GLOBAL_ATOMIC_FMAX_X2 : FLAT_Real_GlblAtomics_gfx10<0x060>; +defm GLOBAL_ATOMIC_FMIN_X2 : FLAT_Real_GlblAtomics_gfx10<0x05f, "GLOBAL_ATOMIC_MIN_F64", "global_atomic_fmin_x2">; +defm GLOBAL_ATOMIC_FMAX_X2 : FLAT_Real_GlblAtomics_gfx10<0x060, "GLOBAL_ATOMIC_MAX_F64", "global_atomic_fmax_x2">; defm GLOBAL_LOAD_DWORD_ADDTID : FLAT_Real_AllAddr_gfx10<0x016>; defm GLOBAL_STORE_DWORD_ADDTID : FLAT_Real_AllAddr_gfx10<0x017>; @@ -2671,6 +2681,8 @@ defm GLOBAL_STORE_BYTE_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x024, "global_s defm GLOBAL_STORE_SHORT_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x025, "global_store_d16_hi_b16">; defm GLOBAL_LOAD_DWORD_ADDTID : VGLOBAL_Real_AllAddr_gfx12<0x028, "global_load_addtid_b32">; defm GLOBAL_STORE_DWORD_ADDTID : VGLOBAL_Real_AllAddr_gfx12<0x029, "global_store_addtid_b32">; +defm GLOBAL_LOAD_BLOCK : VGLOBAL_Real_AllAddr_gfx12<0x053>; +defm GLOBAL_STORE_BLOCK : VGLOBAL_Real_AllAddr_gfx12<0x054>; defm GLOBAL_ATOMIC_SWAP : VGLOBAL_Real_Atomics_gfx12<0x033, "global_atomic_swap_b32">; defm GLOBAL_ATOMIC_CMPSWAP : VGLOBAL_Real_Atomics_gfx12<0x034, "global_atomic_cmpswap_b32">; @@ -2741,3 +2753,6 @@ defm SCRATCH_LOAD_SBYTE_D16_HI : VSCRATCH_Real_AllAddr_gfx12<0x22, "scratch_ defm SCRATCH_LOAD_SHORT_D16_HI : VSCRATCH_Real_AllAddr_gfx12<0x23, "scratch_load_d16_hi_b16">; defm SCRATCH_STORE_BYTE_D16_HI : VSCRATCH_Real_AllAddr_gfx12<0x24, "scratch_store_d16_hi_b8">; defm SCRATCH_STORE_SHORT_D16_HI : VSCRATCH_Real_AllAddr_gfx12<0x25, "scratch_store_d16_hi_b16">; + +defm SCRATCH_LOAD_BLOCK : VSCRATCH_Real_AllAddr_gfx12<0x53>; +defm SCRATCH_STORE_BLOCK : VSCRATCH_Real_AllAddr_gfx12<0x54>; diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 94d93390d091..217279211531 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -116,31 +116,112 @@ void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) { << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n"); } +/// Checks whether \p SU can use the cached DAG pressure diffs to compute the +/// current register pressure. +/// +/// This works for the common case, but it has a few exceptions that have been +/// observed through trial and error: +/// - Explicit physical register operands +/// - Subregister definitions +/// +/// In both of those cases, PressureDiff doesn't represent the actual pressure, +/// and querying LiveIntervals through the RegPressureTracker is needed to get +/// an accurate value. +/// +/// We should eventually only use PressureDiff for maximum performance, but this +/// already allows 80% of SUs to take the fast path without changing scheduling +/// at all. Further changes would either change scheduling, or require a lot +/// more logic to recover an accurate pressure estimate from the PressureDiffs. +static bool canUsePressureDiffs(const SUnit &SU) { + if (!SU.isInstr()) + return false; + + // Cannot use pressure diffs for subregister defs or with physregs, it's + // imprecise in both cases. + for (const auto &Op : SU.getInstr()->operands()) { + if (!Op.isReg() || Op.isImplicit()) + continue; + if (Op.getReg().isPhysical() || + (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister)) + return false; + } + return true; +} + +static void getRegisterPressures(bool AtTop, + const RegPressureTracker &RPTracker, SUnit *SU, + std::vector<unsigned> &Pressure, + std::vector<unsigned> &MaxPressure) { + // getDownwardPressure() and getUpwardPressure() make temporary changes to + // the tracker, so we need to pass those function a non-const copy. + RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker); + if (AtTop) + TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure); + else + TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); +} + void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, const SIRegisterInfo *SRI, unsigned SGPRPressure, - unsigned VGPRPressure) { + unsigned VGPRPressure, bool IsBottomUp) { Cand.SU = SU; Cand.AtTop = AtTop; if (!DAG->isTrackingPressure()) return; - // getDownwardPressure() and getUpwardPressure() make temporary changes to - // the tracker, so we need to pass those function a non-const copy. - RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker); - Pressure.clear(); MaxPressure.clear(); - if (AtTop) - TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure); - else { - // FIXME: I think for bottom up scheduling, the register pressure is cached - // and can be retrieved by DAG->getPressureDif(SU). - TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); + // We try to use the cached PressureDiffs in the ScheduleDAG whenever + // possible over querying the RegPressureTracker. + // + // RegPressureTracker will make a lot of LIS queries which are very + // expensive, it is considered a slow function in this context. + // + // PressureDiffs are precomputed and cached, and getPressureDiff is just a + // trivial lookup into an array. It is pretty much free. + // + // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of + // PressureDiffs. + if (AtTop || !canUsePressureDiffs(*SU)) { + getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure); + } else { + // Reserve 4 slots. + Pressure.resize(4, 0); + Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure; + Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure; + + for (const auto &Diff : DAG->getPressureDiff(SU)) { + if (!Diff.isValid()) + continue; + // PressureDiffs is always bottom-up so if we're working top-down we need + // to invert its sign. + Pressure[Diff.getPSet()] += + (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc()); + } + +#ifdef EXPENSIVE_CHECKS + std::vector<unsigned> CheckPressure, CheckMaxPressure; + getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure); + if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] != + CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] || + Pressure[AMDGPU::RegisterPressureSets::VGPR_32] != + CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) { + errs() << "Register Pressure is inaccurate when calculated through " + "PressureDiff\n" + << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32] + << ", expected " + << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n" + << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32] + << ", expected " + << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n"; + report_fatal_error("inaccurate register pressure calculation"); + } +#endif } unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; @@ -158,7 +239,6 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit; bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit; - // FIXME: We have to enter REG-EXCESS before we reach the actual threshold // to increase the likelihood we don't go over the limits. We should improve // the analysis to look through dependencies to find the path with the least @@ -207,7 +287,8 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, - SchedCandidate &Cand) { + SchedCandidate &Cand, + bool IsBottomUp) { const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI); ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos(); unsigned SGPRPressure = 0; @@ -220,8 +301,8 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, for (SUnit *SU : Q) { SchedCandidate TryCand(ZonePolicy); - initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, - SGPRPressure, VGPRPressure); + initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure, + VGPRPressure, IsBottomUp); // Pass SchedBoundary only when comparing nodes from the same boundary. SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; tryCandidate(Cand, TryCand, ZoneArg); @@ -262,7 +343,8 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { if (!BotCand.isValid() || BotCand.SU->isScheduled || BotCand.Policy != BotPolicy) { BotCand.reset(CandPolicy()); - pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand); + pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand, + /*IsBottomUp=*/true); assert(BotCand.Reason != NoCand && "failed to find the first candidate"); } else { LLVM_DEBUG(traceCandidate(BotCand)); @@ -270,7 +352,8 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { if (VerifyScheduling) { SchedCandidate TCand; TCand.reset(CandPolicy()); - pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand); + pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand, + /*IsBottomUp=*/true); assert(TCand.SU == BotCand.SU && "Last pick result should correspond to re-picking right now"); } @@ -282,7 +365,8 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { if (!TopCand.isValid() || TopCand.SU->isScheduled || TopCand.Policy != TopPolicy) { TopCand.reset(CandPolicy()); - pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand); + pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand, + /*IsBottomUp=*/false); assert(TopCand.Reason != NoCand && "failed to find the first candidate"); } else { LLVM_DEBUG(traceCandidate(TopCand)); @@ -290,7 +374,8 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { if (VerifyScheduling) { SchedCandidate TCand; TCand.reset(CandPolicy()); - pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand); + pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand, + /*IsBottomUp=*/false); assert(TCand.SU == TopCand.SU && "Last pick result should correspond to re-picking right now"); } @@ -327,7 +412,8 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { if (!SU) { CandPolicy NoPolicy; TopCand.reset(NoPolicy); - pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand); + pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand, + /*IsBottomUp=*/false); assert(TopCand.Reason != NoCand && "failed to find a candidate"); SU = TopCand.SU; } @@ -337,7 +423,8 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { if (!SU) { CandPolicy NoPolicy; BotCand.reset(NoPolicy); - pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand); + pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand, + /*IsBottomUp=*/true); assert(BotCand.Reason != NoCand && "failed to find a candidate"); SU = BotCand.SU; } diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 2084aae4128f..f0aea2bc4ab8 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -45,12 +45,12 @@ protected: void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, - SchedCandidate &Cand); + SchedCandidate &Cand, bool IsBottomUp); - void initCandidate(SchedCandidate &Cand, SUnit *SU, - bool AtTop, const RegPressureTracker &RPTracker, - const SIRegisterInfo *SRI, - unsigned SGPRPressure, unsigned VGPRPressure); + void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, + const RegPressureTracker &RPTracker, + const SIRegisterInfo *SRI, unsigned SGPRPressure, + unsigned VGPRPressure, bool IsBottomUp); std::vector<unsigned> Pressure; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index db5b467f2238..07ff855756ec 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -159,6 +159,10 @@ protected: bool HasFP8Insts = false; bool HasFP8ConversionInsts = false; bool HasPkFmacF16Inst = false; + bool HasAtomicFMinFMaxF32GlobalInsts = false; + bool HasAtomicFMinFMaxF64GlobalInsts = false; + bool HasAtomicFMinFMaxF32FlatInsts = false; + bool HasAtomicFMinFMaxF64FlatInsts = false; bool HasAtomicDsPkAdd16Insts = false; bool HasAtomicFlatPkAdd16Insts = false; bool HasAtomicFaddRtnInsts = false; @@ -167,6 +171,7 @@ protected: bool HasAtomicBufferGlobalPkAddF16Insts = false; bool HasAtomicCSubNoRtnInsts = false; bool HasAtomicGlobalPkAddBF16Inst = false; + bool HasAtomicBufferPkAddBF16Inst = false; bool HasFlatAtomicFaddF32Inst = false; bool HasDefaultComponentZero = false; bool HasDefaultComponentBroadcast = false; @@ -820,6 +825,22 @@ public: return HasPkFmacF16Inst; } + bool hasAtomicFMinFMaxF32GlobalInsts() const { + return HasAtomicFMinFMaxF32GlobalInsts; + } + + bool hasAtomicFMinFMaxF64GlobalInsts() const { + return HasAtomicFMinFMaxF64GlobalInsts; + } + + bool hasAtomicFMinFMaxF32FlatInsts() const { + return HasAtomicFMinFMaxF32FlatInsts; + } + + bool hasAtomicFMinFMaxF64FlatInsts() const { + return HasAtomicFMinFMaxF64FlatInsts; + } + bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; } bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; } @@ -844,6 +865,10 @@ public: return HasAtomicGlobalPkAddBF16Inst; } + bool hasAtomicBufferPkAddBF16Inst() const { + return HasAtomicBufferPkAddBF16Inst; + } + bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; } bool hasDefaultComponentZero() const { return HasDefaultComponentZero; } @@ -1547,6 +1572,8 @@ public: bool hasFlatScratchInit() const { return FlatScratchInit; } + bool hasPrivateSegmentSize() const { return PrivateSegmentSize; } + unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; } unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; } @@ -1611,6 +1638,8 @@ private: bool FlatScratchInit = false; + bool PrivateSegmentSize = false; + unsigned NumKernargPreloadSGPRs = 0; unsigned NumUsedUserSGPRs = 0; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 883b6c4407fe..bb5de368810d 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -43,7 +43,6 @@ void AMDGPUInstPrinter::printRegName(raw_ostream &OS, MCRegister Reg) const { void AMDGPUInstPrinter::printInst(const MCInst *MI, uint64_t Address, StringRef Annot, const MCSubtargetInfo &STI, raw_ostream &OS) { - OS.flush(); printInstruction(MI, Address, STI, OS); printAnnotation(OS, Annot); } @@ -57,9 +56,15 @@ void AMDGPUInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isExpr()) { + Op.getExpr()->print(O, &MAI); + return; + } + // It's possible to end up with a 32-bit literal used with a 16-bit operand // with ignored high bits. Print as 32-bit anyway in that case. - int64_t Imm = MI->getOperand(OpNo).getImm(); + int64_t Imm = Op.getImm(); if (isInt<16>(Imm) || isUInt<16>(Imm)) O << formatHex(static_cast<uint64_t>(Imm & 0xffff)); else diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index fb93f45e3e87..b3cca91f6380 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -662,6 +662,11 @@ void AMDGPUMCCodeEmitter::getMachineOpValueT16Lo128( void AMDGPUMCCodeEmitter::getMachineOpValueCommon( const MCInst &MI, const MCOperand &MO, unsigned OpNo, APInt &Op, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { + int64_t Val; + if (MO.isExpr() && MO.getExpr()->evaluateAsAbsolute(Val)) { + Op = Val; + return; + } if (MO.isExpr() && MO.getExpr()->getKind() != MCExpr::Constant) { // FIXME: If this is expression is PCRel or not should not depend on what diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp index 159664faf983..83fbf4ac53d5 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp @@ -21,13 +21,11 @@ using namespace llvm; using namespace llvm::AMDGPU; -AMDGPUVariadicMCExpr::AMDGPUVariadicMCExpr(VariadicKind Kind, - ArrayRef<const MCExpr *> Args, - MCContext &Ctx) +AMDGPUMCExpr::AMDGPUMCExpr(VariantKind Kind, ArrayRef<const MCExpr *> Args, + MCContext &Ctx) : Kind(Kind), Ctx(Ctx) { assert(Args.size() >= 1 && "Needs a minimum of one expression."); - assert(Kind != AGVK_None && - "Cannot construct AMDGPUVariadicMCExpr of kind none."); + assert(Kind != AGVK_None && "Cannot construct AMDGPUMCExpr of kind none."); // Allocating the variadic arguments through the same allocation mechanism // that the object itself is allocated with so they end up in the same memory. @@ -40,25 +38,23 @@ AMDGPUVariadicMCExpr::AMDGPUVariadicMCExpr(VariadicKind Kind, this->Args = ArrayRef<const MCExpr *>(RawArgs, Args.size()); } -AMDGPUVariadicMCExpr::~AMDGPUVariadicMCExpr() { Ctx.deallocate(RawArgs); } +AMDGPUMCExpr::~AMDGPUMCExpr() { Ctx.deallocate(RawArgs); } -const AMDGPUVariadicMCExpr * -AMDGPUVariadicMCExpr::create(VariadicKind Kind, ArrayRef<const MCExpr *> Args, - MCContext &Ctx) { - return new (Ctx) AMDGPUVariadicMCExpr(Kind, Args, Ctx); +const AMDGPUMCExpr *AMDGPUMCExpr::create(VariantKind Kind, + ArrayRef<const MCExpr *> Args, + MCContext &Ctx) { + return new (Ctx) AMDGPUMCExpr(Kind, Args, Ctx); } -const MCExpr *AMDGPUVariadicMCExpr::getSubExpr(size_t Index) const { - assert(Index < Args.size() && - "Indexing out of bounds AMDGPUVariadicMCExpr sub-expr"); +const MCExpr *AMDGPUMCExpr::getSubExpr(size_t Index) const { + assert(Index < Args.size() && "Indexing out of bounds AMDGPUMCExpr sub-expr"); return Args[Index]; } -void AMDGPUVariadicMCExpr::printImpl(raw_ostream &OS, - const MCAsmInfo *MAI) const { +void AMDGPUMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const { switch (Kind) { default: - llvm_unreachable("Unknown AMDGPUVariadicMCExpr kind."); + llvm_unreachable("Unknown AMDGPUMCExpr kind."); case AGVK_Or: OS << "or("; break; @@ -86,21 +82,19 @@ void AMDGPUVariadicMCExpr::printImpl(raw_ostream &OS, OS << ')'; } -static int64_t op(AMDGPUVariadicMCExpr::VariadicKind Kind, int64_t Arg1, - int64_t Arg2) { +static int64_t op(AMDGPUMCExpr::VariantKind Kind, int64_t Arg1, int64_t Arg2) { switch (Kind) { default: - llvm_unreachable("Unknown AMDGPUVariadicMCExpr kind."); - case AMDGPUVariadicMCExpr::AGVK_Max: + llvm_unreachable("Unknown AMDGPUMCExpr kind."); + case AMDGPUMCExpr::AGVK_Max: return std::max(Arg1, Arg2); - case AMDGPUVariadicMCExpr::AGVK_Or: + case AMDGPUMCExpr::AGVK_Or: return Arg1 | Arg2; } } -bool AMDGPUVariadicMCExpr::evaluateExtraSGPRs(MCValue &Res, - const MCAsmLayout *Layout, - const MCFixup *Fixup) const { +bool AMDGPUMCExpr::evaluateExtraSGPRs(MCValue &Res, const MCAsmLayout *Layout, + const MCFixup *Fixup) const { auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) { MCValue MCVal; if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) || @@ -112,7 +106,7 @@ bool AMDGPUVariadicMCExpr::evaluateExtraSGPRs(MCValue &Res, }; assert(Args.size() == 3 && - "AMDGPUVariadic Argument count incorrect for ExtraSGPRs"); + "AMDGPUMCExpr Argument count incorrect for ExtraSGPRs"); const MCSubtargetInfo *STI = Ctx.getSubtargetInfo(); uint64_t VCCUsed = 0, FlatScrUsed = 0, XNACKUsed = 0; @@ -129,9 +123,8 @@ bool AMDGPUVariadicMCExpr::evaluateExtraSGPRs(MCValue &Res, return true; } -bool AMDGPUVariadicMCExpr::evaluateTotalNumVGPR(MCValue &Res, - const MCAsmLayout *Layout, - const MCFixup *Fixup) const { +bool AMDGPUMCExpr::evaluateTotalNumVGPR(MCValue &Res, const MCAsmLayout *Layout, + const MCFixup *Fixup) const { auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) { MCValue MCVal; if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) || @@ -142,7 +135,7 @@ bool AMDGPUVariadicMCExpr::evaluateTotalNumVGPR(MCValue &Res, return true; }; assert(Args.size() == 2 && - "AMDGPUVariadic Argument count incorrect for TotalNumVGPRs"); + "AMDGPUMCExpr Argument count incorrect for TotalNumVGPRs"); const MCSubtargetInfo *STI = Ctx.getSubtargetInfo(); uint64_t NumAGPR = 0, NumVGPR = 0; @@ -158,9 +151,8 @@ bool AMDGPUVariadicMCExpr::evaluateTotalNumVGPR(MCValue &Res, return true; } -bool AMDGPUVariadicMCExpr::evaluateAlignTo(MCValue &Res, - const MCAsmLayout *Layout, - const MCFixup *Fixup) const { +bool AMDGPUMCExpr::evaluateAlignTo(MCValue &Res, const MCAsmLayout *Layout, + const MCFixup *Fixup) const { auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) { MCValue MCVal; if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) || @@ -172,7 +164,7 @@ bool AMDGPUVariadicMCExpr::evaluateAlignTo(MCValue &Res, }; assert(Args.size() == 2 && - "AMDGPUVariadic Argument count incorrect for AlignTo"); + "AMDGPUMCExpr Argument count incorrect for AlignTo"); uint64_t Value = 0, Align = 0; if (!TryGetMCExprValue(Args[0], Value) || !TryGetMCExprValue(Args[1], Align)) return false; @@ -181,9 +173,8 @@ bool AMDGPUVariadicMCExpr::evaluateAlignTo(MCValue &Res, return true; } -bool AMDGPUVariadicMCExpr::evaluateOccupancy(MCValue &Res, - const MCAsmLayout *Layout, - const MCFixup *Fixup) const { +bool AMDGPUMCExpr::evaluateOccupancy(MCValue &Res, const MCAsmLayout *Layout, + const MCFixup *Fixup) const { auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) { MCValue MCVal; if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) || @@ -194,7 +185,7 @@ bool AMDGPUVariadicMCExpr::evaluateOccupancy(MCValue &Res, return true; }; assert(Args.size() == 7 && - "AMDGPUVariadic Argument count incorrect for Occupancy"); + "AMDGPUMCExpr Argument count incorrect for Occupancy"); uint64_t InitOccupancy, MaxWaves, Granule, TargetTotalNumVGPRs, Generation, NumSGPRs, NumVGPRs; @@ -226,8 +217,9 @@ bool AMDGPUVariadicMCExpr::evaluateOccupancy(MCValue &Res, return true; } -bool AMDGPUVariadicMCExpr::evaluateAsRelocatableImpl( - MCValue &Res, const MCAsmLayout *Layout, const MCFixup *Fixup) const { +bool AMDGPUMCExpr::evaluateAsRelocatableImpl(MCValue &Res, + const MCAsmLayout *Layout, + const MCFixup *Fixup) const { std::optional<int64_t> Total; switch (Kind) { @@ -258,12 +250,12 @@ bool AMDGPUVariadicMCExpr::evaluateAsRelocatableImpl( return true; } -void AMDGPUVariadicMCExpr::visitUsedExpr(MCStreamer &Streamer) const { +void AMDGPUMCExpr::visitUsedExpr(MCStreamer &Streamer) const { for (const MCExpr *Arg : Args) Streamer.visitUsedExpr(*Arg); } -MCFragment *AMDGPUVariadicMCExpr::findAssociatedFragment() const { +MCFragment *AMDGPUMCExpr::findAssociatedFragment() const { for (const MCExpr *Arg : Args) { if (Arg->findAssociatedFragment()) return Arg->findAssociatedFragment(); @@ -275,18 +267,19 @@ MCFragment *AMDGPUVariadicMCExpr::findAssociatedFragment() const { /// are unresolvable but needed for further MCExprs). Derived from /// implementation of IsaInfo::getNumExtraSGPRs in AMDGPUBaseInfo.cpp. /// -const AMDGPUVariadicMCExpr * -AMDGPUVariadicMCExpr::createExtraSGPRs(const MCExpr *VCCUsed, - const MCExpr *FlatScrUsed, - bool XNACKUsed, MCContext &Ctx) { +const AMDGPUMCExpr *AMDGPUMCExpr::createExtraSGPRs(const MCExpr *VCCUsed, + const MCExpr *FlatScrUsed, + bool XNACKUsed, + MCContext &Ctx) { return create(AGVK_ExtraSGPRs, {VCCUsed, FlatScrUsed, MCConstantExpr::create(XNACKUsed, Ctx)}, Ctx); } -const AMDGPUVariadicMCExpr *AMDGPUVariadicMCExpr::createTotalNumVGPR( - const MCExpr *NumAGPR, const MCExpr *NumVGPR, MCContext &Ctx) { +const AMDGPUMCExpr *AMDGPUMCExpr::createTotalNumVGPR(const MCExpr *NumAGPR, + const MCExpr *NumVGPR, + MCContext &Ctx) { return create(AGVK_TotalNumVGPRs, {NumAGPR, NumVGPR}, Ctx); } @@ -295,10 +288,11 @@ const AMDGPUVariadicMCExpr *AMDGPUVariadicMCExpr::createTotalNumVGPR( /// Remove dependency on GCNSubtarget and depend only only the necessary values /// for said occupancy computation. Should match computeOccupancy implementation /// without passing \p STM on. -const AMDGPUVariadicMCExpr * -AMDGPUVariadicMCExpr::createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs, - const MCExpr *NumVGPRs, - const GCNSubtarget &STM, MCContext &Ctx) { +const AMDGPUMCExpr *AMDGPUMCExpr::createOccupancy(unsigned InitOcc, + const MCExpr *NumSGPRs, + const MCExpr *NumVGPRs, + const GCNSubtarget &STM, + MCContext &Ctx) { unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM); unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM); unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h index f92350b59235..207a619d45a1 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h @@ -17,7 +17,7 @@ namespace llvm { class Function; class GCNSubtarget; -/// AMDGPU target specific variadic MCExpr operations. +/// AMDGPU target specific MCExpr operations. /// /// Takes in a minimum of 1 argument to be used with an operation. The supported /// operations are: @@ -27,9 +27,9 @@ class GCNSubtarget; /// \note If the 'or'/'max' operations are provided only a single argument, the /// operation will act as a no-op and simply resolve as the provided argument. /// -class AMDGPUVariadicMCExpr : public MCTargetExpr { +class AMDGPUMCExpr : public MCTargetExpr { public: - enum VariadicKind { + enum VariantKind { AGVK_None, AGVK_Or, AGVK_Max, @@ -40,14 +40,13 @@ public: }; private: - VariadicKind Kind; + VariantKind Kind; MCContext &Ctx; const MCExpr **RawArgs; ArrayRef<const MCExpr *> Args; - AMDGPUVariadicMCExpr(VariadicKind Kind, ArrayRef<const MCExpr *> Args, - MCContext &Ctx); - ~AMDGPUVariadicMCExpr(); + AMDGPUMCExpr(VariantKind Kind, ArrayRef<const MCExpr *> Args, MCContext &Ctx); + ~AMDGPUMCExpr(); bool evaluateExtraSGPRs(MCValue &Res, const MCAsmLayout *Layout, const MCFixup *Fixup) const; @@ -59,40 +58,39 @@ private: const MCFixup *Fixup) const; public: - static const AMDGPUVariadicMCExpr * - create(VariadicKind Kind, ArrayRef<const MCExpr *> Args, MCContext &Ctx); + static const AMDGPUMCExpr * + create(VariantKind Kind, ArrayRef<const MCExpr *> Args, MCContext &Ctx); - static const AMDGPUVariadicMCExpr *createOr(ArrayRef<const MCExpr *> Args, - MCContext &Ctx) { - return create(VariadicKind::AGVK_Or, Args, Ctx); + static const AMDGPUMCExpr *createOr(ArrayRef<const MCExpr *> Args, + MCContext &Ctx) { + return create(VariantKind::AGVK_Or, Args, Ctx); } - static const AMDGPUVariadicMCExpr *createMax(ArrayRef<const MCExpr *> Args, - MCContext &Ctx) { - return create(VariadicKind::AGVK_Max, Args, Ctx); + static const AMDGPUMCExpr *createMax(ArrayRef<const MCExpr *> Args, + MCContext &Ctx) { + return create(VariantKind::AGVK_Max, Args, Ctx); } - static const AMDGPUVariadicMCExpr *createExtraSGPRs(const MCExpr *VCCUsed, - const MCExpr *FlatScrUsed, - bool XNACKUsed, - MCContext &Ctx); + static const AMDGPUMCExpr *createExtraSGPRs(const MCExpr *VCCUsed, + const MCExpr *FlatScrUsed, + bool XNACKUsed, MCContext &Ctx); - static const AMDGPUVariadicMCExpr *createTotalNumVGPR(const MCExpr *NumAGPR, - const MCExpr *NumVGPR, - MCContext &Ctx); + static const AMDGPUMCExpr *createTotalNumVGPR(const MCExpr *NumAGPR, + const MCExpr *NumVGPR, + MCContext &Ctx); - static const AMDGPUVariadicMCExpr * + static const AMDGPUMCExpr * createAlignTo(const MCExpr *Value, const MCExpr *Align, MCContext &Ctx) { - return create(VariadicKind::AGVK_AlignTo, {Value, Align}, Ctx); + return create(VariantKind::AGVK_AlignTo, {Value, Align}, Ctx); } - static const AMDGPUVariadicMCExpr *createOccupancy(unsigned InitOcc, - const MCExpr *NumSGPRs, - const MCExpr *NumVGPRs, - const GCNSubtarget &STM, - MCContext &Ctx); + static const AMDGPUMCExpr *createOccupancy(unsigned InitOcc, + const MCExpr *NumSGPRs, + const MCExpr *NumVGPRs, + const GCNSubtarget &STM, + MCContext &Ctx); - VariadicKind getKind() const { return Kind; } + VariantKind getKind() const { return Kind; } const MCExpr *getSubExpr(size_t Index) const; void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index e805e964ffe4..531031b58034 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -319,8 +319,9 @@ bool AMDGPUTargetAsmStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) { void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, - const MCKernelDescriptor &KD, uint64_t NextVGPR, uint64_t NextSGPR, - bool ReserveVCC, bool ReserveFlatScr) { + const MCKernelDescriptor &KD, const MCExpr *NextVGPR, + const MCExpr *NextSGPR, const MCExpr *ReserveVCC, + const MCExpr *ReserveFlatScr) { IsaVersion IVersion = getIsaVersion(STI.getCPU()); const MCAsmInfo *MAI = getContext().getAsmInfo(); @@ -339,16 +340,25 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( OS << '\n'; }; + auto EmitMCExpr = [&](const MCExpr *Value) { + int64_t evaluatableValue; + if (Value->evaluateAsAbsolute(evaluatableValue)) { + OS << static_cast<uint64_t>(evaluatableValue); + } else { + Value->print(OS, MAI); + } + }; + OS << "\t\t.amdhsa_group_segment_fixed_size "; - KD.group_segment_fixed_size->print(OS, MAI); + EmitMCExpr(KD.group_segment_fixed_size); OS << '\n'; OS << "\t\t.amdhsa_private_segment_fixed_size "; - KD.private_segment_fixed_size->print(OS, MAI); + EmitMCExpr(KD.private_segment_fixed_size); OS << '\n'; OS << "\t\t.amdhsa_kernarg_size "; - KD.kernarg_size->print(OS, MAI); + EmitMCExpr(KD.kernarg_size); OS << '\n'; PrintField( @@ -433,8 +443,13 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( ".amdhsa_system_vgpr_workitem_id"); // These directives are required. - OS << "\t\t.amdhsa_next_free_vgpr " << NextVGPR << '\n'; - OS << "\t\t.amdhsa_next_free_sgpr " << NextSGPR << '\n'; + OS << "\t\t.amdhsa_next_free_vgpr "; + EmitMCExpr(NextVGPR); + OS << '\n'; + + OS << "\t\t.amdhsa_next_free_sgpr "; + EmitMCExpr(NextSGPR); + OS << '\n'; if (AMDGPU::isGFX90A(STI)) { // MCExpr equivalent of taking the (accum_offset + 1) * 4. @@ -447,19 +462,19 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( accum_bits = MCBinaryExpr::createMul( accum_bits, MCConstantExpr::create(4, getContext()), getContext()); OS << "\t\t.amdhsa_accum_offset "; - int64_t IVal; - if (accum_bits->evaluateAsAbsolute(IVal)) { - OS << static_cast<uint64_t>(IVal); - } else { - accum_bits->print(OS, MAI); - } + EmitMCExpr(accum_bits); OS << '\n'; } - if (!ReserveVCC) - OS << "\t\t.amdhsa_reserve_vcc " << ReserveVCC << '\n'; - if (IVersion.Major >= 7 && !ReserveFlatScr && !hasArchitectedFlatScratch(STI)) - OS << "\t\t.amdhsa_reserve_flat_scratch " << ReserveFlatScr << '\n'; + OS << "\t\t.amdhsa_reserve_vcc "; + EmitMCExpr(ReserveVCC); + OS << '\n'; + + if (IVersion.Major >= 7 && !hasArchitectedFlatScratch(STI)) { + OS << "\t\t.amdhsa_reserve_flat_scratch "; + EmitMCExpr(ReserveFlatScr); + OS << '\n'; + } switch (CodeObjectVersion) { default: @@ -915,8 +930,9 @@ bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) { void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, - const MCKernelDescriptor &KernelDescriptor, uint64_t NextVGPR, - uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) { + const MCKernelDescriptor &KernelDescriptor, const MCExpr *NextVGPR, + const MCExpr *NextSGPR, const MCExpr *ReserveVCC, + const MCExpr *ReserveFlatScr) { auto &Streamer = getStreamer(); auto &Context = Streamer.getContext(); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h index e5c90060cb5d..bf1538c71d15 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -94,8 +94,9 @@ public: virtual void EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName, const AMDGPU::MCKernelDescriptor &KernelDescriptor, - uint64_t NextVGPR, uint64_t NextSGPR, - bool ReserveVCC, bool ReserveFlatScr) {} + const MCExpr *NextVGPR, const MCExpr *NextSGPR, + const MCExpr *ReserveVCC, + const MCExpr *ReserveFlatScr) {} static StringRef getArchNameFromElfMach(unsigned ElfMach); static unsigned getElfMach(StringRef GPU); @@ -151,8 +152,9 @@ public: void EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName, const AMDGPU::MCKernelDescriptor &KernelDescriptor, - uint64_t NextVGPR, uint64_t NextSGPR, - bool ReserveVCC, bool ReserveFlatScr) override; + const MCExpr *NextVGPR, const MCExpr *NextSGPR, + const MCExpr *ReserveVCC, + const MCExpr *ReserveFlatScr) override; }; class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer { @@ -207,8 +209,9 @@ public: void EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName, const AMDGPU::MCKernelDescriptor &KernelDescriptor, - uint64_t NextVGPR, uint64_t NextSGPR, - bool ReserveVCC, bool ReserveFlatScr) override; + const MCExpr *NextVGPR, const MCExpr *NextSGPR, + const MCExpr *ReserveVCC, + const MCExpr *ReserveFlatScr) override; }; } #endif diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp index 22d0594e2b86..56a23e26b8d9 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp @@ -21,7 +21,6 @@ using namespace llvm; void R600InstPrinter::printInst(const MCInst *MI, uint64_t Address, StringRef Annot, const MCSubtargetInfo &STI, raw_ostream &O) { - O.flush(); printInstruction(MI, Address, O); printAnnotation(O, Annot); } diff --git a/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp index 0a96c643d9bd..1a73fdf028c9 100644 --- a/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp +++ b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp @@ -113,8 +113,8 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<MachineDominatorTree>(); - AU.addRequired<MachinePostDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addRequired<MachinePostDominatorTreeWrapperPass>(); AU.addRequired<MachineLoopInfo>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -140,9 +140,9 @@ public: FuncRep = &MF; MLI = &getAnalysis<MachineLoopInfo>(); LLVM_DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI);); - MDT = &getAnalysis<MachineDominatorTree>(); - LLVM_DEBUG(MDT->print(dbgs(), (const Module *)nullptr);); - PDT = &getAnalysis<MachinePostDominatorTree>(); + MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); + LLVM_DEBUG(MDT->print(dbgs());); + PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree(); LLVM_DEBUG(PDT->print(dbgs());); prepare(); run(); @@ -1629,8 +1629,8 @@ void R600MachineCFGStructurizer::retireBlock(MachineBasicBlock *MBB) { INITIALIZE_PASS_BEGIN(R600MachineCFGStructurizer, "amdgpustructurizer", "AMDGPU CFG Structurizer", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_END(R600MachineCFGStructurizer, "amdgpustructurizer", "AMDGPU CFG Structurizer", false, false) diff --git a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp index 77935cb4cde1..8bac570d59d4 100644 --- a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp +++ b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -103,8 +103,8 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); AU.addRequired<MachineLoopInfo>(); AU.addPreserved<MachineLoopInfo>(); MachineFunctionPass::getAnalysisUsage(AU); diff --git a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp index 59e274787590..64185db02ec1 100644 --- a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp +++ b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp @@ -35,8 +35,8 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); AU.addRequired<MachineLoopInfo>(); AU.addPreserved<MachineLoopInfo>(); MachineFunctionPass::getAnalysisUsage(AU); diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index a00ca625fc73..68c5f23c8e11 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -162,8 +162,8 @@ public: StringRef getPassName() const override { return "SI Fix SGPR copies"; } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -173,7 +173,7 @@ public: INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE, "SI Fix SGPR copies", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE, "SI Fix SGPR copies", false, false) @@ -611,8 +611,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { MRI = &MF.getRegInfo(); TRI = ST.getRegisterInfo(); TII = ST.getInstrInfo(); - MDT = &getAnalysis<MachineDominatorTree>(); - + MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 5c411a095587..7bf6a635158e 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1519,6 +1519,9 @@ const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const { case AMDGPU::V_MAX_F64_e64: case AMDGPU::V_MAX_NUM_F64_e64: case AMDGPU::V_PK_MAX_F16: { + if (MI.mayRaiseFPException()) + return nullptr; + if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm()) return nullptr; @@ -1565,6 +1568,9 @@ bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) { if (TII->getClampMask(*Def) != TII->getClampMask(MI)) return false; + if (Def->mayRaiseFPException()) + return false; + MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp); if (!DefClamp) return false; @@ -1650,7 +1656,9 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const { ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 || Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 || Op == AMDGPU::V_MUL_F16_fake16_e64) && - MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign)) + MFI->getMode().FP64FP16Denormals.Output != + DenormalMode::PreserveSign) || + MI.mayRaiseFPException()) return std::pair(nullptr, SIOutMods::NONE); const MachineOperand *RegOp = nullptr; @@ -1725,6 +1733,9 @@ bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) { if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE) return false; + if (Def->mayRaiseFPException()) + return false; + // Clamp is applied after omod. If the source already has clamp set, don't // fold it. if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp)) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 4d8667affdb4..83bfb622ee52 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -791,8 +791,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16}) // Split vector operations. setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB, - ISD::MUL, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX, - ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT, + ISD::MUL, ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, + ISD::UMAX, ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT, ISD::SSUBSAT}, VT, Custom); @@ -859,19 +859,22 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16, - MVT::v2i16, MVT::v2f16, MVT::i128, MVT::i8}, + MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128, + MVT::i8}, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, - {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16, - MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16, - MVT::i16, MVT::i8, MVT::i128}, + {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16, + MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16, + MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16, + MVT::i16, MVT::bf16, MVT::i8, MVT::i128}, Custom); setOperationAction(ISD::INTRINSIC_VOID, - {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16, - MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16, - MVT::i8, MVT::i128}, + {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16, + MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16, + MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, + MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128}, Custom); setOperationAction(ISD::STACKSAVE, MVT::Other, Custom); @@ -942,6 +945,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, ISD::ATOMIC_LOAD_UMIN, ISD::ATOMIC_LOAD_UMAX, ISD::ATOMIC_LOAD_FADD, + ISD::ATOMIC_LOAD_FMIN, + ISD::ATOMIC_LOAD_FMAX, ISD::ATOMIC_LOAD_UINC_WRAP, ISD::ATOMIC_LOAD_UDEC_WRAP, ISD::INTRINSIC_VOID, @@ -1109,29 +1114,33 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT); } -static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes) { +static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, + const DataLayout &DL, Type *Ty, + unsigned MaxNumLanes) { assert(MaxNumLanes != 0); + LLVMContext &Ctx = Ty->getContext(); if (auto *VT = dyn_cast<FixedVectorType>(Ty)) { unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements()); - return EVT::getVectorVT(Ty->getContext(), - EVT::getEVT(VT->getElementType()), + return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()), NumElts); } - return EVT::getEVT(Ty); + return TLI.getValueType(DL, Ty); } // Peek through TFE struct returns to only use the data size. -static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) { +static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, + const DataLayout &DL, Type *Ty, + unsigned MaxNumLanes) { auto *ST = dyn_cast<StructType>(Ty); if (!ST) - return memVTFromLoadIntrData(Ty, MaxNumLanes); + return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes); // TFE intrinsics return an aggregate type. assert(ST->getNumContainedTypes() == 2 && ST->getContainedType(1)->isIntegerTy(32)); - return memVTFromLoadIntrData(ST->getContainedType(0), MaxNumLanes); + return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes); } /// Map address space 7 to MVT::v5i32 because that's its in-memory @@ -1200,9 +1209,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags |= MachineMemOperand::MOVolatile; Info.flags |= MachineMemOperand::MODereferenceable; if (ME.onlyReadsMemory()) { - unsigned MaxNumLanes = 4; - if (RsrcIntr->IsImage) { + unsigned MaxNumLanes = 4; + const AMDGPU::ImageDimIntrinsicInfo *Intr = AMDGPU::getImageDimIntrinsicInfo(IntrID); const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = @@ -1215,9 +1224,14 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue(); MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask); } - } - Info.memVT = memVTFromLoadIntrReturn(CI.getType(), MaxNumLanes); + Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(), + CI.getType(), MaxNumLanes); + } else { + Info.memVT = + memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(), + std::numeric_limits<unsigned>::max()); + } // FIXME: What does alignment mean for an image? Info.opc = ISD::INTRINSIC_W_CHAIN; @@ -1229,9 +1243,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, if (RsrcIntr->IsImage) { unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue(); unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask); - Info.memVT = memVTFromLoadIntrData(DataTy, DMaskLanes); + Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy, + DMaskLanes); } else - Info.memVT = EVT::getEVT(DataTy); + Info.memVT = getValueType(MF.getDataLayout(), DataTy); Info.flags |= MachineMemOperand::MOStore; } else { @@ -1265,7 +1280,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, switch (IntrID) { case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: - case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: { Info.opc = ISD::INTRINSIC_W_CHAIN; @@ -1280,19 +1294,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, return true; } - case Intrinsic::amdgcn_buffer_atomic_fadd: { - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(CI.getOperand(0)->getType()); - Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE; - Info.align.reset(); - Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; - - const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4)); - if (!Vol || !Vol->isZero()) - Info.flags |= MachineMemOperand::MOVolatile; - - return true; - } case Intrinsic::amdgcn_ds_add_gs_reg_rtn: case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: { Info.opc = ISD::INTRINSIC_W_CHAIN; @@ -1449,7 +1450,6 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, case Intrinsic::amdgcn_atomic_cond_sub_u32: case Intrinsic::amdgcn_ds_append: case Intrinsic::amdgcn_ds_consume: - case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmax: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_ordered_add: @@ -1610,6 +1610,16 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, return false; } + if ((AS == AMDGPUAS::CONSTANT_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && + AM.BaseOffs < 0) { + // Scalar (non-buffer) loads can only use a negative offset if + // soffset+offset is non-negative. Since the compiler can only prove that + // in a few special cases, it is safer to claim that negative offsets are + // not supported. + return false; + } + if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. return true; @@ -2468,6 +2478,12 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo, CCInfo.AllocateReg(FlatScratchInitReg); } + if (UserSGPRInfo.hasPrivateSegmentSize()) { + Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI); + MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass); + CCInfo.AllocateReg(PrivateSegmentSizeReg); + } + // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read // these from the dispatch pointer. } @@ -5811,6 +5827,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return lowerTRAP(Op, DAG); case ISD::DEBUGTRAP: return lowerDEBUGTRAP(Op, DAG); + case ISD::ABS: case ISD::FABS: case ISD::FNEG: case ISD::FCANONICALIZE: @@ -6097,6 +6114,184 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE)); } +static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, + SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + unsigned ValSize = VT.getSizeInBits(); + unsigned IID = N->getConstantOperandVal(0); + bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 || + IID == Intrinsic::amdgcn_permlanex16; + SDLoc SL(N); + MVT IntVT = MVT::getIntegerVT(ValSize); + + auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1, + SDValue Src2, MVT ValT) -> SDValue { + SmallVector<SDValue, 8> Operands; + switch (IID) { + case Intrinsic::amdgcn_permlane16: + case Intrinsic::amdgcn_permlanex16: + Operands.push_back(N->getOperand(6)); + Operands.push_back(N->getOperand(5)); + Operands.push_back(N->getOperand(4)); + [[fallthrough]]; + case Intrinsic::amdgcn_writelane: + Operands.push_back(Src2); + [[fallthrough]]; + case Intrinsic::amdgcn_readlane: + Operands.push_back(Src1); + [[fallthrough]]; + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_permlane64: + Operands.push_back(Src0); + break; + default: + llvm_unreachable("unhandled lane op"); + } + + Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32)); + std::reverse(Operands.begin(), Operands.end()); + + if (SDNode *GL = N->getGluedNode()) { + assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE); + GL = GL->getOperand(0).getNode(); + Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue, + SDValue(GL, 0))); + } + + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands); + }; + + SDValue Src0 = N->getOperand(1); + SDValue Src1, Src2; + if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane || + IsPermLane16) { + Src1 = N->getOperand(2); + if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) + Src2 = N->getOperand(3); + } + + if (ValSize == 32) { + // Already legal + return SDValue(); + } + + if (ValSize < 32) { + bool IsFloat = VT.isFloatingPoint(); + Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0, + SL, MVT::i32); + + if (IsPermLane16) { + Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1, + SL, MVT::i32); + } + + if (IID == Intrinsic::amdgcn_writelane) { + Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2, + SL, MVT::i32); + } + + SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32); + SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT); + return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc; + } + + if (ValSize % 32 != 0) + return SDValue(); + + auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue { + EVT VT = N->getValueType(0); + unsigned NE = VT.getVectorNumElements(); + EVT EltVT = VT.getVectorElementType(); + SmallVector<SDValue, 8> Scalars; + unsigned NumOperands = N->getNumOperands(); + SmallVector<SDValue, 4> Operands(NumOperands); + SDNode *GL = N->getGluedNode(); + + // only handle convergencectrl_glue + assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE); + + for (unsigned i = 0; i != NE; ++i) { + for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e; + ++j) { + SDValue Operand = N->getOperand(j); + EVT OperandVT = Operand.getValueType(); + if (OperandVT.isVector()) { + // A vector operand; extract a single element. + EVT OperandEltVT = OperandVT.getVectorElementType(); + Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT, + Operand, DAG.getVectorIdxConstant(i, SL)); + } else { + // A scalar operand; just use it as is. + Operands[j] = Operand; + } + } + + if (GL) + Operands[NumOperands - 1] = + DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue, + SDValue(GL->getOperand(0).getNode(), 0)); + + Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands)); + } + + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE); + return DAG.getBuildVector(VecVT, SL, Scalars); + }; + + if (VT.isVector()) { + switch (MVT::SimpleValueType EltTy = + VT.getVectorElementType().getSimpleVT().SimpleTy) { + case MVT::i32: + case MVT::f32: { + SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT()); + return unrollLaneOp(LaneOp.getNode()); + } + case MVT::i16: + case MVT::f16: + case MVT::bf16: { + MVT SubVecVT = MVT::getVectorVT(EltTy, 2); + SmallVector<SDValue, 4> Pieces; + SDValue Src0SubVec, Src1SubVec, Src2SubVec; + for (unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) { + Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0, + DAG.getConstant(EltIdx, SL, MVT::i32)); + + if (IsPermLane16) + Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1, + DAG.getConstant(EltIdx, SL, MVT::i32)); + + if (IID == Intrinsic::amdgcn_writelane) + Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2, + DAG.getConstant(EltIdx, SL, MVT::i32)); + + Pieces.push_back( + IsPermLane16 + ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT) + : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT)); + EltIdx += 2; + } + return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces); + } + default: + // Handle all other cases by bitcasting to i32 vectors + break; + } + } + + MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32); + Src0 = DAG.getBitcast(VecVT, Src0); + + if (IsPermLane16) + Src1 = DAG.getBitcast(VecVT, Src1); + + if (IID == Intrinsic::amdgcn_writelane) + Src2 = DAG.getBitcast(VecVT, Src2); + + SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT); + SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode()); + return DAG.getBitcast(VT, UnrolledLaneOp); +} + void SITargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { @@ -8563,6 +8758,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } case Intrinsic::amdgcn_addrspacecast_nonnull: return lowerADDRSPACECAST(Op, DAG); + case Intrinsic::amdgcn_readlane: + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_writelane: + case Intrinsic::amdgcn_permlane16: + case Intrinsic::amdgcn_permlanex16: + case Intrinsic::amdgcn_permlane64: + return lowerLaneOp(*this, Op.getNode(), DAG); default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) @@ -8609,12 +8811,6 @@ SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op, M->getMemOperand()); } -// Return a value to use for the idxen operand by examining the vindex operand. -static unsigned getIdxEn(SDValue VIndex) { - // No need to set idxen if vindex is known to be zero. - return isNullConstant(VIndex) ? 0 : 1; -} - SDValue SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG, unsigned NewOpcode) const { @@ -8703,78 +8899,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, M->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } - case Intrinsic::amdgcn_ds_fadd: { - MemSDNode *M = cast<MemSDNode>(Op); - unsigned Opc; - switch (IntrID) { - case Intrinsic::amdgcn_ds_fadd: - Opc = ISD::ATOMIC_LOAD_FADD; - break; - } - - return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(), - M->getOperand(0), M->getOperand(2), M->getOperand(3), - M->getMemOperand()); - } case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: { MemSDNode *M = cast<MemSDNode>(Op); - unsigned Opc; - switch (IntrID) { - case Intrinsic::amdgcn_ds_fmin: - Opc = AMDGPUISD::ATOMIC_LOAD_FMIN; - break; - case Intrinsic::amdgcn_ds_fmax: - Opc = AMDGPUISD::ATOMIC_LOAD_FMAX; - break; - default: - llvm_unreachable("Unknown intrinsic!"); - } - SDValue Ops[] = { - M->getOperand(0), // Chain - M->getOperand(2), // Ptr - M->getOperand(3) // Value - }; - - return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops, - M->getMemoryVT(), M->getMemOperand()); - } - case Intrinsic::amdgcn_buffer_load: - case Intrinsic::amdgcn_buffer_load_format: { - unsigned Glc = Op.getConstantOperandVal(5); - unsigned Slc = Op.getConstantOperandVal(6); - unsigned IdxEn = getIdxEn(Op.getOperand(3)); - SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // rsrc - Op.getOperand(3), // vindex - SDValue(), // voffset -- will be set by setBufferOffsets - SDValue(), // soffset -- will be set by setBufferOffsets - SDValue(), // offset -- will be set by setBufferOffsets - DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen - }; - setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]); - - unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ? - AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; - - EVT VT = Op.getValueType(); - EVT IntVT = VT.changeTypeToInteger(); - auto *M = cast<MemSDNode>(Op); - EVT LoadVT = Op.getValueType(); - - if (LoadVT.getScalarType() == MVT::f16) - return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, - M, DAG, Ops); - - // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics - if (LoadVT.getScalarType() == MVT::i8 || LoadVT.getScalarType() == MVT::i16) - return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, - M->getMemOperand()); - - return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, - M->getMemOperand(), DAG); + unsigned Opc = IntrID == Intrinsic::amdgcn_ds_fmin ? ISD::ATOMIC_LOAD_FMIN + : ISD::ATOMIC_LOAD_FMAX; + return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(), M->getOperand(0), + M->getOperand(2), M->getOperand(3), + M->getMemOperand()); } case Intrinsic::amdgcn_raw_buffer_load: case Intrinsic::amdgcn_raw_ptr_buffer_load: @@ -8825,35 +8957,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops); } - case Intrinsic::amdgcn_tbuffer_load: { - MemSDNode *M = cast<MemSDNode>(Op); - EVT LoadVT = Op.getValueType(); - - auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget); - unsigned Dfmt = Op.getConstantOperandVal(7); - unsigned Nfmt = Op.getConstantOperandVal(8); - unsigned Glc = Op.getConstantOperandVal(9); - unsigned Slc = Op.getConstantOperandVal(10); - unsigned IdxEn = getIdxEn(Op.getOperand(3)); - SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // rsrc - Op.getOperand(3), // vindex - Op.getOperand(4), // voffset - SOffset, // soffset - Op.getOperand(6), // offset - DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format - DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen - }; - - if (LoadVT.getScalarType() == MVT::f16) - return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, - M, DAG, Ops); - return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, - Op->getVTList(), Ops, LoadVT, M->getMemOperand(), - DAG); - } case Intrinsic::amdgcn_raw_tbuffer_load: case Intrinsic::amdgcn_raw_ptr_tbuffer_load: { MemSDNode *M = cast<MemSDNode>(Op); @@ -8908,94 +9011,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op->getVTList(), Ops, LoadVT, M->getMemOperand(), DAG); } - case Intrinsic::amdgcn_buffer_atomic_swap: - case Intrinsic::amdgcn_buffer_atomic_add: - case Intrinsic::amdgcn_buffer_atomic_sub: - case Intrinsic::amdgcn_buffer_atomic_csub: - case Intrinsic::amdgcn_buffer_atomic_smin: - case Intrinsic::amdgcn_buffer_atomic_umin: - case Intrinsic::amdgcn_buffer_atomic_smax: - case Intrinsic::amdgcn_buffer_atomic_umax: - case Intrinsic::amdgcn_buffer_atomic_and: - case Intrinsic::amdgcn_buffer_atomic_or: - case Intrinsic::amdgcn_buffer_atomic_xor: - case Intrinsic::amdgcn_buffer_atomic_fadd: { - unsigned Slc = Op.getConstantOperandVal(6); - unsigned IdxEn = getIdxEn(Op.getOperand(4)); - SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // vdata - Op.getOperand(3), // rsrc - Op.getOperand(4), // vindex - SDValue(), // voffset -- will be set by setBufferOffsets - SDValue(), // soffset -- will be set by setBufferOffsets - SDValue(), // offset -- will be set by setBufferOffsets - DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen - }; - setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); - - EVT VT = Op.getValueType(); - - auto *M = cast<MemSDNode>(Op); - unsigned Opcode = 0; - - switch (IntrID) { - case Intrinsic::amdgcn_buffer_atomic_swap: - Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP; - break; - case Intrinsic::amdgcn_buffer_atomic_add: - Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD; - break; - case Intrinsic::amdgcn_buffer_atomic_sub: - Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB; - break; - case Intrinsic::amdgcn_buffer_atomic_csub: - Opcode = AMDGPUISD::BUFFER_ATOMIC_CSUB; - break; - case Intrinsic::amdgcn_buffer_atomic_smin: - Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN; - break; - case Intrinsic::amdgcn_buffer_atomic_umin: - Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN; - break; - case Intrinsic::amdgcn_buffer_atomic_smax: - Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX; - break; - case Intrinsic::amdgcn_buffer_atomic_umax: - Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX; - break; - case Intrinsic::amdgcn_buffer_atomic_and: - Opcode = AMDGPUISD::BUFFER_ATOMIC_AND; - break; - case Intrinsic::amdgcn_buffer_atomic_or: - Opcode = AMDGPUISD::BUFFER_ATOMIC_OR; - break; - case Intrinsic::amdgcn_buffer_atomic_xor: - Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; - break; - case Intrinsic::amdgcn_buffer_atomic_fadd: - Opcode = AMDGPUISD::BUFFER_ATOMIC_FADD; - break; - default: - llvm_unreachable("unhandled atomic opcode"); - } - - return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, - M->getMemOperand()); - } case Intrinsic::amdgcn_raw_buffer_atomic_fadd: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); - case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16: - return lowerRawBufferAtomicIntrin(Op, DAG, - AMDGPUISD::BUFFER_ATOMIC_FADD_BF16); case Intrinsic::amdgcn_struct_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); - case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16: - return lowerStructBufferAtomicIntrin(Op, DAG, - AMDGPUISD::BUFFER_ATOMIC_FADD_BF16); case Intrinsic::amdgcn_raw_buffer_atomic_fmin: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN); @@ -9092,29 +9113,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32); - case Intrinsic::amdgcn_buffer_atomic_cmpswap: { - unsigned Slc = Op.getConstantOperandVal(7); - unsigned IdxEn = getIdxEn(Op.getOperand(5)); - SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // src - Op.getOperand(3), // cmp - Op.getOperand(4), // rsrc - Op.getOperand(5), // vindex - SDValue(), // voffset -- will be set by setBufferOffsets - SDValue(), // soffset -- will be set by setBufferOffsets - SDValue(), // offset -- will be set by setBufferOffsets - DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen - }; - setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]); - - EVT VT = Op.getValueType(); - auto *M = cast<MemSDNode>(Op); - - return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, - Op->getVTList(), Ops, VT, M->getMemOperand()); - } case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: { SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG); @@ -9313,22 +9311,21 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_global_atomic_fmin_num: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmin_num: { - Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN; + Opcode = ISD::ATOMIC_LOAD_FMIN; break; } case Intrinsic::amdgcn_global_atomic_fmax: case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fmax_num: { - Opcode = AMDGPUISD::ATOMIC_LOAD_FMAX; + Opcode = ISD::ATOMIC_LOAD_FMAX; break; } default: llvm_unreachable("unhandled atomic opcode"); } - return DAG.getMemIntrinsicNode(Opcode, SDLoc(Op), - M->getVTList(), Ops, M->getMemoryVT(), - M->getMemOperand()); + return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(), + Ops, M->getMemOperand()); } case Intrinsic::amdgcn_s_get_barrier_state: { SDValue Chain = Op->getOperand(0); @@ -9557,34 +9554,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, return SDValue(); }; - case Intrinsic::amdgcn_tbuffer_store: { - SDValue VData = Op.getOperand(2); - bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); - if (IsD16) - VData = handleD16VData(VData, DAG); - unsigned Dfmt = Op.getConstantOperandVal(8); - unsigned Nfmt = Op.getConstantOperandVal(9); - unsigned Glc = Op.getConstantOperandVal(10); - unsigned Slc = Op.getConstantOperandVal(11); - unsigned IdxEn = getIdxEn(Op.getOperand(4)); - SDValue Ops[] = { - Chain, - VData, // vdata - Op.getOperand(3), // rsrc - Op.getOperand(4), // vindex - Op.getOperand(5), // voffset - Op.getOperand(6), // soffset - Op.getOperand(7), // offset - DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format - DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen - }; - unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : - AMDGPUISD::TBUFFER_STORE_FORMAT; - MemSDNode *M = cast<MemSDNode>(Op); - return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, - M->getMemoryVT(), M->getMemOperand()); - } case Intrinsic::amdgcn_struct_tbuffer_store: case Intrinsic::amdgcn_struct_ptr_tbuffer_store: { @@ -9642,42 +9611,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, M->getMemoryVT(), M->getMemOperand()); } - case Intrinsic::amdgcn_buffer_store: - case Intrinsic::amdgcn_buffer_store_format: { - SDValue VData = Op.getOperand(2); - bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); - if (IsD16) - VData = handleD16VData(VData, DAG); - unsigned Glc = Op.getConstantOperandVal(6); - unsigned Slc = Op.getConstantOperandVal(7); - unsigned IdxEn = getIdxEn(Op.getOperand(4)); - SDValue Ops[] = { - Chain, - VData, - Op.getOperand(3), // rsrc - Op.getOperand(4), // vindex - SDValue(), // voffset -- will be set by setBufferOffsets - SDValue(), // soffset -- will be set by setBufferOffsets - SDValue(), // offset -- will be set by setBufferOffsets - DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen - }; - setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); - - unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ? - AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; - Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; - MemSDNode *M = cast<MemSDNode>(Op); - - // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics - EVT VDataType = VData.getValueType().getScalarType(); - if (VDataType == MVT::i8 || VDataType == MVT::i16) - return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); - - return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, - M->getMemoryVT(), M->getMemOperand()); - } - case Intrinsic::amdgcn_raw_buffer_store: case Intrinsic::amdgcn_raw_ptr_buffer_store: case Intrinsic::amdgcn_raw_buffer_store_format: @@ -10083,8 +10016,8 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets( return {N0, SDValue(C1, 0)}; } -// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the -// three offsets (voffset, soffset and instoffset) into the SDValue[3] array +// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store +// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array // pointed to by Offsets. void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, SDValue *Offsets, @@ -10215,7 +10148,7 @@ SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG, EVT VDataType, SDLoc DL, SDValue Ops[], MemSDNode *M) const { - if (VDataType == MVT::f16) + if (VDataType == MVT::f16 || VDataType == MVT::bf16) Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]); SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]); @@ -16063,8 +15996,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N, case ISD::INTRINSIC_W_CHAIN: return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1)); case AMDGPUISD::ATOMIC_CMP_SWAP: - case AMDGPUISD::ATOMIC_LOAD_FMIN: - case AMDGPUISD::ATOMIC_LOAD_FMAX: case AMDGPUISD::BUFFER_ATOMIC_SWAP: case AMDGPUISD::BUFFER_ATOMIC_ADD: case AMDGPUISD::BUFFER_ATOMIC_SUB: @@ -16080,7 +16011,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N, case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP: case AMDGPUISD::BUFFER_ATOMIC_CSUB: case AMDGPUISD::BUFFER_ATOMIC_FADD: - case AMDGPUISD::BUFFER_ATOMIC_FADD_BF16: case AMDGPUISD::BUFFER_ATOMIC_FMIN: case AMDGPUISD::BUFFER_ATOMIC_FMAX: // Target-specific read-modify-write atomics are sources of divergence. @@ -16173,6 +16103,26 @@ static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) { << " operation at memory scope " << MemScope; } +static bool isHalf2OrBFloat2(Type *Ty) { + if (auto *VT = dyn_cast<FixedVectorType>(Ty)) { + Type *EltTy = VT->getElementType(); + return VT->getNumElements() == 2 && + (EltTy->isHalfTy() || EltTy->isBFloatTy()); + } + + return false; +} + +static bool isHalf2(Type *Ty) { + FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty); + return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy(); +} + +static bool isBFloat2(Type *Ty) { + FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty); + return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy(); +} + TargetLowering::AtomicExpansionKind SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { unsigned AS = RMW->getPointerAddressSpace(); @@ -16231,7 +16181,9 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { : AtomicExpansionKind::CmpXChg; } - // TODO: Handle v2f16/v2bf16 cases for gfx940 + if (Subtarget->hasAtomicDsPkAdd16Insts() && isHalf2OrBFloat2(Ty)) + return AtomicExpansionKind::None; + return AtomicExpansionKind::CmpXChg; } @@ -16239,10 +16191,36 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { AS != AMDGPUAS::BUFFER_FAT_POINTER) return AtomicExpansionKind::CmpXChg; - // TODO: gfx940 supports v2f16 and v2bf16 if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy())) return AtomicExpansionKind::None; + if (AS == AMDGPUAS::FLAT_ADDRESS) { + // gfx940, gfx12 + // FIXME: Needs to account for no fine-grained memory + if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty)) + return AtomicExpansionKind::None; + } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) { + // gfx90a, gfx940, gfx12 + // FIXME: Needs to account for no fine-grained memory + if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty)) + return AtomicExpansionKind::None; + + // gfx940, gfx12 + // FIXME: Needs to account for no fine-grained memory + if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty)) + return AtomicExpansionKind::None; + } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) { + // gfx90a, gfx940, gfx12 + // FIXME: Needs to account for no fine-grained memory + if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty)) + return AtomicExpansionKind::None; + + // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for + // buffer. gfx12 does have the buffer version. + if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty)) + return AtomicExpansionKind::None; + } + if (unsafeFPAtomicsDisabled(RMW->getFunction())) return AtomicExpansionKind::CmpXChg; @@ -16284,17 +16262,51 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { return AtomicExpansionKind::CmpXChg; } case AtomicRMWInst::FMin: - case AtomicRMWInst::FMax: + case AtomicRMWInst::FMax: { + Type *Ty = RMW->getType(); + + // LDS float and double fmin/fmax were always supported. + if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy())) + return AtomicExpansionKind::None; + + if (unsafeFPAtomicsDisabled(RMW->getFunction())) + return AtomicExpansionKind::CmpXChg; + + // Always expand system scope fp atomics. + if (HasSystemScope) + return AtomicExpansionKind::CmpXChg; + + // For flat and global cases: + // float, double in gfx7. Manual claims denormal support. + // Removed in gfx8. + // float, double restored in gfx10. + // double removed again in gfx11, so only f32 for gfx11/gfx12. + // + // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but no + // f32. + // + // FIXME: Check scope and fine grained memory + if (AS == AMDGPUAS::FLAT_ADDRESS) { + if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy()) + return ReportUnsafeHWInst(AtomicExpansionKind::None); + if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy()) + return ReportUnsafeHWInst(AtomicExpansionKind::None); + } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) || + AS == AMDGPUAS::BUFFER_FAT_POINTER) { + if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy()) + return ReportUnsafeHWInst(AtomicExpansionKind::None); + if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy()) + return ReportUnsafeHWInst(AtomicExpansionKind::None); + } + + return AtomicExpansionKind::CmpXChg; + } case AtomicRMWInst::Min: case AtomicRMWInst::Max: case AtomicRMWInst::UMin: case AtomicRMWInst::UMax: { if (AMDGPU::isFlatGlobalAddrSpace(AS) || AS == AMDGPUAS::BUFFER_FAT_POINTER) { - if (RMW->getType()->isFloatTy() && - unsafeFPAtomicsDisabled(RMW->getFunction())) - return AtomicExpansionKind::CmpXChg; - // Always expand system scope min/max atomics. if (HasSystemScope) return AtomicExpansionKind::CmpXChg; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 4c02bb1b306e..1f198a92c0fa 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -253,9 +253,9 @@ public: bool shouldExpandVectorDynExt(SDNode *N) const; private: - // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the - // three offsets (voffset, soffset and instoffset) into the SDValue[3] array - // pointed to by Offsets. + // Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store + // the three offsets (voffset, soffset and instoffset) into the SDValue[3] + // array pointed to by Offsets. void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, SDValue *Offsets, Align Alignment = Align(4)) const; diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 230443313d72..4c53a081cdb2 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -641,7 +641,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired<MachineLoopInfo>(); - AU.addRequired<MachinePostDominatorTree>(); + AU.addRequired<MachinePostDominatorTreeWrapperPass>(); AU.addUsedIfAvailable<AAResultsWrapperPass>(); AU.addPreserved<AAResultsWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); @@ -1118,7 +1118,7 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) @@ -2398,7 +2398,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { MRI = &MF.getRegInfo(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); MLI = &getAnalysis<MachineLoopInfo>(); - PDT = &getAnalysis<MachinePostDominatorTree>(); + PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree(); if (auto AAR = getAnalysisIfAvailable<AAResultsWrapperPass>()) AA = &AAR->getAAResults(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index d8e21da8019a..cc1b9ac0c9ec 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2519,12 +2519,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); break; } - case AMDGPU::ENTER_PSEUDO_WM: - case AMDGPU::EXIT_PSEUDO_WM: { - // These do nothing. - MI.eraseFromParent(); - break; - } case AMDGPU::SI_RETURN: { const MachineFunction *MF = MBB.getParent(); const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); @@ -3978,7 +3972,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, .add(*Dst) .add(*Src0) .add(*Src1) - .addImm(Imm); + .addImm(Imm) + .setMIFlags(MI.getFlags()); updateLiveVariables(LV, MI, *MIB); if (LIS) LIS->ReplaceMachineInstrInMaps(MI, *MIB); @@ -3997,7 +3992,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, .add(*Dst) .add(*Src0) .addImm(Imm) - .add(*Src2); + .add(*Src2) + .setMIFlags(MI.getFlags()); updateLiveVariables(LV, MI, *MIB); if (LIS) LIS->ReplaceMachineInstrInMaps(MI, *MIB); @@ -4018,7 +4014,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, .add(*Dst) .add(*Src1) .addImm(Imm) - .add(*Src2); + .add(*Src2) + .setMIFlags(MI.getFlags()); updateLiveVariables(LV, MI, *MIB); if (LIS) LIS->ReplaceMachineInstrInMaps(MI, *MIB); @@ -4054,7 +4051,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, .addImm(Src2Mods ? Src2Mods->getImm() : 0) .add(*Src2) .addImm(Clamp ? Clamp->getImm() : 0) - .addImm(Omod ? Omod->getImm() : 0); + .addImm(Omod ? Omod->getImm() : 0) + .setMIFlags(MI.getFlags()); if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel)) MIB.addImm(OpSel ? OpSel->getImm() : 0); updateLiveVariables(LV, MI, *MIB); @@ -5657,24 +5655,9 @@ unsigned SIInstrInfo::buildExtractSubReg( DebugLoc DL = MI->getDebugLoc(); Register SubReg = MRI.createVirtualRegister(SubRC); - if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { - BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) - .addReg(SuperReg.getReg(), 0, SubIdx); - return SubReg; - } - - // Just in case the super register is itself a sub-register, copy it to a new - // value so we don't need to worry about merging its subreg index with the - // SubIdx passed to this function. The register coalescer should be able to - // eliminate this extra copy. - Register NewSuperReg = MRI.createVirtualRegister(SuperRC); - - BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) - .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); - + unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx); BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) - .addReg(NewSuperReg, 0, SubIdx); - + .addReg(SuperReg.getReg(), 0, NewSubIdx); return SubReg; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 40289f2addfd..c64b3a7c356f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -72,14 +72,6 @@ def SDTAtomic2_f32 : SDTypeProfile<1, 2, [ SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1> ]>; -def SIatomic_fmin : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMIN", SDTAtomic2_f32, - [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] ->; - -def SIatomic_fmax : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMAX", SDTAtomic2_f32, - [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] ->; - // load_d16_{lo|hi} ptr, tied_input def SIload_d16 : SDTypeProfile<1, 2, [ SDTCisPtrTy<1>, @@ -222,7 +214,6 @@ defm SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">; defm SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">; defm SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">; defm SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">; -defm SIbuffer_atomic_fadd_bf16 : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD_BF16">; defm SIbuffer_atomic_fmin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMIN">; defm SIbuffer_atomic_fmax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMAX">; defm SIbuffer_atomic_cond_sub_u32 : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32">; @@ -315,13 +306,6 @@ class isIntType<ValueType SrcVT> { } //===----------------------------------------------------------------------===// -// PatFrags for global memory operations -//===----------------------------------------------------------------------===// - -defm atomic_load_fmin : binary_atomic_op_all_as<SIatomic_fmin, 0>; -defm atomic_load_fmax : binary_atomic_op_all_as<SIatomic_fmax, 0>; - -//===----------------------------------------------------------------------===// // SDNodes PatFrags for loads/stores with a glue input. // This is for SDNodes and PatFrag for local loads and stores to // enable s_mov_b32 m0, -1 to be glued to the memory instructions. @@ -709,15 +693,24 @@ multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0, >; let AddressSpaces = StoreAddress_local.AddrSpaces in { - defm _local_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>; - defm _local_m0 : noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"), - IsInt>; + + if IsInt then { + defm _local_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue")>; + defm _local_m0 : noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>; + } else { + defm _local_m0 : binary_atomic_op_fp <!cast<SDNode>(NAME#"_glue")>; + defm _local_m0 : noret_binary_atomic_op_fp <!cast<SDNode>(NAME#"_glue")>; + } } let AddressSpaces = StoreAddress_region.AddrSpaces in { - defm _region_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>; - defm _region_m0 : noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"), - IsInt>; + if IsInt then { + defm _region_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue")>; + defm _region_m0 : noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>; + } else { + defm _region_m0 : binary_atomic_op_fp <!cast<SDNode>(NAME#"_glue")>; + defm _region_m0 : noret_binary_atomic_op_fp <!cast<SDNode>(NAME#"_glue")>; + } } } @@ -734,8 +727,8 @@ defm atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">; defm atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">; defm atomic_swap : SIAtomicM0Glue2 <"SWAP">; defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 0, SDTAtomic2_f32, 0>; -defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32, 0>; -defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32, 0>; +defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 0, SDTAtomic2_f32, 0>; +defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 0, SDTAtomic2_f32, 0>; def as_i1timm : SDNodeXForm<timm, [{ return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1); @@ -2233,13 +2226,12 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32, // Return an AGPR+VGPR operand class for the given VGPR register class. class getLdStRegisterOperand<RegisterClass RC> { RegisterOperand ret = - !if(!eq(RC.Size, 32), AVLdSt_32, - !if(!eq(RC.Size, 64), AVLdSt_64, - !if(!eq(RC.Size, 96), AVLdSt_96, - !if(!eq(RC.Size, 128), AVLdSt_128, - !if(!eq(RC.Size, 160), AVLdSt_160, - RegisterOperand<VReg_1> // invalid register - ))))); + !cond(!eq(RC.Size, 32) : AVLdSt_32, + !eq(RC.Size, 64) : AVLdSt_64, + !eq(RC.Size, 96) : AVLdSt_96, + !eq(RC.Size, 128) : AVLdSt_128, + !eq(RC.Size, 160) : AVLdSt_160, + !eq(RC.Size, 1024) : AVLdSt_1024); } class getHasVOP3DPP <ValueType DstVT = i32, ValueType Src0VT = i32, @@ -2271,6 +2263,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> { field bit EnableClamp = _EnableClamp; field bit IsTrue16 = 0; field bit IsRealTrue16 = 0; + field bit IsInvalidSingleUseConsumer = 0; + field bit IsInvalidSingleUseProducer = 0; field ValueType DstVT = ArgVT[0]; field ValueType Src0VT = ArgVT[1]; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index c1b844f844c3..835f44f9d0d6 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -217,21 +217,6 @@ def S_INVERSE_BALLOT_U32 : SPseudoInstSI <(outs SReg_32:$sdst), (ins SSrc_b32:$m def S_INVERSE_BALLOT_U64 : SPseudoInstSI <(outs SReg_64:$sdst), (ins SSrc_b64:$mask)>; } // End usesCustomInserter = 1 -// PSEUDO_WM is treated like STRICT_WWM/STRICT_WQM without exec changes. -def ENTER_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> { - let Uses = [EXEC]; - let Defs = [EXEC]; - let hasSideEffects = 0; - let mayLoad = 0; - let mayStore = 0; -} - -def EXIT_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> { - let hasSideEffects = 0; - let mayLoad = 0; - let mayStore = 0; -} - // Pseudo instructions used for @llvm.fptrunc.round upward // and @llvm.fptrunc.round downward. // These intrinsics will be legalized to G_FPTRUNC_ROUND_UPWARD @@ -252,16 +237,22 @@ def FPTRUNC_DOWNWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst), // restoring it after we're done. let Defs = [SCC], isConvergent = 1 in { def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst), - (ins VSrc_b32: $src, VSrc_b32:$inactive), - [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> { -} + (ins VSrc_b32: $src, VSrc_b32:$inactive), []>; def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst), - (ins VSrc_b64: $src, VSrc_b64:$inactive), - [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> { -} + (ins VSrc_b64: $src, VSrc_b64:$inactive), []>; } // End Defs = [SCC] +foreach vt = Reg32Types.types in { +def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)), + (V_SET_INACTIVE_B32 VSrc_b32:$src, VSrc_b32:$inactive)>; +} + +foreach vt = Reg64Types.types in { +def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)), + (V_SET_INACTIVE_B64 VSrc_b64:$src, VSrc_b64:$inactive)>; +} + def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)), (V_SET_INACTIVE_B32 VGPR_32:$src, VGPR_32:$inactive)>; @@ -3398,7 +3389,7 @@ def : GCNPat< // FIXME: Should also do this for readlane, but tablegen crashes on // the ignored src1. def : GCNPat< - (int_amdgcn_readfirstlane (i32 imm:$src)), + (i32 (int_amdgcn_readfirstlane (i32 imm:$src))), (S_MOV_B32 SReg_32:$src) >; @@ -3872,11 +3863,6 @@ def G_AMDGPU_ATOMIC_CMPXCHG : AMDGPUGenericInstruction { let mayStore = 1; } -let Namespace = "AMDGPU" in { -def G_AMDGPU_ATOMIC_FMIN : G_ATOMICRMW_OP; -def G_AMDGPU_ATOMIC_FMAX : G_ATOMICRMW_OP; -} - class BufferAtomicGenericInstruction : AMDGPUGenericInstruction { let OutOperandList = (outs type0:$dst); let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, @@ -3901,7 +3887,6 @@ def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction; -def G_AMDGPU_BUFFER_ATOMIC_FADD_BF16 : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_FMIN : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_FMAX : BufferAtomicGenericInstruction; diff --git a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp index abb72e8e63c3..afc6353ec811 100644 --- a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp @@ -48,8 +48,8 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -60,7 +60,7 @@ char SILateBranchLowering::ID = 0; INITIALIZE_PASS_BEGIN(SILateBranchLowering, DEBUG_TYPE, "SI insert s_cbranch_execz instructions", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(SILateBranchLowering, DEBUG_TYPE, "SI insert s_cbranch_execz instructions", false, false) @@ -149,7 +149,7 @@ bool SILateBranchLowering::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); - MDT = &getAnalysis<MachineDominatorTree>(); + MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index 5dc3457b5bfa..75a1575f2180 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -149,7 +149,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addUsedIfAvailable<LiveIntervals>(); // Should preserve the same set that TwoAddressInstructions does. - AU.addPreserved<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); AU.addPreserved<SlotIndexes>(); AU.addPreserved<LiveIntervals>(); AU.addPreservedID(LiveVariablesID); @@ -764,7 +764,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { LIS = getAnalysisIfAvailable<LiveIntervals>(); // This doesn't actually need LiveVariables, but we can preserve them. LV = getAnalysisIfAvailable<LiveVariables>(); - MDT = getAnalysisIfAvailable<MachineDominatorTree>(); + auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>(); + MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr; MRI = &MF.getRegInfo(); BoolRC = TRI->getBoolRC(); diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp index 32dad0c425c0..a9ee74dec120 100644 --- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -51,8 +51,8 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<MachineDominatorTree>(); - AU.addRequired<MachinePostDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addRequired<MachinePostDominatorTreeWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -399,8 +399,8 @@ private: INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, false) @@ -445,8 +445,9 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) { MachineFunctionProperties::Property::Selected)) return false; - Vreg1LoweringHelper Helper(&TheMF, &getAnalysis<MachineDominatorTree>(), - &getAnalysis<MachinePostDominatorTree>()); + Vreg1LoweringHelper Helper( + &TheMF, &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(), + &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree()); bool Changed = false; Changed |= Helper.lowerCopiesFromI1(); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 072c5aedc220..d9db0f7a4f53 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -83,7 +83,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, if (CC != CallingConv::AMDGPU_Gfx) ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo; - // TODO: Pick a high register, and shift down, similar to a kernel. FrameOffsetReg = AMDGPU::SGPR33; StackPtrOffsetReg = AMDGPU::SGPR32; @@ -233,6 +232,12 @@ Register SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) { return ArgInfo.FlatScratchInit.getRegister(); } +Register SIMachineFunctionInfo::addPrivateSegmentSize(const SIRegisterInfo &TRI) { + ArgInfo.PrivateSegmentSize = ArgDescriptor::createRegister(getNextUserSGPR()); + NumUserSGPRs += 1; + return ArgInfo.PrivateSegmentSize.getRegister(); +} + Register SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) { ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 9fe02e24c8a1..7af5e7388f84 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -752,6 +752,7 @@ public: Register addKernargSegmentPtr(const SIRegisterInfo &TRI); Register addDispatchID(const SIRegisterInfo &TRI); Register addFlatScratchInit(const SIRegisterInfo &TRI); + Register addPrivateSegmentSize(const SIRegisterInfo &TRI); Register addImplicitBufferPtr(const SIRegisterInfo &TRI); Register addLDSKernelId(); SmallVectorImpl<MCRegister> * diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp index 8204a70e72d9..18d66e419152 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp @@ -148,10 +148,10 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LiveVariables>(); - AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); AU.addRequired<MachineLoopInfo>(); AU.addPreserved<LiveVariables>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); AU.addPreserved<MachineLoopInfo>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -618,7 +618,7 @@ char SIOptimizeVGPRLiveRange::ID = 0; INITIALIZE_PASS_BEGIN(SIOptimizeVGPRLiveRange, DEBUG_TYPE, "SI Optimize VGPR LiveRange", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(LiveVariables) INITIALIZE_PASS_END(SIOptimizeVGPRLiveRange, DEBUG_TYPE, @@ -635,7 +635,7 @@ bool SIOptimizeVGPRLiveRange::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); - MDT = &getAnalysis<MachineDominatorTree>(); + MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); Loops = &getAnalysis<MachineLoopInfo>(); LV = &getAnalysis<LiveVariables>(); MRI = &MF.getRegInfo(); diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 1fadd8ce45b1..f47731bf6aac 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -37,20 +37,22 @@ STATISTIC(NumSDWAInstructionsPeepholed, namespace { +bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST, + const SIInstrInfo *TII); class SDWAOperand; class SDWADstOperand; -class SIPeepholeSDWA : public MachineFunctionPass { -public: - using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>; +using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>; +using SDWAOperandsMap = MapVector<MachineInstr *, SDWAOperandsVector>; +class SIPeepholeSDWA : public MachineFunctionPass { private: MachineRegisterInfo *MRI; const SIRegisterInfo *TRI; const SIInstrInfo *TII; MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands; - MapVector<MachineInstr *, SDWAOperandsVector> PotentialMatches; + SDWAOperandsMap PotentialMatches; SmallVector<MachineInstr *, 8> ConvertedInstructions; std::optional<int64_t> foldToImm(const MachineOperand &Op) const; @@ -65,7 +67,6 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; void matchSDWAOperands(MachineBasicBlock &MBB); std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI); - bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const; void pseudoOpConvertToVOP2(MachineInstr &MI, const GCNSubtarget &ST) const; bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); @@ -93,7 +94,9 @@ public: virtual ~SDWAOperand() = default; - virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0; + virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII, + const GCNSubtarget &ST, + SDWAOperandsMap *PotentialMatches = nullptr) = 0; virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; MachineOperand *getTargetOperand() const { return Target; } @@ -126,7 +129,9 @@ public: : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {} - MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; + MachineInstr *potentialToConvert(const SIInstrInfo *TII, + const GCNSubtarget &ST, + SDWAOperandsMap *PotentialMatches = nullptr) override; bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; SdwaSel getSrcSel() const { return SrcSel; } @@ -153,7 +158,9 @@ public: SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} - MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; + MachineInstr *potentialToConvert(const SIInstrInfo *TII, + const GCNSubtarget &ST, + SDWAOperandsMap *PotentialMatches = nullptr) override; bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; SdwaSel getDstSel() const { return DstSel; } @@ -327,7 +334,33 @@ uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, return Mods; } -MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) { +MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII, + const GCNSubtarget &ST, + SDWAOperandsMap *PotentialMatches) { + if (PotentialMatches != nullptr) { + // Fill out the map for all uses if all can be converted + MachineOperand *Reg = getReplacedOperand(); + if (!Reg->isReg() || !Reg->isDef()) + return nullptr; + + for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg->getReg())) + // Check that all instructions that use Reg can be converted + if (!isConvertibleToSDWA(UseMI, ST, TII)) + return nullptr; + + // Now that it's guaranteed all uses are legal, iterate over the uses again + // to add them for later conversion. + for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) { + // Should not get a subregister here + assert(isSameReg(UseMO, *Reg)); + + SDWAOperandsMap &potentialMatchesMap = *PotentialMatches; + MachineInstr *UseMI = UseMO.getParent(); + potentialMatchesMap[UseMI].push_back(this); + } + return nullptr; + } + // For SDWA src operand potential instruction is one that use register // defined by parent instruction MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI()); @@ -420,7 +453,9 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { return true; } -MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) { +MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII, + const GCNSubtarget &ST, + SDWAOperandsMap *PotentialMatches) { // For SDWA dst operand potential instruction is one that defines register // that this operand uses MachineRegisterInfo *MRI = getMRI(); @@ -919,8 +954,10 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI); } -bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI, - const GCNSubtarget &ST) const { +namespace { +bool isConvertibleToSDWA(MachineInstr &MI, + const GCNSubtarget &ST, + const SIInstrInfo* TII) { // Check if this is already an SDWA instruction unsigned Opc = MI.getOpcode(); if (TII->isSDWA(Opc)) @@ -980,6 +1017,7 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI, return true; } +} // namespace bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands) { @@ -1215,7 +1253,7 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { matchSDWAOperands(MBB); for (const auto &OperandPair : SDWAOperands) { const auto &Operand = OperandPair.second; - MachineInstr *PotentialMI = Operand->potentialToConvert(TII); + MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST); if (PotentialMI && (PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 || PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64)) @@ -1228,8 +1266,8 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { for (const auto &OperandPair : SDWAOperands) { const auto &Operand = OperandPair.second; - MachineInstr *PotentialMI = Operand->potentialToConvert(TII); - if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) { + MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST, &PotentialMatches); + if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST, TII)) { PotentialMatches[PotentialMI].push_back(Operand.get()); } } diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp index 398f870a9f53..5837dbeb3f98 100644 --- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp @@ -165,19 +165,15 @@ SIPreAllocateWWMRegs::printWWMInfo(const MachineInstr &MI) { unsigned Opc = MI.getOpcode(); - if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM || - Opc == AMDGPU::ENTER_PSEUDO_WM) { + if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM) { dbgs() << "Entering "; } else { - assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM || - Opc == AMDGPU::EXIT_PSEUDO_WM); + assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM); dbgs() << "Exiting "; } if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WWM) { dbgs() << "Strict WWM "; - } else if (Opc == AMDGPU::ENTER_PSEUDO_WM || Opc == AMDGPU::EXIT_PSEUDO_WM) { - dbgs() << "Pseudo WWM/WQM "; } else { assert(Opc == AMDGPU::ENTER_STRICT_WQM || Opc == AMDGPU::EXIT_STRICT_WQM); dbgs() << "Strict WQM "; @@ -230,16 +226,14 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) { } if (MI.getOpcode() == AMDGPU::ENTER_STRICT_WWM || - MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM || - MI.getOpcode() == AMDGPU::ENTER_PSEUDO_WM) { + MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM) { LLVM_DEBUG(printWWMInfo(MI)); InWWM = true; continue; } if (MI.getOpcode() == AMDGPU::EXIT_STRICT_WWM || - MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM || - MI.getOpcode() == AMDGPU::EXIT_PSEUDO_WM) { + MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM) { LLVM_DEBUG(printWWMInfo(MI)); InWWM = false; } diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp index 0d40816cdd4b..212edff09783 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp @@ -161,45 +161,6 @@ static const MCExpr *MaskShift(const MCExpr *Val, uint32_t Mask, uint32_t Shift, return Val; } -uint64_t SIProgramInfo::getComputePGMRSrc1(const GCNSubtarget &ST) const { - int64_t VBlocks, SBlocks; - VGPRBlocks->evaluateAsAbsolute(VBlocks); - SGPRBlocks->evaluateAsAbsolute(SBlocks); - - uint64_t Reg = S_00B848_VGPRS(static_cast<uint64_t>(VBlocks)) | - S_00B848_SGPRS(static_cast<uint64_t>(SBlocks)) | - getComputePGMRSrc1Reg(*this, ST); - - return Reg; -} - -uint64_t SIProgramInfo::getPGMRSrc1(CallingConv::ID CC, - const GCNSubtarget &ST) const { - if (AMDGPU::isCompute(CC)) { - return getComputePGMRSrc1(ST); - } - int64_t VBlocks, SBlocks; - VGPRBlocks->evaluateAsAbsolute(VBlocks); - SGPRBlocks->evaluateAsAbsolute(SBlocks); - - return getPGMRSrc1Reg(*this, CC, ST) | - S_00B848_VGPRS(static_cast<uint64_t>(VBlocks)) | - S_00B848_SGPRS(static_cast<uint64_t>(SBlocks)); -} - -uint64_t SIProgramInfo::getComputePGMRSrc2() const { - int64_t ScratchEn; - ScratchEnable->evaluateAsAbsolute(ScratchEn); - return ScratchEn | getComputePGMRSrc2Reg(*this); -} - -uint64_t SIProgramInfo::getPGMRSrc2(CallingConv::ID CC) const { - if (AMDGPU::isCompute(CC)) - return getComputePGMRSrc2(); - - return 0; -} - const MCExpr *SIProgramInfo::getComputePGMRSrc1(const GCNSubtarget &ST, MCContext &Ctx) const { uint64_t Reg = getComputePGMRSrc1Reg(*this, ST); diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h index e66e5a194c8b..c358a2d9db10 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h @@ -98,16 +98,12 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo { void reset(const MachineFunction &MF); /// Compute the value of the ComputePGMRsrc1 register. - uint64_t getComputePGMRSrc1(const GCNSubtarget &ST) const; - uint64_t getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST) const; const MCExpr *getComputePGMRSrc1(const GCNSubtarget &ST, MCContext &Ctx) const; const MCExpr *getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST, MCContext &Ctx) const; /// Compute the value of the ComputePGMRsrc2 register. - uint64_t getComputePGMRSrc2() const; - uint64_t getPGMRSrc2(CallingConv::ID CC) const; const MCExpr *getComputePGMRSrc2(MCContext &Ctx) const; const MCExpr *getPGMRSrc2(CallingConv::ID CC, MCContext &Ctx) const; }; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 4b5f9bdd82b8..4c5e60c873bb 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -3157,7 +3157,7 @@ MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg, MachineInstr &Use, MachineRegisterInfo &MRI, LiveIntervals *LIS) const { - auto &MDT = LIS->getAnalysis<MachineDominatorTree>(); + auto &MDT = LIS->getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); SlotIndex UseIdx = LIS->getInstructionIndex(Use); SlotIndex DefIdx; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index caac7126068e..f1d9aec16363 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -586,7 +586,9 @@ class RegisterTypes<list<ValueType> reg_types> { def Reg16Types : RegisterTypes<[i16, f16, bf16]>; def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16, p2, p3, p5, p6]>; -def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, p0]>; +def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, p0, v4i16, v4f16, v4bf16]>; +def Reg96Types : RegisterTypes<[v3i32, v3f32]>; +def Reg128Types : RegisterTypes<[v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16]>; let HasVGPR = 1 in { // VOP3 and VINTERP can access 256 lo and 256 hi registers. @@ -744,7 +746,7 @@ def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, let BaseClassOrder = 10000; } -def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64, v8i16, v8f16, v8bf16], 32, +def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", Reg128Types.types, 32, (add PRIVATE_RSRC_REG)> { let isAllocatable = 0; let CopyCost = -1; @@ -815,7 +817,7 @@ def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v let HasSGPR = 1; } -def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16, v4bf16], 32, +def SGPR_64 : SIRegisterClass<"AMDGPU", Reg64Types.types, 32, (add SGPR_64Regs)> { let CopyCost = 1; let AllocationPriority = 1; @@ -905,8 +907,8 @@ multiclass SRegClass<int numRegs, } } -defm "" : SRegClass<3, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>; -defm "" : SRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16], SGPR_128Regs, TTMP_128Regs>; +defm "" : SRegClass<3, Reg96Types.types, SGPR_96Regs, TTMP_96Regs>; +defm "" : SRegClass<4, Reg128Types.types, SGPR_128Regs, TTMP_128Regs>; defm "" : SRegClass<5, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>; defm "" : SRegClass<6, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>; defm "" : SRegClass<7, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>; @@ -958,8 +960,8 @@ multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> { defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4bf16, v4i16, p0, p1, p4], (add VGPR_64)>; -defm VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>; -defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16], (add VGPR_128)>; +defm VReg_96 : VRegClass<3, Reg96Types.types, (add VGPR_96)>; +defm VReg_128 : VRegClass<4, Reg128Types.types, (add VGPR_128)>; defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>; defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>; @@ -1342,6 +1344,7 @@ def AVLdSt_64 : AVLdStOperand<AV_64, "OPW64">; def AVLdSt_96 : AVLdStOperand<AV_96, "OPW96">; def AVLdSt_128 : AVLdStOperand<AV_128, "OPW128">; def AVLdSt_160 : AVLdStOperand<AV_160, "OPW160">; +def AVLdSt_1024 : AVLdStOperand<AV_1024, "OPW1024">; //===----------------------------------------------------------------------===// // ACSrc_* Operands with an AGPR or an inline constant diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 647fae904d39..79bcf5e8cd30 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -45,7 +45,6 @@ public: bool isKImmOperand(const MachineOperand &Src) const; bool isKUImmOperand(const MachineOperand &Src) const; bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const; - bool isReverseInlineImm(const MachineOperand &Src, int32_t &ReverseImm) const; void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const; void shrinkScalarCompare(MachineInstr &MI) const; void shrinkMIMG(MachineInstr &MI) const; @@ -183,15 +182,36 @@ bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src, return false; } -/// \returns true if the constant in \p Src should be replaced with a bitreverse -/// of an inline immediate. -bool SIShrinkInstructions::isReverseInlineImm(const MachineOperand &Src, - int32_t &ReverseImm) const { - if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src)) - return false; +/// \returns the opcode of an instruction a move immediate of the constant \p +/// Src can be replaced with if the constant is replaced with \p ModifiedImm. +/// i.e. +/// +/// If the bitreverse of a constant is an inline immediate, reverse the +/// immediate and return the bitreverse opcode. +/// +/// If the bitwise negation of a constant is an inline immediate, reverse the +/// immediate and return the bitwise not opcode. +static unsigned canModifyToInlineImmOp32(const SIInstrInfo *TII, + const MachineOperand &Src, + int32_t &ModifiedImm, bool Scalar) { + if (TII->isInlineConstant(Src)) + return 0; + int32_t SrcImm = static_cast<int32_t>(Src.getImm()); + + if (!Scalar) { + // We could handle the scalar case with here, but we would need to check + // that SCC is not live as S_NOT_B32 clobbers it. It's probably not worth + // it, as the reasonable values are already covered by s_movk_i32. + ModifiedImm = ~SrcImm; + if (TII->isInlineConstant(APInt(32, ModifiedImm))) + return AMDGPU::V_NOT_B32_e32; + } + + ModifiedImm = reverseBits<int32_t>(SrcImm); + if (TII->isInlineConstant(APInt(32, ModifiedImm))) + return Scalar ? AMDGPU::S_BREV_B32 : AMDGPU::V_BFREV_B32_e32; - ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm())); - return ReverseImm >= -16 && ReverseImm <= 64; + return 0; } /// Copy implicit register operands from specified instruction to this @@ -801,10 +821,12 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { // XXX - not exactly a check for post-regalloc run. MachineOperand &Src = MI.getOperand(1); if (Src.isImm() && MI.getOperand(0).getReg().isPhysical()) { - int32_t ReverseImm; - if (isReverseInlineImm(Src, ReverseImm)) { - MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32)); - Src.setImm(ReverseImm); + int32_t ModImm; + unsigned ModOpcode = + canModifyToInlineImmOp32(TII, Src, ModImm, /*Scalar=*/false); + if (ModOpcode != 0) { + MI.setDesc(TII->get(ModOpcode)); + Src.setImm(static_cast<int64_t>(ModImm)); continue; } } @@ -863,13 +885,15 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { MachineOperand &Src = MI.getOperand(1); if (Src.isImm() && Dst.getReg().isPhysical()) { - int32_t ReverseImm; + unsigned ModOpc; + int32_t ModImm; if (isKImmOperand(Src)) { MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); Src.setImm(SignExtend64(Src.getImm(), 32)); - } else if (isReverseInlineImm(Src, ReverseImm)) { - MI.setDesc(TII->get(AMDGPU::S_BREV_B32)); - Src.setImm(ReverseImm); + } else if ((ModOpc = canModifyToInlineImmOp32(TII, Src, ModImm, + /*Scalar=*/true))) { + MI.setDesc(TII->get(ModOpc)); + Src.setImm(static_cast<int64_t>(ModImm)); } } diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 913942dda19d..742fd397ff9e 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -215,8 +215,6 @@ private: MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI, bool IsWQM); MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI); - void lowerPseudoStrictMode(MachineBasicBlock &MBB, MachineInstr *Entry, - MachineInstr *Exit); void lowerBlock(MachineBasicBlock &MBB); void processBlock(MachineBasicBlock &MBB, bool IsEntry); @@ -241,8 +239,8 @@ public: AU.addRequired<LiveIntervals>(); AU.addPreserved<SlotIndexes>(); AU.addPreserved<LiveIntervals>(); - AU.addPreserved<MachineDominatorTree>(); - AU.addPreserved<MachinePostDominatorTree>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachinePostDominatorTreeWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -259,8 +257,8 @@ char SIWholeQuadMode::ID = 0; INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, false) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, false) @@ -785,7 +783,7 @@ MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB, if (MDT) MDT->getBase().applyUpdates(DTUpdates); if (PDT) - PDT->getBase().applyUpdates(DTUpdates); + PDT->applyUpdates(DTUpdates); // Link blocks MachineInstr *MI = @@ -1025,31 +1023,6 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB, return NewTerm; } -// Convert a strict mode transition to a pseudo transition. -// This still pre-allocates registers to prevent clobbering, -// but avoids any EXEC mask changes. -void SIWholeQuadMode::lowerPseudoStrictMode(MachineBasicBlock &MBB, - MachineInstr *Entry, - MachineInstr *Exit) { - assert(Entry->getOpcode() == AMDGPU::ENTER_STRICT_WQM); - assert(Exit->getOpcode() == AMDGPU::EXIT_STRICT_WQM); - - Register SaveOrig = Entry->getOperand(0).getReg(); - - MachineInstr *NewEntry = - BuildMI(MBB, Entry, DebugLoc(), TII->get(AMDGPU::ENTER_PSEUDO_WM)); - MachineInstr *NewExit = - BuildMI(MBB, Exit, DebugLoc(), TII->get(AMDGPU::EXIT_PSEUDO_WM)); - - LIS->ReplaceMachineInstrInMaps(*Exit, *NewExit); - Exit->eraseFromParent(); - - LIS->ReplaceMachineInstrInMaps(*Entry, *NewEntry); - Entry->eraseFromParent(); - - LIS->removeInterval(SaveOrig); -} - // Replace (or supplement) instructions accessing live mask. // This can only happen once all the live mask registers have been created // and the execute state (WQM/StrictWWM/Exact) of instructions is known. @@ -1066,12 +1039,9 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) { SmallVector<MachineInstr *, 4> SplitPoints; char State = BI.InitialState; - MachineInstr *StrictEntry = nullptr; for (MachineInstr &MI : llvm::make_early_inc_range( llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) { - char PreviousState = State; - if (StateTransition.count(&MI)) State = StateTransition[&MI]; @@ -1084,20 +1054,6 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) { case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: SplitPoint = lowerKillF32(MBB, MI); break; - case AMDGPU::ENTER_STRICT_WQM: - StrictEntry = PreviousState == StateWQM ? &MI : nullptr; - break; - case AMDGPU::EXIT_STRICT_WQM: - if (State == StateWQM && StrictEntry) { - // Transition WQM -> StrictWQM -> WQM detected. - lowerPseudoStrictMode(MBB, StrictEntry, &MI); - } - StrictEntry = nullptr; - break; - case AMDGPU::ENTER_STRICT_WWM: - case AMDGPU::EXIT_STRICT_WWM: - StrictEntry = nullptr; - break; default: break; } @@ -1251,11 +1207,6 @@ void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB, } LIS->InsertMachineInstrInMaps(*MI); StateTransition[MI] = StrictStateNeeded; - - // Mark block as needing lower so it will be checked for unnecessary transitions. - auto BII = Blocks.find(&MBB); - if (BII != Blocks.end()) - BII->second.NeedsLowering = true; } void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB, @@ -1687,8 +1638,11 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); LIS = &getAnalysis<LiveIntervals>(); - MDT = getAnalysisIfAvailable<MachineDominatorTree>(); - PDT = getAnalysisIfAvailable<MachinePostDominatorTree>(); + auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>(); + MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr; + auto *PDTWrapper = + getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>(); + PDT = PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr; if (ST->isWave32()) { AndOpc = AMDGPU::S_AND_B32; diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index aee518680a60..64f33199545a 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -215,6 +215,11 @@ let isMoveImm = 1 in { } // End Uses = [SCC] } // End isMoveImm = 1 +// Variant of S_MOV_B32 used for reading from volatile registers like +// SRC_POPS_EXITING_WAVE_ID. +let hasSideEffects = 1 in +def S_MOV_B32_sideeffects : SOP1_32 <"s_mov_b32">; + let Defs = [SCC] in { def S_NOT_B32 : SOP1_32 <"s_not_b32", [(set i32:$sdst, (UniformUnaryFrag<not> i32:$src0))] @@ -1196,11 +1201,15 @@ let SubtargetPredicate = isGFX9Plus in { } } // End SubtargetPredicate = isGFX9Plus +def VersionImm : S16ImmOperand { + let DecoderMethod = "decodeVersionImm"; +} + let SubtargetPredicate = isGFX10Plus in { def S_VERSION : SOPK_Pseudo< "s_version", (outs), - (ins s16imm:$simm16), + (ins VersionImm:$simm16), "$simm16"> { let has_sdst = 0; } @@ -1876,6 +1885,12 @@ let SubtargetPredicate = isNotGFX9Plus in { def : GetFPModePat<fpmode_mask_gfx6plus>; } +let SubtargetPredicate = isGFX9GFX10 in +def : GCNPat< + (int_amdgcn_pops_exiting_wave_id), + (S_MOV_B32_sideeffects (i32 SRC_POPS_EXITING_WAVE_ID)) +>; + //===----------------------------------------------------------------------===// // SOP2 Patterns //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index 2e1db1665b9c..3af536dac473 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -669,5 +669,20 @@ const char* const IdSymbolic[] = { } // namespace VGPRIndexMode +namespace UCVersion { + +ArrayRef<GFXVersion> getGFXVersions() { + // GFX6, GFX8 and GFX9 don't support s_version and there are no + // UC_VERSION_GFX* codes for them. + static const GFXVersion Versions[] = {{"UC_VERSION_GFX7", 0}, + {"UC_VERSION_GFX10", 4}, + {"UC_VERSION_GFX11", 6}, + {"UC_VERSION_GFX12", 9}}; + + return Versions; +} + +} // namespace UCVersion + } // namespace AMDGPU } // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h index 069134a7ae7f..c84c1a7dc18c 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h @@ -116,6 +116,17 @@ extern const char* const IdSymbolic[]; } // namespace VGPRIndexMode +namespace UCVersion { + +struct GFXVersion { + StringLiteral Symbol; + unsigned Code; +}; + +ArrayRef<GFXVersion> getGFXVersions(); + +} // namespace UCVersion + } // namespace AMDGPU } // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 4b34fb27632a..9886235121d2 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -159,6 +159,12 @@ namespace llvm { namespace AMDGPU { +/// \returns true if the target supports signed immediate offset for SMRD +/// instructions. +bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) { + return isGFX9Plus(ST); +} + /// \returns True if \p STI is AMDHSA. bool isHsaAbi(const MCSubtargetInfo &STI) { return STI.getTargetTriple().getOS() == Triple::AMDHSA; @@ -373,10 +379,18 @@ struct VOPTrue16Info { bool IsTrue16; }; +struct SingleUseExceptionInfo { + uint16_t Opcode; + bool IsInvalidSingleUseConsumer; + bool IsInvalidSingleUseProducer; +}; + #define GET_MTBUFInfoTable_DECL #define GET_MTBUFInfoTable_IMPL #define GET_MUBUFInfoTable_DECL #define GET_MUBUFInfoTable_IMPL +#define GET_SingleUseExceptionTable_DECL +#define GET_SingleUseExceptionTable_IMPL #define GET_SMInfoTable_DECL #define GET_SMInfoTable_IMPL #define GET_VOP1InfoTable_DECL @@ -582,9 +596,7 @@ bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc) { } bool isGenericAtomic(unsigned Opc) { - return Opc == AMDGPU::G_AMDGPU_ATOMIC_FMIN || - Opc == AMDGPU::G_AMDGPU_ATOMIC_FMAX || - Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP || + return Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP || Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD || Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB || Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN || @@ -608,6 +620,16 @@ bool isTrue16Inst(unsigned Opc) { return Info ? Info->IsTrue16 : false; } +bool isInvalidSingleUseConsumerInst(unsigned Opc) { + const SingleUseExceptionInfo *Info = getSingleUseExceptionHelper(Opc); + return Info && Info->IsInvalidSingleUseConsumer; +} + +bool isInvalidSingleUseProducerInst(unsigned Opc) { + const SingleUseExceptionInfo *Info = getSingleUseExceptionHelper(Opc); + return Info && Info->IsInvalidSingleUseProducer; +} + unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) { const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc); return Info ? Info->Opcode3Addr : ~0u; @@ -2803,10 +2825,6 @@ static bool hasSMEMByteOffset(const MCSubtargetInfo &ST) { return isGCN3Encoding(ST) || isGFX10Plus(ST); } -static bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) { - return isGFX9Plus(ST); -} - bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST, int64_t EncodedOffset) { if (isGFX12Plus(ST)) @@ -2841,7 +2859,14 @@ uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, } std::optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST, - int64_t ByteOffset, bool IsBuffer) { + int64_t ByteOffset, bool IsBuffer, + bool HasSOffset) { + // For unbuffered smem loads, it is illegal for the Immediate Offset to be + // negative if the resulting (Offset + (M0 or SOffset or zero) is negative. + // Handle case where SOffset is not present. + if (!IsBuffer && !HasSOffset && ByteOffset < 0 && hasSMRDSignedImmOffset(ST)) + return std::nullopt; + if (isGFX12Plus(ST)) // 24 bit signed offsets return isInt<24>(ByteOffset) ? std::optional<int64_t>(ByteOffset) : std::nullopt; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index cf8236b8e23b..af2f0bc1a630 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -360,6 +360,10 @@ struct EncodingField { static ValueType decode(uint64_t Encoded) { return Encoded; } }; +// Represents a single bit in an encoded value. +template <unsigned Bit, unsigned D = 0> +using EncodingBit = EncodingField<Bit, Bit, D>; + // A helper for encoding and decoding multiple fields. template <typename... Fields> struct EncodingFields { static constexpr uint64_t encode(Fields... Values) { @@ -857,6 +861,12 @@ LLVM_READONLY bool isTrue16Inst(unsigned Opc); LLVM_READONLY +bool isInvalidSingleUseConsumerInst(unsigned Opc); + +LLVM_READONLY +bool isInvalidSingleUseProducerInst(unsigned Opc); + +LLVM_READONLY unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc); LLVM_READONLY @@ -1297,6 +1307,7 @@ bool hasVOPD(const MCSubtargetInfo &STI); bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI); int getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR); unsigned hasKernargPreload(const MCSubtargetInfo &STI); +bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST); /// Is Reg - scalar register bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI); @@ -1469,7 +1480,8 @@ uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset); /// S_LOAD instructions have a signed offset, on other subtargets it is /// unsigned. S_BUFFER has an unsigned offset for all subtargets. std::optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST, - int64_t ByteOffset, bool IsBuffer); + int64_t ByteOffset, bool IsBuffer, + bool HasSOffset = false); /// \return The encoding that can be used for a 32-bit literal offset in an SMRD /// instruction. This is only useful on CI.s diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.cpp new file mode 100644 index 000000000000..a4f4a9ed5da4 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.cpp @@ -0,0 +1,61 @@ +//===- AMDGPUDelayedMCExpr.cpp - Delayed MCExpr resolve ---------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUDelayedMCExpr.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCValue.h" + +using namespace llvm; + +static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, + MCValue Val) { + msgpack::Document *Doc = DN.getDocument(); + switch (Type) { + default: + return Doc->getEmptyNode(); + case msgpack::Type::Int: + return Doc->getNode(static_cast<int64_t>(Val.getConstant())); + case msgpack::Type::UInt: + return Doc->getNode(static_cast<uint64_t>(Val.getConstant())); + case msgpack::Type::Boolean: + return Doc->getNode(static_cast<bool>(Val.getConstant())); + } +} + +void DelayedMCExprs::assignDocNode(msgpack::DocNode &DN, msgpack::Type Type, + const MCExpr *ExprValue) { + MCValue Res; + if (ExprValue->evaluateAsRelocatable(Res, nullptr, nullptr)) { + if (Res.isAbsolute()) { + DN = getNode(DN, Type, Res); + return; + } + } + + DelayedExprs.push_back(Expr{DN, Type, ExprValue}); +} + +bool DelayedMCExprs::resolveDelayedExpressions() { + while (!DelayedExprs.empty()) { + Expr DE = DelayedExprs.front(); + MCValue Res; + + if (!DE.ExprValue->evaluateAsRelocatable(Res, nullptr, nullptr) || + !Res.isAbsolute()) + return false; + + DelayedExprs.pop_front(); + DE.DN = getNode(DE.DN, DE.Type, Res); + } + + return true; +} + +void DelayedMCExprs::clear() { DelayedExprs.clear(); } + +bool DelayedMCExprs::empty() { return DelayedExprs.empty(); } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.h new file mode 100644 index 000000000000..8c9cda3a1bdd --- /dev/null +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUDelayedMCExpr.h @@ -0,0 +1,39 @@ +//===- AMDGPUDelayedMCExpr.h - Delayed MCExpr resolve -----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUDELAYEDMCEXPR_H +#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUDELAYEDMCEXPR_H + +#include "llvm/BinaryFormat/MsgPackDocument.h" +#include <deque> + +namespace llvm { +class MCExpr; + +class DelayedMCExprs { + struct Expr { + msgpack::DocNode &DN; + msgpack::Type Type; + const MCExpr *ExprValue; + Expr(msgpack::DocNode &DN, msgpack::Type Type, const MCExpr *ExprValue) + : DN(DN), Type(Type), ExprValue(ExprValue) {} + }; + + std::deque<Expr> DelayedExprs; + +public: + bool resolveDelayedExpressions(); + void assignDocNode(msgpack::DocNode &DN, msgpack::Type Type, + const MCExpr *ExprValue); + void clear(); + bool empty(); +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUDELAYEDMCEXPR_H diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp index 0fa67c559cb2..a53bf70d7771 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp @@ -20,6 +20,7 @@ #include "llvm/BinaryFormat/ELF.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Module.h" +#include "llvm/MC/MCExpr.h" #include "llvm/Support/AMDGPUMetadata.h" #include "llvm/Support/EndianStream.h" @@ -137,12 +138,22 @@ void AMDGPUPALMetadata::setRsrc1(CallingConv::ID CC, unsigned Val) { setRegister(getRsrc1Reg(CC), Val); } +void AMDGPUPALMetadata::setRsrc1(CallingConv::ID CC, const MCExpr *Val, + MCContext &Ctx) { + setRegister(getRsrc1Reg(CC), Val, Ctx); +} + // Set the rsrc2 register in the metadata for a particular shader stage. // In fact this ORs the value into any previous setting of the register. void AMDGPUPALMetadata::setRsrc2(CallingConv::ID CC, unsigned Val) { setRegister(getRsrc1Reg(CC) + 1, Val); } +void AMDGPUPALMetadata::setRsrc2(CallingConv::ID CC, const MCExpr *Val, + MCContext &Ctx) { + setRegister(getRsrc1Reg(CC) + 1, Val, Ctx); +} + // Set the SPI_PS_INPUT_ENA register in the metadata. // In fact this ORs the value into any previous setting of the register. void AMDGPUPALMetadata::setSpiPsInputEna(unsigned Val) { @@ -182,6 +193,40 @@ void AMDGPUPALMetadata::setRegister(unsigned Reg, unsigned Val) { N = N.getDocument()->getNode(Val); } +// Set a register in the metadata. +// In fact this ORs the value into any previous setting of the register. +void AMDGPUPALMetadata::setRegister(unsigned Reg, const MCExpr *Val, + MCContext &Ctx) { + if (!isLegacy()) { + // In the new MsgPack format, ignore register numbered >= 0x10000000. It + // is a PAL ABI pseudo-register in the old non-MsgPack format. + if (Reg >= 0x10000000) + return; + } + auto &N = getRegisters()[MsgPackDoc.getNode(Reg)]; + auto ExprIt = REM.find(Reg); + + if (ExprIt != REM.end()) { + Val = MCBinaryExpr::createOr(Val, ExprIt->getSecond(), Ctx); + // This conditional may be redundant most of the time, but the alternate + // setRegister(unsigned, unsigned) could've been called while the + // conditional returns true (i.e., Reg exists in REM). + if (N.getKind() == msgpack::Type::UInt) { + const MCExpr *NExpr = MCConstantExpr::create(N.getUInt(), Ctx); + Val = MCBinaryExpr::createOr(Val, NExpr, Ctx); + } + ExprIt->getSecond() = Val; + } else if (N.getKind() == msgpack::Type::UInt) { + const MCExpr *NExpr = MCConstantExpr::create(N.getUInt(), Ctx); + Val = MCBinaryExpr::createOr(Val, NExpr, Ctx); + int64_t Unused; + if (!Val->evaluateAsAbsolute(Unused)) + REM[Reg] = Val; + (void)Unused; + } + DelayedExprs.assignDocNode(N, msgpack::Type::UInt, Val); +} + // Set the entry point name for one shader. void AMDGPUPALMetadata::setEntryPoint(unsigned CC, StringRef Name) { if (isLegacy()) @@ -207,11 +252,29 @@ void AMDGPUPALMetadata::setNumUsedVgprs(CallingConv::ID CC, unsigned Val) { getHwStage(CC)[".vgpr_count"] = MsgPackDoc.getNode(Val); } +void AMDGPUPALMetadata::setNumUsedVgprs(CallingConv::ID CC, const MCExpr *Val, + MCContext &Ctx) { + if (isLegacy()) { + // Old non-msgpack format. + unsigned NumUsedVgprsKey = getScratchSizeKey(CC) + + PALMD::Key::VS_NUM_USED_VGPRS - + PALMD::Key::VS_SCRATCH_SIZE; + setRegister(NumUsedVgprsKey, Val, Ctx); + return; + } + // Msgpack format. + setHwStage(CC, ".vgpr_count", msgpack::Type::UInt, Val); +} + // Set the number of used agprs in the metadata. void AMDGPUPALMetadata::setNumUsedAgprs(CallingConv::ID CC, unsigned Val) { getHwStage(CC)[".agpr_count"] = Val; } +void AMDGPUPALMetadata::setNumUsedAgprs(unsigned CC, const MCExpr *Val) { + setHwStage(CC, ".agpr_count", msgpack::Type::UInt, Val); +} + // Set the number of used sgprs in the metadata. This is an optional advisory // record for logging etc; wave dispatch actually uses the rsrc1 register for // the shader stage to determine the number of sgprs to allocate. @@ -228,6 +291,20 @@ void AMDGPUPALMetadata::setNumUsedSgprs(CallingConv::ID CC, unsigned Val) { getHwStage(CC)[".sgpr_count"] = MsgPackDoc.getNode(Val); } +void AMDGPUPALMetadata::setNumUsedSgprs(unsigned CC, const MCExpr *Val, + MCContext &Ctx) { + if (isLegacy()) { + // Old non-msgpack format. + unsigned NumUsedSgprsKey = getScratchSizeKey(CC) + + PALMD::Key::VS_NUM_USED_SGPRS - + PALMD::Key::VS_SCRATCH_SIZE; + setRegister(NumUsedSgprsKey, Val, Ctx); + return; + } + // Msgpack format. + setHwStage(CC, ".sgpr_count", msgpack::Type::UInt, Val); +} + // Set the scratch size in the metadata. void AMDGPUPALMetadata::setScratchSize(CallingConv::ID CC, unsigned Val) { if (isLegacy()) { @@ -239,6 +316,17 @@ void AMDGPUPALMetadata::setScratchSize(CallingConv::ID CC, unsigned Val) { getHwStage(CC)[".scratch_memory_size"] = MsgPackDoc.getNode(Val); } +void AMDGPUPALMetadata::setScratchSize(unsigned CC, const MCExpr *Val, + MCContext &Ctx) { + if (isLegacy()) { + // Old non-msgpack format. + setRegister(getScratchSizeKey(CC), Val, Ctx); + return; + } + // Msgpack format. + setHwStage(CC, ".scratch_memory_size", msgpack::Type::UInt, Val); +} + // Set the stack frame size of a function in the metadata. void AMDGPUPALMetadata::setFunctionScratchSize(StringRef FnName, unsigned Val) { auto Node = getShaderFunction(FnName); @@ -259,6 +347,12 @@ void AMDGPUPALMetadata::setFunctionNumUsedVgprs(StringRef FnName, Node[".vgpr_count"] = MsgPackDoc.getNode(Val); } +void AMDGPUPALMetadata::setFunctionNumUsedVgprs(StringRef FnName, + const MCExpr *Val) { + auto Node = getShaderFunction(FnName); + DelayedExprs.assignDocNode(Node[".vgpr_count"], msgpack::Type::UInt, Val); +} + // Set the number of used vgprs in the metadata. void AMDGPUPALMetadata::setFunctionNumUsedSgprs(StringRef FnName, unsigned Val) { @@ -266,6 +360,12 @@ void AMDGPUPALMetadata::setFunctionNumUsedSgprs(StringRef FnName, Node[".sgpr_count"] = MsgPackDoc.getNode(Val); } +void AMDGPUPALMetadata::setFunctionNumUsedSgprs(StringRef FnName, + const MCExpr *Val) { + auto Node = getShaderFunction(FnName); + DelayedExprs.assignDocNode(Node[".sgpr_count"], msgpack::Type::UInt, Val); +} + // Set the hardware register bit in PAL metadata to enable wave32 on the // shader of the given calling convention. void AMDGPUPALMetadata::setWave32(unsigned CC) { @@ -662,6 +762,7 @@ void AMDGPUPALMetadata::toString(std::string &String) { String.clear(); if (!BlobType) return; + ResolvedAll = DelayedExprs.resolveDelayedExpressions(); raw_string_ostream Stream(String); if (isLegacy()) { if (MsgPackDoc.getRoot().getKind() == msgpack::Type::Nil) @@ -711,6 +812,7 @@ void AMDGPUPALMetadata::toString(std::string &String) { // a .note record of the specified AMD type. Returns an empty blob if // there is no PAL metadata, void AMDGPUPALMetadata::toBlob(unsigned Type, std::string &Blob) { + ResolvedAll = DelayedExprs.resolveDelayedExpressions(); if (Type == ELF::NT_AMD_PAL_METADATA) toLegacyBlob(Blob); else if (Type) @@ -906,11 +1008,17 @@ void AMDGPUPALMetadata::setLegacy() { // Erase all PAL metadata. void AMDGPUPALMetadata::reset() { MsgPackDoc.clear(); + REM.clear(); + DelayedExprs.clear(); Registers = MsgPackDoc.getEmptyNode(); HwStages = MsgPackDoc.getEmptyNode(); ShaderFunctions = MsgPackDoc.getEmptyNode(); } +bool AMDGPUPALMetadata::resolvedAllMCExpr() { + return ResolvedAll && DelayedExprs.empty(); +} + unsigned AMDGPUPALMetadata::getPALVersion(unsigned idx) { assert(idx < 2 && "illegal index to PAL version - should be 0 (major) or 1 (minor)"); @@ -942,6 +1050,11 @@ void AMDGPUPALMetadata::setHwStage(unsigned CC, StringRef field, bool Val) { getHwStage(CC)[field] = Val; } +void AMDGPUPALMetadata::setHwStage(unsigned CC, StringRef field, + msgpack::Type Type, const MCExpr *Val) { + DelayedExprs.assignDocNode(getHwStage(CC)[field], Type, Val); +} + void AMDGPUPALMetadata::setComputeRegisters(StringRef field, unsigned Val) { getComputeRegisters()[field] = Val; } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h index 158f766d0485..e05532afed2f 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h @@ -13,7 +13,10 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H +#include "AMDGPUDelayedMCExpr.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/BinaryFormat/MsgPackDocument.h" +#include "llvm/MC/MCContext.h" namespace llvm { @@ -21,6 +24,10 @@ class Module; class StringRef; class AMDGPUPALMetadata { +public: + using RegisterExprMap = DenseMap<unsigned, const MCExpr *>; + +private: unsigned BlobType = 0; msgpack::Document MsgPackDoc; msgpack::DocNode Registers; @@ -32,6 +39,10 @@ class AMDGPUPALMetadata { msgpack::DocNode ComputeRegisters; msgpack::DocNode GraphicsRegisters; + DelayedMCExprs DelayedExprs; + RegisterExprMap REM; + bool ResolvedAll = true; + public: // Read the amdgpu.pal.metadata supplied by the frontend, ready for // per-function modification. @@ -45,10 +56,12 @@ public: // Set the rsrc1 register in the metadata for a particular shader stage. // In fact this ORs the value into any previous setting of the register. void setRsrc1(unsigned CC, unsigned Val); + void setRsrc1(unsigned CC, const MCExpr *Val, MCContext &Ctx); // Set the rsrc2 register in the metadata for a particular shader stage. // In fact this ORs the value into any previous setting of the register. void setRsrc2(unsigned CC, unsigned Val); + void setRsrc2(unsigned CC, const MCExpr *Val, MCContext &Ctx); // Set the SPI_PS_INPUT_ENA register in the metadata. // In fact this ORs the value into any previous setting of the register. @@ -64,6 +77,7 @@ public: // Set a register in the metadata. // In fact this ORs the value into any previous setting of the register. void setRegister(unsigned Reg, unsigned Val); + void setRegister(unsigned Reg, const MCExpr *Val, MCContext &Ctx); // Set the entry point name for one shader. void setEntryPoint(unsigned CC, StringRef Name); @@ -72,18 +86,22 @@ public: // record for logging etc; wave dispatch actually uses the rsrc1 register for // the shader stage to determine the number of vgprs to allocate. void setNumUsedVgprs(unsigned CC, unsigned Val); + void setNumUsedVgprs(unsigned CC, const MCExpr *Val, MCContext &Ctx); // Set the number of used agprs in the metadata. This is an optional advisory // record for logging etc; void setNumUsedAgprs(unsigned CC, unsigned Val); + void setNumUsedAgprs(unsigned CC, const MCExpr *Val); // Set the number of used sgprs in the metadata. This is an optional advisory // record for logging etc; wave dispatch actually uses the rsrc1 register for // the shader stage to determine the number of sgprs to allocate. void setNumUsedSgprs(unsigned CC, unsigned Val); + void setNumUsedSgprs(unsigned CC, const MCExpr *Val, MCContext &Ctx); // Set the scratch size in the metadata. void setScratchSize(unsigned CC, unsigned Val); + void setScratchSize(unsigned CC, const MCExpr *Val, MCContext &Ctx); // Set the stack frame size of a function in the metadata. void setFunctionScratchSize(StringRef FnName, unsigned Val); @@ -97,11 +115,13 @@ public: // record for logging etc; wave dispatch actually uses the rsrc1 register for // the shader stage to determine the number of vgprs to allocate. void setFunctionNumUsedVgprs(StringRef FnName, unsigned Val); + void setFunctionNumUsedVgprs(StringRef FnName, const MCExpr *Val); // Set the number of used sgprs in the metadata. This is an optional advisory // record for logging etc; wave dispatch actually uses the rsrc1 register for // the shader stage to determine the number of sgprs to allocate. void setFunctionNumUsedSgprs(StringRef FnName, unsigned Val); + void setFunctionNumUsedSgprs(StringRef FnName, const MCExpr *Val); // Set the hardware register bit in PAL metadata to enable wave32 on the // shader of the given calling convention. @@ -138,6 +158,8 @@ public: void setHwStage(unsigned CC, StringRef field, unsigned Val); void setHwStage(unsigned CC, StringRef field, bool Val); + void setHwStage(unsigned CC, StringRef field, msgpack::Type Type, + const MCExpr *Val); void setComputeRegisters(StringRef field, unsigned Val); void setComputeRegisters(StringRef field, bool Val); @@ -156,6 +178,8 @@ public: // Erase all PAL metadata. void reset(); + bool resolvedAllMCExpr(); + private: // Return whether the blob type is legacy PAL metadata. bool isLegacy() const; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp index eaee1a2a9739..720d5a1853db 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp @@ -14,6 +14,7 @@ #include "AMDKernelCodeT.h" #include "SIDefines.h" #include "Utils/AMDGPUBaseInfo.h" +#include "Utils/SIDefinesUtils.h" #include "llvm/ADT/IndexedMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/MC/MCContext.h" @@ -220,43 +221,6 @@ static int get_amd_kernel_code_t_FieldIndex(StringRef name) { return map.lookup(name) - 1; // returns -1 if not found } -static constexpr std::pair<unsigned, unsigned> getShiftMask(unsigned Value) { - unsigned Shift = 0; - unsigned Mask = 0; - - Mask = ~Value; - for (; !(Mask & 1); Shift++, Mask >>= 1) { - } - - return std::make_pair(Shift, Mask); -} - -static const MCExpr *MaskShiftSet(const MCExpr *Val, uint32_t Mask, - uint32_t Shift, MCContext &Ctx) { - if (Mask) { - const MCExpr *MaskExpr = MCConstantExpr::create(Mask, Ctx); - Val = MCBinaryExpr::createAnd(Val, MaskExpr, Ctx); - } - if (Shift) { - const MCExpr *ShiftExpr = MCConstantExpr::create(Shift, Ctx); - Val = MCBinaryExpr::createShl(Val, ShiftExpr, Ctx); - } - return Val; -} - -static const MCExpr *MaskShiftGet(const MCExpr *Val, uint32_t Mask, - uint32_t Shift, MCContext &Ctx) { - if (Shift) { - const MCExpr *ShiftExpr = MCConstantExpr::create(Shift, Ctx); - Val = MCBinaryExpr::createLShr(Val, ShiftExpr, Ctx); - } - if (Mask) { - const MCExpr *MaskExpr = MCConstantExpr::create(Mask, Ctx); - Val = MCBinaryExpr::createAnd(Val, MaskExpr, Ctx); - } - return Val; -} - class PrintField { public: template <typename T, T AMDGPUMCKernelCodeT::*ptr, @@ -305,10 +269,10 @@ static ArrayRef<PrintFx> getPrinterTable() { const MCExpr *Value; \ if (PGMType == 0) { \ Value = \ - MaskShiftGet(C.compute_pgm_resource1_registers, Mask, Shift, Ctx); \ + maskShiftGet(C.compute_pgm_resource1_registers, Mask, Shift, Ctx); \ } else { \ Value = \ - MaskShiftGet(C.compute_pgm_resource2_registers, Mask, Shift, Ctx); \ + maskShiftGet(C.compute_pgm_resource2_registers, Mask, Shift, Ctx); \ } \ int64_t Val; \ if (Value->evaluateAsAbsolute(Val)) \ @@ -392,7 +356,7 @@ static ArrayRef<ParseFx> getParserTable() { if (!parseExpr(MCParser, Value, Err)) \ return false; \ auto [Shift, Mask] = getShiftMask(Complement); \ - Value = MaskShiftSet(Value, Mask, Shift, Ctx); \ + Value = maskShiftSet(Value, Mask, Shift, Ctx); \ const MCExpr *Compl = MCConstantExpr::create(Complement, Ctx); \ if (PGMType == 0) { \ C.compute_pgm_resource1_registers = MCBinaryExpr::createAnd( \ @@ -542,7 +506,7 @@ void AMDGPUMCKernelCodeT::EmitKernelCodeT(MCStreamer &OS, MCContext &Ctx) { const MCExpr *CodeProps = MCConstantExpr::create(code_properties, Ctx); CodeProps = MCBinaryExpr::createOr( CodeProps, - MaskShiftSet(is_dynamic_callstack, + maskShiftSet(is_dynamic_callstack, (1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1, AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT, Ctx), Ctx); diff --git a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt index 2f4ce8eaf1d6..09b8da9f5dd4 100644 --- a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt @@ -1,6 +1,7 @@ add_llvm_component_library(LLVMAMDGPUUtils AMDGPUAsmUtils.cpp AMDGPUBaseInfo.cpp + AMDGPUDelayedMCExpr.cpp AMDGPUMemoryUtils.cpp AMDGPUPALMetadata.cpp AMDKernelCodeTUtils.cpp diff --git a/llvm/lib/Target/AMDGPU/Utils/SIDefinesUtils.h b/llvm/lib/Target/AMDGPU/Utils/SIDefinesUtils.h new file mode 100644 index 000000000000..64d21de12c26 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/Utils/SIDefinesUtils.h @@ -0,0 +1,79 @@ +//===-- SIDefines.h - SI Helper Functions -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +/// \file - utility functions for the SIDefines and its common uses. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_SIDEFINESUTILS_H +#define LLVM_LIB_TARGET_AMDGPU_UTILS_SIDEFINESUTILS_H + +#include "llvm/MC/MCExpr.h" +#include <utility> + +namespace llvm { +class MCContext; +namespace AMDGPU { + +/// Deduce the least significant bit aligned shift and mask values for a binary +/// Complement \p Value (as they're defined in SIDefines.h as C_*) as a returned +/// pair<shift, mask>. That is to say \p Value == ~(mask << shift) +/// +/// For example, given C_00B848_FWD_PROGRESS (i.e., 0x7FFFFFFF) from +/// SIDefines.h, this will return the pair as (31,1). +constexpr std::pair<unsigned, unsigned> getShiftMask(unsigned Value) { + unsigned Shift = 0; + unsigned Mask = 0; + + Mask = ~Value; + for (; !(Mask & 1); Shift++, Mask >>= 1) { + } + + return std::make_pair(Shift, Mask); +} + +/// Provided with the MCExpr * \p Val, uint32 \p Mask and \p Shift, will return +/// the masked and left shifted, in said order of operations, MCExpr * created +/// within the MCContext \p Ctx. +/// +/// For example, given MCExpr *Val, Mask == 0xf, Shift == 6 the returned MCExpr +/// * will be the equivalent of (Val & 0xf) << 6 +inline const MCExpr *maskShiftSet(const MCExpr *Val, uint32_t Mask, + uint32_t Shift, MCContext &Ctx) { + if (Mask) { + const MCExpr *MaskExpr = MCConstantExpr::create(Mask, Ctx); + Val = MCBinaryExpr::createAnd(Val, MaskExpr, Ctx); + } + if (Shift) { + const MCExpr *ShiftExpr = MCConstantExpr::create(Shift, Ctx); + Val = MCBinaryExpr::createShl(Val, ShiftExpr, Ctx); + } + return Val; +} + +/// Provided with the MCExpr * \p Val, uint32 \p Mask and \p Shift, will return +/// the right shifted and masked, in said order of operations, MCExpr * created +/// within the MCContext \p Ctx. +/// +/// For example, given MCExpr *Val, Mask == 0xf, Shift == 6 the returned MCExpr +/// * will be the equivalent of (Val >> 6) & 0xf +inline const MCExpr *maskShiftGet(const MCExpr *Val, uint32_t Mask, + uint32_t Shift, MCContext &Ctx) { + if (Shift) { + const MCExpr *ShiftExpr = MCConstantExpr::create(Shift, Ctx); + Val = MCBinaryExpr::createLShr(Val, ShiftExpr, Ctx); + } + if (Mask) { + const MCExpr *MaskExpr = MCConstantExpr::create(Mask, Ctx); + Val = MCBinaryExpr::createAnd(Val, MaskExpr, Ctx); + } + return Val; +} + +} // end namespace AMDGPU +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_SIDEFINESUTILS_H diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index b96c41c1e12a..2c0d61ee4afa 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -112,7 +112,7 @@ class getVOP1Pat <SDPatternOperator node, VOPProfile P> : LetDummies { !if(P.HasOMod, [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3OMods P.Src0VT:$src0, i1:$clamp, i32:$omod))))], - [(set P.DstVT:$vdst, (node P.Src0RC32:$src0))] + [(set P.DstVT:$vdst, (node (P.Src0VT P.Src0RC32:$src0)))] ) ); } @@ -249,9 +249,15 @@ def VOP_READFIRSTLANE : VOPProfile <[i32, i32, untyped, untyped]> { // FIXME: Specify SchedRW for READFIRSTLANE_B32 // TODO: There is VOP3 encoding also def V_READFIRSTLANE_B32 : VOP1_Pseudo <"v_readfirstlane_b32", VOP_READFIRSTLANE, - getVOP1Pat<int_amdgcn_readfirstlane, - VOP_READFIRSTLANE>.ret, 1> { + [], 1> { let isConvergent = 1; + let IsInvalidSingleUseConsumer = 1; +} + +foreach vt = Reg32Types.types in { + def : GCNPat<(vt (int_amdgcn_readfirstlane (vt VRegOrLdsSrc_32:$src0))), + (V_READFIRSTLANE_B32 (vt VRegOrLdsSrc_32:$src0)) + >; } let isReMaterializable = 1 in { @@ -362,6 +368,7 @@ defm V_CLREXCP : VOP1Inst <"v_clrexcp", VOP_NO_EXT<VOP_NONE>>; def VOP_MOVRELS : VOPProfile<[i32, i32, untyped, untyped]> { let Src0RC32 = VRegSrc_32; let Src0RC64 = VRegSrc_32; + let IsInvalidSingleUseConsumer = 1; } // Special case because there are no true output operands. Hack vdst @@ -405,8 +412,12 @@ class VOP_MOVREL<RegisterOperand Src1RC> : VOPProfile<[untyped, i32, untyped, un let EmitDst = 1; // force vdst emission } -def VOP_MOVRELD : VOP_MOVREL<VSrc_b32>; -def VOP_MOVRELSD : VOP_MOVREL<VRegSrc_32>; +let IsInvalidSingleUseProducer = 1 in { + def VOP_MOVRELD : VOP_MOVREL<VSrc_b32>; + def VOP_MOVRELSD : VOP_MOVREL<VRegSrc_32> { + let IsInvalidSingleUseConsumer = 1; + } +} let SubtargetPredicate = HasMovrel, Uses = [M0, EXEC] in { // v_movreld_b32 is a special case because the destination output @@ -535,6 +546,7 @@ let SubtargetPredicate = isGFX9Plus in { let Constraints = "$vdst = $src1, $vdst1 = $src0"; let DisableEncoding = "$vdst1,$src1"; let SchedRW = [Write64Bit, Write64Bit]; + let IsInvalidSingleUseConsumer = 1; } let isReMaterializable = 1 in @@ -699,6 +711,8 @@ let SubtargetPredicate = isGFX10Plus in { let Constraints = "$vdst = $src1, $vdst1 = $src0"; let DisableEncoding = "$vdst1,$src1"; let SchedRW = [Write64Bit, Write64Bit]; + let IsInvalidSingleUseConsumer = 1; + let IsInvalidSingleUseProducer = 1; } } // End Uses = [M0] } // End SubtargetPredicate = isGFX10Plus @@ -718,15 +732,22 @@ def V_ACCVGPR_MOV_B32 : VOP1_Pseudo<"v_accvgpr_mov_b32", VOPProfileAccMov, [], 1 let SubtargetPredicate = isGFX11Plus in { // Restrict src0 to be VGPR def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS, - getVOP1Pat<int_amdgcn_permlane64, - VOP_MOVRELS>.ret, - /*VOP1Only=*/ 1>; + [], /*VOP1Only=*/ 1> { + let IsInvalidSingleUseConsumer = 1; + let IsInvalidSingleUseProducer = 1; + } defm V_MOV_B16_t16 : VOP1Inst<"v_mov_b16_t16", VOPProfile_True16<VOP_I16_I16>>; defm V_NOT_B16 : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>; defm V_CVT_I32_I16 : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>; defm V_CVT_U32_U16 : VOP1Inst_t16<"v_cvt_u32_u16", VOP_I32_I16>; } // End SubtargetPredicate = isGFX11Plus +foreach vt = Reg32Types.types in { + def : GCNPat<(int_amdgcn_permlane64 (vt VRegSrc_32:$src0)), + (vt (V_PERMLANE64_B32 (vt VRegSrc_32:$src0))) + >; +} + //===----------------------------------------------------------------------===// // Target-specific instruction encodings. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index ccb5b33dbdc4..9989752c2f6b 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -779,15 +779,25 @@ defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag, } // End isCommutable = 1 // These are special and do not read the exec mask. -let isConvergent = 1, Uses = []<Register> in { -def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE, - [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))]>; +let isConvergent = 1, Uses = []<Register>, IsInvalidSingleUseConsumer = 1 in { +def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE, []>; let IsNeverUniform = 1, Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { -def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, - [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))]>; +def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, []> { + let IsInvalidSingleUseProducer = 1; + } } // End IsNeverUniform, $vdst = $vdst_in, DisableEncoding $vdst_in } // End isConvergent = 1 +foreach vt = Reg32Types.types in { + def : GCNPat<(vt (int_amdgcn_readlane vt:$src0, i32:$src1)), + (V_READLANE_B32 VRegOrLdsSrc_32:$src0, SCSrc_b32:$src1) + >; + + def : GCNPat<(vt (int_amdgcn_writelane vt:$src0, i32:$src1, vt:$src2)), + (V_WRITELANE_B32 SCSrc_b32:$src0, SCSrc_b32:$src1, VGPR_32:$src2) + >; +} + let isReMaterializable = 1 in { defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_I32_I32_I32>; defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_I32_I32_I32, add_ctpop>; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 479c0aaf0174..efa8e9c74d44 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -13,9 +13,11 @@ def VOP_F32_F32_F32_F32_VCC : VOPProfile<[f32, f32, f32, f32]> { let Outs64 = (outs DstRC.RegClass:$vdst); let HasExtVOP3DPP = 0; let HasExtDPP = 0; + let IsSingle = 1; } def VOP_F64_F64_F64_F64_VCC : VOPProfile<[f64, f64, f64, f64]> { let Outs64 = (outs DstRC.RegClass:$vdst); + let IsSingle = 1; } } @@ -105,7 +107,7 @@ class getInterp16Ins <bit HasSrc2, bit HasOMod, } class VOP3_INTERP16 <list<ValueType> ArgVT> : VOPProfile<ArgVT> { - + let IsSingle = 1; let HasOMod = !ne(DstVT.Value, f16.Value); let HasHigh = 1; @@ -155,12 +157,12 @@ defm V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_l } // End SubtargetPredicate = isNotGFX12Plus } // End SchedRW = [WriteDoubleAdd] -let SchedRW = [WriteIntMul] in { +let SchedRW = [WriteIntMul], IsInvalidSingleUseConsumer = 1 in { defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", V_MUL_PROF<VOP_I32_I32_I32>, DivergentBinFrag<mul>>; defm V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", V_MUL_PROF<VOP_I32_I32_I32>, mulhu>; defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", V_MUL_PROF<VOP_I32_I32_I32>>; defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", V_MUL_PROF<VOP_I32_I32_I32>, mulhs>; -} // End SchedRW = [WriteIntMul] +} // End SchedRW = [WriteIntMul], IsInvalidSingleUseConsumer = 1 let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in { defm V_MINIMUM_F32 : VOP3Inst <"v_minimum_f32", VOP3_Profile<VOP_F32_F32_F32>, DivergentBinFrag<fminimum>>; @@ -258,9 +260,9 @@ let mayRaiseFPException = 0 in { // Seems suspicious but manual doesn't say it d let isReMaterializable = 1 in defm V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; -let Constraints = "@earlyclobber $vdst" in { +let Constraints = "@earlyclobber $vdst", IsInvalidSingleUseConsumer = 1 in { defm V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>; -} // End Constraints = "@earlyclobber $vdst" +} // End Constraints = "@earlyclobber $vdst", IsInvalidSingleUseConsumer = 1 let isReMaterializable = 1 in { @@ -275,14 +277,16 @@ let SchedRW = [Write64Bit] in { defm V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>, csra_64>; } // End SubtargetPredicate = isGFX6GFX7 + let IsInvalidSingleUseConsumer = 1 in { let SubtargetPredicate = isGFX8Plus in { defm V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>, clshr_rev_64>; defm V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, cashr_rev_64>; - } // End SubtargetPredicate = isGFX8Plus + } // End SubtargetPredicate = isGFX8Plus, , IsInvalidSingleUseConsumer = 1 let SubtargetPredicate = isGFX8GFX9GFX10GFX11 in { defm V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>, clshl_rev_64>; } // End SubtargetPredicate = isGFX8GFX9GFX10GFX11 + } // End IsInvalidSingleUseConsumer = 1 } // End SchedRW = [Write64Bit] } // End isReMaterializable = 1 @@ -307,14 +311,14 @@ def VOPProfileMQSAD : VOP3_Profile<VOP_V4I32_I64_I32_V4I32, VOP3_CLAMP> { let HasModifiers = 0; } -let SubtargetPredicate = isGFX7Plus in { +let SubtargetPredicate = isGFX7Plus, IsInvalidSingleUseConsumer = 1 in { let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in { defm V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>; defm V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOPProfileMQSAD>; } // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] -} // End SubtargetPredicate = isGFX7Plus +} // End SubtargetPredicate = isGFX7Plus, IsInvalidSingleUseConsumer = 1 -let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] in { +let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU], IsInvalidSingleUseConsumer = 1 in { let SubtargetPredicate = isGFX7Plus, OtherPredicates = [HasNotMADIntraFwdBug] in { defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>; defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; @@ -324,7 +328,7 @@ let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] in { defm V_MAD_U64_U32_gfx11 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>; defm V_MAD_I64_I32_gfx11 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; } -} // End isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] +} // End isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU], IsInvalidSingleUseConsumer = 1 let FPDPRounding = 1 in { @@ -838,9 +842,9 @@ def gi_opsel_i1timm : GICustomOperandRenderer<"renderOpSelTImm">, GISDNodeXFormEquiv<opsel_i1timm>; class PermlanePat<SDPatternOperator permlane, - Instruction inst> : GCNPat< - (permlane i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, - timm:$fi, timm:$bc), + Instruction inst, ValueType vt> : GCNPat< + (vt (permlane vt:$vdst_in, vt:$src0, i32:$src1, i32:$src2, + timm:$fi, timm:$bc)), (inst (opsel_i1timm $fi), VGPR_32:$src0, (opsel_i1timm $bc), SCSrc_b32:$src1, 0, SCSrc_b32:$src2, VGPR_32:$vdst_in) >; @@ -859,13 +863,15 @@ let SubtargetPredicate = isGFX10Plus in { } // End isCommutable = 1, isReMaterializable = 1 def : ThreeOp_i32_Pats<xor, xor, V_XOR3_B32_e64>; - let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { + let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in", IsInvalidSingleUseConsumer = 1, IsInvalidSingleUseProducer = 1 in { defm V_PERMLANE16_B32 : VOP3Inst<"v_permlane16_b32", VOP3_PERMLANE_Profile>; defm V_PERMLANEX16_B32 : VOP3Inst<"v_permlanex16_b32", VOP3_PERMLANE_Profile>; - } // End $vdst = $vdst_in, DisableEncoding $vdst_in + } // End $vdst = $vdst_in, DisableEncoding $vdst_in, IsInvalidSingleUseConsumer = 1, IsInvalidSingleUseProducer = 1 - def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64>; - def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32_e64>; + foreach vt = Reg32Types.types in { + def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64, vt>; + def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32_e64, vt>; + } defm V_ADD_NC_U16 : VOP3Inst <"v_add_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, add>; defm V_SUB_NC_U16 : VOP3Inst <"v_sub_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, sub>; @@ -1275,11 +1281,12 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { } } // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" -defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx10<0x360>; - -let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in { - defm V_WRITELANE_B32 : VOP3_Real_No_Suffix_gfx10<0x361>; -} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) +let IsInvalidSingleUseConsumer = 1 in { + defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx10<0x360>; + let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in), IsInvalidSingleUseProducer = 1 in { + defm V_WRITELANE_B32 : VOP3_Real_No_Suffix_gfx10<0x361>; + } // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32: $src1, VGPR_32:$vdst_in), IsInvalidSingleUseProducer = 1 +} // End IsInvalidSingleUseConsumer = 1 let SubtargetPredicate = isGFX10Before1030 in { defm V_MUL_LO_I32 : VOP3_Real_gfx10<0x16b>; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 4c78bd94458d..4cab15435199 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -90,7 +90,7 @@ multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P> { let isReMaterializable = 1 in { let isCommutable = 1 in { defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>; -defm V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>; +defm V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>, imad>; let FPDPRounding = 1 in { defm V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, any_fma>; @@ -382,15 +382,19 @@ defm V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", AMDGPUfdot2, 1/*ExplicitClamp*/>; let OtherPredicates = [HasDot7Insts] in { -defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", - VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>; +let IsInvalidSingleUseConsumer = 1 in { + defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", + VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>; +} defm V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>; } // End OtherPredicates = [HasDot7Insts] let OtherPredicates = [HasDot1Insts] in { -defm V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", - VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>; +let IsInvalidSingleUseConsumer = 1 in { + defm V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", + VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>; +} defm V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>; } // End OtherPredicates = [HasDot1Insts] diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index 372c4f533629..3bcee28a2cb7 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -435,8 +435,10 @@ multiclass VOPC_I16 <string opName, SDPatternOperator cond = COND_NULL, multiclass VOPC_I32 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> : VOPC_Pseudos <opName, VOPC_I1_I32_I32, cond, revOp, 0>; -multiclass VOPC_I64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> : - VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>; +let IsInvalidSingleUseConsumer = 1 in { + multiclass VOPC_I64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> : + VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>; +} multiclass VOPCX_F16<string opName, string revOp = opName> { let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in { @@ -465,8 +467,10 @@ multiclass VOPCX_I16<string opName, string revOp = opName> { multiclass VOPCX_I32 <string opName, string revOp = opName> : VOPCX_Pseudos <opName, VOPC_I1_I32_I32, VOPC_I32_I32, COND_NULL, revOp>; -multiclass VOPCX_I64 <string opName, string revOp = opName> : - VOPCX_Pseudos <opName, VOPC_I1_I64_I64, VOPC_I64_I64, COND_NULL, revOp>; +let IsInvalidSingleUseConsumer = 1 in { + multiclass VOPCX_I64 <string opName, string revOp = opName> : + VOPCX_Pseudos <opName, VOPC_I1_I64_I64, VOPC_I64_I64, COND_NULL, revOp>; +} //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 5d1573d8dec1..2b05165cc94b 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -17,6 +17,8 @@ class LetDummies { bit isReMaterializable; bit isAsCheapAsAMove; bit FPDPRounding; + bit IsInvalidSingleUseConsumer; + bit IsInvalidSingleUseProducer; Predicate SubtargetPredicate; string Constraints; string DisableEncoding; @@ -81,6 +83,8 @@ class VOP_Pseudo <string opName, string suffix, VOPProfile P, dag outs, dag ins, string Mnemonic = opName; Instruction Opcode = !cast<Instruction>(NAME); bit IsTrue16 = P.IsTrue16; + bit IsInvalidSingleUseConsumer = P.IsInvalidSingleUseConsumer; + bit IsInvalidSingleUseProducer = P.IsInvalidSingleUseProducer; VOPProfile Pfl = P; string AsmOperands; @@ -175,6 +179,8 @@ class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> : class VOP_Real<VOP_Pseudo ps> { Instruction Opcode = !cast<Instruction>(NAME); bit IsSingle = ps.Pfl.IsSingle; + bit IsInvalidSingleUseConsumer = ps.Pfl.IsInvalidSingleUseConsumer; + bit IsInvalidSingleUseProducer = ps.Pfl.IsInvalidSingleUseProducer; } class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemonic> : @@ -823,17 +829,11 @@ class VOP3P_DPPe_Common<bits<7> op, VOPProfile P> : VOP3P_DPPe_Common_Base<op, P class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[], dag Ins = P.InsDPP, string asmOps = P.AsmDPP> : - InstSI <P.OutsDPP, Ins, OpName#asmOps, pattern>, - VOP <OpName>, - SIMCInstr <OpName#"_dpp", SIEncodingFamily.NONE> { - - let isPseudo = 1; - let isCodeGenOnly = 1; + VOP_Pseudo<OpName, "_dpp", P, P.OutsDPP, Ins, asmOps, pattern> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; - let UseNamedOperandTable = 1; let VALU = 1; let DPP = 1; @@ -846,7 +846,6 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[], let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]); let isConvergent = 1; - string Mnemonic = OpName; string AsmOperands = asmOps; let AsmMatchConverter = !if(P.HasModifiers, "cvtDPP", ""); @@ -857,7 +856,8 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[], let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, ""); let DecoderNamespace = "GFX8"; - VOPProfile Pfl = P; + let IsInvalidSingleUseConsumer = !not(VINTERP); + let IsInvalidSingleUseProducer = !not(VINTERP); } class VOP3_DPP_Pseudo <string OpName, VOPProfile P> : @@ -1725,3 +1725,12 @@ def VOPTrue16Table : GenericTable { let PrimaryKey = ["Opcode"]; let PrimaryKeyName = "getTrue16OpcodeHelper"; } + +def SingleUseExceptionTable : GenericTable { + let FilterClass = "VOP_Pseudo"; + let CppTypeName = "SingleUseExceptionInfo"; + let Fields = ["Opcode", "IsInvalidSingleUseConsumer", "IsInvalidSingleUseProducer"]; + + let PrimaryKey = ["Opcode"]; + let PrimaryKeyName = "getSingleUseExceptionHelper"; +} diff --git a/llvm/lib/Target/ARC/ARCBranchFinalize.cpp b/llvm/lib/Target/ARC/ARCBranchFinalize.cpp index 0e3e4d34aa6a..9d616e103f17 100644 --- a/llvm/lib/Target/ARC/ARCBranchFinalize.cpp +++ b/llvm/lib/Target/ARC/ARCBranchFinalize.cpp @@ -61,7 +61,7 @@ char ARCBranchFinalize::ID = 0; INITIALIZE_PASS_BEGIN(ARCBranchFinalize, "arc-branch-finalize", "ARC finalize branches", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(ARCBranchFinalize, "arc-branch-finalize", "ARC finalize branches", false, false) diff --git a/llvm/lib/Target/ARC/ARCOptAddrMode.cpp b/llvm/lib/Target/ARC/ARCOptAddrMode.cpp index e7a0b352db8d..36f811c0aa00 100644 --- a/llvm/lib/Target/ARC/ARCOptAddrMode.cpp +++ b/llvm/lib/Target/ARC/ARCOptAddrMode.cpp @@ -60,8 +60,8 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); } bool runOnMachineFunction(MachineFunction &MF) override; @@ -119,7 +119,7 @@ private: char ARCOptAddrMode::ID = 0; INITIALIZE_PASS_BEGIN(ARCOptAddrMode, OPTADDRMODE_NAME, OPTADDRMODE_DESC, false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(ARCOptAddrMode, OPTADDRMODE_NAME, OPTADDRMODE_DESC, false, false) @@ -508,7 +508,7 @@ bool ARCOptAddrMode::runOnMachineFunction(MachineFunction &MF) { AST = &MF.getSubtarget<ARCSubtarget>(); AII = AST->getInstrInfo(); MRI = &MF.getRegInfo(); - MDT = &getAnalysis<MachineDominatorTree>(); + MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); bool Changed = false; for (auto &MBB : MF) diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 627148b73c4f..e81e6bb69758 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -6060,6 +6060,8 @@ ARMBaseInstrInfo::getOutliningCandidateInfo( RepeatedSequenceLocs.size() * Costs.CallDefault) { RepeatedSequenceLocs = CandidatesWithoutStackFixups; FrameID = MachineOutlinerNoLRSave; + if (RepeatedSequenceLocs.size() < 2) + return std::nullopt; } else SetCandidateCallInfo(MachineOutlinerDefault, Costs.CallDefault); } diff --git a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp index 9579053943f9..90f5c6c40b49 100644 --- a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -229,7 +229,7 @@ namespace { bool runOnMachineFunction(MachineFunction &MF) override; void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -399,7 +399,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) { isPositionIndependentOrROPI = STI->getTargetLowering()->isPositionIndependent() || STI->isROPI(); AFI = MF->getInfo<ARMFunctionInfo>(); - DT = &getAnalysis<MachineDominatorTree>(); + DT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); isThumb = AFI->isThumbFunction(); isThumb1 = AFI->isThumb1OnlyFunction(); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index e3270471981c..4a7da3bf9744 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -156,6 +156,17 @@ static const MCPhysReg GPRArgRegs[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; +static SDValue handleCMSEValue(const SDValue &Value, const ISD::InputArg &Arg, + SelectionDAG &DAG, const SDLoc &DL) { + assert(Arg.ArgVT.isScalarInteger()); + assert(Arg.ArgVT.bitsLT(MVT::i32)); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, Arg.ArgVT, Value); + SDValue Ext = + DAG.getNode(Arg.Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, + MVT::i32, Trunc); + return Ext; +} + void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) { if (VT != PromotedLdStVT) { setOperationAction(ISD::LOAD, VT, Promote); @@ -365,6 +376,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::FSQRT, VT, Expand); setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); + setOperationAction(ISD::FTAN, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::FLOG, VT, Expand); setOperationAction(ISD::FLOG2, VT, Expand); @@ -875,6 +887,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FSQRT, MVT::v2f64, Expand); setOperationAction(ISD::FSIN, MVT::v2f64, Expand); setOperationAction(ISD::FCOS, MVT::v2f64, Expand); + setOperationAction(ISD::FTAN, MVT::v2f64, Expand); setOperationAction(ISD::FPOW, MVT::v2f64, Expand); setOperationAction(ISD::FLOG, MVT::v2f64, Expand); setOperationAction(ISD::FLOG2, MVT::v2f64, Expand); @@ -897,6 +910,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); setOperationAction(ISD::FSIN, MVT::v4f32, Expand); setOperationAction(ISD::FCOS, MVT::v4f32, Expand); + setOperationAction(ISD::FTAN, MVT::v4f32, Expand); setOperationAction(ISD::FPOW, MVT::v4f32, Expand); setOperationAction(ISD::FLOG, MVT::v4f32, Expand); setOperationAction(ISD::FLOG2, MVT::v4f32, Expand); @@ -914,6 +928,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FSQRT, MVT::v2f32, Expand); setOperationAction(ISD::FSIN, MVT::v2f32, Expand); setOperationAction(ISD::FCOS, MVT::v2f32, Expand); + setOperationAction(ISD::FTAN, MVT::v2f32, Expand); setOperationAction(ISD::FPOW, MVT::v2f32, Expand); setOperationAction(ISD::FLOG, MVT::v2f32, Expand); setOperationAction(ISD::FLOG2, MVT::v2f32, Expand); @@ -1540,6 +1555,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); setOperationAction(ISD::FSIN, MVT::f16, Promote); setOperationAction(ISD::FCOS, MVT::f16, Promote); + setOperationAction(ISD::FTAN, MVT::f16, Promote); setOperationAction(ISD::FSINCOS, MVT::f16, Promote); setOperationAction(ISD::FPOWI, MVT::f16, Promote); setOperationAction(ISD::FPOW, MVT::f16, Promote); @@ -1578,6 +1594,20 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, } } + // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has + // it, but it's just a wrapper around ldexp. + if (Subtarget->isTargetWindows()) { + for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP}) + if (isOperationExpand(Op, MVT::f32)) + setOperationAction(Op, MVT::f32, Promote); + } + + // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16 + // isn't legal. + for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP}) + if (isOperationExpand(Op, MVT::f16)) + setOperationAction(Op, MVT::f16, Promote); + // We have target-specific dag combine patterns for the following nodes: // ARMISD::VMOVRRD - No need to call setTargetDAGCombine setTargetDAGCombine( @@ -1710,7 +1740,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(ARMISD::BCC_i64) MAKE_CASE(ARMISD::FMSTAT) MAKE_CASE(ARMISD::CMOV) - MAKE_CASE(ARMISD::SUBS) MAKE_CASE(ARMISD::SSAT) MAKE_CASE(ARMISD::USAT) MAKE_CASE(ARMISD::ASRL) @@ -2189,7 +2218,7 @@ SDValue ARMTargetLowering::LowerCallResult( SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, - SDValue ThisVal) const { + SDValue ThisVal, bool isCmseNSCall) const { // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, @@ -2267,6 +2296,15 @@ SDValue ARMTargetLowering::LowerCallResult( (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val); + // On CMSE Non-secure Calls, call results (returned values) whose bitwidth + // is less than 32 bits must be sign- or zero-extended after the call for + // security reasons. Although the ABI mandates an extension done by the + // callee, the latter cannot be trusted to follow the rules of the ABI. + const ISD::InputArg &Arg = Ins[VA.getValNo()]; + if (isCmseNSCall && Arg.ArgVT.isScalarInteger() && + VA.getLocVT().isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32)) + Val = handleCMSEValue(Val, Arg, DAG, dl); + InVals.push_back(Val); } @@ -2878,7 +2916,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // return. return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG, InVals, isThisReturn, - isThisReturn ? OutVals[0] : SDValue()); + isThisReturn ? OutVals[0] : SDValue(), isCmseNSCall); } /// HandleByVal - Every parameter *after* a byval parameter is passed @@ -4481,8 +4519,6 @@ SDValue ARMTargetLowering::LowerFormalArguments( *DAG.getContext()); CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg)); - SmallVector<SDValue, 16> ArgValues; - SDValue ArgValue; Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin(); unsigned CurArgIdx = 0; @@ -4537,6 +4573,7 @@ SDValue ARMTargetLowering::LowerFormalArguments( // Arguments stored in registers. if (VA.isRegLoc()) { EVT RegVT = VA.getLocVT(); + SDValue ArgValue; if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) { // f64 and vector types are split up into multiple registers or @@ -4600,16 +4637,6 @@ SDValue ARMTargetLowering::LowerFormalArguments( case CCValAssign::BCvt: ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); break; - case CCValAssign::SExt: - ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, - DAG.getValueType(VA.getValVT())); - ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); - break; - case CCValAssign::ZExt: - ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, - DAG.getValueType(VA.getValVT())); - ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); - break; } // f16 arguments have their size extended to 4 bytes and passed as if they @@ -4619,6 +4646,15 @@ SDValue ARMTargetLowering::LowerFormalArguments( (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue); + // On CMSE Entry Functions, formal integer arguments whose bitwidth is + // less than 32 bits must be sign- or zero-extended in the callee for + // security reasons. Although the ABI mandates an extension done by the + // caller, the latter cannot be trusted to follow the rules of the ABI. + const ISD::InputArg &Arg = Ins[VA.getValNo()]; + if (AFI->isCmseNSEntryFunction() && Arg.ArgVT.isScalarInteger() && + RegVT.isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32)) + ArgValue = handleCMSEValue(ArgValue, Arg, DAG, dl); + InVals.push_back(ArgValue); } else { // VA.isRegLoc() // Only arguments passed on the stack should make it here. @@ -13709,7 +13745,7 @@ static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG, // t2: i64 = build_pair t1, t1:1 // t3: i64 = add t2, y // Otherwise we try to push the add up above VADDLVAx, to potentially allow - // the add to be simplified seperately. + // the add to be simplified separately. // We also need to check for sext / zext and commutitive adds. auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA, SDValue NB) { @@ -18470,9 +18506,9 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { } else if (CC == ARMCC::NE && !isNullConstant(RHS) && (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) { // This seems pointless but will allow us to combine it further below. - // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1 + // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1 SDValue Sub = - DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); + DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, Sub.getValue(1), SDValue()); Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc, @@ -18484,9 +18520,9 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) { // This seems pointless but will allow us to combine it further below // Note that we change == for != as this is the dual for the case above. - // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1 + // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1 SDValue Sub = - DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); + DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, Sub.getValue(1), SDValue()); Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal, @@ -18498,21 +18534,21 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { // On Thumb1, the DAG above may be further combined if z is a power of 2 // (z == 2 ^ K). - // CMOV (SUBS x, y), z, !=, (SUBS x, y):1 -> + // CMOV (SUBC x, y), z, !=, (SUBC x, y):1 -> // t1 = (USUBO (SUB x, y), 1) // t2 = (USUBO_CARRY (SUB x, y), t1:0, t1:1) // Result = if K != 0 then (SHL t2:0, K) else t2:0 // // This also handles the special case of comparing against zero; it's - // essentially, the same pattern, except there's no SUBS: + // essentially, the same pattern, except there's no SUBC: // CMOV x, z, !=, (CMPZ x, 0) -> // t1 = (USUBO x, 1) // t2 = (USUBO_CARRY x, t1:0, t1:1) // Result = if K != 0 then (SHL t2:0, K) else t2:0 const APInt *TrueConst; if (Subtarget->isThumb1Only() && CC == ARMCC::NE && - ((FalseVal.getOpcode() == ARMISD::SUBS && - FalseVal.getOperand(0) == LHS && FalseVal.getOperand(1) == RHS) || + ((FalseVal.getOpcode() == ARMISD::SUBC && FalseVal.getOperand(0) == LHS && + FalseVal.getOperand(1) == RHS) || (FalseVal == LHS && isNullConstant(RHS))) && (TrueConst = isPowerOf2Constant(TrueVal))) { SDVTList VTs = DAG.getVTList(VT, MVT::i32); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index ed4df7edd16e..a255e9b6fc36 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -95,7 +95,6 @@ class VectorType; FMSTAT, // ARM fmstat instruction. CMOV, // ARM conditional move instructions. - SUBS, // Flag-setting subtraction. SSAT, // Signed saturation USAT, // Unsigned saturation @@ -244,7 +243,7 @@ class VectorType; VADDLVAps, // Same as VADDLVp[su] but with a v4i1 predicate mask VADDLVApu, VMLAVs, // sign- or zero-extend the elements of two vectors to i32, multiply - VMLAVu, // them and add the results together, returning an i32 of their sum + VMLAVu, // them and add the results together, returning an i32 of the sum VMLAVps, // Same as VMLAV[su] with a v4i1 predicate mask VMLAVpu, VMLALVs, // Same as VMLAV but with i64, returning the low and @@ -895,7 +894,7 @@ class VectorType; const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, - SDValue ThisVal) const; + SDValue ThisVal, bool isCmseNSCall) const; bool supportSplitCSR(MachineFunction *MF) const override { return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS && diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index 1f7bd8dd3121..c6bcad8e2a82 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -160,7 +160,6 @@ def ARMintretglue : SDNode<"ARMISD::INTRET_GLUE", SDT_ARMcall, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; def ARMcmov : SDNode<"ARMISD::CMOV", SDT_ARMCMov, [SDNPInGlue]>; -def ARMsubs : SDNode<"ARMISD::SUBS", SDTIntBinOp, [SDNPOutGlue]>; def ARMssat : SDNode<"ARMISD::SSAT", SDTIntSatNoShOp, []>; @@ -3879,14 +3878,6 @@ let isAdd = 1 in defm ADDS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr, ARMaddc, 1>; defm SUBS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr, ARMsubc>; -def : ARMPat<(ARMsubs GPR:$Rn, mod_imm:$imm), (SUBSri $Rn, mod_imm:$imm)>; -def : ARMPat<(ARMsubs GPR:$Rn, GPR:$Rm), (SUBSrr $Rn, $Rm)>; -def : ARMPat<(ARMsubs GPR:$Rn, so_reg_imm:$shift), - (SUBSrsi $Rn, so_reg_imm:$shift)>; -def : ARMPat<(ARMsubs GPR:$Rn, so_reg_reg:$shift), - (SUBSrsr $Rn, so_reg_reg:$shift)>; - - let isAdd = 1 in defm ADC : AI1_adde_sube_irs<0b0101, "adc", ARMadde, 1>; defm SBC : AI1_adde_sube_irs<0b0110, "sbc", ARMsube>; diff --git a/llvm/lib/Target/ARM/ARMInstrThumb.td b/llvm/lib/Target/ARM/ARMInstrThumb.td index e7f405993513..2ad78f8cd8d4 100644 --- a/llvm/lib/Target/ARM/ARMInstrThumb.td +++ b/llvm/lib/Target/ARM/ARMInstrThumb.td @@ -1400,12 +1400,6 @@ let hasPostISelHook = 1, Defs = [CPSR] in { Sched<[WriteALU]>; } - -def : T1Pat<(ARMsubs tGPR:$Rn, tGPR:$Rm), (tSUBSrr $Rn, $Rm)>; -def : T1Pat<(ARMsubs tGPR:$Rn, imm0_7:$imm3), (tSUBSi3 $Rn, imm0_7:$imm3)>; -def : T1Pat<(ARMsubs tGPR:$Rn, imm0_255:$imm8), (tSUBSi8 $Rn, imm0_255:$imm8)>; - - // Sign-extend byte def tSXTB : // A8.6.222 T1pIMiscEncode<{0,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm), diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td index f227d68deeb8..e133dbeba365 100644 --- a/llvm/lib/Target/ARM/ARMInstrThumb2.td +++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -2438,12 +2438,6 @@ defm t2SUB : T2I_bin_ii12rs<0b101, "sub", sub>; defm t2ADDS : T2I_bin_s_irs <IIC_iALUi, IIC_iALUr, IIC_iALUsi, ARMaddc, 1>; defm t2SUBS : T2I_bin_s_irs <IIC_iALUi, IIC_iALUr, IIC_iALUsi, ARMsubc>; -def : T2Pat<(ARMsubs GPRnopc:$Rn, t2_so_imm:$imm), - (t2SUBSri $Rn, t2_so_imm:$imm)>; -def : T2Pat<(ARMsubs GPRnopc:$Rn, rGPR:$Rm), (t2SUBSrr $Rn, $Rm)>; -def : T2Pat<(ARMsubs GPRnopc:$Rn, t2_so_reg:$ShiftedRm), - (t2SUBSrs $Rn, t2_so_reg:$ShiftedRm)>; - defm t2ADC : T2I_adde_sube_irs<0b1010, "adc", ARMadde, 1, 1>; defm t2SBC : T2I_adde_sube_irs<0b1011, "sbc", ARMsube, 0, 1>; diff --git a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp index 00a29f8ecb23..660f351bae64 100644 --- a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp +++ b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp @@ -157,8 +157,10 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) .legalForCartesianProduct({s32, s64}, {s32}); - getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).legalFor({s32}); + getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV, G_GET_FPMODE}) + .legalFor({s32}); getActionDefinitionsBuilder(G_RESET_FPENV).alwaysLegal(); + getActionDefinitionsBuilder(G_SET_FPMODE).customFor({s32}); } else { getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV}) .libcallFor({s32, s64}); @@ -187,6 +189,8 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV, G_RESET_FPENV}) .libcall(); + getActionDefinitionsBuilder({G_GET_FPMODE, G_SET_FPMODE, G_RESET_FPMODE}) + .libcall(); } // Just expand whatever loads and stores are left. @@ -439,6 +443,21 @@ bool ARMLegalizerInfo::legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, *ConstantInt::get(Ctx, AsInteger)); break; } + case G_SET_FPMODE: { + // New FPSCR = (FPSCR & FPStatusBits) | (Modes & ~FPStatusBits) + LLT FPEnvTy = LLT::scalar(32); + auto FPEnv = MRI.createGenericVirtualRegister(FPEnvTy); + Register Modes = MI.getOperand(0).getReg(); + MIRBuilder.buildGetFPEnv(FPEnv); + auto StatusBitMask = MIRBuilder.buildConstant(FPEnvTy, ARM::FPStatusBits); + auto StatusBits = MIRBuilder.buildAnd(FPEnvTy, FPEnv, StatusBitMask); + auto NotStatusBitMask = + MIRBuilder.buildConstant(FPEnvTy, ~ARM::FPStatusBits); + auto FPModeBits = MIRBuilder.buildAnd(FPEnvTy, Modes, NotStatusBitMask); + auto NewFPSCR = MIRBuilder.buildOr(FPEnvTy, StatusBits, FPModeBits); + MIRBuilder.buildSetFPEnv(NewFPSCR); + break; + } } MI.eraseFromParent(); diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index 4a5b672f862b..e5e817f1ed9a 100644 --- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -2161,8 +2161,8 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AAResultsWrapperPass>(); - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -2186,7 +2186,7 @@ char ARMPreAllocLoadStoreOpt::ID = 0; INITIALIZE_PASS_BEGIN(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt", ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt", ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false) @@ -2204,7 +2204,7 @@ bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { TII = STI->getInstrInfo(); TRI = STI->getRegisterInfo(); MRI = &Fn.getRegInfo(); - DT = &getAnalysis<MachineDominatorTree>(); + DT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); MF = &Fn; AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); diff --git a/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp b/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp index a6fdece10ba4..9234881c9407 100644 --- a/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp +++ b/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp @@ -466,6 +466,7 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } case G_GET_FPENV: case G_SET_FPENV: + case G_GET_FPMODE: OperandsMapping = getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr}); break; diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp index a8c6cd99633f..b66a41d06062 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.cpp +++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -68,9 +68,6 @@ static cl::opt<bool> ForceFastISel("arm-force-fast-isel", cl::init(false), cl::Hidden); -static cl::opt<bool> EnableSubRegLiveness("arm-enable-subreg-liveness", - cl::init(false), cl::Hidden); - /// initializeSubtargetDependencies - Initializes using a CPU and feature string /// so that we can use initializer lists for subtarget initialization. ARMSubtarget &ARMSubtarget::initializeSubtargetDependencies(StringRef CPU, @@ -385,8 +382,6 @@ bool ARMSubtarget::enableMachineScheduler() const { } bool ARMSubtarget::enableSubRegLiveness() const { - if (EnableSubRegLiveness.getNumOccurrences()) - return EnableSubRegLiveness; // Enable SubRegLiveness for MVE to better optimize s subregs for mqpr regs // and q subregs for qqqqpr regs. return hasMVEIntegerOps(); diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 7db2e8ee7e6f..4dc4d28724ef 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -199,17 +199,21 @@ ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { PatternMatch::m_Value(ArgArg)))) { return IC.replaceInstUsesWith(II, ArgArg); } - if (!II.getMetadata(LLVMContext::MD_range)) { - Type *IntTy32 = Type::getInt32Ty(II.getContext()); - Metadata *M[] = { - ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)), - ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))}; - II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M)); - II.setMetadata(LLVMContext::MD_noundef, - MDNode::get(II.getContext(), std::nullopt)); - return &II; + + if (II.getMetadata(LLVMContext::MD_range)) + break; + + ConstantRange Range(APInt(32, 0), APInt(32, 0x10000)); + + if (auto CurrentRange = II.getRange()) { + Range = Range.intersectWith(*CurrentRange); + if (Range == CurrentRange) + break; } - break; + + II.addRangeRetAttr(Range); + II.addRetAttr(Attribute::NoUndef); + return &II; } case Intrinsic::arm_mve_vadc: case Intrinsic::arm_mve_vadc_predicated: { diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp index de7449a400a7..50a59ce76763 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp @@ -328,8 +328,9 @@ void ARMELFObjectWriter::addTargetSectionFlags(MCContext &Ctx, // execute-only section in the object. MCSectionELF *TextSection = static_cast<MCSectionELF *>(Ctx.getObjectFileInfo()->getTextSection()); - if (Sec.getKind().isExecuteOnly() && !TextSection->hasInstructions()) { - for (auto &F : TextSection->getFragmentList()) + bool IsExecOnly = Sec.getFlags() & ELF::SHF_ARM_PURECODE; + if (IsExecOnly && !TextSection->hasInstructions()) { + for (auto &F : *TextSection) if (auto *DF = dyn_cast<MCDataFragment>(&F)) if (!DF->getContents().empty()) return; diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index afd7dccbeca9..31b577b9c301 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -426,6 +426,8 @@ private: // Reset state between object emissions void reset() override; + void finish() override; + public: ARMTargetELFStreamer(MCStreamer &S) : ARMTargetStreamer(S), CurrentVendor("aeabi") {} @@ -459,8 +461,6 @@ public: ~ARMELFStreamer() override = default; - void finishImpl() override; - // ARM exception handling directives void emitFnStart(); void emitFnEnd(); @@ -479,7 +479,7 @@ public: MCObjectStreamer::emitFill(NumBytes, FillValue, Loc); } - void changeSection(MCSection *Section, const MCExpr *Subsection) override { + void changeSection(MCSection *Section, uint32_t Subsection) override { LastMappingSymbols[getCurrentSection().first] = std::move(LastEMSInfo); MCELFStreamer::changeSection(Section, Subsection); auto LastMappingSymbol = LastMappingSymbols.find(Section); @@ -487,7 +487,7 @@ public: LastEMSInfo = std::move(LastMappingSymbol->second); return; } - LastEMSInfo.reset(new ElfMappingSymbolInfo(SMLoc(), nullptr, 0)); + LastEMSInfo.reset(new ElfMappingSymbolInfo); } /// This function is the one used to emit instruction data into the ELF @@ -555,7 +555,7 @@ public: if (!LastEMSInfo->hasInfo()) return; ElfMappingSymbolInfo *EMS = LastEMSInfo.get(); - EmitMappingSymbol("$d", EMS->Loc, EMS->F, EMS->Offset); + emitMappingSymbol("$d", *EMS->F, EMS->Offset); EMS->resetInfo(); } @@ -625,17 +625,14 @@ private: }; struct ElfMappingSymbolInfo { - explicit ElfMappingSymbolInfo(SMLoc Loc, MCFragment *F, uint64_t O) - : Loc(Loc), F(F), Offset(O), State(EMS_None) {} void resetInfo() { F = nullptr; Offset = 0; } bool hasInfo() { return F != nullptr; } - SMLoc Loc; - MCFragment *F; - uint64_t Offset; - ElfMappingSymbol State; + MCDataFragment *F = nullptr; + uint64_t Offset = 0; + ElfMappingSymbol State = EMS_None; }; void emitDataMappingSymbol() { @@ -648,8 +645,7 @@ private: auto *DF = dyn_cast_or_null<MCDataFragment>(getCurrentFragment()); if (!DF) return; - EMS->Loc = SMLoc(); - EMS->F = getCurrentFragment(); + EMS->F = DF; EMS->Offset = DF->getContents().size(); LastEMSInfo->State = EMS_Data; return; @@ -683,11 +679,10 @@ private: Symbol->setBinding(ELF::STB_LOCAL); } - void EmitMappingSymbol(StringRef Name, SMLoc Loc, MCFragment *F, - uint64_t Offset) { + void emitMappingSymbol(StringRef Name, MCDataFragment &F, uint64_t Offset) { auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol( Name + "." + Twine(MappingSymbolCounter++))); - emitLabelAtPos(Symbol, Loc, F, Offset); + emitLabelAtPos(Symbol, SMLoc(), F, Offset); Symbol->setType(ELF::STT_NOTYPE); Symbol->setBinding(ELF::STB_LOCAL); } @@ -1118,12 +1113,9 @@ void ARMTargetELFStreamer::emitInst(uint32_t Inst, char Suffix) { void ARMTargetELFStreamer::reset() { AttributeSection = nullptr; } -void ARMELFStreamer::finishImpl() { - MCTargetStreamer &TS = *getTargetStreamer(); - ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS); - ATS.finishAttributeSection(); - - MCELFStreamer::finishImpl(); +void ARMTargetELFStreamer::finish() { + ARMTargetStreamer::finish(); + finishAttributeSection(); } void ARMELFStreamer::reset() { diff --git a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp index c9bbc41ac13b..4882e8533caf 100644 --- a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp +++ b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp @@ -59,8 +59,8 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<MachineLoopInfo>(); AU.addPreserved<MachineLoopInfo>(); - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -93,7 +93,7 @@ INITIALIZE_PASS_BEGIN(MVETPAndVPTOptimisations, DEBUG_TYPE, "ARM MVE TailPred and VPT Optimisations pass", false, false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(MVETPAndVPTOptimisations, DEBUG_TYPE, "ARM MVE TailPred and VPT Optimisations pass", false, false) @@ -1065,7 +1065,8 @@ bool MVETPAndVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) { TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo()); MRI = &Fn.getRegInfo(); MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfo>(); - MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>(); + MachineDominatorTree *DT = + &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); LLVM_DEBUG(dbgs() << "********** ARM MVE VPT Optimisations **********\n" << "********** Function: " << Fn.getName() << '\n'); diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp index 7732cb08e24d..8346b1c119ee 100644 --- a/llvm/lib/Target/AVR/AVRISelLowering.cpp +++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp @@ -1346,11 +1346,11 @@ static void analyzeReturnValues(const SmallVectorImpl<ArgT> &Args, ArrayRef<MCPhysReg> RegList8; ArrayRef<MCPhysReg> RegList16; if (Tiny) { - RegList8 = ArrayRef(RegList8Tiny, std::size(RegList8Tiny)); - RegList16 = ArrayRef(RegList16Tiny, std::size(RegList16Tiny)); + RegList8 = ArrayRef(RegList8Tiny); + RegList16 = ArrayRef(RegList16Tiny); } else { - RegList8 = ArrayRef(RegList8AVR, std::size(RegList8AVR)); - RegList16 = ArrayRef(RegList16AVR, std::size(RegList16AVR)); + RegList8 = ArrayRef(RegList8AVR); + RegList16 = ArrayRef(RegList16AVR); } // GCC-ABI says that the size is rounded up to the next even number, diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.td b/llvm/lib/Target/AVR/AVRInstrInfo.td index 88b1989ef917..6cfbf9c83dc3 100644 --- a/llvm/lib/Target/AVR/AVRInstrInfo.td +++ b/llvm/lib/Target/AVR/AVRInstrInfo.td @@ -761,28 +761,16 @@ let isBranch = 1, isTerminator = 1 in { // Conditional skipping on GPR register bits, and // conditional skipping on IO register bits. let isBarrier = 1 in { - def SBRCRrB : FRdB<0b10, (outs), - (ins GPR8 - : $rd, i8imm - : $b), - "sbrc\t$rd, $b", []>; + def SBRCRrB : FRdB<0b10, (outs), (ins GPR8:$rd, i8imm:$b), "sbrc\t$rd, $b", + []>; - def SBRSRrB : FRdB<0b11, (outs), - (ins GPR8 - : $rd, i8imm - : $b), - "sbrs\t$rd, $b", []>; - - def SBICAb : FIOBIT<0b01, (outs), - (ins imm_port5 - : $addr, i8imm - : $b), + def SBRSRrB : FRdB<0b11, (outs), (ins GPR8:$rd, i8imm:$b), "sbrs\t$rd, $b", + []>; + + def SBICAb : FIOBIT<0b01, (outs), (ins imm_port5:$addr, i8imm:$b), "sbic\t$addr, $b", []>; - def SBISAb : FIOBIT<0b11, (outs), - (ins imm_port5 - : $addr, i8imm - : $b), + def SBISAb : FIOBIT<0b11, (outs), (ins imm_port5:$addr, i8imm:$b), "sbis\t$addr, $b", []>; } @@ -790,18 +778,12 @@ let isBranch = 1, isTerminator = 1 in { let Uses = [SREG] in { // BRBS s, k // Branch if `s` flag in status register is set. - def BRBSsk : FSK<0, (outs), - (ins i8imm - : $s, relbrtarget_7 - : $k), + def BRBSsk : FSK<0, (outs), (ins i8imm:$s, relbrtarget_7:$k), "brbs\t$s, $k", []>; // BRBC s, k // Branch if `s` flag in status register is clear. - def BRBCsk : FSK<1, (outs), - (ins i8imm - : $s, relbrtarget_7 - : $k), + def BRBCsk : FSK<1, (outs), (ins i8imm:$s, relbrtarget_7:$k), "brbc\t$s, $k", []>; } } @@ -852,53 +834,29 @@ def : InstAlias<"brid\t$k", (BRBCsk 7, relbrtarget_7 : $k)>; // Based on status register. We cannot simplify these into instruction aliases // because we also need to be able to specify a pattern to match for ISel. let isBranch = 1, isTerminator = 1, Uses = [SREG] in { - def BREQk : FBRsk<0, 0b001, (outs), - (ins relbrtarget_7 - : $k), - "breq\t$k", [(AVRbrcond bb - : $k, AVR_COND_EQ)]>; - - def BRNEk : FBRsk<1, 0b001, (outs), - (ins relbrtarget_7 - : $k), - "brne\t$k", [(AVRbrcond bb - : $k, AVR_COND_NE)]>; - - def BRSHk : FBRsk<1, 0b000, (outs), - (ins relbrtarget_7 - : $k), - "brsh\t$k", [(AVRbrcond bb - : $k, AVR_COND_SH)]>; - - def BRLOk : FBRsk<0, 0b000, (outs), - (ins relbrtarget_7 - : $k), - "brlo\t$k", [(AVRbrcond bb - : $k, AVR_COND_LO)]>; - - def BRMIk : FBRsk<0, 0b010, (outs), - (ins relbrtarget_7 - : $k), - "brmi\t$k", [(AVRbrcond bb - : $k, AVR_COND_MI)]>; - - def BRPLk : FBRsk<1, 0b010, (outs), - (ins relbrtarget_7 - : $k), - "brpl\t$k", [(AVRbrcond bb - : $k, AVR_COND_PL)]>; - - def BRGEk : FBRsk<1, 0b100, (outs), - (ins relbrtarget_7 - : $k), - "brge\t$k", [(AVRbrcond bb - : $k, AVR_COND_GE)]>; - - def BRLTk : FBRsk<0, 0b100, (outs), - (ins relbrtarget_7 - : $k), - "brlt\t$k", [(AVRbrcond bb - : $k, AVR_COND_LT)]>; + def BREQk : FBRsk<0, 0b001, (outs), (ins relbrtarget_7:$k), "breq\t$k", + [(AVRbrcond bb:$k, AVR_COND_EQ)]>; + + def BRNEk : FBRsk<1, 0b001, (outs), (ins relbrtarget_7:$k), "brne\t$k", + [(AVRbrcond bb:$k, AVR_COND_NE)]>; + + def BRSHk : FBRsk<1, 0b000, (outs), (ins relbrtarget_7:$k), "brsh\t$k", + [(AVRbrcond bb:$k, AVR_COND_SH)]>; + + def BRLOk : FBRsk<0, 0b000, (outs), (ins relbrtarget_7:$k), "brlo\t$k", + [(AVRbrcond bb:$k, AVR_COND_LO)]>; + + def BRMIk : FBRsk<0, 0b010, (outs), (ins relbrtarget_7:$k), "brmi\t$k", + [(AVRbrcond bb:$k, AVR_COND_MI)]>; + + def BRPLk : FBRsk<1, 0b010, (outs), (ins relbrtarget_7:$k), "brpl\t$k", + [(AVRbrcond bb:$k, AVR_COND_PL)]>; + + def BRGEk : FBRsk<1, 0b100, (outs), (ins relbrtarget_7:$k), "brge\t$k", + [(AVRbrcond bb:$k, AVR_COND_GE)]>; + + def BRLTk : FBRsk<0, 0b100, (outs), (ins relbrtarget_7:$k), "brlt\t$k", + [(AVRbrcond bb:$k, AVR_COND_LT)]>; } //===----------------------------------------------------------------------===// @@ -906,62 +864,37 @@ let isBranch = 1, isTerminator = 1, Uses = [SREG] in { //===----------------------------------------------------------------------===// // 8 and 16-bit register move instructions. let hasSideEffects = 0 in { - def MOVRdRr : FRdRr<0b0010, 0b11, - (outs GPR8 - : $rd), - (ins GPR8 - : $rr), + def MOVRdRr : FRdRr<0b0010, 0b11, (outs GPR8:$rd), (ins GPR8:$rr), "mov\t$rd, $rr", []>; - def MOVWRdRr : FMOVWRdRr<(outs DREGS - : $rd), - (ins DREGS - : $rr), - "movw\t$rd, $rr", []>, + def MOVWRdRr : FMOVWRdRr<(outs DREGS:$rd), (ins DREGS:$rr), "movw\t$rd, $rr", + []>, Requires<[HasMOVW]>; } // Load immediate values into registers. let isReMaterializable = 1 in { - def LDIRdK : FRdK<0b1110, - (outs LD8 - : $rd), - (ins imm_ldi8 - : $k), - "ldi\t$rd, $k", [(set i8 - : $rd, imm - : $k)]>; + def LDIRdK : FRdK<0b1110, (outs LD8:$rd), (ins imm_ldi8:$k), "ldi\t$rd, $k", + [(set i8:$rd, imm:$k)]>; // LDIW Rd+1:Rd, K+1:K // // Expands to: // ldi Rd, K // ldi Rd+1, K+1 - def LDIWRdK : Pseudo<(outs DLDREGS - : $dst), - (ins i16imm - : $src), - "ldiw\t$dst, $src", [(set i16 - : $dst, imm - : $src)]>; + def LDIWRdK : Pseudo<(outs DLDREGS:$dst), (ins i16imm:$src), + "ldiw\t$dst, $src", [(set i16:$dst, imm:$src)]>; } // Load from data space into register. let canFoldAsLoad = 1, isReMaterializable = 1 in { - def LDSRdK : F32DM<0b0, - (outs GPR8 - : $rd), - (ins imm16 - : $k), - "lds\t$rd, $k", [(set i8 - : $rd, (load imm - : $k))]>, + def LDSRdK : F32DM<0b0, (outs GPR8:$rd), (ins imm16:$k), "lds\t$rd, $k", + [(set i8:$rd, (load imm:$k))]>, Requires<[HasSRAM, HasNonTinyEncoding]>; // Load from data space into register, which is only available on AVRTiny. def LDSRdKTiny : FLDSSTSTINY<0b0, (outs LD8:$rd), (ins imm7tiny:$k), - "lds\t$rd, $k", - [(set i8:$rd, (load imm:$k))]>, + "lds\t$rd, $k", [(set i8:$rd, (load imm:$k))]>, Requires<[HasSRAM, HasTinyEncoding]>; // LDSW Rd+1:Rd, K+1:K @@ -969,26 +902,16 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in { // Expands to: // lds Rd, (K+1:K) // lds Rd+1 (K+1:K) + 1 - def LDSWRdK : Pseudo<(outs DREGS - : $dst), - (ins i16imm - : $src), - "ldsw\t$dst, $src", [(set i16 - : $dst, (load imm - : $src))]>, + def LDSWRdK : Pseudo<(outs DREGS:$dst), (ins i16imm:$src), "ldsw\t$dst, $src", + [(set i16:$dst, (load imm:$src))]>, Requires<[HasSRAM, HasNonTinyEncoding]>; } // Indirect loads. let canFoldAsLoad = 1, isReMaterializable = 1 in { - def LDRdPtr : FSTLD<0, 0b00, - (outs GPR8 - : $reg), - (ins LDSTPtrReg - : $ptrreg), - "ld\t$reg, $ptrreg", [(set GPR8 - : $reg, (load i16 - : $ptrreg))]>, + def LDRdPtr : FSTLD<0, 0b00, (outs GPR8:$reg), (ins LDSTPtrReg:$ptrreg), + "ld\t$reg, $ptrreg", + [(set GPR8:$reg, (load i16:$ptrreg))]>, Requires<[HasSRAM]>; // LDW Rd+1:Rd, P @@ -1001,13 +924,8 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in { // ld Rd+1, P+ // subiw P, 2 let Constraints = "@earlyclobber $reg" in def LDWRdPtr - : Pseudo<(outs DREGS - : $reg), - (ins PTRDISPREGS - : $ptrreg), - "ldw\t$reg, $ptrreg", [(set i16 - : $reg, (load i16 - : $ptrreg))]>, + : Pseudo<(outs DREGS:$reg), (ins PTRDISPREGS:$ptrreg), + "ldw\t$reg, $ptrreg", [(set i16:$reg, (load i16:$ptrreg))]>, Requires<[HasSRAM]>; } @@ -1027,21 +945,12 @@ let mayLoad = 1, hasSideEffects = 0, // Expands to: // ld Rd, P+ // ld Rd+1, P+ - def LDWRdPtrPi : Pseudo<(outs DREGS - : $reg, PTRREGS - : $base_wb), - (ins PTRREGS - : $ptrreg), - "ldw\t$reg, $ptrreg+", []>, + def LDWRdPtrPi : Pseudo<(outs DREGS:$reg, PTRREGS:$base_wb), + (ins PTRREGS:$ptrreg), "ldw\t$reg, $ptrreg+", []>, Requires<[HasSRAM]>; - def LDRdPtrPd : FSTLD<0, 0b10, - (outs GPR8 - : $reg, PTRREGS - : $base_wb), - (ins LDSTPtrReg - : $ptrreg), - "ld\t$reg, -$ptrreg", []>, + def LDRdPtrPd : FSTLD<0, 0b10, (outs GPR8:$reg, PTRREGS:$base_wb), + (ins LDSTPtrReg:$ptrreg), "ld\t$reg, -$ptrreg", []>, Requires<[HasSRAM]>; // LDW Rd+1:Rd, -P @@ -1049,27 +958,17 @@ let mayLoad = 1, hasSideEffects = 0, // Expands to: // ld Rd+1, -P // ld Rd, -P - def LDWRdPtrPd : Pseudo<(outs DREGS - : $reg, PTRREGS - : $base_wb), - (ins PTRREGS - : $ptrreg), - "ldw\t$reg, -$ptrreg", []>, + def LDWRdPtrPd : Pseudo<(outs DREGS:$reg, PTRREGS:$base_wb), + (ins PTRREGS:$ptrreg), "ldw\t$reg, -$ptrreg", []>, Requires<[HasSRAM]>; } // Load indirect with displacement operations. let canFoldAsLoad = 1, isReMaterializable = 1 in { - def LDDRdPtrQ - : FSTDLDD<0, - (outs GPR8 - : $reg), - (ins memri - : $memri), - "ldd\t$reg, $memri", [(set i8 - : $reg, (load addr - : $memri))]>, - Requires<[HasSRAM, HasNonTinyEncoding]>; + def LDDRdPtrQ : FSTDLDD<0, (outs GPR8:$reg), (ins memri:$memri), + "ldd\t$reg, $memri", + [(set i8:$reg, (load addr:$memri))]>, + Requires<[HasSRAM, HasNonTinyEncoding]>; // LDDW Rd+1:Rd, P+q // @@ -1081,15 +980,11 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in { // ld Rd, P+ // ld Rd+1, P+ // subiw P, q+2 - let Constraints = "@earlyclobber $dst" in def LDDWRdPtrQ - : Pseudo<(outs DREGS - : $dst), - (ins memri - : $memri), - "lddw\t$dst, $memri", [(set i16 - : $dst, (load addr - : $memri))]>, - Requires<[HasSRAM]>; + let Constraints = "@earlyclobber $dst" in + def LDDWRdPtrQ : Pseudo<(outs DREGS:$dst), (ins memri:$memri), + "lddw\t$dst, $memri", + [(set i16:$dst, (load addr:$memri))]>, + Requires<[HasSRAM]>; // An identical pseudo instruction to LDDWRdPtrQ, expect restricted to the Y // register and without the @earlyclobber flag. @@ -1107,35 +1002,23 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in { // The pseudo expansion pass trivially expands this into LDDWRdPtrQ. // // This instruction may be removed once PR13375 is fixed. - let mayLoad = 1, - hasSideEffects = 0 in def LDDWRdYQ : Pseudo<(outs DREGS - : $dst), - (ins memri - : $memri), - "lddw\t$dst, $memri", []>, - Requires<[HasSRAM]>; + let mayLoad = 1, hasSideEffects = 0 in + def LDDWRdYQ : Pseudo<(outs DREGS:$dst), (ins memri:$memri), + "lddw\t$dst, $memri", []>, + Requires<[HasSRAM]>; } class AtomicLoad<PatFrag Op, RegisterClass DRC, RegisterClass PTRRC> - : Pseudo<(outs DRC - : $rd), - (ins PTRRC - : $rr), - "atomic_op", [(set DRC - : $rd, (Op i16 - : $rr))]>; + : Pseudo<(outs DRC:$rd), (ins PTRRC:$rr), "atomic_op", + [(set DRC:$rd, (Op i16:$rr))]>; class AtomicStore<PatFrag Op, RegisterClass DRC, RegisterClass PTRRC> - : Pseudo<(outs), - (ins PTRRC - : $rd, DRC - : $rr), - "atomic_op", [(Op DRC:$rr, i16:$rd)]>; + : Pseudo<(outs), (ins PTRRC:$rd, DRC:$rr), "atomic_op", + [(Op DRC:$rr, i16:$rd)]>; class AtomicLoadOp<PatFrag Op, RegisterClass DRC, RegisterClass PTRRC> - : Pseudo<(outs DRC:$rd), - (ins PTRRC:$rr, DRC:$operand), - "atomic_op", [(set DRC:$rd, (Op i16:$rr, DRC:$operand))]>; + : Pseudo<(outs DRC:$rd), (ins PTRRC:$rr, DRC:$operand), "atomic_op", + [(set DRC:$rd, (Op i16:$rr, DRC:$operand))]>; // Atomic instructions // =================== @@ -1156,28 +1039,24 @@ class AtomicLoadOp8<PatFrag Op> : AtomicLoadOp<Op, GPR8, PTRREGS>; class AtomicLoadOp16<PatFrag Op> : AtomicLoadOp<Op, DREGS, PTRDISPREGS>; let usesCustomInserter=1 in { - def AtomicLoadAdd8 : AtomicLoadOp8<atomic_load_add_8>; - def AtomicLoadAdd16 : AtomicLoadOp16<atomic_load_add_16>; - def AtomicLoadSub8 : AtomicLoadOp8<atomic_load_sub_8>; - def AtomicLoadSub16 : AtomicLoadOp16<atomic_load_sub_16>; - def AtomicLoadAnd8 : AtomicLoadOp8<atomic_load_and_8>; - def AtomicLoadAnd16 : AtomicLoadOp16<atomic_load_and_16>; - def AtomicLoadOr8 : AtomicLoadOp8<atomic_load_or_8>; - def AtomicLoadOr16 : AtomicLoadOp16<atomic_load_or_16>; - def AtomicLoadXor8 : AtomicLoadOp8<atomic_load_xor_8>; - def AtomicLoadXor16 : AtomicLoadOp16<atomic_load_xor_16>; + def AtomicLoadAdd8 : AtomicLoadOp8<atomic_load_add_i8>; + def AtomicLoadAdd16 : AtomicLoadOp16<atomic_load_add_i16>; + def AtomicLoadSub8 : AtomicLoadOp8<atomic_load_sub_i8>; + def AtomicLoadSub16 : AtomicLoadOp16<atomic_load_sub_i16>; + def AtomicLoadAnd8 : AtomicLoadOp8<atomic_load_and_i8>; + def AtomicLoadAnd16 : AtomicLoadOp16<atomic_load_and_i16>; + def AtomicLoadOr8 : AtomicLoadOp8<atomic_load_or_i8>; + def AtomicLoadOr16 : AtomicLoadOp16<atomic_load_or_i16>; + def AtomicLoadXor8 : AtomicLoadOp8<atomic_load_xor_i8>; + def AtomicLoadXor16 : AtomicLoadOp16<atomic_load_xor_i16>; } + def AtomicFence : Pseudo<(outs), (ins), "atomic_fence", [(atomic_fence timm, timm)]>; // Indirect store from register to data space. -def STSKRr : F32DM<0b1, (outs), - (ins imm16 - : $k, GPR8 - : $rd), - "sts\t$k, $rd", [(store i8 - : $rd, imm - : $k)]>, +def STSKRr : F32DM<0b1, (outs), (ins imm16:$k, GPR8:$rd), "sts\t$k, $rd", + [(store i8:$rd, imm:$k)]>, Requires<[HasSRAM, HasNonTinyEncoding]>; // Store from register to data space, which is only available on AVRTiny. @@ -1190,25 +1069,15 @@ def STSKRrTiny : FLDSSTSTINY<0b1, (outs), (ins imm7tiny:$k, LD8:$rd), // Expands to: // sts Rr+1, (K+1:K) + 1 // sts Rr, (K+1:K) -def STSWKRr : Pseudo<(outs), - (ins i16imm - : $dst, DREGS - : $src), - "stsw\t$dst, $src", [(store i16 - : $src, imm - : $dst)]>, +def STSWKRr : Pseudo<(outs), (ins i16imm:$dst, DREGS:$src), + "stsw\t$dst, $src", [(store i16:$src, imm:$dst)]>, Requires<[HasSRAM, HasNonTinyEncoding]>; // Indirect stores. // ST P, Rr // Stores the value of Rr into the location addressed by pointer P. -def STPtrRr : FSTLD<1, 0b00, (outs), - (ins LDSTPtrReg - : $ptrreg, GPR8 - : $reg), - "st\t$ptrreg, $reg", [(store GPR8 - : $reg, i16 - : $ptrreg)]>, +def STPtrRr : FSTLD<1, 0b00, (outs), (ins LDSTPtrReg:$ptrreg, GPR8:$reg), + "st\t$ptrreg, $reg", [(store GPR8:$reg, i16:$ptrreg)]>, Requires<[HasSRAM]>; // STW P, Rr+1:Rr @@ -1221,13 +1090,8 @@ def STPtrRr : FSTLD<1, 0b00, (outs), // st P+, Rr // st P+, Rr+1 // subiw P, q+2 -def STWPtrRr : Pseudo<(outs), - (ins PTRDISPREGS - : $ptrreg, DREGS - : $reg), - "stw\t$ptrreg, $reg", [(store i16 - : $reg, i16 - : $ptrreg)]>, +def STWPtrRr : Pseudo<(outs), (ins PTRDISPREGS:$ptrreg, DREGS:$reg), + "stw\t$ptrreg, $reg", [(store i16:$reg, i16:$ptrreg)]>, Requires<[HasSRAM]>; // Indirect stores (with postincrement or predecrement). @@ -1236,18 +1100,11 @@ let Constraints = "$ptrreg = $base_wb,@earlyclobber $base_wb" in { // ST P+, Rr // Stores the value of Rr into the location addressed by pointer P. // Post increments P. - def STPtrPiRr : FSTLD<1, 0b01, - (outs LDSTPtrReg - : $base_wb), - (ins LDSTPtrReg - : $ptrreg, GPR8 - : $reg, i8imm - : $offs), - "st\t$ptrreg+, $reg", [(set i16 - : $base_wb, (post_store GPR8 - : $reg, i16 - : $ptrreg, imm - : $offs))]>, + def STPtrPiRr : FSTLD<1, 0b01, (outs LDSTPtrReg:$base_wb), + (ins LDSTPtrReg:$ptrreg, GPR8:$reg, i8imm:$offs), + "st\t$ptrreg+, $reg", + [(set i16:$base_wb, (post_store GPR8:$reg, i16:$ptrreg, + imm:$offs))]>, Requires<[HasSRAM]>; // STW P+, Rr+1:Rr @@ -1257,34 +1114,22 @@ let Constraints = "$ptrreg = $base_wb,@earlyclobber $base_wb" in { // Expands to: // st P+, Rr // st P+, Rr+1 - def STWPtrPiRr : Pseudo<(outs PTRREGS - : $base_wb), - (ins PTRREGS - : $ptrreg, DREGS - : $trh, i8imm - : $offs), - "stw\t$ptrreg+, $trh", [(set PTRREGS - : $base_wb, (post_store DREGS - : $trh, PTRREGS - : $ptrreg, imm - : $offs))]>, + def STWPtrPiRr : Pseudo<(outs PTRREGS:$base_wb), + (ins PTRREGS:$ptrreg, DREGS:$trh, i8imm:$offs), + "stw\t$ptrreg+, $trh", + [(set PTRREGS:$base_wb, + (post_store DREGS:$trh, PTRREGS:$ptrreg, + imm:$offs))]>, Requires<[HasSRAM]>; // ST -P, Rr // Stores the value of Rr into the location addressed by pointer P. // Pre decrements P. - def STPtrPdRr : FSTLD<1, 0b10, - (outs LDSTPtrReg - : $base_wb), - (ins LDSTPtrReg - : $ptrreg, GPR8 - : $reg, i8imm - : $offs), - "st\t-$ptrreg, $reg", [(set i16 - : $base_wb, (pre_store GPR8 - : $reg, i16 - : $ptrreg, imm - : $offs))]>, + def STPtrPdRr : FSTLD<1, 0b10, (outs LDSTPtrReg:$base_wb), + (ins LDSTPtrReg:$ptrreg, GPR8:$reg, i8imm:$offs), + "st\t-$ptrreg, $reg", + [(set i16: $base_wb, + (pre_store GPR8:$reg, i16:$ptrreg, imm:$offs))]>, Requires<[HasSRAM]>; // STW -P, Rr+1:Rr @@ -1294,17 +1139,11 @@ let Constraints = "$ptrreg = $base_wb,@earlyclobber $base_wb" in { // Expands to: // st -P, Rr+1 // st -P, Rr - def STWPtrPdRr : Pseudo<(outs PTRREGS - : $base_wb), - (ins PTRREGS - : $ptrreg, DREGS - : $reg, i8imm - : $offs), - "stw\t-$ptrreg, $reg", [(set PTRREGS - : $base_wb, (pre_store i16 - : $reg, i16 - : $ptrreg, imm - : $offs))]>, + def STWPtrPdRr : Pseudo<(outs PTRREGS:$base_wb), + (ins PTRREGS:$ptrreg, DREGS:$reg, i8imm:$offs), + "stw\t-$ptrreg, $reg", + [(set PTRREGS:$base_wb, + (pre_store i16:$reg, i16:$ptrreg, imm:$offs))]>, Requires<[HasSRAM]>; } @@ -1312,13 +1151,8 @@ let Constraints = "$ptrreg = $base_wb,@earlyclobber $base_wb" in { // STD P+q, Rr // Stores the value of Rr into the location addressed by pointer P with a // displacement of q. Does not modify P. -def STDPtrQRr : FSTDLDD<1, (outs), - (ins memri - : $memri, GPR8 - : $reg), - "std\t$memri, $reg", [(store i8 - : $reg, addr - : $memri)]>, +def STDPtrQRr : FSTDLDD<1, (outs), (ins memri:$memri, GPR8:$reg), + "std\t$memri, $reg", [(store i8:$reg, addr:$memri)]>, Requires<[HasSRAM, HasNonTinyEncoding]>; // STDW P+q, Rr+1:Rr @@ -1333,13 +1167,8 @@ def STDPtrQRr : FSTDLDD<1, (outs), // st P+, Rr // st P+, Rr+1 // subiw P, q+2 -def STDWPtrQRr : Pseudo<(outs), - (ins memri - : $memri, DREGS - : $src), - "stdw\t$memri, $src", [(store i16 - : $src, addr - : $memri)]>, +def STDWPtrQRr : Pseudo<(outs), (ins memri:$memri, DREGS:$src), + "stdw\t$memri, $src", [(store i16:$src, addr:$memri)]>, Requires<[HasSRAM]>; // Load program memory operations. diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td index 66c57952a7f1..55989f5eb6a3 100644 --- a/llvm/lib/Target/BPF/BPFInstrInfo.td +++ b/llvm/lib/Target/BPF/BPFInstrInfo.td @@ -807,7 +807,7 @@ class XADD<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode> let Constraints = "$dst = $val" in { let Predicates = [BPFNoALU32] in { - def XADDW : XADD<BPF_W, "u32", atomic_load_add_32>; + def XADDW : XADD<BPF_W, "u32", atomic_load_add_i32>; } } @@ -897,23 +897,23 @@ class XFALU32<BPFWidthModifer SizeOp, BPFArithOp Opc, string OpcodeStr, let Constraints = "$dst = $val" in { let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in { - def XFADDW32 : XFALU32<BPF_W, BPF_ADD, "u32", "add", atomic_load_add_32>; - def XFANDW32 : XFALU32<BPF_W, BPF_AND, "u32", "and", atomic_load_and_32>; - def XFORW32 : XFALU32<BPF_W, BPF_OR, "u32", "or", atomic_load_or_32>; - def XFXORW32 : XFALU32<BPF_W, BPF_XOR, "u32", "xor", atomic_load_xor_32>; + def XFADDW32 : XFALU32<BPF_W, BPF_ADD, "u32", "add", atomic_load_add_i32>; + def XFANDW32 : XFALU32<BPF_W, BPF_AND, "u32", "and", atomic_load_and_i32>; + def XFORW32 : XFALU32<BPF_W, BPF_OR, "u32", "or", atomic_load_or_i32>; + def XFXORW32 : XFALU32<BPF_W, BPF_XOR, "u32", "xor", atomic_load_xor_i32>; } - def XFADDD : XFALU64<BPF_DW, BPF_ADD, "u64", "add", atomic_load_add_64>; - def XFANDD : XFALU64<BPF_DW, BPF_AND, "u64", "and", atomic_load_and_64>; - def XFORD : XFALU64<BPF_DW, BPF_OR, "u64", "or", atomic_load_or_64>; - def XFXORD : XFALU64<BPF_DW, BPF_XOR, "u64", "xor", atomic_load_xor_64>; + def XFADDD : XFALU64<BPF_DW, BPF_ADD, "u64", "add", atomic_load_add_i64>; + def XFANDD : XFALU64<BPF_DW, BPF_AND, "u64", "and", atomic_load_and_i64>; + def XFORD : XFALU64<BPF_DW, BPF_OR, "u64", "or", atomic_load_or_i64>; + def XFXORD : XFALU64<BPF_DW, BPF_XOR, "u64", "xor", atomic_load_xor_i64>; } // atomic_load_sub can be represented as a neg followed // by an atomic_load_add. -def : Pat<(atomic_load_sub_32 ADDRri:$addr, GPR32:$val), +def : Pat<(atomic_load_sub_i32 ADDRri:$addr, GPR32:$val), (XFADDW32 ADDRri:$addr, (NEG_32 GPR32:$val))>; -def : Pat<(atomic_load_sub_64 ADDRri:$addr, GPR:$val), +def : Pat<(atomic_load_sub_i64 ADDRri:$addr, GPR:$val), (XFADDD ADDRri:$addr, (NEG_64 GPR:$val))>; // Atomic Exchange @@ -953,10 +953,10 @@ class XCHG32<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode> let Constraints = "$dst = $val" in { let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in { - def XCHGW32 : XCHG32<BPF_W, "32", atomic_swap_32>; + def XCHGW32 : XCHG32<BPF_W, "32", atomic_swap_i32>; } - def XCHGD : XCHG<BPF_DW, "64", atomic_swap_64>; + def XCHGD : XCHG<BPF_DW, "64", atomic_swap_i64>; } // Compare-And-Exchange @@ -996,11 +996,11 @@ class CMPXCHG32<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode> let Predicates = [BPFHasALU32], Defs = [W0], Uses = [W0], DecoderNamespace = "BPFALU32" in { - def CMPXCHGW32 : CMPXCHG32<BPF_W, "32", atomic_cmp_swap_32>; + def CMPXCHGW32 : CMPXCHG32<BPF_W, "32", atomic_cmp_swap_i32>; } let Defs = [R0], Uses = [R0] in { - def CMPXCHGD : CMPXCHG<BPF_DW, "64", atomic_cmp_swap_64>; + def CMPXCHGD : CMPXCHG<BPF_DW, "64", atomic_cmp_swap_i64>; } // bswap16, bswap32, bswap64 diff --git a/llvm/lib/Target/BPF/BPFRegisterInfo.cpp b/llvm/lib/Target/BPF/BPFRegisterInfo.cpp index 8761e4aa258c..84af6806abb3 100644 --- a/llvm/lib/Target/BPF/BPFRegisterInfo.cpp +++ b/llvm/lib/Target/BPF/BPFRegisterInfo.cpp @@ -47,9 +47,17 @@ BitVector BPFRegisterInfo::getReservedRegs(const MachineFunction &MF) const { return Reserved; } -static void WarnSize(int Offset, MachineFunction &MF, DebugLoc& DL) -{ +static void WarnSize(int Offset, MachineFunction &MF, DebugLoc& DL, + MachineBasicBlock& MBB) { if (Offset <= -BPFStackSizeOption) { + if (!DL) + /* try harder to get some debug loc */ + for (auto &I : MBB) + if (I.getDebugLoc()) { + DL = I.getDebugLoc(); + break; + } + const Function &F = MF.getFunction(); DiagnosticInfoUnsupported DiagStackSize( F, @@ -73,14 +81,6 @@ bool BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MachineFunction &MF = *MBB.getParent(); DebugLoc DL = MI.getDebugLoc(); - if (!DL) - /* try harder to get some debug loc */ - for (auto &I : MBB) - if (I.getDebugLoc()) { - DL = I.getDebugLoc(); - break; - } - while (!MI.getOperand(i).isFI()) { ++i; assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!"); @@ -93,7 +93,7 @@ bool BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, if (MI.getOpcode() == BPF::MOV_rr) { int Offset = MF.getFrameInfo().getObjectOffset(FrameIndex); - WarnSize(Offset, MF, DL); + WarnSize(Offset, MF, DL, MBB); MI.getOperand(i).ChangeToRegister(FrameReg, false); Register reg = MI.getOperand(i - 1).getReg(); BuildMI(MBB, ++II, DL, TII.get(BPF::ADD_ri), reg) @@ -108,7 +108,7 @@ bool BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, if (!isInt<32>(Offset)) llvm_unreachable("bug in frame offset"); - WarnSize(Offset, MF, DL); + WarnSize(Offset, MF, DL, MBB); if (MI.getOpcode() == BPF::FI_ri) { // architecture does not really support FI_ri, replace it with diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/llvm/lib/Target/BPF/BPFTargetMachine.cpp index 7b73c9f4a1e4..7d91fa8bb824 100644 --- a/llvm/lib/Target/BPF/BPFTargetMachine.cpp +++ b/llvm/lib/Target/BPF/BPFTargetMachine.cpp @@ -113,8 +113,7 @@ static Expected<bool> parseBPFPreserveStaticOffsetOptions(StringRef Params) { "BPFPreserveStaticOffsetPass"); } -void BPFTargetMachine::registerPassBuilderCallbacks( - PassBuilder &PB, bool PopulateClassToPassNames) { +void BPFTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { #define GET_PASS_REGISTRY "BPFPassRegistry.def" #include "llvm/Passes/TargetPassRegistry.inc" diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.h b/llvm/lib/Target/BPF/BPFTargetMachine.h index 0a28394463b2..4e6adc722e76 100644 --- a/llvm/lib/Target/BPF/BPFTargetMachine.h +++ b/llvm/lib/Target/BPF/BPFTargetMachine.h @@ -42,8 +42,7 @@ public: return TLOF.get(); } - void registerPassBuilderCallbacks(PassBuilder &PB, - bool PopulateClassToPassNames) override; + void registerPassBuilderCallbacks(PassBuilder &PB) override; }; } diff --git a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp index 4acdd571f6c9..97bdd4c45a8c 100644 --- a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp +++ b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp @@ -218,7 +218,7 @@ public: bool runOnMachineFunction(MachineFunction &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index 67e04c212a69..f49ccfe01f72 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -37,6 +37,8 @@ class DXContainerGlobals : public llvm::ModulePass { GlobalVariable *buildSignature(Module &M, Signature &Sig, StringRef Name, StringRef SectionName); void addSignature(Module &M, SmallVector<GlobalValue *> &Globals); + void addPipelineStateValidationInfo(Module &M, + SmallVector<GlobalValue *> &Globals); public: static char ID; // Pass identification, replacement for typeid @@ -63,6 +65,7 @@ bool DXContainerGlobals::runOnModule(Module &M) { Globals.push_back(getFeatureFlags(M)); Globals.push_back(computeShaderHash(M)); addSignature(M, Globals); + addPipelineStateValidationInfo(M, Globals); appendToCompilerUsed(M, Globals); return true; } @@ -133,6 +136,34 @@ void DXContainerGlobals::addSignature(Module &M, Globals.emplace_back(buildSignature(M, OutputSig, "dx.osg1", "OSG1")); } +void DXContainerGlobals::addPipelineStateValidationInfo( + Module &M, SmallVector<GlobalValue *> &Globals) { + SmallString<256> Data; + raw_svector_ostream OS(Data); + PSVRuntimeInfo PSV; + Triple TT(M.getTargetTriple()); + PSV.BaseData.MinimumWaveLaneCount = 0; + PSV.BaseData.MaximumWaveLaneCount = std::numeric_limits<uint32_t>::max(); + PSV.BaseData.ShaderStage = + static_cast<uint8_t>(TT.getEnvironment() - Triple::Pixel); + + // Hardcoded values here to unblock loading the shader into D3D. + // + // TODO: Lots more stuff to do here! + // + // See issue https://github.com/llvm/llvm-project/issues/96674. + PSV.BaseData.NumThreadsX = 1; + PSV.BaseData.NumThreadsY = 1; + PSV.BaseData.NumThreadsZ = 1; + PSV.EntryName = "main"; + + PSV.finalize(TT.getEnvironment()); + PSV.write(OS); + Constant *Constant = + ConstantDataArray::getString(M.getContext(), Data, /*AddNull*/ false); + Globals.emplace_back(buildContainerGlobal(M, Constant, "dx.psv0", "PSV0")); +} + char DXContainerGlobals::ID = 0; INITIALIZE_PASS_BEGIN(DXContainerGlobals, "dxil-globals", "DXContainer Global Emitter", false, true) diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index 24a0c8524230..adaaa2a6e0d4 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -269,6 +269,25 @@ def Sin : DXILOpMapping<13, unary, int_sin, def Tan : DXILOpMapping<14, unary, int_tan, "Returns tangent(theta) for theta in radians.", [llvm_halforfloat_ty, LLVMMatchType<0>]>; +def ACos : DXILOpMapping<15, unary, int_acos, + "Returns the arccosine of each component of input.", + [llvm_halforfloat_ty, LLVMMatchType<0>]>; +def ASin : DXILOpMapping<16, unary, int_asin, + "Returns the arcsine of each component of input.", + [llvm_halforfloat_ty, LLVMMatchType<0>]>; +def ATan : DXILOpMapping<17, unary, int_atan, + "Returns the arctangent of each component of input.", + [llvm_halforfloat_ty, LLVMMatchType<0>]>; +def HCos : DXILOpMapping<18, unary, int_cosh, + "Returns the hyperbolic cosine of the specified value.", + [llvm_halforfloat_ty, LLVMMatchType<0>]>; +def HSin : DXILOpMapping<19, unary, int_sinh, + "Returns the hyperbolic sine of the specified value.", + [llvm_halforfloat_ty, LLVMMatchType<0>]>; +def HTan : DXILOpMapping<20, unary, int_tanh, + "Returns the hyperbolic tan of the specified value.", + [llvm_halforfloat_ty, LLVMMatchType<0>]>; + def Exp2 : DXILOpMapping<21, unary, int_exp2, "Returns the base 2 exponential, or 2**x, of the specified value." "exp2(x) = 2**x.", diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp index c853393e4282..e6dbb25b710e 100644 --- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp +++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp @@ -102,8 +102,7 @@ DirectXTargetMachine::DirectXTargetMachine(const Target &T, const Triple &TT, DirectXTargetMachine::~DirectXTargetMachine() {} -void DirectXTargetMachine::registerPassBuilderCallbacks( - PassBuilder &PB, bool PopulateClassToPassNames) { +void DirectXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { #define GET_PASS_REGISTRY "DirectXPassRegistry.def" #include "llvm/Passes/TargetPassRegistry.inc" } diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.h b/llvm/lib/Target/DirectX/DirectXTargetMachine.h index 428beaf61cd0..d04c375b2736 100644 --- a/llvm/lib/Target/DirectX/DirectXTargetMachine.h +++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.h @@ -47,8 +47,7 @@ public: } TargetTransformInfo getTargetTransformInfo(const Function &F) const override; - void registerPassBuilderCallbacks(PassBuilder &PB, - bool PopulateClassToPassNames) override; + void registerPassBuilderCallbacks(PassBuilder &PB) override; }; } // namespace llvm diff --git a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp index 092cccbcca9c..6e5e2a61bd77 100644 --- a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp +++ b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp @@ -748,7 +748,8 @@ bool HexagonAsmParser::ParseDirectiveSubsection(SMLoc L) { Subsection = HexagonMCExpr::create( MCConstantExpr::create(8192 + Res, getContext()), getContext()); - getStreamer().subSection(Subsection); + getStreamer().switchSection(getStreamer().getCurrentSectionOnly(), + Subsection); return false; } diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp index 4c18e076c439..99745941d579 100644 --- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp +++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp @@ -219,8 +219,8 @@ namespace { } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -285,7 +285,7 @@ char HexagonBitSimplify::ID = 0; INITIALIZE_PASS_BEGIN(HexagonBitSimplify, "hexagon-bit-simplify", "Hexagon bit simplification", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(HexagonBitSimplify, "hexagon-bit-simplify", "Hexagon bit simplification", false, false) @@ -2800,7 +2800,7 @@ bool HexagonBitSimplify::runOnMachineFunction(MachineFunction &MF) { auto &HRI = *HST.getRegisterInfo(); auto &HII = *HST.getInstrInfo(); - MDT = &getAnalysis<MachineDominatorTree>(); + MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); MachineRegisterInfo &MRI = MF.getRegInfo(); bool Changed; diff --git a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp index f2a02fe9540b..f0933765bbcb 100644 --- a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp +++ b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp @@ -218,8 +218,8 @@ namespace { HexagonConstExtenders() : MachineFunctionPass(ID) {} void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -569,7 +569,7 @@ namespace { INITIALIZE_PASS_BEGIN(HexagonConstExtenders, "hexagon-cext-opt", "Hexagon constant-extender optimization", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(HexagonConstExtenders, "hexagon-cext-opt", "Hexagon constant-extender optimization", false, false) @@ -1973,7 +1973,7 @@ bool HCE::runOnMachineFunction(MachineFunction &MF) { HST = &MF.getSubtarget<HexagonSubtarget>(); HII = HST->getInstrInfo(); HRI = HST->getRegisterInfo(); - MDT = &getAnalysis<MachineDominatorTree>(); + MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); MRI = &MF.getRegInfo(); AssignmentMap IMap; diff --git a/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp b/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp index 97917270601b..a5c47e67de89 100644 --- a/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp +++ b/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp @@ -50,8 +50,8 @@ public: AU.addRequired<LiveIntervals>(); AU.addPreserved<SlotIndexes>(); AU.addPreserved<LiveIntervals>(); - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } diff --git a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp index cb820e215899..03f6882e6889 100644 --- a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp +++ b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp @@ -162,8 +162,8 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<MachineBranchProbabilityInfo>(); - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); AU.addRequired<MachineLoopInfo>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -1054,7 +1054,7 @@ bool HexagonEarlyIfConversion::runOnMachineFunction(MachineFunction &MF) { TRI = ST.getRegisterInfo(); MFN = &MF; MRI = &MF.getRegInfo(); - MDT = &getAnalysis<MachineDominatorTree>(); + MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); MLI = &getAnalysis<MachineLoopInfo>(); MBPI = EnableHexagonBP ? &getAnalysis<MachineBranchProbabilityInfo>() : nullptr; diff --git a/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp index 204f3b6b20c7..8a23b7743e83 100644 --- a/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp +++ b/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp @@ -155,8 +155,8 @@ namespace { AU.addRequired<LiveIntervals>(); AU.addPreserved<LiveIntervals>(); AU.addPreserved<SlotIndexes>(); - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -254,7 +254,7 @@ namespace llvm { INITIALIZE_PASS_BEGIN(HexagonExpandCondsets, "expand-condsets", "Hexagon Expand Condsets", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(SlotIndexes) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) INITIALIZE_PASS_END(HexagonExpandCondsets, "expand-condsets", @@ -1277,7 +1277,7 @@ bool HexagonExpandCondsets::runOnMachineFunction(MachineFunction &MF) { HII = static_cast<const HexagonInstrInfo*>(MF.getSubtarget().getInstrInfo()); TRI = MF.getSubtarget().getRegisterInfo(); - MDT = &getAnalysis<MachineDominatorTree>(); + MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); LIS = &getAnalysis<LiveIntervals>(); MRI = &MF.getRegInfo(); diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp index 232651132d6e..f4f84beea734 100644 --- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -413,9 +413,9 @@ void HexagonFrameLowering::findShrunkPrologEpilog(MachineFunction &MF, auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo(); MachineDominatorTree MDT; - MDT.runOnMachineFunction(MF); + MDT.calculate(MF); MachinePostDominatorTree MPT; - MPT.runOnMachineFunction(MF); + MPT.recalculate(MF); using UnsignedMap = DenseMap<unsigned, unsigned>; using RPOTType = ReversePostOrderTraversal<const MachineFunction *>; diff --git a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp index 1e373f6061bb..a4304b053166 100644 --- a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp +++ b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp @@ -515,8 +515,8 @@ namespace { } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -1497,7 +1497,7 @@ bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) { HRI = ST.getRegisterInfo(); MFN = &MF; MRI = &MF.getRegInfo(); - MDT = &getAnalysis<MachineDominatorTree>(); + MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); // Clean up before any further processing, so that dead code does not // get used in a newly generated "insert" instruction. Have a custom @@ -1607,6 +1607,6 @@ FunctionPass *llvm::createHexagonGenInsert() { INITIALIZE_PASS_BEGIN(HexagonGenInsert, "hexinsert", "Hexagon generate \"insert\" instructions", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(HexagonGenInsert, "hexinsert", "Hexagon generate \"insert\" instructions", false, false) diff --git a/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp b/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp index afd49631943f..651ccc2db9ba 100644 --- a/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp +++ b/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp @@ -56,8 +56,8 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { MachineFunctionPass::getAnalysisUsage(AU); - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); } bool runOnMachineFunction(MachineFunction &Fn) override; @@ -82,7 +82,8 @@ bool HexagonGenMemAbsolute::runOnMachineFunction(MachineFunction &Fn) { MRI = &Fn.getRegInfo(); TRI = Fn.getRegInfo().getTargetRegisterInfo(); - MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>(); + MachineDominatorTree &MDT = + getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); // Loop over all of the basic blocks for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end(); diff --git a/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp b/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp index 92e743273611..5bb2d7d80ad5 100644 --- a/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp +++ b/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp @@ -93,8 +93,8 @@ namespace { } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -130,7 +130,7 @@ char HexagonGenPredicate::ID = 0; INITIALIZE_PASS_BEGIN(HexagonGenPredicate, "hexagon-gen-pred", "Hexagon generate predicate operations", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(HexagonGenPredicate, "hexagon-gen-pred", "Hexagon generate predicate operations", false, false) diff --git a/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp index 31e37dcce415..19a024078b10 100644 --- a/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp +++ b/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp @@ -118,7 +118,7 @@ namespace { StringRef getPassName() const override { return "Hexagon Hardware Loops"; } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); AU.addRequired<MachineLoopInfo>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -368,7 +368,7 @@ namespace { INITIALIZE_PASS_BEGIN(HexagonHardwareLoops, "hwloops", "Hexagon Hardware Loops", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_END(HexagonHardwareLoops, "hwloops", "Hexagon Hardware Loops", false, false) @@ -386,7 +386,7 @@ bool HexagonHardwareLoops::runOnMachineFunction(MachineFunction &MF) { MLI = &getAnalysis<MachineLoopInfo>(); MRI = &MF.getRegInfo(); - MDT = &getAnalysis<MachineDominatorTree>(); + MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); const HexagonSubtarget &HST = MF.getSubtarget<HexagonSubtarget>(); TII = HST.getInstrInfo(); TRI = HST.getRegisterInfo(); diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp index 7777ae23e8ae..5a383b23a833 100644 --- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp +++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp @@ -770,8 +770,7 @@ bool PolynomialMultiplyRecognize::matchLeftShift(SelectInst *SelI, // select +++ ? T : 0 Value *U = *SelI->user_begin(); - if (!match(U, m_Xor(m_Specific(SelI), m_Value(R))) && - !match(U, m_Xor(m_Value(R), m_Specific(SelI)))) + if (!match(U, m_c_Xor(m_Specific(SelI), m_Value(R)))) return false; // Matched: xor (select +++ ? 0 : T), R // xor (select +++ ? T : 0), R @@ -814,15 +813,13 @@ bool PolynomialMultiplyRecognize::matchRightShift(SelectInst *SelI, CmpInst::Predicate P; bool TrueIfZero; - if (match(CondV, m_ICmp(P, m_Value(C), m_Zero())) || - match(CondV, m_ICmp(P, m_Zero(), m_Value(C)))) { + if (match(CondV, m_c_ICmp(P, m_Value(C), m_Zero()))) { if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE) return false; // Matched: select C == 0 ? ... : ... // select C != 0 ? ... : ... TrueIfZero = (P == CmpInst::ICMP_EQ); - } else if (match(CondV, m_ICmp(P, m_Value(C), m_One())) || - match(CondV, m_ICmp(P, m_One(), m_Value(C)))) { + } else if (match(CondV, m_c_ICmp(P, m_Value(C), m_One()))) { if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE) return false; // Matched: select C == 1 ? ... : ... @@ -832,8 +829,7 @@ bool PolynomialMultiplyRecognize::matchRightShift(SelectInst *SelI, return false; Value *X = nullptr; - if (!match(C, m_And(m_Value(X), m_One())) && - !match(C, m_And(m_One(), m_Value(X)))) + if (!match(C, m_c_And(m_Value(X), m_One()))) return false; // Matched: select (X & 1) == +++ ? ... : ... // select (X & 1) != +++ ? ... : ... @@ -845,8 +841,7 @@ bool PolynomialMultiplyRecognize::matchRightShift(SelectInst *SelI, if (!match(TrueV, m_LShr(m_Value(R), m_One()))) return false; // Matched: select +++ ? (R >> 1) : ... - if (!match(FalseV, m_Xor(m_Specific(TrueV), m_Value(Q))) && - !match(FalseV, m_Xor(m_Value(Q), m_Specific(TrueV)))) + if (!match(FalseV, m_c_Xor(m_Specific(TrueV), m_Value(Q)))) return false; // Matched: select +++ ? (R >> 1) : (R >> 1) ^ Q // with commuting ^. @@ -856,8 +851,7 @@ bool PolynomialMultiplyRecognize::matchRightShift(SelectInst *SelI, if (!match(FalseV, m_LShr(m_Value(R), m_One()))) return false; // Matched: select +++ ? ... : (R >> 1) - if (!match(TrueV, m_Xor(m_Specific(FalseV), m_Value(Q))) && - !match(TrueV, m_Xor(m_Value(Q), m_Specific(FalseV)))) + if (!match(TrueV, m_c_Xor(m_Specific(FalseV), m_Value(Q)))) return false; // Matched: select +++ ? (R >> 1) ^ Q : (R >> 1) // with commuting ^. diff --git a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp index 0e82bf6e5331..e7f5c257b21c 100644 --- a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp +++ b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp @@ -70,7 +70,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { MachineFunctionPass::getAnalysisUsage(AU); - AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); AU.addRequired<MachineDominanceFrontier>(); AU.setPreservesAll(); } @@ -122,7 +122,7 @@ char HexagonOptAddrMode::ID = 0; INITIALIZE_PASS_BEGIN(HexagonOptAddrMode, "amode-opt", "Optimize addressing mode", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier) INITIALIZE_PASS_END(HexagonOptAddrMode, "amode-opt", "Optimize addressing mode", false, false) @@ -872,7 +872,7 @@ bool HexagonOptAddrMode::runOnMachineFunction(MachineFunction &MF) { HII = HST.getInstrInfo(); HRI = HST.getRegisterInfo(); const auto &MDF = getAnalysis<MachineDominanceFrontier>(); - MDT = &getAnalysis<MachineDominatorTree>(); + MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); DataFlowGraph G(MF, *HII, *HRI, *MDT, MDF); // Need to keep dead phis because we can propagate uses of registers into diff --git a/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp b/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp index 4131f2a31755..3c17f6800114 100644 --- a/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp +++ b/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp @@ -63,7 +63,7 @@ namespace { HexagonRDFOpt() : MachineFunctionPass(ID) {} void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); AU.addRequired<MachineDominanceFrontier>(); AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); @@ -109,7 +109,7 @@ char HexagonRDFOpt::ID = 0; INITIALIZE_PASS_BEGIN(HexagonRDFOpt, "hexagon-rdf-opt", "Hexagon RDF optimizations", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier) INITIALIZE_PASS_END(HexagonRDFOpt, "hexagon-rdf-opt", "Hexagon RDF optimizations", false, false) @@ -302,7 +302,7 @@ bool HexagonRDFOpt::runOnMachineFunction(MachineFunction &MF) { RDFCount++; } - MDT = &getAnalysis<MachineDominatorTree>(); + MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); const auto &MDF = getAnalysis<MachineDominanceFrontier>(); const auto &HII = *MF.getSubtarget<HexagonSubtarget>().getInstrInfo(); const auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo(); diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp index da8ab5c4b21b..5e713842ff67 100644 --- a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp +++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp @@ -55,10 +55,6 @@ static cl::opt<bool> DisableHexagonMISched("disable-hexagon-misched", cl::Hidden, cl::desc("Disable Hexagon MI Scheduling")); -static cl::opt<bool> EnableSubregLiveness( - "hexagon-subreg-liveness", cl::Hidden, cl::init(true), - cl::desc("Enable subregister liveness tracking for Hexagon")); - static cl::opt<bool> OverrideLongCalls( "hexagon-long-calls", cl::Hidden, cl::desc("If present, forces/disables the use of long calls")); @@ -726,9 +722,7 @@ unsigned HexagonSubtarget::getL1PrefetchDistance() const { return 32; } -bool HexagonSubtarget::enableSubRegLiveness() const { - return EnableSubregLiveness; -} +bool HexagonSubtarget::enableSubRegLiveness() const { return true; } Intrinsic::ID HexagonSubtarget::getIntrinsicId(unsigned Opc) const { struct Scalar { diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp index e4886506de19..b362285d4f16 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -312,8 +312,7 @@ HexagonTargetMachine::getSubtargetImpl(const Function &F) const { return I.get(); } -void HexagonTargetMachine::registerPassBuilderCallbacks( - PassBuilder &PB, bool PopulateClassToPassNames) { +void HexagonTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { #define GET_PASS_REGISTRY "HexagonPassRegistry.def" #include "llvm/Passes/TargetPassRegistry.inc" diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.h b/llvm/lib/Target/Hexagon/HexagonTargetMachine.h index 34ff45b6acf3..6e9a78b76650 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.h @@ -35,8 +35,7 @@ public: ~HexagonTargetMachine() override; const HexagonSubtarget *getSubtargetImpl(const Function &F) const override; - void registerPassBuilderCallbacks(PassBuilder &PB, - bool PopulateClassToPassNames) override; + void registerPassBuilderCallbacks(PassBuilder &PB) override; TargetPassConfig *createPassConfig(PassManagerBase &PM) override; TargetTransformInfo getTargetTransformInfo(const Function &F) const override; diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp index 56472d633694..2d5352b08cae 100644 --- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp +++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp @@ -97,9 +97,9 @@ namespace { AU.setPreservesCFG(); AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<MachineBranchProbabilityInfo>(); - AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); AU.addRequired<MachineLoopInfo>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); AU.addPreserved<MachineLoopInfo>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -124,7 +124,7 @@ char HexagonPacketizer::ID = 0; INITIALIZE_PASS_BEGIN(HexagonPacketizer, "hexagon-packetizer", "Hexagon Packetizer", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp index f9a0ba3608e6..54efe4bc25ef 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp @@ -712,18 +712,20 @@ public: void finishLayout(MCAssembler const &Asm, MCAsmLayout &Layout) const override { + SmallVector<MCFragment *> Frags; for (auto *I : Layout.getSectionOrder()) { - auto &Fragments = I->getFragmentList(); - for (auto &J : Fragments) { - switch (J.getKind()) { + Frags.clear(); + for (MCFragment &F : *I) + Frags.push_back(&F); + for (size_t J = 0, E = Frags.size(); J != E; ++J) { + switch (Frags[J]->getKind()) { default: break; case MCFragment::FT_Align: { - auto Size = Asm.computeFragmentSize(Layout, J); - for (auto K = J.getIterator(); - K != Fragments.begin() && Size >= HEXAGON_PACKET_SIZE;) { + auto Size = Asm.computeFragmentSize(Layout, *Frags[J]); + for (auto K = J; K != 0 && Size >= HEXAGON_PACKET_SIZE;) { --K; - switch (K->getKind()) { + switch (Frags[K]->getKind()) { default: break; case MCFragment::FT_Align: { @@ -733,7 +735,7 @@ public: } case MCFragment::FT_Relaxable: { MCContext &Context = Asm.getContext(); - auto &RF = cast<MCRelaxableFragment>(*K); + auto &RF = cast<MCRelaxableFragment>(*Frags[K]); auto &Inst = const_cast<MCInst &>(RF.getInst()); while (Size > 0 && HexagonMCInstrInfo::bundleSize(Inst) < MaxPacketSize) { diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 9d7e4636abac..f0a18b42f481 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -785,6 +785,7 @@ SDValue LoongArchTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG, SDLoc DL(N); EVT Ty = getPointerTy(DAG.getDataLayout()); SDValue Addr = getTargetNode(N, DL, Ty, DAG, 0); + SDValue Load; switch (M) { default: @@ -796,33 +797,49 @@ SDValue LoongArchTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG, // This is not actually used, but is necessary for successfully matching // the PseudoLA_*_LARGE nodes. SDValue Tmp = DAG.getConstant(0, DL, Ty); - if (IsLocal) + if (IsLocal) { // This generates the pattern (PseudoLA_PCREL_LARGE tmp sym), that // eventually becomes the desired 5-insn code sequence. - return SDValue(DAG.getMachineNode(LoongArch::PseudoLA_PCREL_LARGE, DL, Ty, + Load = SDValue(DAG.getMachineNode(LoongArch::PseudoLA_PCREL_LARGE, DL, Ty, Tmp, Addr), 0); - - // This generates the pattern (PseudoLA_GOT_LARGE tmp sym), that eventually - // becomes the desired 5-insn code sequence. - return SDValue( - DAG.getMachineNode(LoongArch::PseudoLA_GOT_LARGE, DL, Ty, Tmp, Addr), - 0); + } else { + // This generates the pattern (PseudoLA_GOT_LARGE tmp sym), that + // eventually becomes the desired 5-insn code sequence. + Load = SDValue( + DAG.getMachineNode(LoongArch::PseudoLA_GOT_LARGE, DL, Ty, Tmp, Addr), + 0); + } + break; } case CodeModel::Small: case CodeModel::Medium: - if (IsLocal) + if (IsLocal) { // This generates the pattern (PseudoLA_PCREL sym), which expands to // (addi.w/d (pcalau12i %pc_hi20(sym)) %pc_lo12(sym)). - return SDValue( + Load = SDValue( DAG.getMachineNode(LoongArch::PseudoLA_PCREL, DL, Ty, Addr), 0); + } else { + // This generates the pattern (PseudoLA_GOT sym), which expands to (ld.w/d + // (pcalau12i %got_pc_hi20(sym)) %got_pc_lo12(sym)). + Load = + SDValue(DAG.getMachineNode(LoongArch::PseudoLA_GOT, DL, Ty, Addr), 0); + } + } - // This generates the pattern (PseudoLA_GOT sym), which expands to (ld.w/d - // (pcalau12i %got_pc_hi20(sym)) %got_pc_lo12(sym)). - return SDValue(DAG.getMachineNode(LoongArch::PseudoLA_GOT, DL, Ty, Addr), - 0); + if (!IsLocal) { + // Mark the load instruction as invariant to enable hoisting in MachineLICM. + MachineFunction &MF = DAG.getMachineFunction(); + MachineMemOperand *MemOp = MF.getMachineMemOperand( + MachinePointerInfo::getGOT(MF), + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + LLT(Ty.getSimpleVT()), Align(Ty.getFixedSizeInBits() / 8)); + DAG.setNodeMemRefs(cast<MachineSDNode>(Load.getNode()), {MemOp}); } + + return Load; } SDValue LoongArchTargetLowering::lowerBlockAddress(SDValue Op, @@ -860,7 +877,7 @@ SDValue LoongArchTargetLowering::lowerGlobalAddress(SDValue Op, SDValue LoongArchTargetLowering::getStaticTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG, - unsigned Opc, + unsigned Opc, bool UseGOT, bool Large) const { SDLoc DL(N); EVT Ty = getPointerTy(DAG.getDataLayout()); @@ -873,6 +890,16 @@ SDValue LoongArchTargetLowering::getStaticTLSAddr(GlobalAddressSDNode *N, SDValue Offset = Large ? SDValue(DAG.getMachineNode(Opc, DL, Ty, Tmp, Addr), 0) : SDValue(DAG.getMachineNode(Opc, DL, Ty, Addr), 0); + if (UseGOT) { + // Mark the load instruction as invariant to enable hoisting in MachineLICM. + MachineFunction &MF = DAG.getMachineFunction(); + MachineMemOperand *MemOp = MF.getMachineMemOperand( + MachinePointerInfo::getGOT(MF), + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + LLT(Ty.getSimpleVT()), Align(Ty.getFixedSizeInBits() / 8)); + DAG.setNodeMemRefs(cast<MachineSDNode>(Offset.getNode()), {MemOp}); + } // Add the thread pointer. return DAG.getNode(ISD::ADD, DL, Ty, Offset, @@ -945,6 +972,10 @@ LoongArchTargetLowering::lowerGlobalTLSAddress(SDValue Op, GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op); assert(N->getOffset() == 0 && "unexpected offset in global node"); + if (DAG.getTarget().useEmulatedTLS()) + report_fatal_error("the emulated TLS is prohibited", + /*GenCrashDiag=*/false); + bool IsDesc = DAG.getTarget().useTLSDESC(); switch (getTargetMachine().getTLSModel(N->getGlobal())) { @@ -972,13 +1003,14 @@ LoongArchTargetLowering::lowerGlobalTLSAddress(SDValue Op, return getStaticTLSAddr(N, DAG, Large ? LoongArch::PseudoLA_TLS_IE_LARGE : LoongArch::PseudoLA_TLS_IE, - Large); + /*UseGOT=*/true, Large); case TLSModel::LocalExec: // This model is used when static linking as the TLS offsets are resolved // during program linking. // // This node doesn't need an extra argument for the large code model. - return getStaticTLSAddr(N, DAG, LoongArch::PseudoLA_TLS_LE); + return getStaticTLSAddr(N, DAG, LoongArch::PseudoLA_TLS_LE, + /*UseGOT=*/false); } return getTLSDescAddr(N, DAG, diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index 9328831a17a3..f4c57f80fdbe 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -267,7 +267,7 @@ private: SDValue getAddr(NodeTy *N, SelectionDAG &DAG, CodeModel::Model M, bool IsLocal = true) const; SDValue getStaticTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG, - unsigned Opc, bool Large = false) const; + unsigned Opc, bool UseGOT, bool Large = false) const; SDValue getDynamicTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG, unsigned Opc, bool Large = false) const; SDValue getTLSDescAddr(GlobalAddressSDNode *N, SelectionDAG &DAG, diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp index 6b75634f5b2e..a85b054a85d7 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp @@ -229,6 +229,21 @@ unsigned LoongArchInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { return MI.getDesc().getSize(); } +bool LoongArchInstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { + const unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + default: + break; + case LoongArch::ADDI_D: + case LoongArch::ORI: + case LoongArch::XORI: + return (MI.getOperand(1).isReg() && + MI.getOperand(1).getReg() == LoongArch::R0) || + (MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0); + } + return MI.isAsCheapAsAMove(); +} + MachineBasicBlock * LoongArchInstrInfo::getBranchDestBlock(const MachineInstr &MI) const { assert(MI.getDesc().isBranch() && "Unexpected opcode!"); diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h index 3b80f55bc84f..eb19051e380c 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h @@ -52,6 +52,8 @@ public: unsigned getInstSizeInBytes(const MachineInstr &MI) const override; + bool isAsCheapAsAMove(const MachineInstr &MI) const override; + MachineBasicBlock *getBranchDestBlock(const MachineInstr &MI) const override; bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td index f72f46e39e2a..67c3dfd3b259 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td @@ -751,7 +751,8 @@ def XOR : ALU_3R<0x00158000>; def ANDN : ALU_3R<0x00168000>; def ORN : ALU_3R<0x00160000>; def ANDI : ALU_2RI12<0x03400000, uimm12>; -let isReMaterializable = 1 in { +// See LoongArchInstrInfo::isAsCheapAsAMove for more details. +let isReMaterializable = 1, isAsCheapAsAMove = 1 in { def ORI : ALU_2RI12<0x03800000, uimm12_ori>; def XORI : ALU_2RI12<0x03c00000, uimm12>; } @@ -858,7 +859,8 @@ def ADD_D : ALU_3R<0x00108000>; def SUB_D : ALU_3R<0x00118000>; // ADDI_D isn't always rematerializable, but isReMaterializable will be used as // a hint which is verified in isReallyTriviallyReMaterializable. -let isReMaterializable = 1 in { +// See LoongArchInstrInfo::isAsCheapAsAMove for more details. +let isReMaterializable = 1, isAsCheapAsAMove = 1 in { def ADDI_D : ALU_2RI12<0x02c00000, simm12_addlike>; } def ADDU16I_D : ALU_2RI16<0x10000000, simm16>; @@ -1580,13 +1582,26 @@ def PseudoLA_ABS_LARGE : Pseudo<(outs GPR:$dst), "la.abs", "$dst, $src">; def PseudoLA_PCREL : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], "la.pcrel", "$dst, $src">; -let Defs = [R20], Size = 20 in +def PseudoLA_TLS_LD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], + "la.tls.ld", "$dst, $src">; +def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], + "la.tls.gd", "$dst, $src">; +let Defs = [R20], Size = 20 in { def PseudoLA_PCREL_LARGE : Pseudo<(outs GPR:$dst), (ins GPR:$tmp, bare_symbol:$src), [], "la.pcrel", "$dst, $tmp, $src">, Requires<[IsLA64]>; def PseudoLA_TLS_LE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], "la.tls.le", "$dst, $src">; +def PseudoLA_TLS_LD_LARGE : Pseudo<(outs GPR:$dst), + (ins GPR:$tmp, bare_symbol:$src), [], + "la.tls.ld", "$dst, $tmp, $src">, + Requires<[IsLA64]>; +def PseudoLA_TLS_GD_LARGE : Pseudo<(outs GPR:$dst), + (ins GPR:$tmp, bare_symbol:$src), [], + "la.tls.gd", "$dst, $tmp, $src">, + Requires<[IsLA64]>; +} // Defs = [R20], Size = 20 } let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0, isAsmParserOnly = 1 in { @@ -1594,10 +1609,6 @@ def PseudoLA_GOT : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], "la.got", "$dst, $src">; def PseudoLA_TLS_IE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], "la.tls.ie", "$dst, $src">; -def PseudoLA_TLS_LD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], - "la.tls.ld", "$dst, $src">; -def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], - "la.tls.gd", "$dst, $src">; let Defs = [R20], Size = 20 in { def PseudoLA_GOT_LARGE : Pseudo<(outs GPR:$dst), (ins GPR:$tmp, bare_symbol:$src), [], @@ -1607,14 +1618,6 @@ def PseudoLA_TLS_IE_LARGE : Pseudo<(outs GPR:$dst), (ins GPR:$tmp, bare_symbol:$src), [], "la.tls.ie", "$dst, $tmp, $src">, Requires<[IsLA64]>; -def PseudoLA_TLS_LD_LARGE : Pseudo<(outs GPR:$dst), - (ins GPR:$tmp, bare_symbol:$src), [], - "la.tls.ld", "$dst, $tmp, $src">, - Requires<[IsLA64]>; -def PseudoLA_TLS_GD_LARGE : Pseudo<(outs GPR:$dst), - (ins GPR:$tmp, bare_symbol:$src), [], - "la.tls.gd", "$dst, $tmp, $src">, - Requires<[IsLA64]>; } // Defs = [R20], Size = 20 } @@ -2041,60 +2044,60 @@ multiclass ternary_atomic_op_failure_ord { }]>; } -defm atomic_cmp_swap_32 : ternary_atomic_op_failure_ord; -defm atomic_cmp_swap_64 : ternary_atomic_op_failure_ord; +defm atomic_cmp_swap_i32 : ternary_atomic_op_failure_ord; +defm atomic_cmp_swap_i64 : ternary_atomic_op_failure_ord; let Predicates = [IsLA64] in { def : AtomicPat<int_loongarch_masked_atomicrmw_xchg_i64, PseudoMaskedAtomicSwap32>; -def : Pat<(atomic_swap_32 GPR:$addr, GPR:$incr), +def : Pat<(atomic_swap_i32 GPR:$addr, GPR:$incr), (AMSWAP__DB_W GPR:$incr, GPR:$addr)>; -def : Pat<(atomic_swap_64 GPR:$addr, GPR:$incr), +def : Pat<(atomic_swap_i64 GPR:$addr, GPR:$incr), (AMSWAP__DB_D GPR:$incr, GPR:$addr)>; -def : Pat<(atomic_load_add_64 GPR:$rj, GPR:$rk), +def : Pat<(atomic_load_add_i64 GPR:$rj, GPR:$rk), (AMADD__DB_D GPR:$rk, GPR:$rj)>; def : AtomicPat<int_loongarch_masked_atomicrmw_add_i64, PseudoMaskedAtomicLoadAdd32>; -def : Pat<(atomic_load_sub_32 GPR:$rj, GPR:$rk), +def : Pat<(atomic_load_sub_i32 GPR:$rj, GPR:$rk), (AMADD__DB_W (SUB_W R0, GPR:$rk), GPR:$rj)>; -def : Pat<(atomic_load_sub_64 GPR:$rj, GPR:$rk), +def : Pat<(atomic_load_sub_i64 GPR:$rj, GPR:$rk), (AMADD__DB_D (SUB_D R0, GPR:$rk), GPR:$rj)>; def : AtomicPat<int_loongarch_masked_atomicrmw_sub_i64, PseudoMaskedAtomicLoadSub32>; -defm : PseudoBinPat<"atomic_load_nand_64", PseudoAtomicLoadNand64>; +defm : PseudoBinPat<"atomic_load_nand_i64", PseudoAtomicLoadNand64>; def : AtomicPat<int_loongarch_masked_atomicrmw_nand_i64, PseudoMaskedAtomicLoadNand32>; -def : Pat<(atomic_load_add_32 GPR:$rj, GPR:$rk), +def : Pat<(atomic_load_add_i32 GPR:$rj, GPR:$rk), (AMADD__DB_W GPR:$rk, GPR:$rj)>; -def : Pat<(atomic_load_and_32 GPR:$rj, GPR:$rk), +def : Pat<(atomic_load_and_i32 GPR:$rj, GPR:$rk), (AMAND__DB_W GPR:$rk, GPR:$rj)>; -def : Pat<(atomic_load_and_64 GPR:$rj, GPR:$rk), +def : Pat<(atomic_load_and_i64 GPR:$rj, GPR:$rk), (AMAND__DB_D GPR:$rk, GPR:$rj)>; -def : Pat<(atomic_load_or_32 GPR:$rj, GPR:$rk), +def : Pat<(atomic_load_or_i32 GPR:$rj, GPR:$rk), (AMOR__DB_W GPR:$rk, GPR:$rj)>; -def : Pat<(atomic_load_or_64 GPR:$rj, GPR:$rk), +def : Pat<(atomic_load_or_i64 GPR:$rj, GPR:$rk), (AMOR__DB_D GPR:$rk, GPR:$rj)>; -def : Pat<(atomic_load_xor_32 GPR:$rj, GPR:$rk), +def : Pat<(atomic_load_xor_i32 GPR:$rj, GPR:$rk), (AMXOR__DB_W GPR:$rk, GPR:$rj)>; -def : Pat<(atomic_load_xor_64 GPR:$rj, GPR:$rk), +def : Pat<(atomic_load_xor_i64 GPR:$rj, GPR:$rk), (AMXOR__DB_D GPR:$rk, GPR:$rj)>; -def : Pat<(atomic_load_umin_32 GPR:$rj, GPR:$rk), +def : Pat<(atomic_load_umin_i32 GPR:$rj, GPR:$rk), (AMMIN__DB_WU GPR:$rk, GPR:$rj)>; -def : Pat<(atomic_load_umin_64 GPR:$rj, GPR:$rk), +def : Pat<(atomic_load_umin_i64 GPR:$rj, GPR:$rk), (AMMIN__DB_DU GPR:$rk, GPR:$rj)>; -def : Pat<(atomic_load_umax_32 GPR:$rj, GPR:$rk), +def : Pat<(atomic_load_umax_i32 GPR:$rj, GPR:$rk), (AMMAX__DB_WU GPR:$rk, GPR:$rj)>; -def : Pat<(atomic_load_umax_64 GPR:$rj, GPR:$rk), +def : Pat<(atomic_load_umax_i64 GPR:$rj, GPR:$rk), (AMMAX__DB_DU GPR:$rk, GPR:$rj)>; -def : Pat<(atomic_load_min_32 GPR:$rj, GPR:$rk), +def : Pat<(atomic_load_min_i32 GPR:$rj, GPR:$rk), (AMMIN__DB_W GPR:$rk, GPR:$rj)>; -def : Pat<(atomic_load_min_64 GPR:$rj, GPR:$rk), +def : Pat<(atomic_load_min_i64 GPR:$rj, GPR:$rk), (AMMIN__DB_D GPR:$rk, GPR:$rj)>; -def : Pat<(atomic_load_max_32 GPR:$rj, GPR:$rk), +def : Pat<(atomic_load_max_i32 GPR:$rj, GPR:$rk), (AMMAX__DB_W GPR:$rk, GPR:$rj)>; -def : Pat<(atomic_load_max_64 GPR:$rj, GPR:$rk), +def : Pat<(atomic_load_max_i64 GPR:$rj, GPR:$rk), (AMMAX__DB_D GPR:$rk, GPR:$rj)>; def : AtomicPat<int_loongarch_masked_atomicrmw_umax_i64, @@ -2118,8 +2121,8 @@ multiclass PseudoCmpXchgPat<string Op, Pseudo CmpXchgInst, (CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 7)>; } -defm : PseudoCmpXchgPat<"atomic_cmp_swap_32", PseudoCmpXchg32>; -defm : PseudoCmpXchgPat<"atomic_cmp_swap_64", PseudoCmpXchg64, i64>; +defm : PseudoCmpXchgPat<"atomic_cmp_swap_i32", PseudoCmpXchg32>; +defm : PseudoCmpXchgPat<"atomic_cmp_swap_i64", PseudoCmpXchg64, i64>; def : Pat<(int_loongarch_masked_cmpxchg_i64 GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, timm:$fail_order), (PseudoMaskedCmpXchg32 @@ -2131,23 +2134,23 @@ def : PseudoMaskedAMMinMaxPat<int_loongarch_masked_atomicrmw_min_i64, PseudoMaskedAtomicLoadMin32>; } // Predicates = [IsLA64] -defm : PseudoBinPat<"atomic_load_nand_32", PseudoAtomicLoadNand32>; +defm : PseudoBinPat<"atomic_load_nand_i32", PseudoAtomicLoadNand32>; let Predicates = [IsLA32] in { def : AtomicPat<int_loongarch_masked_atomicrmw_xchg_i32, PseudoMaskedAtomicSwap32>; -defm : PseudoBinPat<"atomic_swap_32", PseudoAtomicSwap32>; +defm : PseudoBinPat<"atomic_swap_i32", PseudoAtomicSwap32>; def : AtomicPat<int_loongarch_masked_atomicrmw_add_i32, PseudoMaskedAtomicLoadAdd32>; def : AtomicPat<int_loongarch_masked_atomicrmw_sub_i32, PseudoMaskedAtomicLoadSub32>; def : AtomicPat<int_loongarch_masked_atomicrmw_nand_i32, PseudoMaskedAtomicLoadNand32>; -defm : PseudoBinPat<"atomic_load_add_32", PseudoAtomicLoadAdd32>; -defm : PseudoBinPat<"atomic_load_sub_32", PseudoAtomicLoadSub32>; -defm : PseudoBinPat<"atomic_load_and_32", PseudoAtomicLoadAnd32>; -defm : PseudoBinPat<"atomic_load_or_32", PseudoAtomicLoadOr32>; -defm : PseudoBinPat<"atomic_load_xor_32", PseudoAtomicLoadXor32>; +defm : PseudoBinPat<"atomic_load_add_i32", PseudoAtomicLoadAdd32>; +defm : PseudoBinPat<"atomic_load_sub_i32", PseudoAtomicLoadSub32>; +defm : PseudoBinPat<"atomic_load_and_i32", PseudoAtomicLoadAnd32>; +defm : PseudoBinPat<"atomic_load_or_i32", PseudoAtomicLoadOr32>; +defm : PseudoBinPat<"atomic_load_xor_i32", PseudoAtomicLoadXor32>; } // Predicates = [IsLA32] /// Intrinsics diff --git a/llvm/lib/Target/M68k/M68kInstrAtomics.td b/llvm/lib/Target/M68k/M68kInstrAtomics.td index 40c6593e2cfa..84a662533542 100644 --- a/llvm/lib/Target/M68k/M68kInstrAtomics.td +++ b/llvm/lib/Target/M68k/M68kInstrAtomics.td @@ -35,7 +35,7 @@ def CAS32 : MxCASOp<0x3, MxType32d>; foreach size = [8, 16, 32] in { - def : Pat<(!cast<SDPatternOperator>("atomic_cmp_swap_"#size) MxCP_ARI:$ptr, + def : Pat<(!cast<SDPatternOperator>("atomic_cmp_swap_i"#size) MxCP_ARI:$ptr, !cast<MxRegOp>("MxDRD"#size):$cmp, !cast<MxRegOp>("MxDRD"#size):$new), (!cast<MxInst>("CAS"#size) !cast<MxRegOp>("MxDRD"#size):$cmp, diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp index e907e8d8a700..f861268c0015 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp @@ -90,8 +90,7 @@ void MipsELFStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) { Labels.push_back(Symbol); } -void MipsELFStreamer::switchSection(MCSection *Section, - const MCExpr *Subsection) { +void MipsELFStreamer::switchSection(MCSection *Section, uint32_t Subsection) { MCELFStreamer::switchSection(Section, Subsection); Labels.clear(); } diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h index 051806d2cfe8..1e8042e88c9e 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h @@ -50,8 +50,7 @@ public: /// Overriding this function allows us to dismiss all labels that are /// candidates for marking as microMIPS when .section directive is processed. - void switchSection(MCSection *Section, - const MCExpr *Subsection = nullptr) override; + void switchSection(MCSection *Section, uint32_t Subsection = 0) override; /// Overriding these functions allows us to dismiss all labels that are /// candidates for marking as microMIPS when .word/.long/.4byte etc diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp index f1aa90d24023..6b013de27477 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp @@ -20,7 +20,6 @@ using namespace llvm; void MipsRegInfoRecord::EmitMipsOptionRecord() { - MCAssembler &MCA = Streamer->getAssembler(); MipsTargetStreamer *MTS = static_cast<MipsTargetStreamer *>(Streamer->getTargetStreamer()); @@ -36,7 +35,6 @@ void MipsRegInfoRecord::EmitMipsOptionRecord() { MCSectionELF *Sec = Context.getELFSection(".MIPS.options", ELF::SHT_MIPS_OPTIONS, ELF::SHF_ALLOC | ELF::SHF_MIPS_NOSTRIP, 1); - MCA.registerSection(*Sec); Sec->setAlignment(Align(8)); Streamer->switchSection(Sec); @@ -54,7 +52,6 @@ void MipsRegInfoRecord::EmitMipsOptionRecord() { } else { MCSectionELF *Sec = Context.getELFSection(".reginfo", ELF::SHT_MIPS_REGINFO, ELF::SHF_ALLOC, 24); - MCA.registerSection(*Sec); Sec->setAlignment(MTS->getABI().IsN32() ? Align(8) : Align(4)); Streamer->switchSection(Sec); diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp index bdf299ae8ac1..2f39d091d86d 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp @@ -890,14 +890,15 @@ void MipsTargetELFStreamer::emitLabel(MCSymbol *S) { void MipsTargetELFStreamer::finish() { MCAssembler &MCA = getStreamer().getAssembler(); const MCObjectFileInfo &OFI = *MCA.getContext().getObjectFileInfo(); + MCELFStreamer &S = getStreamer(); // .bss, .text and .data are always at least 16-byte aligned. MCSection &TextSection = *OFI.getTextSection(); - MCA.registerSection(TextSection); + S.changeSection(&TextSection); MCSection &DataSection = *OFI.getDataSection(); - MCA.registerSection(DataSection); + S.changeSection(&DataSection); MCSection &BSSSection = *OFI.getBSSSection(); - MCA.registerSection(BSSSection); + S.changeSection(&BSSSection); TextSection.ensureMinAlignment(Align(16)); DataSection.ensureMinAlignment(Align(16)); @@ -908,16 +909,15 @@ void MipsTargetELFStreamer::finish() { // verifying the output of IAS against the output of other assemblers but // it's not necessary to produce a correct object and increases section // size. - MCStreamer &OS = getStreamer(); - for (MCSection &S : MCA) { - MCSectionELF &Section = static_cast<MCSectionELF &>(S); + for (MCSection &Sec : MCA) { + MCSectionELF &Section = static_cast<MCSectionELF &>(Sec); Align Alignment = Section.getAlign(); - OS.switchSection(&Section); + S.switchSection(&Section); if (Section.useCodeAlign()) - OS.emitCodeAlignment(Alignment, &STI, Alignment.value()); + S.emitCodeAlignment(Alignment, &STI, Alignment.value()); else - OS.emitValueToAlignment(Alignment, 0, 1, Alignment.value()); + S.emitValueToAlignment(Alignment, 0, 1, Alignment.value()); } } @@ -1015,19 +1015,15 @@ void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) { MCContext &Context = MCA.getContext(); MCStreamer &OS = getStreamer(); + OS.pushSection(); MCSectionELF *Sec = Context.getELFSection(".pdr", ELF::SHT_PROGBITS, 0); + OS.switchSection(Sec); + Sec->setAlignment(Align(4)); MCSymbol *Sym = Context.getOrCreateSymbol(Name); const MCSymbolRefExpr *ExprRef = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Context); - MCA.registerSection(*Sec); - Sec->setAlignment(Align(4)); - - OS.pushSection(); - - OS.switchSection(Sec); - OS.emitValueImpl(ExprRef, 4); OS.emitIntValue(GPRInfoSet ? GPRBitMask : 0, 4); // reg_mask @@ -1306,9 +1302,8 @@ void MipsTargetELFStreamer::emitMipsAbiFlags() { MCStreamer &OS = getStreamer(); MCSectionELF *Sec = Context.getELFSection( ".MIPS.abiflags", ELF::SHT_MIPS_ABIFLAGS, ELF::SHF_ALLOC, 24); - MCA.registerSection(*Sec); - Sec->setAlignment(Align(8)); OS.switchSection(Sec); + Sec->setAlignment(Align(8)); OS << ABIFlagsSection; } diff --git a/llvm/lib/Target/Mips/Mips64InstrInfo.td b/llvm/lib/Target/Mips/Mips64InstrInfo.td index c0e7eef8dd9d..f6ac3091a3ba 100644 --- a/llvm/lib/Target/Mips/Mips64InstrInfo.td +++ b/llvm/lib/Target/Mips/Mips64InstrInfo.td @@ -75,18 +75,18 @@ def assertzext_lt_i32 : PatFrag<(ops node:$src), (assertzext node:$src), [{ // Instructions specific format //===----------------------------------------------------------------------===// let usesCustomInserter = 1 in { - def ATOMIC_LOAD_ADD_I64 : Atomic2Ops<atomic_load_add_64, GPR64>; - def ATOMIC_LOAD_SUB_I64 : Atomic2Ops<atomic_load_sub_64, GPR64>; - def ATOMIC_LOAD_AND_I64 : Atomic2Ops<atomic_load_and_64, GPR64>; - def ATOMIC_LOAD_OR_I64 : Atomic2Ops<atomic_load_or_64, GPR64>; - def ATOMIC_LOAD_XOR_I64 : Atomic2Ops<atomic_load_xor_64, GPR64>; - def ATOMIC_LOAD_NAND_I64 : Atomic2Ops<atomic_load_nand_64, GPR64>; - def ATOMIC_SWAP_I64 : Atomic2Ops<atomic_swap_64, GPR64>; - def ATOMIC_CMP_SWAP_I64 : AtomicCmpSwap<atomic_cmp_swap_64, GPR64>; - def ATOMIC_LOAD_MIN_I64 : Atomic2Ops<atomic_load_min_64, GPR64>; - def ATOMIC_LOAD_MAX_I64 : Atomic2Ops<atomic_load_max_64, GPR64>; - def ATOMIC_LOAD_UMIN_I64 : Atomic2Ops<atomic_load_umin_64, GPR64>; - def ATOMIC_LOAD_UMAX_I64 : Atomic2Ops<atomic_load_umax_64, GPR64>; + def ATOMIC_LOAD_ADD_I64 : Atomic2Ops<atomic_load_add_i64, GPR64>; + def ATOMIC_LOAD_SUB_I64 : Atomic2Ops<atomic_load_sub_i64, GPR64>; + def ATOMIC_LOAD_AND_I64 : Atomic2Ops<atomic_load_and_i64, GPR64>; + def ATOMIC_LOAD_OR_I64 : Atomic2Ops<atomic_load_or_i64, GPR64>; + def ATOMIC_LOAD_XOR_I64 : Atomic2Ops<atomic_load_xor_i64, GPR64>; + def ATOMIC_LOAD_NAND_I64 : Atomic2Ops<atomic_load_nand_i64, GPR64>; + def ATOMIC_SWAP_I64 : Atomic2Ops<atomic_swap_i64, GPR64>; + def ATOMIC_CMP_SWAP_I64 : AtomicCmpSwap<atomic_cmp_swap_i64, GPR64>; + def ATOMIC_LOAD_MIN_I64 : Atomic2Ops<atomic_load_min_i64, GPR64>; + def ATOMIC_LOAD_MAX_I64 : Atomic2Ops<atomic_load_max_i64, GPR64>; + def ATOMIC_LOAD_UMIN_I64 : Atomic2Ops<atomic_load_umin_i64, GPR64>; + def ATOMIC_LOAD_UMAX_I64 : Atomic2Ops<atomic_load_umax_i64, GPR64>; } def ATOMIC_LOAD_ADD_I64_POSTRA : Atomic2OpsPostRA<GPR64>; diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp index dda33f9a1808..62dfa5f71106 100644 --- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp @@ -1014,7 +1014,7 @@ void MipsAsmPrinter::EmitFPCallStub( MCSectionELF *M = OutContext.getELFSection( ".mips16.call.fp." + std::string(Symbol), ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_EXECINSTR); - OutStreamer->switchSection(M, nullptr); + OutStreamer->switchSection(M); // // .align 2 // diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.td b/llvm/lib/Target/Mips/MipsInstrInfo.td index 23e04c442bf6..85e3e78d2a4d 100644 --- a/llvm/lib/Target/Mips/MipsInstrInfo.td +++ b/llvm/lib/Target/Mips/MipsInstrInfo.td @@ -1904,45 +1904,45 @@ def ADJCALLSTACKUP : MipsPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), } let usesCustomInserter = 1 in { - def ATOMIC_LOAD_ADD_I8 : Atomic2Ops<atomic_load_add_8, GPR32>; - def ATOMIC_LOAD_ADD_I16 : Atomic2Ops<atomic_load_add_16, GPR32>; - def ATOMIC_LOAD_ADD_I32 : Atomic2Ops<atomic_load_add_32, GPR32>; - def ATOMIC_LOAD_SUB_I8 : Atomic2Ops<atomic_load_sub_8, GPR32>; - def ATOMIC_LOAD_SUB_I16 : Atomic2Ops<atomic_load_sub_16, GPR32>; - def ATOMIC_LOAD_SUB_I32 : Atomic2Ops<atomic_load_sub_32, GPR32>; - def ATOMIC_LOAD_AND_I8 : Atomic2Ops<atomic_load_and_8, GPR32>; - def ATOMIC_LOAD_AND_I16 : Atomic2Ops<atomic_load_and_16, GPR32>; - def ATOMIC_LOAD_AND_I32 : Atomic2Ops<atomic_load_and_32, GPR32>; - def ATOMIC_LOAD_OR_I8 : Atomic2Ops<atomic_load_or_8, GPR32>; - def ATOMIC_LOAD_OR_I16 : Atomic2Ops<atomic_load_or_16, GPR32>; - def ATOMIC_LOAD_OR_I32 : Atomic2Ops<atomic_load_or_32, GPR32>; - def ATOMIC_LOAD_XOR_I8 : Atomic2Ops<atomic_load_xor_8, GPR32>; - def ATOMIC_LOAD_XOR_I16 : Atomic2Ops<atomic_load_xor_16, GPR32>; - def ATOMIC_LOAD_XOR_I32 : Atomic2Ops<atomic_load_xor_32, GPR32>; - def ATOMIC_LOAD_NAND_I8 : Atomic2Ops<atomic_load_nand_8, GPR32>; - def ATOMIC_LOAD_NAND_I16 : Atomic2Ops<atomic_load_nand_16, GPR32>; - def ATOMIC_LOAD_NAND_I32 : Atomic2Ops<atomic_load_nand_32, GPR32>; - - def ATOMIC_SWAP_I8 : Atomic2Ops<atomic_swap_8, GPR32>; - def ATOMIC_SWAP_I16 : Atomic2Ops<atomic_swap_16, GPR32>; - def ATOMIC_SWAP_I32 : Atomic2Ops<atomic_swap_32, GPR32>; - - def ATOMIC_CMP_SWAP_I8 : AtomicCmpSwap<atomic_cmp_swap_8, GPR32>; - def ATOMIC_CMP_SWAP_I16 : AtomicCmpSwap<atomic_cmp_swap_16, GPR32>; - def ATOMIC_CMP_SWAP_I32 : AtomicCmpSwap<atomic_cmp_swap_32, GPR32>; - - def ATOMIC_LOAD_MIN_I8 : Atomic2Ops<atomic_load_min_8, GPR32>; - def ATOMIC_LOAD_MIN_I16 : Atomic2Ops<atomic_load_min_16, GPR32>; - def ATOMIC_LOAD_MIN_I32 : Atomic2Ops<atomic_load_min_32, GPR32>; - def ATOMIC_LOAD_MAX_I8 : Atomic2Ops<atomic_load_max_8, GPR32>; - def ATOMIC_LOAD_MAX_I16 : Atomic2Ops<atomic_load_max_16, GPR32>; - def ATOMIC_LOAD_MAX_I32 : Atomic2Ops<atomic_load_max_32, GPR32>; - def ATOMIC_LOAD_UMIN_I8 : Atomic2Ops<atomic_load_umin_8, GPR32>; - def ATOMIC_LOAD_UMIN_I16 : Atomic2Ops<atomic_load_umin_16, GPR32>; - def ATOMIC_LOAD_UMIN_I32 : Atomic2Ops<atomic_load_umin_32, GPR32>; - def ATOMIC_LOAD_UMAX_I8 : Atomic2Ops<atomic_load_umax_8, GPR32>; - def ATOMIC_LOAD_UMAX_I16 : Atomic2Ops<atomic_load_umax_16, GPR32>; - def ATOMIC_LOAD_UMAX_I32 : Atomic2Ops<atomic_load_umax_32, GPR32>; + def ATOMIC_LOAD_ADD_I8 : Atomic2Ops<atomic_load_add_i8, GPR32>; + def ATOMIC_LOAD_ADD_I16 : Atomic2Ops<atomic_load_add_i16, GPR32>; + def ATOMIC_LOAD_ADD_I32 : Atomic2Ops<atomic_load_add_i32, GPR32>; + def ATOMIC_LOAD_SUB_I8 : Atomic2Ops<atomic_load_sub_i8, GPR32>; + def ATOMIC_LOAD_SUB_I16 : Atomic2Ops<atomic_load_sub_i16, GPR32>; + def ATOMIC_LOAD_SUB_I32 : Atomic2Ops<atomic_load_sub_i32, GPR32>; + def ATOMIC_LOAD_AND_I8 : Atomic2Ops<atomic_load_and_i8, GPR32>; + def ATOMIC_LOAD_AND_I16 : Atomic2Ops<atomic_load_and_i16, GPR32>; + def ATOMIC_LOAD_AND_I32 : Atomic2Ops<atomic_load_and_i32, GPR32>; + def ATOMIC_LOAD_OR_I8 : Atomic2Ops<atomic_load_or_i8, GPR32>; + def ATOMIC_LOAD_OR_I16 : Atomic2Ops<atomic_load_or_i16, GPR32>; + def ATOMIC_LOAD_OR_I32 : Atomic2Ops<atomic_load_or_i32, GPR32>; + def ATOMIC_LOAD_XOR_I8 : Atomic2Ops<atomic_load_xor_i8, GPR32>; + def ATOMIC_LOAD_XOR_I16 : Atomic2Ops<atomic_load_xor_i16, GPR32>; + def ATOMIC_LOAD_XOR_I32 : Atomic2Ops<atomic_load_xor_i32, GPR32>; + def ATOMIC_LOAD_NAND_I8 : Atomic2Ops<atomic_load_nand_i8, GPR32>; + def ATOMIC_LOAD_NAND_I16 : Atomic2Ops<atomic_load_nand_i16, GPR32>; + def ATOMIC_LOAD_NAND_I32 : Atomic2Ops<atomic_load_nand_i32, GPR32>; + + def ATOMIC_SWAP_I8 : Atomic2Ops<atomic_swap_i8, GPR32>; + def ATOMIC_SWAP_I16 : Atomic2Ops<atomic_swap_i16, GPR32>; + def ATOMIC_SWAP_I32 : Atomic2Ops<atomic_swap_i32, GPR32>; + + def ATOMIC_CMP_SWAP_I8 : AtomicCmpSwap<atomic_cmp_swap_i8, GPR32>; + def ATOMIC_CMP_SWAP_I16 : AtomicCmpSwap<atomic_cmp_swap_i16, GPR32>; + def ATOMIC_CMP_SWAP_I32 : AtomicCmpSwap<atomic_cmp_swap_i32, GPR32>; + + def ATOMIC_LOAD_MIN_I8 : Atomic2Ops<atomic_load_min_i8, GPR32>; + def ATOMIC_LOAD_MIN_I16 : Atomic2Ops<atomic_load_min_i16, GPR32>; + def ATOMIC_LOAD_MIN_I32 : Atomic2Ops<atomic_load_min_i32, GPR32>; + def ATOMIC_LOAD_MAX_I8 : Atomic2Ops<atomic_load_max_i8, GPR32>; + def ATOMIC_LOAD_MAX_I16 : Atomic2Ops<atomic_load_max_i16, GPR32>; + def ATOMIC_LOAD_MAX_I32 : Atomic2Ops<atomic_load_max_i32, GPR32>; + def ATOMIC_LOAD_UMIN_I8 : Atomic2Ops<atomic_load_umin_i8, GPR32>; + def ATOMIC_LOAD_UMIN_I16 : Atomic2Ops<atomic_load_umin_i16, GPR32>; + def ATOMIC_LOAD_UMIN_I32 : Atomic2Ops<atomic_load_umin_i32, GPR32>; + def ATOMIC_LOAD_UMAX_I8 : Atomic2Ops<atomic_load_umax_i8, GPR32>; + def ATOMIC_LOAD_UMAX_I16 : Atomic2Ops<atomic_load_umax_i16, GPR32>; + def ATOMIC_LOAD_UMAX_I32 : Atomic2Ops<atomic_load_umax_i32, GPR32>; } def ATOMIC_LOAD_ADD_I8_POSTRA : Atomic2OpsSubwordPostRA<GPR32>; diff --git a/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp b/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp index 902c7ceb869a..7ceb97642bba 100644 --- a/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp +++ b/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp @@ -83,7 +83,7 @@ public: bool runOnMachineFunction(MachineFunction &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -197,7 +197,8 @@ bool OptimizePICCall::runOnMachineFunction(MachineFunction &F) { return false; // Do a pre-order traversal of the dominator tree. - MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>(); + MachineDominatorTree *MDT = + &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); bool Changed = false; SmallVector<MBBInfo, 8> WorkList(1, MBBInfo(MDT->getRootNode())); diff --git a/llvm/lib/Target/Mips/MipsPostLegalizerCombiner.cpp b/llvm/lib/Target/Mips/MipsPostLegalizerCombiner.cpp index 0578655f0443..bd8a065011c9 100644 --- a/llvm/lib/Target/Mips/MipsPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/Mips/MipsPostLegalizerCombiner.cpp @@ -110,8 +110,8 @@ void MipsPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<GISelKnownBitsAnalysis>(); AU.addPreserved<GISelKnownBitsAnalysis>(); if (!IsOptNone) { - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); } MachineFunctionPass::getAnalysisUsage(AU); } @@ -139,7 +139,8 @@ bool MipsPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); MachineDominatorTree *MDT = - IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); + IsOptNone ? nullptr + : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, LI, EnableOpt, F.hasOptSize(), F.hasMinSize()); MipsPostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, /*CSEInfo*/ nullptr, diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp index 1aff608cfffb..fc207b1a8871 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp @@ -46,8 +46,7 @@ static bool isDwarfSection(const MCObjectFileInfo *FI, const MCSection *Section) { // FIXME: the checks for the DWARF sections are very fragile and should be // fixed up in a followup patch. - if (!Section || Section->getKind().isText() || - Section->getKind().isWriteable()) + if (!Section || Section->isText()) return false; return Section == FI->getDwarfAbbrevSection() || Section == FI->getDwarfInfoSection() || @@ -84,8 +83,7 @@ static bool isDwarfSection(const MCObjectFileInfo *FI, } void NVPTXTargetStreamer::changeSection(const MCSection *CurSection, - MCSection *Section, - const MCExpr *SubSection, + MCSection *Section, uint32_t SubSection, raw_ostream &OS) { assert(!SubSection && "SubSection is not null!"); const MCObjectFileInfo *FI = getStreamer().getContext().getObjectFileInfo(); diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h index b0d8978ee685..ca0d84ee2079 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h @@ -43,7 +43,7 @@ public: /// functions. void emitDwarfFileDirective(StringRef Directive) override; void changeSection(const MCSection *CurSection, MCSection *Section, - const MCExpr *SubSection, raw_ostream &OS) override; + uint32_t SubSection, raw_ostream &OS) override; /// Emit the bytes in \p Data into the output. /// /// This is used to emit bytes in \p Data as sequence of .byte directives. diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index 82770f866085..ca077d41d36b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -1845,6 +1845,10 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes, auto AddIntToBuffer = [AggBuffer, Bytes](const APInt &Val) { size_t NumBytes = (Val.getBitWidth() + 7) / 8; SmallVector<unsigned char, 16> Buf(NumBytes); + // `extractBitsAsZExtValue` does not allow the extraction of bits beyond the + // input's bit width, and i1 arrays may not have a length that is a multuple + // of 8. We handle the last byte separately, so we never request out of + // bounds bits. for (unsigned I = 0; I < NumBytes - 1; ++I) { Buf[I] = Val.extractBitsAsZExtValue(8, I * 8); } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index f4ef7c9914f1..8cb83d8322b8 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -3232,9 +3232,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( if (NumElts != 1) return std::nullopt; Align PartAlign = - (Offsets[parti] == 0 && PAL.getParamAlignment(i)) - ? PAL.getParamAlignment(i).value() - : DL.getABITypeAlign(EltVT.getTypeForEVT(F->getContext())); + DL.getABITypeAlign(EltVT.getTypeForEVT(F->getContext())); return commonAlignment(PartAlign, Offsets[parti]); }(); SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr, @@ -5038,7 +5036,9 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( /// ensures that alignment is 16 or greater. Align NVPTXTargetLowering::getFunctionParamOptimizedAlign( const Function *F, Type *ArgTy, const DataLayout &DL) const { - const uint64_t ABITypeAlign = DL.getABITypeAlign(ArgTy).value(); + // Capping the alignment to 128 bytes as that is the maximum alignment + // supported by PTX. + const Align ABITypeAlign = std::min(Align(128), DL.getABITypeAlign(ArgTy)); // If a function has linkage different from internal or private, we // must use default ABI alignment as external users rely on it. Same @@ -5048,10 +5048,10 @@ Align NVPTXTargetLowering::getFunctionParamOptimizedAlign( /*IgnoreCallbackUses=*/false, /*IgnoreAssumeLikeCalls=*/true, /*IgnoreLLVMUsed=*/true)) - return Align(ABITypeAlign); + return ABITypeAlign; assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage"); - return Align(std::max(uint64_t(16), ABITypeAlign)); + return std::max(Align(16), ABITypeAlign); } /// Helper for computing alignment of a device function byval parameter. @@ -5215,103 +5215,131 @@ bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const { return F.getFnAttribute("unsafe-fp-math").getValueAsBool(); } +static bool isConstZero(const SDValue &Operand) { + const auto *Const = dyn_cast<ConstantSDNode>(Operand); + return Const && Const->getZExtValue() == 0; +} + /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with /// operands N0 and N1. This is a helper for PerformADDCombine that is /// called with the default operands, and if that fails, with commuted /// operands. -static SDValue PerformADDCombineWithOperands( - SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, - const NVPTXSubtarget &Subtarget, CodeGenOptLevel OptLevel) { - SelectionDAG &DAG = DCI.DAG; - // Skip non-integer, non-scalar case - EVT VT=N0.getValueType(); - if (VT.isVector()) +static SDValue +PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, + TargetLowering::DAGCombinerInfo &DCI) { + EVT VT = N0.getValueType(); + + // Since integer multiply-add costs the same as integer multiply + // but is more costly than integer add, do the fusion only when + // the mul is only used in the add. + // TODO: this may not be true for later architectures, consider relaxing this + if (!N0.getNode()->hasOneUse()) return SDValue(); // fold (add (mul a, b), c) -> (mad a, b, c) // - if (N0.getOpcode() == ISD::MUL) { - assert (VT.isInteger()); - // For integer: - // Since integer multiply-add costs the same as integer multiply - // but is more costly than integer add, do the fusion only when - // the mul is only used in the add. - if (OptLevel == CodeGenOptLevel::None || VT != MVT::i32 || - !N0.getNode()->hasOneUse()) + if (N0.getOpcode() == ISD::MUL) + return DCI.DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, N0.getOperand(0), + N0.getOperand(1), N1); + + // fold (add (select cond, 0, (mul a, b)), c) + // -> (select cond, c, (mad a, b, c)) + // + if (N0.getOpcode() == ISD::SELECT) { + unsigned ZeroOpNum; + if (isConstZero(N0->getOperand(1))) + ZeroOpNum = 1; + else if (isConstZero(N0->getOperand(2))) + ZeroOpNum = 2; + else + return SDValue(); + + SDValue M = N0->getOperand((ZeroOpNum == 1) ? 2 : 1); + if (M->getOpcode() != ISD::MUL || !M.getNode()->hasOneUse()) return SDValue(); - // Do the folding - return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, - N0.getOperand(0), N0.getOperand(1), N1); + SDValue MAD = DCI.DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, + M->getOperand(0), M->getOperand(1), N1); + return DCI.DAG.getSelect(SDLoc(N), VT, N0->getOperand(0), + ((ZeroOpNum == 1) ? N1 : MAD), + ((ZeroOpNum == 1) ? MAD : N1)); } - else if (N0.getOpcode() == ISD::FMUL) { - if (VT == MVT::f32 || VT == MVT::f64) { - const auto *TLI = static_cast<const NVPTXTargetLowering *>( - &DAG.getTargetLoweringInfo()); - if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel)) - return SDValue(); - // For floating point: - // Do the fusion only when the mul has less than 5 uses and all - // are add. - // The heuristic is that if a use is not an add, then that use - // cannot be fused into fma, therefore mul is still needed anyway. - // If there are more than 4 uses, even if they are all add, fusing - // them will increase register pressue. - // - int numUses = 0; - int nonAddCount = 0; - for (const SDNode *User : N0.getNode()->uses()) { - numUses++; - if (User->getOpcode() != ISD::FADD) - ++nonAddCount; - } + return SDValue(); +} + +static SDValue +PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, + TargetLowering::DAGCombinerInfo &DCI, + CodeGenOptLevel OptLevel) { + EVT VT = N0.getValueType(); + if (N0.getOpcode() == ISD::FMUL) { + const auto *TLI = static_cast<const NVPTXTargetLowering *>( + &DCI.DAG.getTargetLoweringInfo()); + if (!TLI->allowFMA(DCI.DAG.getMachineFunction(), OptLevel)) + return SDValue(); + + // For floating point: + // Do the fusion only when the mul has less than 5 uses and all + // are add. + // The heuristic is that if a use is not an add, then that use + // cannot be fused into fma, therefore mul is still needed anyway. + // If there are more than 4 uses, even if they are all add, fusing + // them will increase register pressue. + // + int numUses = 0; + int nonAddCount = 0; + for (const SDNode *User : N0.getNode()->uses()) { + numUses++; + if (User->getOpcode() != ISD::FADD) + ++nonAddCount; if (numUses >= 5) return SDValue(); - if (nonAddCount) { - int orderNo = N->getIROrder(); - int orderNo2 = N0.getNode()->getIROrder(); - // simple heuristics here for considering potential register - // pressure, the logics here is that the differnce are used - // to measure the distance between def and use, the longer distance - // more likely cause register pressure. - if (orderNo - orderNo2 < 500) - return SDValue(); - - // Now, check if at least one of the FMUL's operands is live beyond the node N, - // which guarantees that the FMA will not increase register pressure at node N. - bool opIsLive = false; - const SDNode *left = N0.getOperand(0).getNode(); - const SDNode *right = N0.getOperand(1).getNode(); - - if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right)) - opIsLive = true; - - if (!opIsLive) - for (const SDNode *User : left->uses()) { - int orderNo3 = User->getIROrder(); - if (orderNo3 > orderNo) { - opIsLive = true; - break; - } - } + } + if (nonAddCount) { + int orderNo = N->getIROrder(); + int orderNo2 = N0.getNode()->getIROrder(); + // simple heuristics here for considering potential register + // pressure, the logics here is that the differnce are used + // to measure the distance between def and use, the longer distance + // more likely cause register pressure. + if (orderNo - orderNo2 < 500) + return SDValue(); - if (!opIsLive) - for (const SDNode *User : right->uses()) { - int orderNo3 = User->getIROrder(); - if (orderNo3 > orderNo) { - opIsLive = true; - break; - } + // Now, check if at least one of the FMUL's operands is live beyond the + // node N, which guarantees that the FMA will not increase register + // pressure at node N. + bool opIsLive = false; + const SDNode *left = N0.getOperand(0).getNode(); + const SDNode *right = N0.getOperand(1).getNode(); + + if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right)) + opIsLive = true; + + if (!opIsLive) + for (const SDNode *User : left->uses()) { + int orderNo3 = User->getIROrder(); + if (orderNo3 > orderNo) { + opIsLive = true; + break; } + } - if (!opIsLive) - return SDValue(); - } + if (!opIsLive) + for (const SDNode *User : right->uses()) { + int orderNo3 = User->getIROrder(); + if (orderNo3 > orderNo) { + opIsLive = true; + break; + } + } - return DAG.getNode(ISD::FMA, SDLoc(N), VT, - N0.getOperand(0), N0.getOperand(1), N1); + if (!opIsLive) + return SDValue(); } + + return DCI.DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(0), + N0.getOperand(1), N1); } return SDValue(); @@ -5332,18 +5360,44 @@ static SDValue PerformStoreRetvalCombine(SDNode *N) { /// static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, - const NVPTXSubtarget &Subtarget, + CodeGenOptLevel OptLevel) { + if (OptLevel == CodeGenOptLevel::None) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // Skip non-integer, non-scalar case + EVT VT = N0.getValueType(); + if (VT.isVector() || VT != MVT::i32) + return SDValue(); + + // First try with the default operand order. + if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI)) + return Result; + + // If that didn't work, try again with the operands commuted. + return PerformADDCombineWithOperands(N, N1, N0, DCI); +} + +/// PerformFADDCombine - Target-specific dag combine xforms for ISD::FADD. +/// +static SDValue PerformFADDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); + EVT VT = N0.getValueType(); + if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64)) + return SDValue(); + // First try with the default operand order. - if (SDValue Result = - PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel)) + if (SDValue Result = PerformFADDCombineWithOperands(N, N0, N1, DCI, OptLevel)) return Result; // If that didn't work, try again with the operands commuted. - return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel); + return PerformFADDCombineWithOperands(N, N1, N0, DCI, OptLevel); } static SDValue PerformANDCombine(SDNode *N, @@ -5876,8 +5930,9 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, switch (N->getOpcode()) { default: break; case ISD::ADD: + return PerformADDCombine(N, DCI, OptLevel); case ISD::FADD: - return PerformADDCombine(N, DCI, STI, OptLevel); + return PerformFADDCombine(N, DCI, OptLevel); case ISD::MUL: return PerformMULCombine(N, DCI, OptLevel); case ISD::SHL: diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 440af085cb8e..c81dfa68e4bd 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -265,7 +265,7 @@ multiclass MATCH_ANY_SYNC<NVPTXRegClass regclass, string ptxtype, Intrinsic IntO // activemask.b32 def ACTIVEMASK : NVPTXInst<(outs Int32Regs:$dest), (ins), - "activemask.b32 \t$dest;", + "activemask.b32 \t$dest;", [(set Int32Regs:$dest, (int_nvvm_activemask))]>, Requires<[hasPTX<62>, hasSM<30>]>; @@ -1618,18 +1618,18 @@ multiclass F_ATOMIC_3<ValueType regT, NVPTXRegClass regclass, string SpaceStr, s // atom_add -def atomic_load_add_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_add_32 node:$a, node:$b)>; -def atomic_load_add_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_add_32 node:$a, node:$b)>; -def atomic_load_add_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_add_32 node:$a, node:$b)>; -def atomic_load_add_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_add_64 node:$a, node:$b)>; -def atomic_load_add_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_add_64 node:$a, node:$b)>; -def atomic_load_add_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_add_64 node:$a, node:$b)>; +def atomic_load_add_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_add_i32 node:$a, node:$b)>; +def atomic_load_add_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_add_i32 node:$a, node:$b)>; +def atomic_load_add_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_add_i32 node:$a, node:$b)>; +def atomic_load_add_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_add_i64 node:$a, node:$b)>; +def atomic_load_add_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_add_i64 node:$a, node:$b)>; +def atomic_load_add_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_add_i64 node:$a, node:$b)>; def atomic_load_add_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), (atomic_load_fadd node:$a, node:$b)>; def atomic_load_add_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), @@ -1638,22 +1638,22 @@ def atomic_load_add_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (atomic_load_fadd node:$a, node:$b)>; defm INT_PTX_ATOM_ADD_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".u32", ".add", - atomic_load_add_32_g, i32imm, imm>; + atomic_load_add_i32_g, i32imm, imm>; defm INT_PTX_ATOM_ADD_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".u32", ".add", - atomic_load_add_32_s, i32imm, imm>; + atomic_load_add_i32_s, i32imm, imm>; defm INT_PTX_ATOM_ADD_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".u32", ".add", - atomic_load_add_32_gen, i32imm, imm>; + atomic_load_add_i32_gen, i32imm, imm>; defm INT_PTX_ATOM_ADD_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", ".u32", - ".add", atomic_load_add_32_gen, i32imm, imm>; + ".add", atomic_load_add_i32_gen, i32imm, imm>; defm INT_PTX_ATOM_ADD_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".u64", ".add", - atomic_load_add_64_g, i64imm, imm>; + atomic_load_add_i64_g, i64imm, imm>; defm INT_PTX_ATOM_ADD_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".u64", ".add", - atomic_load_add_64_s, i64imm, imm>; + atomic_load_add_i64_s, i64imm, imm>; defm INT_PTX_ATOM_ADD_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".u64", ".add", - atomic_load_add_64_gen, i64imm, imm>; + atomic_load_add_i64_gen, i64imm, imm>; defm INT_PTX_ATOM_ADD_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".u64", - ".add", atomic_load_add_64_gen, i64imm, imm>; + ".add", atomic_load_add_i64_gen, i64imm, imm>; defm INT_PTX_ATOM_ADD_G_F16 : F_ATOMIC_2<f16, Int16Regs, ".global", ".f16", ".add.noftz", atomic_load_add_g, f16imm, fpimm, [hasSM<70>, hasPTX<63>]>; @@ -1685,187 +1685,187 @@ defm INT_PTX_ATOM_ADD_GEN_F64 : F_ATOMIC_2<f64, Float64Regs, "", ".f64", ".add", // atom_sub -def atomic_load_sub_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_sub_32 node:$a, node:$b)>; -def atomic_load_sub_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_sub_32 node:$a, node:$b)>; -def atomic_load_sub_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_sub_32 node:$a, node:$b)>; -def atomic_load_sub_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_sub_64 node:$a, node:$b)>; -def atomic_load_sub_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_sub_64 node:$a, node:$b)>; -def atomic_load_sub_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_sub_64 node:$a, node:$b)>; +def atomic_load_sub_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_sub_i32 node:$a, node:$b)>; +def atomic_load_sub_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_sub_i32 node:$a, node:$b)>; +def atomic_load_sub_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_sub_i32 node:$a, node:$b)>; +def atomic_load_sub_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_sub_i64 node:$a, node:$b)>; +def atomic_load_sub_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_sub_i64 node:$a, node:$b)>; +def atomic_load_sub_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_sub_i64 node:$a, node:$b)>; defm INT_PTX_ATOM_SUB_G_32 : F_ATOMIC_2_NEG<i32, Int32Regs, ".global", "32", ".add", - atomic_load_sub_32_g>; + atomic_load_sub_i32_g>; defm INT_PTX_ATOM_SUB_G_64 : F_ATOMIC_2_NEG<i64, Int64Regs, ".global", "64", ".add", - atomic_load_sub_64_g>; + atomic_load_sub_i64_g>; defm INT_PTX_ATOM_SUB_GEN_32 : F_ATOMIC_2_NEG<i32, Int32Regs, "", "32", ".add", - atomic_load_sub_32_gen>; + atomic_load_sub_i32_gen>; defm INT_PTX_ATOM_SUB_GEN_32_USE_G : F_ATOMIC_2_NEG<i32, Int32Regs, ".global", "32", - ".add", atomic_load_sub_32_gen>; + ".add", atomic_load_sub_i32_gen>; defm INT_PTX_ATOM_SUB_S_32 : F_ATOMIC_2_NEG<i32, Int32Regs, ".shared", "32", ".add", - atomic_load_sub_32_s>; + atomic_load_sub_i32_s>; defm INT_PTX_ATOM_SUB_S_64 : F_ATOMIC_2_NEG<i64, Int64Regs, ".shared", "64", ".add", - atomic_load_sub_64_s>; + atomic_load_sub_i64_s>; defm INT_PTX_ATOM_SUB_GEN_64 : F_ATOMIC_2_NEG<i64, Int64Regs, "", "64", ".add", - atomic_load_sub_64_gen>; + atomic_load_sub_i64_gen>; defm INT_PTX_ATOM_SUB_GEN_64_USE_G : F_ATOMIC_2_NEG<i64, Int64Regs, ".global", "64", - ".add", atomic_load_sub_64_gen>; + ".add", atomic_load_sub_i64_gen>; // atom_swap -def atomic_swap_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_swap_32 node:$a, node:$b)>; -def atomic_swap_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_swap_32 node:$a, node:$b)>; -def atomic_swap_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_swap_32 node:$a, node:$b)>; -def atomic_swap_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_swap_64 node:$a, node:$b)>; -def atomic_swap_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_swap_64 node:$a, node:$b)>; -def atomic_swap_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_swap_64 node:$a, node:$b)>; +def atomic_swap_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_swap_i32 node:$a, node:$b)>; +def atomic_swap_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_swap_i32 node:$a, node:$b)>; +def atomic_swap_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_swap_i32 node:$a, node:$b)>; +def atomic_swap_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_swap_i64 node:$a, node:$b)>; +def atomic_swap_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_swap_i64 node:$a, node:$b)>; +def atomic_swap_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_swap_i64 node:$a, node:$b)>; defm INT_PTX_ATOM_SWAP_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32", ".exch", - atomic_swap_32_g, i32imm, imm>; + atomic_swap_i32_g, i32imm, imm>; defm INT_PTX_ATOM_SWAP_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".b32", ".exch", - atomic_swap_32_s, i32imm, imm>; + atomic_swap_i32_s, i32imm, imm>; defm INT_PTX_ATOM_SWAP_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".b32", ".exch", - atomic_swap_32_gen, i32imm, imm>; + atomic_swap_i32_gen, i32imm, imm>; defm INT_PTX_ATOM_SWAP_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32", - ".exch", atomic_swap_32_gen, i32imm, imm>; + ".exch", atomic_swap_i32_gen, i32imm, imm>; defm INT_PTX_ATOM_SWAP_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64", ".exch", - atomic_swap_64_g, i64imm, imm>; + atomic_swap_i64_g, i64imm, imm>; defm INT_PTX_ATOM_SWAP_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".b64", ".exch", - atomic_swap_64_s, i64imm, imm>; + atomic_swap_i64_s, i64imm, imm>; defm INT_PTX_ATOM_SWAP_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".b64", ".exch", - atomic_swap_64_gen, i64imm, imm>; + atomic_swap_i64_gen, i64imm, imm>; defm INT_PTX_ATOM_SWAP_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64", - ".exch", atomic_swap_64_gen, i64imm, imm>; + ".exch", atomic_swap_i64_gen, i64imm, imm>; // atom_max -def atomic_load_max_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b) - , (atomic_load_max_32 node:$a, node:$b)>; -def atomic_load_max_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_max_32 node:$a, node:$b)>; -def atomic_load_max_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_max_32 node:$a, node:$b)>; -def atomic_load_max_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b) - , (atomic_load_max_64 node:$a, node:$b)>; -def atomic_load_max_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_max_64 node:$a, node:$b)>; -def atomic_load_max_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_max_64 node:$a, node:$b)>; -def atomic_load_umax_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_umax_32 node:$a, node:$b)>; -def atomic_load_umax_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_umax_32 node:$a, node:$b)>; -def atomic_load_umax_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_umax_32 node:$a, node:$b)>; -def atomic_load_umax_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_umax_64 node:$a, node:$b)>; -def atomic_load_umax_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_umax_64 node:$a, node:$b)>; -def atomic_load_umax_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_umax_64 node:$a, node:$b)>; +def atomic_load_max_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b) + , (atomic_load_max_i32 node:$a, node:$b)>; +def atomic_load_max_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_max_i32 node:$a, node:$b)>; +def atomic_load_max_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_max_i32 node:$a, node:$b)>; +def atomic_load_max_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b) + , (atomic_load_max_i64 node:$a, node:$b)>; +def atomic_load_max_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_max_i64 node:$a, node:$b)>; +def atomic_load_max_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_max_i64 node:$a, node:$b)>; +def atomic_load_umax_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_umax_i32 node:$a, node:$b)>; +def atomic_load_umax_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_umax_i32 node:$a, node:$b)>; +def atomic_load_umax_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_umax_i32 node:$a, node:$b)>; +def atomic_load_umax_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_umax_i64 node:$a, node:$b)>; +def atomic_load_umax_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_umax_i64 node:$a, node:$b)>; +def atomic_load_umax_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_umax_i64 node:$a, node:$b)>; defm INT_PTX_ATOM_LOAD_MAX_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".s32", - ".max", atomic_load_max_32_g, i32imm, imm>; + ".max", atomic_load_max_i32_g, i32imm, imm>; defm INT_PTX_ATOM_LOAD_MAX_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".s32", - ".max", atomic_load_max_32_s, i32imm, imm>; + ".max", atomic_load_max_i32_s, i32imm, imm>; defm INT_PTX_ATOM_LOAD_MAX_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".s32", ".max", - atomic_load_max_32_gen, i32imm, imm>; + atomic_load_max_i32_gen, i32imm, imm>; defm INT_PTX_ATOM_LOAD_MAX_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", - ".s32", ".max", atomic_load_max_32_gen, i32imm, imm>; + ".s32", ".max", atomic_load_max_i32_gen, i32imm, imm>; defm INT_PTX_ATOM_LOAD_MAX_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".s64", - ".max", atomic_load_max_64_g, i64imm, imm, [hasSM<32>]>; + ".max", atomic_load_max_i64_g, i64imm, imm, [hasSM<32>]>; defm INT_PTX_ATOM_LOAD_MAX_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".s64", - ".max", atomic_load_max_64_s, i64imm, imm, [hasSM<32>]>; + ".max", atomic_load_max_i64_s, i64imm, imm, [hasSM<32>]>; defm INT_PTX_ATOM_LOAD_MAX_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".s64", ".max", - atomic_load_max_64_gen, i64imm, imm, [hasSM<32>]>; + atomic_load_max_i64_gen, i64imm, imm, [hasSM<32>]>; defm INT_PTX_ATOM_LOAD_MAX_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", - ".s64", ".max", atomic_load_max_64_gen, i64imm, imm, [hasSM<32>]>; + ".s64", ".max", atomic_load_max_i64_gen, i64imm, imm, [hasSM<32>]>; defm INT_PTX_ATOM_LOAD_UMAX_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".u32", - ".max", atomic_load_umax_32_g, i32imm, imm>; + ".max", atomic_load_umax_i32_g, i32imm, imm>; defm INT_PTX_ATOM_LOAD_UMAX_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".u32", - ".max", atomic_load_umax_32_s, i32imm, imm>; + ".max", atomic_load_umax_i32_s, i32imm, imm>; defm INT_PTX_ATOM_LOAD_UMAX_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".u32", ".max", - atomic_load_umax_32_gen, i32imm, imm>; + atomic_load_umax_i32_gen, i32imm, imm>; defm INT_PTX_ATOM_LOAD_UMAX_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", - ".u32", ".max", atomic_load_umax_32_gen, i32imm, imm>; + ".u32", ".max", atomic_load_umax_i32_gen, i32imm, imm>; defm INT_PTX_ATOM_LOAD_UMAX_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".u64", - ".max", atomic_load_umax_64_g, i64imm, imm, [hasSM<32>]>; + ".max", atomic_load_umax_i64_g, i64imm, imm, [hasSM<32>]>; defm INT_PTX_ATOM_LOAD_UMAX_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".u64", - ".max", atomic_load_umax_64_s, i64imm, imm, [hasSM<32>]>; + ".max", atomic_load_umax_i64_s, i64imm, imm, [hasSM<32>]>; defm INT_PTX_ATOM_LOAD_UMAX_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".u64", ".max", - atomic_load_umax_64_gen, i64imm, imm, [hasSM<32>]>; + atomic_load_umax_i64_gen, i64imm, imm, [hasSM<32>]>; defm INT_PTX_ATOM_LOAD_UMAX_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", - ".u64", ".max", atomic_load_umax_64_gen, i64imm, imm, [hasSM<32>]>; + ".u64", ".max", atomic_load_umax_i64_gen, i64imm, imm, [hasSM<32>]>; // atom_min -def atomic_load_min_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_min_32 node:$a, node:$b)>; -def atomic_load_min_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_min_32 node:$a, node:$b)>; -def atomic_load_min_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_min_32 node:$a, node:$b)>; -def atomic_load_min_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_min_64 node:$a, node:$b)>; -def atomic_load_min_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_min_64 node:$a, node:$b)>; -def atomic_load_min_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_min_64 node:$a, node:$b)>; -def atomic_load_umin_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_umin_32 node:$a, node:$b)>; -def atomic_load_umin_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_umin_32 node:$a, node:$b)>; -def atomic_load_umin_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_umin_32 node:$a, node:$b)>; -def atomic_load_umin_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_umin_64 node:$a, node:$b)>; -def atomic_load_umin_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_umin_64 node:$a, node:$b)>; -def atomic_load_umin_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_umin_64 node:$a, node:$b)>; +def atomic_load_min_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_min_i32 node:$a, node:$b)>; +def atomic_load_min_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_min_i32 node:$a, node:$b)>; +def atomic_load_min_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_min_i32 node:$a, node:$b)>; +def atomic_load_min_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_min_i64 node:$a, node:$b)>; +def atomic_load_min_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_min_i64 node:$a, node:$b)>; +def atomic_load_min_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_min_i64 node:$a, node:$b)>; +def atomic_load_umin_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_umin_i32 node:$a, node:$b)>; +def atomic_load_umin_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_umin_i32 node:$a, node:$b)>; +def atomic_load_umin_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_umin_i32 node:$a, node:$b)>; +def atomic_load_umin_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_umin_i64 node:$a, node:$b)>; +def atomic_load_umin_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_umin_i64 node:$a, node:$b)>; +def atomic_load_umin_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_umin_i64 node:$a, node:$b)>; defm INT_PTX_ATOM_LOAD_MIN_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".s32", - ".min", atomic_load_min_32_g, i32imm, imm>; + ".min", atomic_load_min_i32_g, i32imm, imm>; defm INT_PTX_ATOM_LOAD_MIN_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".s32", - ".min", atomic_load_min_32_s, i32imm, imm>; + ".min", atomic_load_min_i32_s, i32imm, imm>; defm INT_PTX_ATOM_LOAD_MIN_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".s32", ".min", - atomic_load_min_32_gen, i32imm, imm>; + atomic_load_min_i32_gen, i32imm, imm>; defm INT_PTX_ATOM_LOAD_MIN_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", - ".s32", ".min", atomic_load_min_32_gen, i32imm, imm>; + ".s32", ".min", atomic_load_min_i32_gen, i32imm, imm>; defm INT_PTX_ATOM_LOAD_MIN_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".s64", - ".min", atomic_load_min_64_g, i64imm, imm, [hasSM<32>]>; + ".min", atomic_load_min_i64_g, i64imm, imm, [hasSM<32>]>; defm INT_PTX_ATOM_LOAD_MIN_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".s64", - ".min", atomic_load_min_64_s, i64imm, imm, [hasSM<32>]>; + ".min", atomic_load_min_i64_s, i64imm, imm, [hasSM<32>]>; defm INT_PTX_ATOM_LOAD_MIN_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".s64", ".min", - atomic_load_min_64_gen, i64imm, imm, [hasSM<32>]>; + atomic_load_min_i64_gen, i64imm, imm, [hasSM<32>]>; defm INT_PTX_ATOM_LOAD_MIN_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", - ".s64", ".min", atomic_load_min_64_gen, i64imm, imm, [hasSM<32>]>; + ".s64", ".min", atomic_load_min_i64_gen, i64imm, imm, [hasSM<32>]>; defm INT_PTX_ATOM_LOAD_UMIN_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".u32", - ".min", atomic_load_umin_32_g, i32imm, imm>; + ".min", atomic_load_umin_i32_g, i32imm, imm>; defm INT_PTX_ATOM_LOAD_UMIN_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".u32", - ".min", atomic_load_umin_32_s, i32imm, imm>; + ".min", atomic_load_umin_i32_s, i32imm, imm>; defm INT_PTX_ATOM_LOAD_UMIN_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".u32", ".min", - atomic_load_umin_32_gen, i32imm, imm>; + atomic_load_umin_i32_gen, i32imm, imm>; defm INT_PTX_ATOM_LOAD_UMIN_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", - ".u32", ".min", atomic_load_umin_32_gen, i32imm, imm>; + ".u32", ".min", atomic_load_umin_i32_gen, i32imm, imm>; defm INT_PTX_ATOM_LOAD_UMIN_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".u64", - ".min", atomic_load_umin_64_g, i64imm, imm, [hasSM<32>]>; + ".min", atomic_load_umin_i64_g, i64imm, imm, [hasSM<32>]>; defm INT_PTX_ATOM_LOAD_UMIN_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".u64", - ".min", atomic_load_umin_64_s, i64imm, imm, [hasSM<32>]>; + ".min", atomic_load_umin_i64_s, i64imm, imm, [hasSM<32>]>; defm INT_PTX_ATOM_LOAD_UMIN_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".u64", ".min", - atomic_load_umin_64_gen, i64imm, imm, [hasSM<32>]>; + atomic_load_umin_i64_gen, i64imm, imm, [hasSM<32>]>; defm INT_PTX_ATOM_LOAD_UMIN_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", - ".u64", ".min", atomic_load_umin_64_gen, i64imm, imm, [hasSM<32>]>; + ".u64", ".min", atomic_load_umin_i64_gen, i64imm, imm, [hasSM<32>]>; // atom_inc atom_dec @@ -1901,131 +1901,131 @@ defm INT_PTX_ATOM_DEC_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", ".u32 // atom_and -def atomic_load_and_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_and_32 node:$a, node:$b)>; -def atomic_load_and_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_and_32 node:$a, node:$b)>; -def atomic_load_and_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_and_32 node:$a, node:$b)>; -def atomic_load_and_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_and_64 node:$a, node:$b)>; -def atomic_load_and_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_and_64 node:$a, node:$b)>; -def atomic_load_and_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_and_64 node:$a, node:$b)>; +def atomic_load_and_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_and_i32 node:$a, node:$b)>; +def atomic_load_and_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_and_i32 node:$a, node:$b)>; +def atomic_load_and_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_and_i32 node:$a, node:$b)>; +def atomic_load_and_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_and_i64 node:$a, node:$b)>; +def atomic_load_and_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_and_i64 node:$a, node:$b)>; +def atomic_load_and_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_and_i64 node:$a, node:$b)>; defm INT_PTX_ATOM_AND_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32", ".and", - atomic_load_and_32_g, i32imm, imm>; + atomic_load_and_i32_g, i32imm, imm>; defm INT_PTX_ATOM_AND_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".b32", ".and", - atomic_load_and_32_s, i32imm, imm>; + atomic_load_and_i32_s, i32imm, imm>; defm INT_PTX_ATOM_AND_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".b32", ".and", - atomic_load_and_32_gen, i32imm, imm>; + atomic_load_and_i32_gen, i32imm, imm>; defm INT_PTX_ATOM_AND_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32", - ".and", atomic_load_and_32_gen, i32imm, imm>; + ".and", atomic_load_and_i32_gen, i32imm, imm>; defm INT_PTX_ATOM_AND_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64", ".and", - atomic_load_and_64_g, i64imm, imm, [hasSM<32>]>; + atomic_load_and_i64_g, i64imm, imm, [hasSM<32>]>; defm INT_PTX_ATOM_AND_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".b64", ".and", - atomic_load_and_64_s, i64imm, imm, [hasSM<32>]>; + atomic_load_and_i64_s, i64imm, imm, [hasSM<32>]>; defm INT_PTX_ATOM_AND_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".b64", ".and", - atomic_load_and_64_gen, i64imm, imm, [hasSM<32>]>; + atomic_load_and_i64_gen, i64imm, imm, [hasSM<32>]>; defm INT_PTX_ATOM_AND_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64", - ".and", atomic_load_and_64_gen, i64imm, imm, [hasSM<32>]>; + ".and", atomic_load_and_i64_gen, i64imm, imm, [hasSM<32>]>; // atom_or -def atomic_load_or_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_or_32 node:$a, node:$b)>; -def atomic_load_or_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_or_32 node:$a, node:$b)>; -def atomic_load_or_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_or_32 node:$a, node:$b)>; -def atomic_load_or_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_or_64 node:$a, node:$b)>; -def atomic_load_or_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_or_64 node:$a, node:$b)>; -def atomic_load_or_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_or_64 node:$a, node:$b)>; +def atomic_load_or_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_or_i32 node:$a, node:$b)>; +def atomic_load_or_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_or_i32 node:$a, node:$b)>; +def atomic_load_or_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_or_i32 node:$a, node:$b)>; +def atomic_load_or_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_or_i64 node:$a, node:$b)>; +def atomic_load_or_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_or_i64 node:$a, node:$b)>; +def atomic_load_or_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_or_i64 node:$a, node:$b)>; defm INT_PTX_ATOM_OR_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32", ".or", - atomic_load_or_32_g, i32imm, imm>; + atomic_load_or_i32_g, i32imm, imm>; defm INT_PTX_ATOM_OR_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".b32", ".or", - atomic_load_or_32_gen, i32imm, imm>; + atomic_load_or_i32_gen, i32imm, imm>; defm INT_PTX_ATOM_OR_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32", - ".or", atomic_load_or_32_gen, i32imm, imm>; + ".or", atomic_load_or_i32_gen, i32imm, imm>; defm INT_PTX_ATOM_OR_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".b32", ".or", - atomic_load_or_32_s, i32imm, imm>; + atomic_load_or_i32_s, i32imm, imm>; defm INT_PTX_ATOM_OR_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64", ".or", - atomic_load_or_64_g, i64imm, imm, [hasSM<32>]>; + atomic_load_or_i64_g, i64imm, imm, [hasSM<32>]>; defm INT_PTX_ATOM_OR_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".b64", ".or", - atomic_load_or_64_gen, i64imm, imm, [hasSM<32>]>; + atomic_load_or_i64_gen, i64imm, imm, [hasSM<32>]>; defm INT_PTX_ATOM_OR_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64", - ".or", atomic_load_or_64_gen, i64imm, imm, [hasSM<32>]>; + ".or", atomic_load_or_i64_gen, i64imm, imm, [hasSM<32>]>; defm INT_PTX_ATOM_OR_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".b64", ".or", - atomic_load_or_64_s, i64imm, imm, [hasSM<32>]>; + atomic_load_or_i64_s, i64imm, imm, [hasSM<32>]>; // atom_xor -def atomic_load_xor_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_xor_32 node:$a, node:$b)>; -def atomic_load_xor_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_xor_32 node:$a, node:$b)>; -def atomic_load_xor_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_xor_32 node:$a, node:$b)>; -def atomic_load_xor_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_xor_64 node:$a, node:$b)>; -def atomic_load_xor_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_xor_64 node:$a, node:$b)>; -def atomic_load_xor_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_xor_64 node:$a, node:$b)>; +def atomic_load_xor_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_xor_i32 node:$a, node:$b)>; +def atomic_load_xor_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_xor_i32 node:$a, node:$b)>; +def atomic_load_xor_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_xor_i32 node:$a, node:$b)>; +def atomic_load_xor_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_xor_i64 node:$a, node:$b)>; +def atomic_load_xor_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_xor_i64 node:$a, node:$b)>; +def atomic_load_xor_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_xor_i64 node:$a, node:$b)>; defm INT_PTX_ATOM_XOR_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32", ".xor", - atomic_load_xor_32_g, i32imm, imm>; + atomic_load_xor_i32_g, i32imm, imm>; defm INT_PTX_ATOM_XOR_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".b32", ".xor", - atomic_load_xor_32_s, i32imm, imm>; + atomic_load_xor_i32_s, i32imm, imm>; defm INT_PTX_ATOM_XOR_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".b32", ".xor", - atomic_load_xor_32_gen, i32imm, imm>; + atomic_load_xor_i32_gen, i32imm, imm>; defm INT_PTX_ATOM_XOR_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32", - ".xor", atomic_load_xor_32_gen, i32imm, imm>; + ".xor", atomic_load_xor_i32_gen, i32imm, imm>; defm INT_PTX_ATOM_XOR_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64", ".xor", - atomic_load_xor_64_g, i64imm, imm, [hasSM<32>]>; + atomic_load_xor_i64_g, i64imm, imm, [hasSM<32>]>; defm INT_PTX_ATOM_XOR_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".b64", ".xor", - atomic_load_xor_64_s, i64imm, imm, [hasSM<32>]>; + atomic_load_xor_i64_s, i64imm, imm, [hasSM<32>]>; defm INT_PTX_ATOM_XOR_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".b64", ".xor", - atomic_load_xor_64_gen, i64imm, imm, [hasSM<32>]>; + atomic_load_xor_i64_gen, i64imm, imm, [hasSM<32>]>; defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64", - ".xor", atomic_load_xor_64_gen, i64imm, imm, [hasSM<32>]>; + ".xor", atomic_load_xor_i64_gen, i64imm, imm, [hasSM<32>]>; // atom_cas -def atomic_cmp_swap_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c), - (atomic_cmp_swap_32 node:$a, node:$b, node:$c)>; -def atomic_cmp_swap_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c), - (atomic_cmp_swap_32 node:$a, node:$b, node:$c)>; -def atomic_cmp_swap_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c), - (atomic_cmp_swap_32 node:$a, node:$b, node:$c)>; -def atomic_cmp_swap_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c), - (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>; -def atomic_cmp_swap_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c), - (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>; -def atomic_cmp_swap_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c), - (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>; +def atomic_cmp_swap_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c), + (atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>; +def atomic_cmp_swap_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c), + (atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>; +def atomic_cmp_swap_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c), + (atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>; +def atomic_cmp_swap_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c), + (atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>; +def atomic_cmp_swap_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c), + (atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>; +def atomic_cmp_swap_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c), + (atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>; defm INT_PTX_ATOM_CAS_G_32 : F_ATOMIC_3<i32, Int32Regs, ".global", ".b32", ".cas", - atomic_cmp_swap_32_g, i32imm>; + atomic_cmp_swap_i32_g, i32imm>; defm INT_PTX_ATOM_CAS_S_32 : F_ATOMIC_3<i32, Int32Regs, ".shared", ".b32", ".cas", - atomic_cmp_swap_32_s, i32imm>; + atomic_cmp_swap_i32_s, i32imm>; defm INT_PTX_ATOM_CAS_GEN_32 : F_ATOMIC_3<i32, Int32Regs, "", ".b32", ".cas", - atomic_cmp_swap_32_gen, i32imm>; + atomic_cmp_swap_i32_gen, i32imm>; defm INT_PTX_ATOM_CAS_GEN_32_USE_G : F_ATOMIC_3<i32, Int32Regs, ".global", ".b32", - ".cas", atomic_cmp_swap_32_gen, i32imm>; + ".cas", atomic_cmp_swap_i32_gen, i32imm>; defm INT_PTX_ATOM_CAS_G_64 : F_ATOMIC_3<i64, Int64Regs, ".global", ".b64", ".cas", - atomic_cmp_swap_64_g, i64imm>; + atomic_cmp_swap_i64_g, i64imm>; defm INT_PTX_ATOM_CAS_S_64 : F_ATOMIC_3<i64, Int64Regs, ".shared", ".b64", ".cas", - atomic_cmp_swap_64_s, i64imm>; + atomic_cmp_swap_i64_s, i64imm>; defm INT_PTX_ATOM_CAS_GEN_64 : F_ATOMIC_3<i64, Int64Regs, "", ".b64", ".cas", - atomic_cmp_swap_64_gen, i64imm>; + atomic_cmp_swap_i64_gen, i64imm>; defm INT_PTX_ATOM_CAS_GEN_64_USE_G : F_ATOMIC_3<i64, Int64Regs, ".global", ".b64", - ".cas", atomic_cmp_swap_64_gen, i64imm>; + ".cas", atomic_cmp_swap_i64_gen, i64imm>; // Support for scoped atomic operations. Matches // int_nvvm_atomic_{op}_{space}_{type}_{scope} @@ -2475,6 +2475,7 @@ defm cvta_local : NG_TO_G<"local", int_nvvm_ptr_local_to_gen, useShortPtrLocal> defm cvta_shared : NG_TO_G<"shared", int_nvvm_ptr_shared_to_gen, useShortPtrShared>; defm cvta_global : NG_TO_G<"global", int_nvvm_ptr_global_to_gen, False>; defm cvta_const : NG_TO_G<"const", int_nvvm_ptr_constant_to_gen, useShortPtrConst>; +defm cvta_param : NG_TO_G<"param", int_nvvm_ptr_param_to_gen, False>; defm cvta_to_local : G_TO_NG<"local", int_nvvm_ptr_gen_to_local, useShortPtrLocal>; defm cvta_to_shared : G_TO_NG<"shared", int_nvvm_ptr_gen_to_shared, useShortPtrShared>; @@ -6724,6 +6725,7 @@ class WMMA_MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB, # FragC.regstring # ";"; } +let isConvergent = true in { defset list<WMMA_INSTR> WMMAs = { foreach layout_a = ["row", "col"] in { foreach layout_b = ["row", "col"] in { @@ -6745,6 +6747,7 @@ defset list<WMMA_INSTR> WMMAs = { } // layout_b } // layout_a } // defset +} // MMA class MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB, @@ -6774,6 +6777,7 @@ class MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB, # FragC.regstring # ";"; } +let isConvergent = true in { defset list<WMMA_INSTR> MMAs = { foreach layout_a = ["row", "col"] in { foreach layout_b = ["row", "col"] in { @@ -6793,6 +6797,7 @@ defset list<WMMA_INSTR> MMAs = { } // layout_b } // layout_a } // defset +} // // ldmatrix.sync.aligned.m8n8[|.trans][|.shared].b16 diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp index cde02c25c483..e63c7a61c6f2 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -95,7 +95,9 @@ #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicsNVPTX.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/InitializePasses.h" @@ -336,8 +338,9 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, while (!ValuesToCheck.empty()) { Value *V = ValuesToCheck.pop_back_val(); if (!IsALoadChainInstr(V)) { - LLVM_DEBUG(dbgs() << "Need a copy of " << *Arg << " because of " << *V - << "\n"); + LLVM_DEBUG(dbgs() << "Need a " + << (isParamGridConstant(*Arg) ? "cast " : "copy ") + << "of " << *Arg << " because of " << *V << "\n"); (void)Arg; return false; } @@ -366,27 +369,59 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, return; } - // Otherwise we have to create a temporary copy. const DataLayout &DL = Func->getParent()->getDataLayout(); unsigned AS = DL.getAllocaAddrSpace(); - AllocaInst *AllocA = new AllocaInst(StructType, AS, Arg->getName(), FirstInst); - // Set the alignment to alignment of the byval parameter. This is because, - // later load/stores assume that alignment, and we are going to replace - // the use of the byval parameter with this alloca instruction. - AllocA->setAlignment(Func->getParamAlign(Arg->getArgNo()) - .value_or(DL.getPrefTypeAlign(StructType))); - Arg->replaceAllUsesWith(AllocA); - - Value *ArgInParam = new AddrSpaceCastInst( - Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(), - FirstInst); - // Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX - // addrspacecast preserves alignment. Since params are constant, this load is - // definitely not volatile. - LoadInst *LI = - new LoadInst(StructType, ArgInParam, Arg->getName(), - /*isVolatile=*/false, AllocA->getAlign(), FirstInst); - new StoreInst(LI, AllocA, FirstInst); + if (isParamGridConstant(*Arg)) { + // Writes to a grid constant are undefined behaviour. We do not need a + // temporary copy. When a pointer might have escaped, conservatively replace + // all of its uses (which might include a device function call) with a cast + // to the generic address space. + // TODO: only cast byval grid constant parameters at use points that need + // generic address (e.g., merging parameter pointers with other address + // space, or escaping to call-sites, inline-asm, memory), and use the + // parameter address space for normal loads. + IRBuilder<> IRB(&Func->getEntryBlock().front()); + + // Cast argument to param address space + auto *CastToParam = + cast<AddrSpaceCastInst>(IRB.CreateAddrSpaceCast( + Arg, IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getName() + ".param")); + + // Cast param address to generic address space. We do not use an + // addrspacecast to generic here, because, LLVM considers `Arg` to be in the + // generic address space, and a `generic -> param` cast followed by a `param + // -> generic` cast will be folded away. The `param -> generic` intrinsic + // will be correctly lowered to `cvta.param`. + Value *CvtToGenCall = IRB.CreateIntrinsic( + IRB.getPtrTy(ADDRESS_SPACE_GENERIC), Intrinsic::nvvm_ptr_param_to_gen, + CastToParam, nullptr, CastToParam->getName() + ".gen"); + + Arg->replaceAllUsesWith(CvtToGenCall); + + // Do not replace Arg in the cast to param space + CastToParam->setOperand(0, Arg); + } else { + // Otherwise we have to create a temporary copy. + AllocaInst *AllocA = + new AllocaInst(StructType, AS, Arg->getName(), FirstInst); + // Set the alignment to alignment of the byval parameter. This is because, + // later load/stores assume that alignment, and we are going to replace + // the use of the byval parameter with this alloca instruction. + AllocA->setAlignment(Func->getParamAlign(Arg->getArgNo()) + .value_or(DL.getPrefTypeAlign(StructType))); + Arg->replaceAllUsesWith(AllocA); + + Value *ArgInParam = new AddrSpaceCastInst( + Arg, PointerType::get(Arg->getContext(), ADDRESS_SPACE_PARAM), + Arg->getName(), FirstInst); + // Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX + // addrspacecast preserves alignment. Since params are constant, this load + // is definitely not volatile. + LoadInst *LI = + new LoadInst(StructType, ArgInParam, Arg->getName(), + /*isVolatile=*/false, AllocA->getAlign(), FirstInst); + new StoreInst(LI, AllocA, FirstInst); + } } void NVPTXLowerArgs::markPointerAsGlobal(Value *Ptr) { diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index b60a1d747af7..152f200b9d0f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -224,8 +224,7 @@ void NVPTXTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) { AAM.registerFunctionAnalysis<NVPTXAA>(); } -void NVPTXTargetMachine::registerPassBuilderCallbacks( - PassBuilder &PB, bool PopulateClassToPassNames) { +void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { #define GET_PASS_REGISTRY "NVPTXPassRegistry.def" #include "llvm/Passes/TargetPassRegistry.inc" diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h index 870ea20c26f3..2b88da67a50f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h @@ -66,8 +66,7 @@ public: void registerDefaultAliasAnalyses(AAManager &AAM) override; - void registerPassBuilderCallbacks(PassBuilder &PB, - bool PopulateClassToPassNames) override; + void registerPassBuilderCallbacks(PassBuilder &PB) override; TargetTransformInfo getTargetTransformInfo(const Function &F) const override; diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp index 3a536db1c972..e4b2ec868519 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp @@ -52,29 +52,46 @@ void clearAnnotationCache(const Module *Mod) { AC.Cache.erase(Mod); } -static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) { +static void readIntVecFromMDNode(const MDNode *MetadataNode, + std::vector<unsigned> &Vec) { + for (unsigned i = 0, e = MetadataNode->getNumOperands(); i != e; ++i) { + ConstantInt *Val = + mdconst::extract<ConstantInt>(MetadataNode->getOperand(i)); + Vec.push_back(Val->getZExtValue()); + } +} + +static void cacheAnnotationFromMD(const MDNode *MetadataNode, + key_val_pair_t &retval) { auto &AC = getAnnotationCache(); std::lock_guard<sys::Mutex> Guard(AC.Lock); - assert(md && "Invalid mdnode for annotation"); - assert((md->getNumOperands() % 2) == 1 && "Invalid number of operands"); + assert(MetadataNode && "Invalid mdnode for annotation"); + assert((MetadataNode->getNumOperands() % 2) == 1 && + "Invalid number of operands"); // start index = 1, to skip the global variable key // increment = 2, to skip the value for each property-value pairs - for (unsigned i = 1, e = md->getNumOperands(); i != e; i += 2) { + for (unsigned i = 1, e = MetadataNode->getNumOperands(); i != e; i += 2) { // property - const MDString *prop = dyn_cast<MDString>(md->getOperand(i)); + const MDString *prop = dyn_cast<MDString>(MetadataNode->getOperand(i)); assert(prop && "Annotation property not a string"); + std::string Key = prop->getString().str(); // value - ConstantInt *Val = mdconst::dyn_extract<ConstantInt>(md->getOperand(i + 1)); - assert(Val && "Value operand not a constant int"); - - std::string keyname = prop->getString().str(); - if (retval.find(keyname) != retval.end()) - retval[keyname].push_back(Val->getZExtValue()); - else { - std::vector<unsigned> tmp; - tmp.push_back(Val->getZExtValue()); - retval[keyname] = tmp; + if (ConstantInt *Val = mdconst::dyn_extract<ConstantInt>( + MetadataNode->getOperand(i + 1))) { + retval[Key].push_back(Val->getZExtValue()); + } else if (MDNode *VecMd = + dyn_cast<MDNode>(MetadataNode->getOperand(i + 1))) { + // note: only "grid_constant" annotations support vector MDNodes. + // assert: there can only exist one unique key value pair of + // the form (string key, MDNode node). Operands of such a node + // shall always be unsigned ints. + if (retval.find(Key) == retval.end()) { + readIntVecFromMDNode(VecMd, retval[Key]); + continue; + } + } else { + llvm_unreachable("Value operand not a constant int or an mdnode"); } } } @@ -153,9 +170,9 @@ bool findAllNVVMAnnotation(const GlobalValue *gv, const std::string &prop, bool isTexture(const Value &val) { if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) { - unsigned annot; - if (findOneNVVMAnnotation(gv, "texture", annot)) { - assert((annot == 1) && "Unexpected annotation on a texture symbol"); + unsigned Annot; + if (findOneNVVMAnnotation(gv, "texture", Annot)) { + assert((Annot == 1) && "Unexpected annotation on a texture symbol"); return true; } } @@ -164,70 +181,67 @@ bool isTexture(const Value &val) { bool isSurface(const Value &val) { if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) { - unsigned annot; - if (findOneNVVMAnnotation(gv, "surface", annot)) { - assert((annot == 1) && "Unexpected annotation on a surface symbol"); + unsigned Annot; + if (findOneNVVMAnnotation(gv, "surface", Annot)) { + assert((Annot == 1) && "Unexpected annotation on a surface symbol"); return true; } } return false; } -bool isSampler(const Value &val) { - const char *AnnotationName = "sampler"; - - if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) { - unsigned annot; - if (findOneNVVMAnnotation(gv, AnnotationName, annot)) { - assert((annot == 1) && "Unexpected annotation on a sampler symbol"); - return true; - } - } - if (const Argument *arg = dyn_cast<Argument>(&val)) { - const Function *func = arg->getParent(); - std::vector<unsigned> annot; - if (findAllNVVMAnnotation(func, AnnotationName, annot)) { - if (is_contained(annot, arg->getArgNo())) +static bool argHasNVVMAnnotation(const Value &Val, + const std::string &Annotation, + const bool StartArgIndexAtOne = false) { + if (const Argument *Arg = dyn_cast<Argument>(&Val)) { + const Function *Func = Arg->getParent(); + std::vector<unsigned> Annot; + if (findAllNVVMAnnotation(Func, Annotation, Annot)) { + const unsigned BaseOffset = StartArgIndexAtOne ? 1 : 0; + if (is_contained(Annot, BaseOffset + Arg->getArgNo())) { return true; + } } } return false; } -bool isImageReadOnly(const Value &val) { - if (const Argument *arg = dyn_cast<Argument>(&val)) { - const Function *func = arg->getParent(); - std::vector<unsigned> annot; - if (findAllNVVMAnnotation(func, "rdoimage", annot)) { - if (is_contained(annot, arg->getArgNo())) - return true; +bool isParamGridConstant(const Value &V) { + if (const Argument *Arg = dyn_cast<Argument>(&V)) { + // "grid_constant" counts argument indices starting from 1 + if (Arg->hasByValAttr() && + argHasNVVMAnnotation(*Arg, "grid_constant", /*StartArgIndexAtOne*/true)) { + assert(isKernelFunction(*Arg->getParent()) && + "only kernel arguments can be grid_constant"); + return true; } } return false; } -bool isImageWriteOnly(const Value &val) { - if (const Argument *arg = dyn_cast<Argument>(&val)) { - const Function *func = arg->getParent(); - std::vector<unsigned> annot; - if (findAllNVVMAnnotation(func, "wroimage", annot)) { - if (is_contained(annot, arg->getArgNo())) - return true; +bool isSampler(const Value &val) { + const char *AnnotationName = "sampler"; + + if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) { + unsigned Annot; + if (findOneNVVMAnnotation(gv, AnnotationName, Annot)) { + assert((Annot == 1) && "Unexpected annotation on a sampler symbol"); + return true; } } - return false; + return argHasNVVMAnnotation(val, AnnotationName); +} + +bool isImageReadOnly(const Value &val) { + return argHasNVVMAnnotation(val, "rdoimage"); +} + +bool isImageWriteOnly(const Value &val) { + return argHasNVVMAnnotation(val, "wroimage"); } bool isImageReadWrite(const Value &val) { - if (const Argument *arg = dyn_cast<Argument>(&val)) { - const Function *func = arg->getParent(); - std::vector<unsigned> annot; - if (findAllNVVMAnnotation(func, "rdwrimage", annot)) { - if (is_contained(annot, arg->getArgNo())) - return true; - } - } - return false; + return argHasNVVMAnnotation(val, "rdwrimage"); } bool isImage(const Value &val) { @@ -236,9 +250,9 @@ bool isImage(const Value &val) { bool isManaged(const Value &val) { if(const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) { - unsigned annot; - if (findOneNVVMAnnotation(gv, "managed", annot)) { - assert((annot == 1) && "Unexpected annotation on a managed symbol"); + unsigned Annot; + if (findOneNVVMAnnotation(gv, "managed", Annot)) { + assert((Annot == 1) && "Unexpected annotation on a managed symbol"); return true; } } @@ -323,8 +337,7 @@ bool getMaxNReg(const Function &F, unsigned &x) { bool isKernelFunction(const Function &F) { unsigned x = 0; - bool retval = findOneNVVMAnnotation(&F, "kernel", x); - if (!retval) { + if (!findOneNVVMAnnotation(&F, "kernel", x)) { // There is no NVVM metadata, check the calling convention return F.getCallingConv() == CallingConv::PTX_Kernel; } diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h index e020bc0f02e9..c15ff6cae1f2 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -62,6 +62,7 @@ bool getMaxClusterRank(const Function &, unsigned &); bool getMinCTASm(const Function &, unsigned &); bool getMaxNReg(const Function &, unsigned &); bool isKernelFunction(const Function &); +bool isParamGridConstant(const Value &); MaybeAlign getAlign(const Function &, unsigned); MaybeAlign getAlign(const CallInst &, unsigned); diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp index 29c95e4226bf..4024953bb51d 100644 --- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp +++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp @@ -209,7 +209,7 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) { // Removing via isInstructionTriviallyDead may add duplicates to the ToRemove // array. Filter out the duplicates before starting to erase from parent. std::sort(ToRemove.begin(), ToRemove.end()); - auto NewLastIter = std::unique(ToRemove.begin(), ToRemove.end()); + auto NewLastIter = llvm::unique(ToRemove); ToRemove.erase(NewLastIter, ToRemove.end()); for (Instruction *I : ToRemove) diff --git a/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp b/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp index 799890928577..53b2d2aa8624 100644 --- a/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp +++ b/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp @@ -165,8 +165,8 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<MachineDominatorTree>(); - AU.addRequired<MachinePostDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addRequired<MachinePostDominatorTreeWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -195,8 +195,8 @@ FunctionPass *llvm::createPPCBranchCoalescingPass() { INITIALIZE_PASS_BEGIN(PPCBranchCoalescing, DEBUG_TYPE, "Branch Coalescing", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) INITIALIZE_PASS_END(PPCBranchCoalescing, DEBUG_TYPE, "Branch Coalescing", false, false) @@ -214,8 +214,8 @@ void PPCBranchCoalescing::CoalescingCandidateInfo::clear() { } void PPCBranchCoalescing::initialize(MachineFunction &MF) { - MDT = &getAnalysis<MachineDominatorTree>(); - MPDT = &getAnalysis<MachinePostDominatorTree>(); + MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); + MPDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree(); TII = MF.getSubtarget().getInstrInfo(); MRI = &MF.getRegInfo(); } diff --git a/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp b/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp index 1f9947f6f327..c4190bb9a1c4 100644 --- a/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp +++ b/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp @@ -55,7 +55,7 @@ namespace { } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -70,7 +70,7 @@ namespace { INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify", "PowerPC CTR Loops Verify", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(PPCCTRLoopsVerify, "ppc-ctr-loops-verify", "PowerPC CTR Loops Verify", false, false) @@ -160,7 +160,7 @@ queue_preds: } bool PPCCTRLoopsVerify::runOnMachineFunction(MachineFunction &MF) { - MDT = &getAnalysis<MachineDominatorTree>(); + MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); // Verify that all bdnz/bdz instructions are dominated by a loop mtctr before // any other instructions that might clobber the ctr register. diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp index 201b2d162372..277d708013c7 100644 --- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -384,7 +384,10 @@ bool PPCFrameLowering::needsFP(const MachineFunction &MF) const { } void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const { - bool is31 = needsFP(MF); + // When there is dynamic alloca in this function, we can not use the frame + // pointer X31/R31 for the frameaddress lowering. In this case, only X1/R1 + // always points to the backchain. + bool is31 = needsFP(MF) && !MF.getFrameInfo().hasVarSizedObjects(); unsigned FPReg = is31 ? PPC::R31 : PPC::R1; unsigned FP8Reg = is31 ? PPC::X31 : PPC::X1; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 12e33ddb8eb5..9e56b8522fa6 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -5608,7 +5608,7 @@ static void prepareIndirectCall(SelectionDAG &DAG, SDValue &Callee, const SDLoc &dl) { SDValue MTCTROps[] = {Chain, Callee, Glue}; EVT ReturnTypes[] = {MVT::Other, MVT::Glue}; - Chain = DAG.getNode(PPCISD::MTCTR, dl, ArrayRef(ReturnTypes, 2), + Chain = DAG.getNode(PPCISD::MTCTR, dl, ReturnTypes, ArrayRef(MTCTROps, Glue.getNode() ? 3 : 2)); // The glue is the second value produced. Glue = Chain.getValue(1); @@ -10937,10 +10937,10 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::ppc_mma_disassemble_acc: { if (Subtarget.isISAFuture()) { EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1}; - SDValue WideVec = SDValue(DAG.getMachineNode(PPC::DMXXEXTFDMR512, dl, - ArrayRef(ReturnTypes, 2), - Op.getOperand(1)), - 0); + SDValue WideVec = + SDValue(DAG.getMachineNode(PPC::DMXXEXTFDMR512, dl, ReturnTypes, + Op.getOperand(1)), + 0); SmallVector<SDValue, 4> RetOps; SDValue Value = SDValue(WideVec.getNode(), 0); SDValue Value2 = SDValue(WideVec.getNode(), 1); @@ -11609,7 +11609,7 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, if (Subtarget.isISAFuture()) { EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1}; MachineSDNode *ExtNode = DAG.getMachineNode( - PPC::DMXXEXTFDMR512, dl, ArrayRef(ReturnTypes, 2), Op.getOperand(1)); + PPC::DMXXEXTFDMR512, dl, ReturnTypes, Op.getOperand(1)); Value = SDValue(ExtNode, 0); Value2 = SDValue(ExtNode, 1); diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td index eda5eb975e70..8f5afbae01de 100644 --- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -292,42 +292,42 @@ def : Pat<(PPCcall_nop_rm (i64 mcsym:$dst)), let Defs = [CR0] in { def ATOMIC_LOAD_ADD_I64 : PPCCustomInserterPseudo< (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_ADD_I64", - [(set i64:$dst, (atomic_load_add_64 ForceXForm:$ptr, i64:$incr))]>; + [(set i64:$dst, (atomic_load_add_i64 ForceXForm:$ptr, i64:$incr))]>; def ATOMIC_LOAD_SUB_I64 : PPCCustomInserterPseudo< (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_SUB_I64", - [(set i64:$dst, (atomic_load_sub_64 ForceXForm:$ptr, i64:$incr))]>; + [(set i64:$dst, (atomic_load_sub_i64 ForceXForm:$ptr, i64:$incr))]>; def ATOMIC_LOAD_OR_I64 : PPCCustomInserterPseudo< (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_OR_I64", - [(set i64:$dst, (atomic_load_or_64 ForceXForm:$ptr, i64:$incr))]>; + [(set i64:$dst, (atomic_load_or_i64 ForceXForm:$ptr, i64:$incr))]>; def ATOMIC_LOAD_XOR_I64 : PPCCustomInserterPseudo< (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_XOR_I64", - [(set i64:$dst, (atomic_load_xor_64 ForceXForm:$ptr, i64:$incr))]>; + [(set i64:$dst, (atomic_load_xor_i64 ForceXForm:$ptr, i64:$incr))]>; def ATOMIC_LOAD_AND_I64 : PPCCustomInserterPseudo< (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_AND_i64", - [(set i64:$dst, (atomic_load_and_64 ForceXForm:$ptr, i64:$incr))]>; + [(set i64:$dst, (atomic_load_and_i64 ForceXForm:$ptr, i64:$incr))]>; def ATOMIC_LOAD_NAND_I64 : PPCCustomInserterPseudo< (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_NAND_I64", - [(set i64:$dst, (atomic_load_nand_64 ForceXForm:$ptr, i64:$incr))]>; + [(set i64:$dst, (atomic_load_nand_i64 ForceXForm:$ptr, i64:$incr))]>; def ATOMIC_LOAD_MIN_I64 : PPCCustomInserterPseudo< (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_MIN_I64", - [(set i64:$dst, (atomic_load_min_64 ForceXForm:$ptr, i64:$incr))]>; + [(set i64:$dst, (atomic_load_min_i64 ForceXForm:$ptr, i64:$incr))]>; def ATOMIC_LOAD_MAX_I64 : PPCCustomInserterPseudo< (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_MAX_I64", - [(set i64:$dst, (atomic_load_max_64 ForceXForm:$ptr, i64:$incr))]>; + [(set i64:$dst, (atomic_load_max_i64 ForceXForm:$ptr, i64:$incr))]>; def ATOMIC_LOAD_UMIN_I64 : PPCCustomInserterPseudo< (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_UMIN_I64", - [(set i64:$dst, (atomic_load_umin_64 ForceXForm:$ptr, i64:$incr))]>; + [(set i64:$dst, (atomic_load_umin_i64 ForceXForm:$ptr, i64:$incr))]>; def ATOMIC_LOAD_UMAX_I64 : PPCCustomInserterPseudo< (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_UMAX_I64", - [(set i64:$dst, (atomic_load_umax_64 ForceXForm:$ptr, i64:$incr))]>; + [(set i64:$dst, (atomic_load_umax_i64 ForceXForm:$ptr, i64:$incr))]>; def ATOMIC_CMP_SWAP_I64 : PPCCustomInserterPseudo< (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$old, g8rc:$new), "#ATOMIC_CMP_SWAP_I64", - [(set i64:$dst, (atomic_cmp_swap_64 ForceXForm:$ptr, i64:$old, i64:$new))]>; + [(set i64:$dst, (atomic_cmp_swap_i64 ForceXForm:$ptr, i64:$old, i64:$new))]>; def ATOMIC_SWAP_I64 : PPCCustomInserterPseudo< (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$new), "#ATOMIC_SWAP_I64", - [(set i64:$dst, (atomic_swap_64 ForceXForm:$ptr, i64:$new))]>; + [(set i64:$dst, (atomic_swap_i64 ForceXForm:$ptr, i64:$new))]>; } // Instructions to support atomic operations @@ -1036,7 +1036,7 @@ defm DIVDE : XOForm_1rcr<31, 425, 0, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB), let Predicates = [IsISA3_0] in { def MADDHD : VAForm_1a<48, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC), "maddhd $RT, $RA, $RB, $RC", IIC_IntMulHD, []>, isPPC64; -def MADDHDU : VAForm_1a<49, +def MADDHDU : VAForm_1a<49, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC), "maddhdu $RT, $RA, $RB, $RC", IIC_IntMulHD, []>, isPPC64; def MADDLD : VAForm_1a<51, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB, gprc:$RC), @@ -1044,7 +1044,7 @@ def MADDLD : VAForm_1a<51, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB, gprc:$RC), [(set i32:$RT, (add_without_simm16 (mul_without_simm16 i32:$RA, i32:$RB), i32:$RC))]>, isPPC64; let Interpretation64Bit = 1, isCodeGenOnly = 1 in { - def MADDLD8 : VAForm_1a<51, + def MADDLD8 : VAForm_1a<51, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC), "maddld $RT, $RA, $RB, $RC", IIC_IntMulHD, [(set i64:$RT, (add_without_simm16 (mul_without_simm16 i64:$RA, i64:$RB), i64:$RC))]>, @@ -1349,8 +1349,8 @@ def LWZX8 : XForm_1_memOp<31, 23, (outs g8rc:$RST), (ins (memrr $RA, $RB):$addr "lwzx $RST, $addr", IIC_LdStLoad, [(set i64:$RST, (zextloadi32 XForm:$addr))]>, ZExt32To64; - - + + // Update forms. let mayLoad = 1, hasSideEffects = 0 in { def LBZU8 : DForm_1<35, (outs g8rc:$RST, ptr_rc_nor0:$ea_result), @@ -1635,7 +1635,7 @@ def PADDIdtprel : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm let PPC970_Unit = 2 in { let Interpretation64Bit = 1, isCodeGenOnly = 1 in { -// Truncating stores. +// Truncating stores. def STB8 : DForm_1<38, (outs), (ins g8rc:$RST, (memri $D, $RA):$addr), "stb $RST, $addr", IIC_LdStStore, [(truncstorei8 i64:$RST, DForm:$addr)]>; diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index 09f829943528..1686249c0f89 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -723,7 +723,7 @@ def PCRelativeMemops : Predicate<"Subtarget->hasPCRelativeMemops()">; def IsNotISA3_1 : Predicate<"!Subtarget->isISA3_1()">; // AIX assembler may not be modern enough to support some extended mne. -def ModernAs: Predicate<"!Subtarget->isAIXABI() || Subtarget->HasModernAIXAs">, +def ModernAs: Predicate<"!Subtarget->isAIXABI() || Subtarget->HasModernAIXAs">, AssemblerPredicate<(any_of (not AIXOS), FeatureModernAIXAs)>; def IsAIX : Predicate<"Subtarget->isAIXABI()">; def NotAIX : Predicate<"!Subtarget->isAIXABI()">; @@ -1747,114 +1747,114 @@ def : Pat<(int_ppc_dcbtst_with_hint xoaddr:$dst, i32:$TH), let Defs = [CR0] in { def ATOMIC_LOAD_ADD_I8 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I8", - [(set i32:$dst, (atomic_load_add_8 ForceXForm:$ptr, i32:$incr))]>; + [(set i32:$dst, (atomic_load_add_i8 ForceXForm:$ptr, i32:$incr))]>; def ATOMIC_LOAD_SUB_I8 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I8", - [(set i32:$dst, (atomic_load_sub_8 ForceXForm:$ptr, i32:$incr))]>; + [(set i32:$dst, (atomic_load_sub_i8 ForceXForm:$ptr, i32:$incr))]>; def ATOMIC_LOAD_AND_I8 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I8", - [(set i32:$dst, (atomic_load_and_8 ForceXForm:$ptr, i32:$incr))]>; + [(set i32:$dst, (atomic_load_and_i8 ForceXForm:$ptr, i32:$incr))]>; def ATOMIC_LOAD_OR_I8 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I8", - [(set i32:$dst, (atomic_load_or_8 ForceXForm:$ptr, i32:$incr))]>; + [(set i32:$dst, (atomic_load_or_i8 ForceXForm:$ptr, i32:$incr))]>; def ATOMIC_LOAD_XOR_I8 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "ATOMIC_LOAD_XOR_I8", - [(set i32:$dst, (atomic_load_xor_8 ForceXForm:$ptr, i32:$incr))]>; + [(set i32:$dst, (atomic_load_xor_i8 ForceXForm:$ptr, i32:$incr))]>; def ATOMIC_LOAD_NAND_I8 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I8", - [(set i32:$dst, (atomic_load_nand_8 ForceXForm:$ptr, i32:$incr))]>; + [(set i32:$dst, (atomic_load_nand_i8 ForceXForm:$ptr, i32:$incr))]>; def ATOMIC_LOAD_MIN_I8 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I8", - [(set i32:$dst, (atomic_load_min_8 ForceXForm:$ptr, i32:$incr))]>; + [(set i32:$dst, (atomic_load_min_i8 ForceXForm:$ptr, i32:$incr))]>; def ATOMIC_LOAD_MAX_I8 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I8", - [(set i32:$dst, (atomic_load_max_8 ForceXForm:$ptr, i32:$incr))]>; + [(set i32:$dst, (atomic_load_max_i8 ForceXForm:$ptr, i32:$incr))]>; def ATOMIC_LOAD_UMIN_I8 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I8", - [(set i32:$dst, (atomic_load_umin_8 ForceXForm:$ptr, i32:$incr))]>; + [(set i32:$dst, (atomic_load_umin_i8 ForceXForm:$ptr, i32:$incr))]>; def ATOMIC_LOAD_UMAX_I8 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I8", - [(set i32:$dst, (atomic_load_umax_8 ForceXForm:$ptr, i32:$incr))]>; + [(set i32:$dst, (atomic_load_umax_i8 ForceXForm:$ptr, i32:$incr))]>; def ATOMIC_LOAD_ADD_I16 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I16", - [(set i32:$dst, (atomic_load_add_16 ForceXForm:$ptr, i32:$incr))]>; + [(set i32:$dst, (atomic_load_add_i16 ForceXForm:$ptr, i32:$incr))]>; def ATOMIC_LOAD_SUB_I16 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I16", - [(set i32:$dst, (atomic_load_sub_16 ForceXForm:$ptr, i32:$incr))]>; + [(set i32:$dst, (atomic_load_sub_i16 ForceXForm:$ptr, i32:$incr))]>; def ATOMIC_LOAD_AND_I16 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I16", - [(set i32:$dst, (atomic_load_and_16 ForceXForm:$ptr, i32:$incr))]>; + [(set i32:$dst, (atomic_load_and_i16 ForceXForm:$ptr, i32:$incr))]>; def ATOMIC_LOAD_OR_I16 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I16", - [(set i32:$dst, (atomic_load_or_16 ForceXForm:$ptr, i32:$incr))]>; + [(set i32:$dst, (atomic_load_or_i16 ForceXForm:$ptr, i32:$incr))]>; def ATOMIC_LOAD_XOR_I16 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_XOR_I16", - [(set i32:$dst, (atomic_load_xor_16 ForceXForm:$ptr, i32:$incr))]>; + [(set i32:$dst, (atomic_load_xor_i16 ForceXForm:$ptr, i32:$incr))]>; def ATOMIC_LOAD_NAND_I16 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I16", - [(set i32:$dst, (atomic_load_nand_16 ForceXForm:$ptr, i32:$incr))]>; + [(set i32:$dst, (atomic_load_nand_i16 ForceXForm:$ptr, i32:$incr))]>; def ATOMIC_LOAD_MIN_I16 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I16", - [(set i32:$dst, (atomic_load_min_16 ForceXForm:$ptr, i32:$incr))]>; + [(set i32:$dst, (atomic_load_min_i16 ForceXForm:$ptr, i32:$incr))]>; def ATOMIC_LOAD_MAX_I16 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I16", - [(set i32:$dst, (atomic_load_max_16 ForceXForm:$ptr, i32:$incr))]>; + [(set i32:$dst, (atomic_load_max_i16 ForceXForm:$ptr, i32:$incr))]>; def ATOMIC_LOAD_UMIN_I16 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I16", - [(set i32:$dst, (atomic_load_umin_16 ForceXForm:$ptr, i32:$incr))]>; + [(set i32:$dst, (atomic_load_umin_i16 ForceXForm:$ptr, i32:$incr))]>; def ATOMIC_LOAD_UMAX_I16 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I16", - [(set i32:$dst, (atomic_load_umax_16 ForceXForm:$ptr, i32:$incr))]>; + [(set i32:$dst, (atomic_load_umax_i16 ForceXForm:$ptr, i32:$incr))]>; def ATOMIC_LOAD_ADD_I32 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I32", - [(set i32:$dst, (atomic_load_add_32 ForceXForm:$ptr, i32:$incr))]>; + [(set i32:$dst, (atomic_load_add_i32 ForceXForm:$ptr, i32:$incr))]>; def ATOMIC_LOAD_SUB_I32 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I32", - [(set i32:$dst, (atomic_load_sub_32 ForceXForm:$ptr, i32:$incr))]>; + [(set i32:$dst, (atomic_load_sub_i32 ForceXForm:$ptr, i32:$incr))]>; def ATOMIC_LOAD_AND_I32 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I32", - [(set i32:$dst, (atomic_load_and_32 ForceXForm:$ptr, i32:$incr))]>; + [(set i32:$dst, (atomic_load_and_i32 ForceXForm:$ptr, i32:$incr))]>; def ATOMIC_LOAD_OR_I32 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I32", - [(set i32:$dst, (atomic_load_or_32 ForceXForm:$ptr, i32:$incr))]>; + [(set i32:$dst, (atomic_load_or_i32 ForceXForm:$ptr, i32:$incr))]>; def ATOMIC_LOAD_XOR_I32 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_XOR_I32", - [(set i32:$dst, (atomic_load_xor_32 ForceXForm:$ptr, i32:$incr))]>; + [(set i32:$dst, (atomic_load_xor_i32 ForceXForm:$ptr, i32:$incr))]>; def ATOMIC_LOAD_NAND_I32 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I32", - [(set i32:$dst, (atomic_load_nand_32 ForceXForm:$ptr, i32:$incr))]>; + [(set i32:$dst, (atomic_load_nand_i32 ForceXForm:$ptr, i32:$incr))]>; def ATOMIC_LOAD_MIN_I32 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I32", - [(set i32:$dst, (atomic_load_min_32 ForceXForm:$ptr, i32:$incr))]>; + [(set i32:$dst, (atomic_load_min_i32 ForceXForm:$ptr, i32:$incr))]>; def ATOMIC_LOAD_MAX_I32 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I32", - [(set i32:$dst, (atomic_load_max_32 ForceXForm:$ptr, i32:$incr))]>; + [(set i32:$dst, (atomic_load_max_i32 ForceXForm:$ptr, i32:$incr))]>; def ATOMIC_LOAD_UMIN_I32 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I32", - [(set i32:$dst, (atomic_load_umin_32 ForceXForm:$ptr, i32:$incr))]>; + [(set i32:$dst, (atomic_load_umin_i32 ForceXForm:$ptr, i32:$incr))]>; def ATOMIC_LOAD_UMAX_I32 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I32", - [(set i32:$dst, (atomic_load_umax_32 ForceXForm:$ptr, i32:$incr))]>; + [(set i32:$dst, (atomic_load_umax_i32 ForceXForm:$ptr, i32:$incr))]>; def ATOMIC_CMP_SWAP_I8 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I8", - [(set i32:$dst, (atomic_cmp_swap_8 ForceXForm:$ptr, i32:$old, i32:$new))]>; + [(set i32:$dst, (atomic_cmp_swap_i8 ForceXForm:$ptr, i32:$old, i32:$new))]>; def ATOMIC_CMP_SWAP_I16 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I16 $dst $ptr $old $new", - [(set i32:$dst, (atomic_cmp_swap_16 ForceXForm:$ptr, i32:$old, i32:$new))]>; + [(set i32:$dst, (atomic_cmp_swap_i16 ForceXForm:$ptr, i32:$old, i32:$new))]>; def ATOMIC_CMP_SWAP_I32 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I32 $dst $ptr $old $new", - [(set i32:$dst, (atomic_cmp_swap_32 ForceXForm:$ptr, i32:$old, i32:$new))]>; + [(set i32:$dst, (atomic_cmp_swap_i32 ForceXForm:$ptr, i32:$old, i32:$new))]>; def ATOMIC_SWAP_I8 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_i8", - [(set i32:$dst, (atomic_swap_8 ForceXForm:$ptr, i32:$new))]>; + [(set i32:$dst, (atomic_swap_i8 ForceXForm:$ptr, i32:$new))]>; def ATOMIC_SWAP_I16 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_I16", - [(set i32:$dst, (atomic_swap_16 ForceXForm:$ptr, i32:$new))]>; + [(set i32:$dst, (atomic_swap_i16 ForceXForm:$ptr, i32:$new))]>; def ATOMIC_SWAP_I32 : PPCCustomInserterPseudo< (outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_I32", - [(set i32:$dst, (atomic_swap_32 ForceXForm:$ptr, i32:$new))]>; + [(set i32:$dst, (atomic_swap_i32 ForceXForm:$ptr, i32:$new))]>; } def : Pat<(PPCatomicCmpSwap_8 ForceXForm:$ptr, i32:$old, i32:$new), diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp index c6db8a7bbeb8..0b515c9f798f 100644 --- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp +++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp @@ -156,12 +156,12 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LiveVariables>(); - AU.addRequired<MachineDominatorTree>(); - AU.addRequired<MachinePostDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addRequired<MachinePostDominatorTreeWrapperPass>(); AU.addRequired<MachineBlockFrequencyInfo>(); AU.addPreserved<LiveVariables>(); - AU.addPreserved<MachineDominatorTree>(); - AU.addPreserved<MachinePostDominatorTree>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachinePostDominatorTreeWrapperPass>(); AU.addPreserved<MachineBlockFrequencyInfo>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -200,8 +200,8 @@ void PPCMIPeephole::addRegToUpdateWithLine(Register Reg, int Line) { void PPCMIPeephole::initialize(MachineFunction &MFParm) { MF = &MFParm; MRI = &MF->getRegInfo(); - MDT = &getAnalysis<MachineDominatorTree>(); - MPDT = &getAnalysis<MachinePostDominatorTree>(); + MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); + MPDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree(); MBFI = &getAnalysis<MachineBlockFrequencyInfo>(); LV = &getAnalysis<LiveVariables>(); EntryFreq = MBFI->getEntryFreq(); @@ -448,6 +448,9 @@ void PPCMIPeephole::convertUnprimedAccPHIs( if (MRI->isSSA()) addRegToUpdate(RegMBB.first.getReg()); } + // The liveness of old PHI and new PHI have to be updated. + addRegToUpdate(PHI->getOperand(0).getReg()); + addRegToUpdate(AccReg); ChangedPHIMap[PHI] = NewPHI.getInstr(); LLVM_DEBUG(dbgs() << "Converting PHI: "); LLVM_DEBUG(PHI->dump()); @@ -2029,8 +2032,8 @@ bool PPCMIPeephole::combineSEXTAndSHL(MachineInstr &MI, INITIALIZE_PASS_BEGIN(PPCMIPeephole, DEBUG_TYPE, "PowerPC MI Peephole Optimization", false, false) INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LiveVariables) INITIALIZE_PASS_END(PPCMIPeephole, DEBUG_TYPE, "PowerPC MI Peephole Optimization", false, false) diff --git a/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp b/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp index 0504db239f67..d1cc2ad5c481 100644 --- a/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp +++ b/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp @@ -427,7 +427,7 @@ public: CRLogicalOpInfo createCRLogicalOpInfo(MachineInstr &MI); void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<MachineBranchProbabilityInfo>(); - AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -730,7 +730,7 @@ void PPCReduceCRLogicals::collectCRLogicals() { INITIALIZE_PASS_BEGIN(PPCReduceCRLogicals, DEBUG_TYPE, "PowerPC Reduce CR logical Operation", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(PPCReduceCRLogicals, DEBUG_TYPE, "PowerPC Reduce CR logical Operation", false, false) diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td index 8a37e40414ee..fdbdc14736c8 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td @@ -199,7 +199,7 @@ let SubRegIndices = [sub_vsx0, sub_vsx1] in { def VSRp#!add(!srl(Index, 1), 16) : VSRPair<!add(!srl(Index, 1), 16), "vsp"#!add(Index, 32), [!cast<VR>("V"#Index), !cast<VR>("V"#!add(Index, 1))]>, - DwarfRegNum<[-1, -1]>; + DwarfRegAlias<!cast<VR>("V"#Index)>; } } diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp index 0628fbb26245..bd9af12b30f5 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp @@ -38,11 +38,6 @@ using namespace llvm; #include "PPCGenSubtargetInfo.inc" static cl::opt<bool> - UseSubRegLiveness("ppc-track-subreg-liveness", - cl::desc("Enable subregister liveness tracking for PPC"), - cl::init(true), cl::Hidden); - -static cl::opt<bool> EnableMachinePipeliner("ppc-enable-pipeliner", cl::desc("Enable Machine Pipeliner for PPC"), cl::init(false), cl::Hidden); @@ -186,9 +181,7 @@ bool PPCSubtarget::useAA() const { return true; } -bool PPCSubtarget::enableSubRegLiveness() const { - return UseSubRegLiveness; -} +bool PPCSubtarget::enableSubRegLiveness() const { return true; } bool PPCSubtarget::isGVIndirectSymbol(const GlobalValue *GV) const { if (isAIXABI()) { diff --git a/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp index 0d8c71f9f2e6..69e046972f3d 100644 --- a/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp +++ b/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp @@ -368,8 +368,8 @@ public: AU.addPreserved<LiveIntervals>(); AU.addRequired<SlotIndexes>(); AU.addPreserved<SlotIndexes>(); - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -379,7 +379,7 @@ INITIALIZE_PASS_BEGIN(PPCVSXFMAMutate, DEBUG_TYPE, "PowerPC VSX FMA Mutation", false, false) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) INITIALIZE_PASS_DEPENDENCY(SlotIndexes) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(PPCVSXFMAMutate, DEBUG_TYPE, "PowerPC VSX FMA Mutation", false, false) diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 5906a2cdb3bf..8ac1cdf0a7a9 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -2155,6 +2155,16 @@ bool RISCVAsmParser::parseVTypeToken(const AsmToken &Tok, VTypeState &State, break; if (!RISCVVType::isValidLMUL(Lmul, Fractional)) break; + + if (Fractional) { + unsigned ELEN = STI->hasFeature(RISCV::FeatureStdExtZve64x) ? 64 : 32; + unsigned MinLMUL = ELEN / 8; + if (Lmul > MinLMUL) + Warning(Tok.getLoc(), + "use of vtype encodings with LMUL < SEWMIN/ELEN == mf" + + Twine(MinLMUL) + " is reserved"); + } + State = VTypeState_TailPolicy; return false; } @@ -2194,6 +2204,7 @@ ParseStatus RISCVAsmParser::parseVTypeI(OperandVector &Operands) { bool MaskAgnostic = false; VTypeState State = VTypeState_SEW; + SMLoc SEWLoc = S; if (parseVTypeToken(getTok(), State, Sew, Lmul, Fractional, TailAgnostic, MaskAgnostic)) @@ -2211,6 +2222,16 @@ ParseStatus RISCVAsmParser::parseVTypeI(OperandVector &Operands) { if (getLexer().is(AsmToken::EndOfStatement) && State == VTypeState_Done) { RISCVII::VLMUL VLMUL = RISCVVType::encodeLMUL(Lmul, Fractional); + if (Fractional) { + unsigned ELEN = STI->hasFeature(RISCV::FeatureStdExtZve64x) ? 64 : 32; + unsigned MaxSEW = ELEN / Lmul; + // If MaxSEW < 8, we should have printed warning about reserved LMUL. + if (MaxSEW >= 8 && Sew > MaxSEW) + Warning(SEWLoc, + "use of vtype encodings with SEW > " + Twine(MaxSEW) + + " and LMUL == mf" + Twine(Lmul) + + " may not be compatible with all RVV implementations"); + } unsigned VTypeI = RISCVVType::encodeVTYPE(VLMUL, Sew, TailAgnostic, MaskAgnostic); diff --git a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp index beee9405de02..b5f8715598f3 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp @@ -102,9 +102,14 @@ struct RISCVOutgoingValueHandler : public CallLowering::OutgoingValueHandler { void assignValueToReg(Register ValVReg, Register PhysReg, const CCValAssign &VA) override { - // If we're passing an f32 value into an i64, anyextend before copying. - if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) - ValVReg = MIRBuilder.buildAnyExt(LLT::scalar(64), ValVReg).getReg(0); + // If we're passing a smaller fp value into a larger integer register, + // anyextend before copying. + if ((VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) || + ((VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::i64) && + VA.getValVT() == MVT::f16)) { + LLT DstTy = LLT::scalar(VA.getLocVT().getSizeInBits()); + ValVReg = MIRBuilder.buildAnyExt(DstTy, ValVReg).getReg(0); + } Register ExtReg = extendRegister(ValVReg, VA); MIRBuilder.buildCopy(PhysReg, ExtReg); @@ -336,10 +341,8 @@ static bool isLegalElementTypeForRVV(Type *EltTy, // TODO: Remove IsLowerArgs argument by adding support for vectors in lowerCall. static bool isSupportedArgumentType(Type *T, const RISCVSubtarget &Subtarget, bool IsLowerArgs = false) { - // TODO: Integers larger than 2*XLen are passed indirectly which is not - // supported yet. if (T->isIntegerTy()) - return T->getIntegerBitWidth() <= Subtarget.getXLen() * 2; + return true; if (T->isHalfTy() || T->isFloatTy() || T->isDoubleTy()) return true; if (T->isPointerTy()) diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp index a091380e8ce8..f511a2010980 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp @@ -577,12 +577,14 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) { const APFloat &FPimm = MI.getOperand(1).getFPImm()->getValueAPF(); APInt Imm = FPimm.bitcastToAPInt(); unsigned Size = MRI.getType(DstReg).getSizeInBits(); - if (Size == 32 || (Size == 64 && Subtarget->is64Bit())) { + if (Size == 16 || Size == 32 || (Size == 64 && Subtarget->is64Bit())) { Register GPRReg = MRI.createVirtualRegister(&RISCV::GPRRegClass); if (!materializeImm(GPRReg, Imm.getSExtValue(), MIB)) return false; - unsigned Opcode = Size == 64 ? RISCV::FMV_D_X : RISCV::FMV_W_X; + unsigned Opcode = Size == 64 ? RISCV::FMV_D_X + : Size == 32 ? RISCV::FMV_W_X + : RISCV::FMV_H_X; auto FMV = MIB.buildInstr(Opcode, {DstReg}, {GPRReg}); if (!FMV.constrainAllUses(TII, TRI, RBI)) return false; @@ -1146,16 +1148,16 @@ bool RISCVInstructionSelector::selectSelect(MachineInstr &MI, // Convert an FCMP predicate to one of the supported F or D instructions. static unsigned getFCmpOpcode(CmpInst::Predicate Pred, unsigned Size) { - assert((Size == 32 || Size == 64) && "Unsupported size"); + assert((Size == 16 || Size == 32 || Size == 64) && "Unsupported size"); switch (Pred) { default: llvm_unreachable("Unsupported predicate"); case CmpInst::FCMP_OLT: - return Size == 32 ? RISCV::FLT_S : RISCV::FLT_D; + return Size == 16 ? RISCV::FLT_H : Size == 32 ? RISCV::FLT_S : RISCV::FLT_D; case CmpInst::FCMP_OLE: - return Size == 32 ? RISCV::FLE_S : RISCV::FLE_D; + return Size == 16 ? RISCV::FLE_H : Size == 32 ? RISCV::FLE_S : RISCV::FLE_D; case CmpInst::FCMP_OEQ: - return Size == 32 ? RISCV::FEQ_S : RISCV::FEQ_D; + return Size == 16 ? RISCV::FEQ_H : Size == 32 ? RISCV::FEQ_S : RISCV::FEQ_D; } } @@ -1207,7 +1209,7 @@ bool RISCVInstructionSelector::selectFPCompare(MachineInstr &MI, Register RHS = CmpMI.getRHSReg(); unsigned Size = MRI.getType(LHS).getSizeInBits(); - assert((Size == 32 || Size == 64) && "Unexpected size"); + assert((Size == 16 || Size == 32 || Size == 64) && "Unexpected size"); Register TmpReg = DstReg; diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp index 816e62ea24ed..f033ea725003 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp @@ -35,7 +35,8 @@ static LegalityPredicate typeIsScalarFPArith(unsigned TypeIdx, const RISCVSubtarget &ST) { return [=, &ST](const LegalityQuery &Query) { return Query.Types[TypeIdx].isScalar() && - ((ST.hasStdExtF() && Query.Types[TypeIdx].getSizeInBits() == 32) || + ((ST.hasStdExtZfh() && Query.Types[TypeIdx].getSizeInBits() == 16) || + (ST.hasStdExtF() && Query.Types[TypeIdx].getSizeInBits() == 32) || (ST.hasStdExtD() && Query.Types[TypeIdx].getSizeInBits() == 64)); }; } @@ -305,7 +306,7 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) getActionDefinitionsBuilder({G_GLOBAL_VALUE, G_JUMP_TABLE, G_CONSTANT_POOL}) .legalFor({p0}); - if (ST.hasStdExtM() || ST.hasStdExtZmmul()) { + if (ST.hasStdExtZmmul()) { getActionDefinitionsBuilder(G_MUL) .legalFor({s32, sXLen}) .widenScalarToNextPow2(0) @@ -383,15 +384,24 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) getActionDefinitionsBuilder(G_FCOPYSIGN) .legalIf(all(typeIsScalarFPArith(0, ST), typeIsScalarFPArith(1, ST))); + // FIXME: Use Zfhmin. getActionDefinitionsBuilder(G_FPTRUNC).legalIf( [=, &ST](const LegalityQuery &Query) -> bool { return (ST.hasStdExtD() && typeIs(0, s32)(Query) && + typeIs(1, s64)(Query)) || + (ST.hasStdExtZfh() && typeIs(0, s16)(Query) && + typeIs(1, s32)(Query)) || + (ST.hasStdExtZfh() && ST.hasStdExtD() && typeIs(0, s16)(Query) && typeIs(1, s64)(Query)); }); getActionDefinitionsBuilder(G_FPEXT).legalIf( [=, &ST](const LegalityQuery &Query) -> bool { return (ST.hasStdExtD() && typeIs(0, s64)(Query) && - typeIs(1, s32)(Query)); + typeIs(1, s32)(Query)) || + (ST.hasStdExtZfh() && typeIs(0, s32)(Query) && + typeIs(1, s16)(Query)) || + (ST.hasStdExtZfh() && ST.hasStdExtD() && typeIs(0, s64)(Query) && + typeIs(1, s16)(Query)); }); getActionDefinitionsBuilder(G_FCMP) @@ -462,13 +472,6 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) getLegacyLegalizerInfo().computeTables(); } -static Type *getTypeForLLT(LLT Ty, LLVMContext &C) { - if (Ty.isVector()) - return FixedVectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()), - Ty.getNumElements()); - return IntegerType::get(C, Ty.getSizeInBits()); -} - bool RISCVLegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const { Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID(); diff --git a/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp b/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp index 9c28944abc76..8fa9dba28538 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp @@ -112,8 +112,8 @@ void RISCVPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { getSelectionDAGFallbackAnalysisUsage(AU); AU.addRequired<GISelKnownBitsAnalysis>(); AU.addPreserved<GISelKnownBitsAnalysis>(); - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); AU.addRequired<GISelCSEAnalysisWrapperPass>(); AU.addPreserved<GISelCSEAnalysisWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); @@ -143,7 +143,8 @@ bool RISCVPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { const auto *LI = ST.getLegalizerInfo(); GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); - MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>(); + MachineDominatorTree *MDT = + &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); GISelCSEAnalysisWrapper &Wrapper = getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper(); auto *CSEInfo = &Wrapper.get(TPC->getCSEConfig()); diff --git a/llvm/lib/Target/RISCV/GISel/RISCVPreLegalizerCombiner.cpp b/llvm/lib/Target/RISCV/GISel/RISCVPreLegalizerCombiner.cpp index 9a35fffae058..6a695119be25 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVPreLegalizerCombiner.cpp @@ -109,8 +109,8 @@ void RISCVPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { getSelectionDAGFallbackAnalysisUsage(AU); AU.addRequired<GISelKnownBitsAnalysis>(); AU.addPreserved<GISelKnownBitsAnalysis>(); - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); AU.addRequired<GISelCSEAnalysisWrapperPass>(); AU.addPreserved<GISelCSEAnalysisWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); @@ -142,7 +142,8 @@ bool RISCVPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { bool EnableOpt = MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F); GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); - MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>(); + MachineDominatorTree *MDT = + &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, /*LegalizerInfo*/ nullptr, EnableOpt, F.hasOptSize(), F.hasMinSize()); diff --git a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp index 686c8d89a732..d25e96525399 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp @@ -29,6 +29,7 @@ const RegisterBankInfo::PartialMapping PartMappings[] = { // clang-format off {0, 32, GPRBRegBank}, {0, 64, GPRBRegBank}, + {0, 16, FPRBRegBank}, {0, 32, FPRBRegBank}, {0, 64, FPRBRegBank}, {0, 64, VRBRegBank}, @@ -41,12 +42,13 @@ const RegisterBankInfo::PartialMapping PartMappings[] = { enum PartialMappingIdx { PMI_GPRB32 = 0, PMI_GPRB64 = 1, - PMI_FPRB32 = 2, - PMI_FPRB64 = 3, - PMI_VRB64 = 4, - PMI_VRB128 = 5, - PMI_VRB256 = 6, - PMI_VRB512 = 7, + PMI_FPRB16 = 2, + PMI_FPRB32 = 3, + PMI_FPRB64 = 4, + PMI_VRB64 = 5, + PMI_VRB128 = 6, + PMI_VRB256 = 7, + PMI_VRB512 = 8, }; const RegisterBankInfo::ValueMapping ValueMappings[] = { @@ -60,6 +62,10 @@ const RegisterBankInfo::ValueMapping ValueMappings[] = { {&PartMappings[PMI_GPRB64], 1}, {&PartMappings[PMI_GPRB64], 1}, {&PartMappings[PMI_GPRB64], 1}, + // Maximum 3 FPR operands; 16 bit. + {&PartMappings[PMI_FPRB16], 1}, + {&PartMappings[PMI_FPRB16], 1}, + {&PartMappings[PMI_FPRB16], 1}, // Maximum 3 FPR operands; 32 bit. {&PartMappings[PMI_FPRB32], 1}, {&PartMappings[PMI_FPRB32], 1}, @@ -90,12 +96,13 @@ enum ValueMappingIdx { InvalidIdx = 0, GPRB32Idx = 1, GPRB64Idx = 4, - FPRB32Idx = 7, - FPRB64Idx = 10, - VRB64Idx = 13, - VRB128Idx = 16, - VRB256Idx = 19, - VRB512Idx = 22, + FPRB16Idx = 7, + FPRB32Idx = 10, + FPRB64Idx = 13, + VRB64Idx = 16, + VRB128Idx = 19, + VRB256Idx = 22, + VRB512Idx = 25, }; } // namespace RISCV } // namespace llvm @@ -151,8 +158,20 @@ RISCVRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, } static const RegisterBankInfo::ValueMapping *getFPValueMapping(unsigned Size) { - assert(Size == 32 || Size == 64); - unsigned Idx = Size == 64 ? RISCV::FPRB64Idx : RISCV::FPRB32Idx; + unsigned Idx; + switch (Size) { + default: + llvm_unreachable("Unexpected size"); + case 16: + Idx = RISCV::FPRB16Idx; + break; + case 32: + Idx = RISCV::FPRB32Idx; + break; + case 64: + Idx = RISCV::FPRB64Idx; + break; + } return &RISCV::ValueMappings[Idx]; } @@ -459,7 +478,6 @@ RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { LLT Ty = MRI.getType(MI.getOperand(2).getReg()); unsigned Size = Ty.getSizeInBits(); - assert((Size == 32 || Size == 64) && "Unsupported size for G_FCMP"); OpdsMapping[0] = GPRValueMapping; OpdsMapping[2] = OpdsMapping[3] = getFPValueMapping(Size); diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp index ae7ce476fff2..87c5a756e025 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp @@ -159,8 +159,7 @@ void RISCVELFStreamer::emitMappingSymbol(StringRef Name) { Symbol->setBinding(ELF::STB_LOCAL); } -void RISCVELFStreamer::changeSection(MCSection *Section, - const MCExpr *Subsection) { +void RISCVELFStreamer::changeSection(MCSection *Section, uint32_t Subsection) { // We have to keep track of the mapping symbol state of any sections we // use. Each one should start off as EMS_None, which is provided as the // default constructor by DenseMap::lookup. diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h index 212d731889f1..40c6b5ac3dcc 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h @@ -32,7 +32,7 @@ public: std::unique_ptr<MCCodeEmitter> MCE) : MCELFStreamer(C, std::move(MAB), std::move(MOW), std::move(MCE)) {} - void changeSection(MCSection *Section, const MCExpr *Subsection) override; + void changeSection(MCSection *Section, uint32_t Subsection) override; void emitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override; void emitBytes(StringRef Data) override; void emitFill(const MCExpr &NumBytes, uint64_t FillValue, SMLoc Loc) override; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp index b8e0f3a867f4..d83dadd30161 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp @@ -62,7 +62,7 @@ const MCFixup *RISCVMCExpr::getPCRelHiFixup(const MCFragment **DFOut) const { uint64_t Offset = AUIPCSymbol->getOffset(); if (DF->getContents().size() == Offset) { - DF = dyn_cast_or_null<MCDataFragment>(DF->getNextNode()); + DF = dyn_cast_or_null<MCDataFragment>(DF->getNext()); if (!DF) return nullptr; Offset = 0; diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td index 09f496574d64..d96fafbe6080 100644 --- a/llvm/lib/Target/RISCV/RISCV.td +++ b/llvm/lib/Target/RISCV/RISCV.td @@ -51,6 +51,7 @@ include "RISCVSchedSiFive7.td" include "RISCVSchedSiFiveP400.td" include "RISCVSchedSiFiveP600.td" include "RISCVSchedSyntacoreSCR1.td" +include "RISCVSchedSyntacoreSCR3.td" include "RISCVSchedXiangShanNanHu.td" //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 9bf06850483d..a5e34def81c8 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -171,23 +171,21 @@ def NoHasStdExtZicfiss : Predicate<"!Subtarget->hasStdExtZicfiss()">; // Multiply Extensions +def FeatureStdExtZmmul + : RISCVExtension<"zmmul", 1, 0, + "'Zmmul' (Integer Multiplication)">; +def HasStdExtZmmul : Predicate<"Subtarget->hasStdExtZmmul()">, + AssemblerPredicate<(all_of FeatureStdExtZmmul), + "'Zmmul' (Integer Multiplication)">; + def FeatureStdExtM : RISCVExtension<"m", 2, 0, - "'M' (Integer Multiplication and Division)">; + "'M' (Integer Multiplication and Division)", + [FeatureStdExtZmmul]>; def HasStdExtM : Predicate<"Subtarget->hasStdExtM()">, AssemblerPredicate<(all_of FeatureStdExtM), "'M' (Integer Multiplication and Division)">; -def FeatureStdExtZmmul - : RISCVExtension<"zmmul", 1, 0, - "'Zmmul' (Integer Multiplication)">; - -def HasStdExtMOrZmmul - : Predicate<"Subtarget->hasStdExtM() || Subtarget->hasStdExtZmmul()">, - AssemblerPredicate<(any_of FeatureStdExtM, FeatureStdExtZmmul), - "'M' (Integer Multiplication and Division) or " - "'Zmmul' (Integer Multiplication)">; - // Atomic Extensions def FeatureStdExtA @@ -477,6 +475,14 @@ def HasStdExtZbs : Predicate<"Subtarget->hasStdExtZbs()">, // Bitmanip Extensions for Cryptography Extensions +def FeatureStdExtB + : RISCVExtension<"b", 1, 0, + "'B' (the collection of the Zba, Zbb, Zbs extensions)", + [FeatureStdExtZba, FeatureStdExtZbb, FeatureStdExtZbs]>; +def HasStdExtB : Predicate<"Subtarget->hasStdExtB()">, + AssemblerPredicate<(all_of FeatureStdExtB), + "'B' (the collection of the Zba, Zbb, Zbs extensions)">; + def FeatureStdExtZbkb : RISCVExtension<"zbkb", 1, 0, "'Zbkb' (Bitmanip instructions for Cryptography)">; @@ -847,10 +853,24 @@ def FeatureStdExtSsaia "'Ssaia' (Advanced Interrupt Architecture Supervisor " "Level)">; +def FeatureStdExtSmcsrind + : RISCVExtension<"smcsrind", 1, 0, + "'Smcsrind' (Indirect CSR Access Machine Level)">; +def FeatureStdExtSscsrind + : RISCVExtension<"sscsrind", 1, 0, + "'Sscsrind' (Indirect CSR Access Supervisor Level)">; + def FeatureStdExtSmepmp : RISCVExtension<"smepmp", 1, 0, "'Smepmp' (Enhanced Physical Memory Protection)">; +def FeatureStdExtSmcdeleg + : RISCVExtension<"smcdeleg", 1, 0, + "'Smcdeleg' (Counter Delegation Machine Level)">; +def FeatureStdExtSsccfg + : RISCVExtension<"ssccfg", 1, 0, + "'Ssccfg' (Counter Configuration Supervisor Level)">; + def FeatureStdExtSsccptr : RISCVExtension<"ssccptr", 1, 0, "'Ssccptr' (Main memory supports page table reads)">; @@ -883,7 +903,7 @@ def FeatureStdExtSstc : RISCVExtension<"sstc", 1, 0, "'Sstc' (Supervisor-mode timer interrupts)">; -def FeaturesSsqosid +def FeaturesStdExtSsqosid : RISCVExperimentalExtension<"ssqosid", 1, 0, "'Ssqosid' (Quality-of-Service (QoS) Identifiers)">; diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index 436bd4a38a31..e676c2f94583 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -878,9 +878,9 @@ RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, StackID == TargetStackID::ScalableVector) && "Unexpected stack ID for the frame object."); if (StackID == TargetStackID::Default) { - Offset = - StackOffset::getFixed(MFI.getObjectOffset(FI) - getOffsetOfLocalArea() + - MFI.getOffsetAdjustment()); + assert(getOffsetOfLocalArea() == 0 && "LocalAreaOffset is not 0!"); + Offset = StackOffset::getFixed(MFI.getObjectOffset(FI) + + MFI.getOffsetAdjustment()); } else if (StackID == TargetStackID::ScalableVector) { Offset = StackOffset::getScalable(MFI.getObjectOffset(FI)); } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index af3950773e4d..a648ee2c9571 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -300,7 +300,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setLibcallName(RTLIB::MULO_I64, nullptr); } - if (!Subtarget.hasStdExtM() && !Subtarget.hasStdExtZmmul()) { + if (!Subtarget.hasStdExtZmmul()) { setOperationAction({ISD::MUL, ISD::MULHS, ISD::MULHU}, XLenVT, Expand); if (RV64LegalI32 && Subtarget.is64Bit()) setOperationAction(ISD::MUL, MVT::i32, Promote); @@ -662,6 +662,11 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setBooleanContents(ZeroOrOneBooleanContent); + if (getTargetMachine().getTargetTriple().isOSLinux()) { + // Custom lowering of llvm.clear_cache. + setOperationAction(ISD::CLEAR_CACHE, MVT::Other, Custom); + } + if (Subtarget.hasVInstructions()) { setBooleanVectorContents(ZeroOrOneBooleanContent); @@ -1102,12 +1107,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::EXTRACT_SUBVECTOR}, VT, Custom); setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom); - if (Subtarget.hasStdExtZfbfmin()) { - if (Subtarget.hasVInstructionsF16()) - setOperationAction(ISD::SPLAT_VECTOR, VT, Legal); - else if (Subtarget.hasVInstructionsF16Minimal()) - setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); - } + if (Subtarget.hasStdExtZfbfmin()) + setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); setOperationAction({ISD::VP_MERGE, ISD::VP_SELECT, ISD::SELECT}, VT, Custom); setOperationAction(ISD::SELECT_CC, VT, Expand); @@ -1340,12 +1341,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::EXTRACT_SUBVECTOR}, VT, Custom); setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom); - if (Subtarget.hasStdExtZfbfmin()) { - if (Subtarget.hasVInstructionsF16()) - setOperationAction(ISD::SPLAT_VECTOR, VT, Legal); - else if (Subtarget.hasVInstructionsF16Minimal()) - setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); - } + if (Subtarget.hasStdExtZfbfmin()) + setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); setOperationAction( {ISD::VP_MERGE, ISD::VP_SELECT, ISD::VSELECT, ISD::SELECT}, VT, Custom); @@ -6738,9 +6735,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, Subtarget.hasStdExtZfhminOrZhinxmin() && !Subtarget.hasVInstructionsF16())) || (Op.getValueType().getScalarType() == MVT::bf16 && - (Subtarget.hasVInstructionsBF16() && Subtarget.hasStdExtZfbfmin() && - Subtarget.hasVInstructionsF16Minimal() && - !Subtarget.hasVInstructionsF16()))) { + (Subtarget.hasVInstructionsBF16() && Subtarget.hasStdExtZfbfmin()))) { if (Op.getValueType() == MVT::nxv32f16 || Op.getValueType() == MVT::nxv32bf16) return SplitVectorOp(Op, DAG); @@ -7152,7 +7147,27 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, return lowerVPSpliceExperimental(Op, DAG); case ISD::EXPERIMENTAL_VP_REVERSE: return lowerVPReverseExperimental(Op, DAG); + case ISD::CLEAR_CACHE: { + assert(getTargetMachine().getTargetTriple().isOSLinux() && + "llvm.clear_cache only needs custom lower on Linux targets"); + SDLoc DL(Op); + SDValue Flags = DAG.getConstant(0, DL, Subtarget.getXLenVT()); + return emitFlushICache(DAG, Op.getOperand(0), Op.getOperand(1), + Op.getOperand(2), Flags, DL); } + } +} + +SDValue RISCVTargetLowering::emitFlushICache(SelectionDAG &DAG, SDValue InChain, + SDValue Start, SDValue End, + SDValue Flags, SDLoc DL) const { + MakeLibCallOptions CallOptions; + std::pair<SDValue, SDValue> CallResult = + makeLibCall(DAG, RTLIB::RISCV_FLUSH_ICACHE, MVT::isVoid, + {Start, End, Flags}, CallOptions, DL, InChain); + + // This function returns void so only the out chain matters. + return CallResult.second; } static SDValue getTargetNode(GlobalAddressSDNode *N, const SDLoc &DL, EVT Ty, @@ -12487,12 +12502,15 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, } break; } - case RISCVISD::BREV8: { + case RISCVISD::BREV8: + case RISCVISD::ORC_B: { MVT VT = N->getSimpleValueType(0); MVT XLenVT = Subtarget.getXLenVT(); assert((VT == MVT::i16 || (VT == MVT::i32 && Subtarget.is64Bit())) && "Unexpected custom legalisation"); - assert(Subtarget.hasStdExtZbkb() && "Unexpected extension"); + assert(((N->getOpcode() == RISCVISD::BREV8 && Subtarget.hasStdExtZbkb()) || + (N->getOpcode() == RISCVISD::ORC_B && Subtarget.hasStdExtZbb())) && + "Unexpected extension"); SDValue NewOp = DAG.getNode(ISD::ANY_EXTEND, DL, XLenVT, N->getOperand(0)); SDValue NewRes = DAG.getNode(N->getOpcode(), DL, XLenVT, NewOp); // ReplaceNodeResults requires we maintain the same type for the return @@ -13330,6 +13348,35 @@ static SDValue combineSubOfBoolean(SDNode *N, SelectionDAG &DAG) { return DAG.getNode(ISD::ADD, DL, VT, NewLHS, NewRHS); } +// Looks for (sub (shl X, 8), X) where only bits 8, 16, 24, 32, etc. of X are +// non-zero. Replace with orc.b. +static SDValue combineSubShiftToOrcB(SDNode *N, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + if (!Subtarget.hasStdExtZbb()) + return SDValue(); + + EVT VT = N->getValueType(0); + + if (VT != Subtarget.getXLenVT() && VT != MVT::i32 && VT != MVT::i16) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + if (N0.getOpcode() != ISD::SHL || N0.getOperand(0) != N1 || !N0.hasOneUse()) + return SDValue(); + + auto *ShAmtC = dyn_cast<ConstantSDNode>(N0.getOperand(1)); + if (!ShAmtC || ShAmtC->getZExtValue() != 8) + return SDValue(); + + APInt Mask = APInt::getSplat(VT.getSizeInBits(), APInt(8, 0xfe)); + if (!DAG.MaskedValueIsZero(N1, Mask)) + return SDValue(); + + return DAG.getNode(RISCVISD::ORC_B, SDLoc(N), VT, N1); +} + static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { if (SDValue V = combineSubOfBoolean(N, DAG)) @@ -13352,6 +13399,8 @@ static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineBinOpOfZExt(N, DAG)) return V; + if (SDValue V = combineSubShiftToOrcB(N, DAG, Subtarget)) + return V; // fold (sub x, (select lhs, rhs, cc, 0, y)) -> // (select lhs, rhs, cc, x, (sub x, y)) @@ -13691,8 +13740,8 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, if (VT != Subtarget.getXLenVT()) return SDValue(); - if (!Subtarget.hasStdExtZba() && !Subtarget.hasVendorXTHeadBa()) - return SDValue(); + const bool HasShlAdd = + Subtarget.hasStdExtZba() || Subtarget.hasVendorXTHeadBa(); ConstantSDNode *CNode = dyn_cast<ConstantSDNode>(N->getOperand(1)); if (!CNode) @@ -13705,107 +13754,123 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, // other target properly freezes X in these cases either. SDValue X = N->getOperand(0); - for (uint64_t Divisor : {3, 5, 9}) { - if (MulAmt % Divisor != 0) - continue; - uint64_t MulAmt2 = MulAmt / Divisor; - // 3/5/9 * 2^N -> shl (shXadd X, X), N - if (isPowerOf2_64(MulAmt2)) { - SDLoc DL(N); - SDValue X = N->getOperand(0); - // Put the shift first if we can fold a zext into the - // shift forming a slli.uw. - if (X.getOpcode() == ISD::AND && isa<ConstantSDNode>(X.getOperand(1)) && - X.getConstantOperandVal(1) == UINT64_C(0xffffffff)) { - SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, X, - DAG.getConstant(Log2_64(MulAmt2), DL, VT)); - return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Shl, - DAG.getConstant(Log2_64(Divisor - 1), DL, VT), Shl); + if (HasShlAdd) { + for (uint64_t Divisor : {3, 5, 9}) { + if (MulAmt % Divisor != 0) + continue; + uint64_t MulAmt2 = MulAmt / Divisor; + // 3/5/9 * 2^N -> shl (shXadd X, X), N + if (isPowerOf2_64(MulAmt2)) { + SDLoc DL(N); + SDValue X = N->getOperand(0); + // Put the shift first if we can fold a zext into the + // shift forming a slli.uw. + if (X.getOpcode() == ISD::AND && isa<ConstantSDNode>(X.getOperand(1)) && + X.getConstantOperandVal(1) == UINT64_C(0xffffffff)) { + SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, X, + DAG.getConstant(Log2_64(MulAmt2), DL, VT)); + return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Shl, + DAG.getConstant(Log2_64(Divisor - 1), DL, VT), + Shl); + } + // Otherwise, put rhe shl second so that it can fold with following + // instructions (e.g. sext or add). + SDValue Mul359 = + DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X); + return DAG.getNode(ISD::SHL, DL, VT, Mul359, + DAG.getConstant(Log2_64(MulAmt2), DL, VT)); + } + + // 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X) + if (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9) { + SDLoc DL(N); + SDValue Mul359 = + DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X); + return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, + DAG.getConstant(Log2_64(MulAmt2 - 1), DL, VT), + Mul359); } - // Otherwise, put rhe shl second so that it can fold with following - // instructions (e.g. sext or add). - SDValue Mul359 = - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X); - return DAG.getNode(ISD::SHL, DL, VT, Mul359, - DAG.getConstant(Log2_64(MulAmt2), DL, VT)); } - // 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X) - if (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9) { - SDLoc DL(N); - SDValue Mul359 = - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X); - return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, - DAG.getConstant(Log2_64(MulAmt2 - 1), DL, VT), - Mul359); - } - } - - // If this is a power 2 + 2/4/8, we can use a shift followed by a single - // shXadd. First check if this a sum of two power of 2s because that's - // easy. Then count how many zeros are up to the first bit. - if (isPowerOf2_64(MulAmt & (MulAmt - 1))) { - unsigned ScaleShift = llvm::countr_zero(MulAmt); - if (ScaleShift >= 1 && ScaleShift < 4) { - unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1))); - SDLoc DL(N); - SDValue Shift1 = - DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT)); - return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(ScaleShift, DL, VT), Shift1); + // If this is a power 2 + 2/4/8, we can use a shift followed by a single + // shXadd. First check if this a sum of two power of 2s because that's + // easy. Then count how many zeros are up to the first bit. + if (isPowerOf2_64(MulAmt & (MulAmt - 1))) { + unsigned ScaleShift = llvm::countr_zero(MulAmt); + if (ScaleShift >= 1 && ScaleShift < 4) { + unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1))); + SDLoc DL(N); + SDValue Shift1 = + DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT)); + return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getConstant(ScaleShift, DL, VT), Shift1); + } } - } - // 2^(1,2,3) * 3,5,9 + 1 -> (shXadd (shYadd x, x), x) - // This is the two instruction form, there are also three instruction - // variants we could implement. e.g. - // (2^(1,2,3) * 3,5,9 + 1) << C2 - // 2^(C1>3) * 3,5,9 +/- 1 - for (uint64_t Divisor : {3, 5, 9}) { - uint64_t C = MulAmt - 1; - if (C <= Divisor) - continue; - unsigned TZ = llvm::countr_zero(C); - if ((C >> TZ) == Divisor && (TZ == 1 || TZ == 2 || TZ == 3)) { - SDLoc DL(N); - SDValue Mul359 = - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X); - return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, - DAG.getConstant(TZ, DL, VT), X); + // 2^(1,2,3) * 3,5,9 + 1 -> (shXadd (shYadd x, x), x) + // This is the two instruction form, there are also three instruction + // variants we could implement. e.g. + // (2^(1,2,3) * 3,5,9 + 1) << C2 + // 2^(C1>3) * 3,5,9 +/- 1 + for (uint64_t Divisor : {3, 5, 9}) { + uint64_t C = MulAmt - 1; + if (C <= Divisor) + continue; + unsigned TZ = llvm::countr_zero(C); + if ((C >> TZ) == Divisor && (TZ == 1 || TZ == 2 || TZ == 3)) { + SDLoc DL(N); + SDValue Mul359 = + DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X); + return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, + DAG.getConstant(TZ, DL, VT), X); + } } - } - // 2^n + 2/4/8 + 1 -> (add (shl X, C1), (shXadd X, X)) - if (MulAmt > 2 && isPowerOf2_64((MulAmt - 1) & (MulAmt - 2))) { - unsigned ScaleShift = llvm::countr_zero(MulAmt - 1); - if (ScaleShift >= 1 && ScaleShift < 4) { - unsigned ShiftAmt = Log2_64(((MulAmt - 1) & (MulAmt - 2))); - SDLoc DL(N); - SDValue Shift1 = - DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT)); - return DAG.getNode(ISD::ADD, DL, VT, Shift1, - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(ScaleShift, DL, VT), X)); + // 2^n + 2/4/8 + 1 -> (add (shl X, C1), (shXadd X, X)) + if (MulAmt > 2 && isPowerOf2_64((MulAmt - 1) & (MulAmt - 2))) { + unsigned ScaleShift = llvm::countr_zero(MulAmt - 1); + if (ScaleShift >= 1 && ScaleShift < 4) { + unsigned ShiftAmt = Log2_64(((MulAmt - 1) & (MulAmt - 2))); + SDLoc DL(N); + SDValue Shift1 = + DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT)); + return DAG.getNode(ISD::ADD, DL, VT, Shift1, + DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getConstant(ScaleShift, DL, VT), X)); + } } - } - // 2^N - 3/5/9 --> (sub (shl X, C1), (shXadd X, x)) - for (uint64_t Offset : {3, 5, 9}) { - if (isPowerOf2_64(MulAmt + Offset)) { - SDLoc DL(N); - SDValue Shift1 = - DAG.getNode(ISD::SHL, DL, VT, X, - DAG.getConstant(Log2_64(MulAmt + Offset), DL, VT)); - SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(Log2_64(Offset - 1), DL, VT), - X); - return DAG.getNode(ISD::SUB, DL, VT, Shift1, Mul359); + // 2^N - 3/5/9 --> (sub (shl X, C1), (shXadd X, x)) + for (uint64_t Offset : {3, 5, 9}) { + if (isPowerOf2_64(MulAmt + Offset)) { + SDLoc DL(N); + SDValue Shift1 = + DAG.getNode(ISD::SHL, DL, VT, X, + DAG.getConstant(Log2_64(MulAmt + Offset), DL, VT)); + SDValue Mul359 = + DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getConstant(Log2_64(Offset - 1), DL, VT), X); + return DAG.getNode(ISD::SUB, DL, VT, Shift1, Mul359); + } } } + // 2^N - 2^M -> (sub (shl X, C1), (shl X, C2)) + uint64_t MulAmtLowBit = MulAmt & (-MulAmt); + if (isPowerOf2_64(MulAmt + MulAmtLowBit)) { + uint64_t ShiftAmt1 = MulAmt + MulAmtLowBit; + SDLoc DL(N); + SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(Log2_64(ShiftAmt1), DL, VT)); + SDValue Shift2 = + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(Log2_64(MulAmtLowBit), DL, VT)); + return DAG.getNode(ISD::SUB, DL, VT, Shift1, Shift2); + } + return SDValue(); } @@ -16829,7 +16894,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, DAG.getNode(ISD::FNEG, DL, VT, NewFPExtRound)); } case ISD::MGATHER: { - const auto *MGN = dyn_cast<MaskedGatherSDNode>(N); + const auto *MGN = cast<MaskedGatherSDNode>(N); const EVT VT = N->getValueType(0); SDValue Index = MGN->getIndex(); SDValue ScaleOp = MGN->getScale(); @@ -16929,7 +16994,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, break; } case ISD::MSCATTER:{ - const auto *MSN = dyn_cast<MaskedScatterSDNode>(N); + const auto *MSN = cast<MaskedScatterSDNode>(N); SDValue Index = MSN->getIndex(); SDValue ScaleOp = MSN->getScale(); ISD::MemIndexType IndexType = MSN->getIndexType(); @@ -16965,7 +17030,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, break; } case ISD::VP_GATHER: { - const auto *VPGN = dyn_cast<VPGatherSDNode>(N); + const auto *VPGN = cast<VPGatherSDNode>(N); SDValue Index = VPGN->getIndex(); SDValue ScaleOp = VPGN->getScale(); ISD::MemIndexType IndexType = VPGN->getIndexType(); @@ -16990,7 +17055,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, break; } case ISD::VP_SCATTER: { - const auto *VPSN = dyn_cast<VPScatterSDNode>(N); + const auto *VPSN = cast<VPScatterSDNode>(N); SDValue Index = VPSN->getIndex(); SDValue ScaleOp = VPSN->getScale(); ISD::MemIndexType IndexType = VPSN->getIndexType(); @@ -21113,14 +21178,13 @@ bool RISCVTargetLowering::shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) bool RISCVTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const { // Check integral scalar types. - const bool HasExtMOrZmmul = - Subtarget.hasStdExtM() || Subtarget.hasStdExtZmmul(); + const bool HasZmmul = Subtarget.hasStdExtZmmul(); if (!VT.isScalarInteger()) return false; // Omit the optimization if the sub target has the M extension and the data // size exceeds XLen. - if (HasExtMOrZmmul && VT.getSizeInBits() > Subtarget.getXLen()) + if (HasZmmul && VT.getSizeInBits() > Subtarget.getXLen()) return false; if (auto *ConstNode = dyn_cast<ConstantSDNode>(C.getNode())) { @@ -21755,7 +21819,7 @@ bool RISCVTargetLowering::fallBackToDAGISel(const Instruction &Inst) const { Op == Instruction::And || Op == Instruction::Or || Op == Instruction::Xor || Op == Instruction::InsertElement || Op == Instruction::ShuffleVector || Op == Instruction::Load || - Op == Instruction::Freeze) + Op == Instruction::Freeze || Op == Instruction::Store) return false; if (Inst.getType()->isScalableTy()) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 3b8eb3c88901..7d8bceb5cb34 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -1037,6 +1037,9 @@ private: const APInt &AndMask) const override; unsigned getMinimumJumpTableEntries() const override; + + SDValue emitFlushICache(SelectionDAG &DAG, SDValue InChain, SDValue Start, + SDValue End, SDValue Flags, SDLoc DL) const; }; /// As per the spec, the rules for passing vector arguments are as follows: diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp index 82358cdd45ed..101f188374e0 100644 --- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp @@ -48,10 +48,13 @@ static cl::opt<bool> DisableInsertVSETVLPHIOpt( namespace { /// Given a virtual register \p Reg, return the corresponding VNInfo for it. -/// This will return nullptr if the virtual register is an implicit_def. +/// This will return nullptr if the virtual register is an implicit_def or +/// if LiveIntervals is not available. static VNInfo *getVNInfoFromReg(Register Reg, const MachineInstr &MI, const LiveIntervals *LIS) { assert(Reg.isVirtual()); + if (!LIS) + return nullptr; auto &LI = LIS->getInterval(Reg); SlotIndex SI = LIS->getSlotIndexes()->getInstructionIndex(MI); return LI.getVNInfoBefore(SI); @@ -249,6 +252,13 @@ struct DemandedFields { VLZeroness = true; } + static DemandedFields all() { + DemandedFields DF; + DF.demandVTYPE(); + DF.demandVL(); + return DF; + } + // Make this the result of demanding both the fields in this and B. void doUnion(const DemandedFields &B) { VLAny |= B.VLAny; @@ -397,7 +407,9 @@ DemandedFields getDemanded(const MachineInstr &MI, const RISCVSubtarget *ST) { if (RISCVII::hasSEWOp(TSFlags)) { Res.demandVTYPE(); if (RISCVII::hasVLOp(TSFlags)) - Res.demandVL(); + if (const MachineOperand &VLOp = MI.getOperand(getVLOpNum(MI)); + !VLOp.isReg() || !VLOp.isUndef()) + Res.demandVL(); // Behavior is independent of mask policy. if (!RISCVII::usesMaskPolicy(TSFlags)) @@ -503,7 +515,8 @@ DemandedFields getDemanded(const MachineInstr &MI, const RISCVSubtarget *ST) { /// values of the VL and VTYPE registers after insertion. class VSETVLIInfo { struct AVLDef { - // Every AVLDef should have a VNInfo. + // Every AVLDef should have a VNInfo, unless we're running without + // LiveIntervals in which case this will be nullptr. const VNInfo *ValNo; Register DefReg; }; @@ -517,8 +530,7 @@ class VSETVLIInfo { AVLIsReg, AVLIsImm, AVLIsVLMAX, - AVLIsIgnored, - Unknown, + Unknown, // AVL and VTYPE are fully unknown } State = Uninitialized; // Fields from VTYPE. @@ -544,7 +556,7 @@ public: bool isUnknown() const { return State == Unknown; } void setAVLRegDef(const VNInfo *VNInfo, Register AVLReg) { - assert(VNInfo && AVLReg.isVirtual()); + assert(AVLReg.isVirtual()); AVLRegDef.ValNo = VNInfo; AVLRegDef.DefReg = AVLReg; State = AVLIsReg; @@ -557,12 +569,9 @@ public: void setAVLVLMAX() { State = AVLIsVLMAX; } - void setAVLIgnored() { State = AVLIsIgnored; } - bool hasAVLImm() const { return State == AVLIsImm; } bool hasAVLReg() const { return State == AVLIsReg; } bool hasAVLVLMAX() const { return State == AVLIsVLMAX; } - bool hasAVLIgnored() const { return State == AVLIsIgnored; } Register getAVLReg() const { assert(hasAVLReg() && AVLRegDef.DefReg.isVirtual()); return AVLRegDef.DefReg; @@ -577,9 +586,11 @@ public: } // Most AVLIsReg infos will have a single defining MachineInstr, unless it was // a PHI node. In that case getAVLVNInfo()->def will point to the block - // boundary slot. + // boundary slot. If LiveIntervals isn't available, then nullptr is returned. const MachineInstr *getAVLDefMI(const LiveIntervals *LIS) const { assert(hasAVLReg()); + if (!LIS) + return nullptr; auto *MI = LIS->getInstructionFromIndex(getAVLVNInfo()->def); assert(!(getAVLVNInfo()->isPHIDef() && MI)); return MI; @@ -593,8 +604,6 @@ public: setAVLRegDef(Info.getAVLVNInfo(), Info.getAVLReg()); else if (Info.hasAVLVLMAX()) setAVLVLMAX(); - else if (Info.hasAVLIgnored()) - setAVLIgnored(); else { assert(Info.hasAVLImm()); setAVLImm(Info.getAVLImm()); @@ -615,8 +624,6 @@ public: } if (hasAVLVLMAX()) return true; - if (hasAVLIgnored()) - return false; return false; } @@ -627,10 +634,15 @@ public: return (hasNonZeroAVL(LIS) && Other.hasNonZeroAVL(LIS)); } - bool hasSameAVL(const VSETVLIInfo &Other) const { - if (hasAVLReg() && Other.hasAVLReg()) + bool hasSameAVLLatticeValue(const VSETVLIInfo &Other) const { + if (hasAVLReg() && Other.hasAVLReg()) { + assert(!getAVLVNInfo() == !Other.getAVLVNInfo() && + "we either have intervals or we don't"); + if (!getAVLVNInfo()) + return getAVLReg() == Other.getAVLReg(); return getAVLVNInfo()->id == Other.getAVLVNInfo()->id && getAVLReg() == Other.getAVLReg(); + } if (hasAVLImm() && Other.hasAVLImm()) return getAVLImm() == Other.getAVLImm(); @@ -638,12 +650,24 @@ public: if (hasAVLVLMAX()) return Other.hasAVLVLMAX() && hasSameVLMAX(Other); - if (hasAVLIgnored()) - return Other.hasAVLIgnored(); - return false; } + // Return true if the two lattice values are guaranteed to have + // the same AVL value at runtime. + bool hasSameAVL(const VSETVLIInfo &Other) const { + // Without LiveIntervals, we don't know which instruction defines a + // register. Since a register may be redefined, this means all AVLIsReg + // states must be treated as possibly distinct. + if (hasAVLReg() && Other.hasAVLReg()) { + assert(!getAVLVNInfo() == !Other.getAVLVNInfo() && + "we either have intervals or we don't"); + if (!getAVLVNInfo()) + return false; + } + return hasSameAVLLatticeValue(Other); + } + void setVTYPE(unsigned VType) { assert(isValid() && !isUnknown() && "Can't set VTYPE for uninitialized or unknown"); @@ -713,14 +737,12 @@ public: const LiveIntervals *LIS) const { assert(isValid() && Require.isValid() && "Can't compare invalid VSETVLIInfos"); - assert(!Require.SEWLMULRatioOnly && - "Expected a valid VTYPE for instruction!"); // Nothing is compatible with Unknown. if (isUnknown() || Require.isUnknown()) return false; // If only our VLMAX ratio is valid, then this isn't compatible. - if (SEWLMULRatioOnly) + if (SEWLMULRatioOnly || Require.SEWLMULRatioOnly) return false; if (Used.VLAny && !(hasSameAVL(Require) && hasSameVLMAX(Require))) @@ -745,7 +767,7 @@ public: if (Other.isUnknown()) return isUnknown(); - if (!hasSameAVL(Other)) + if (!hasSameAVLLatticeValue(Other)) return false; // If the SEWLMULRatioOnly bits are different, then they aren't equal. @@ -811,13 +833,11 @@ public: if (isUnknown()) OS << "unknown"; if (hasAVLReg()) - OS << "AVLReg=" << (unsigned)getAVLReg(); + OS << "AVLReg=" << llvm::printReg(getAVLReg()); if (hasAVLImm()) OS << "AVLImm=" << (unsigned)AVLImm; if (hasAVLVLMAX()) OS << "AVLVLMAX"; - if (hasAVLIgnored()) - OS << "AVLIgnored"; OS << ", " << "VLMul=" << (unsigned)VLMul << ", " << "SEW=" << (unsigned)SEW << ", " @@ -855,6 +875,7 @@ class RISCVInsertVSETVLI : public MachineFunctionPass { const RISCVSubtarget *ST; const TargetInstrInfo *TII; MachineRegisterInfo *MRI; + // Possibly null! LiveIntervals *LIS; std::vector<BlockData> BlockInfo; @@ -869,9 +890,8 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<LiveIntervals>(); + AU.addUsedIfAvailable<LiveIntervals>(); AU.addPreserved<LiveIntervals>(); - AU.addRequired<SlotIndexes>(); AU.addPreserved<SlotIndexes>(); AU.addPreserved<LiveDebugVariables>(); AU.addPreserved<LiveStacks>(); @@ -933,7 +953,8 @@ RISCVInsertVSETVLI::getInfoForVSETVLI(const MachineInstr &MI) const { if (AVLReg == RISCV::X0) NewInfo.setAVLVLMAX(); else if (MI.getOperand(1).isUndef()) - NewInfo.setAVLIgnored(); + // Otherwise use an AVL of 1 to avoid depending on previous vl. + NewInfo.setAVLImm(1); else { VNInfo *VNI = getVNInfoFromReg(AVLReg, MI, LIS); NewInfo.setAVLRegDef(VNI, AVLReg); @@ -941,6 +962,17 @@ RISCVInsertVSETVLI::getInfoForVSETVLI(const MachineInstr &MI) const { } NewInfo.setVTYPE(MI.getOperand(2).getImm()); + // If AVL is defined by a vsetvli with the same VLMAX, we can replace the + // AVL operand with the AVL of the defining vsetvli. + if (NewInfo.hasAVLReg()) { + if (const MachineInstr *DefMI = NewInfo.getAVLDefMI(LIS); + DefMI && isVectorConfigInstr(*DefMI)) { + VSETVLIInfo DefInstrInfo = getInfoForVSETVLI(*DefMI); + if (DefInstrInfo.hasSameVLMAX(NewInfo)) + NewInfo.setAVL(DefInstrInfo); + } + } + return NewInfo; } @@ -1009,17 +1041,17 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const { else InstrInfo.setAVLImm(Imm); } else if (VLOp.isUndef()) { - InstrInfo.setAVLIgnored(); + // Otherwise use an AVL of 1 to avoid depending on previous vl. + InstrInfo.setAVLImm(1); } else { VNInfo *VNI = getVNInfoFromReg(VLOp.getReg(), MI, LIS); InstrInfo.setAVLRegDef(VNI, VLOp.getReg()); } } else { assert(isScalarExtractInstr(MI)); - // TODO: If we are more clever about x0,x0 insertion then we should be able - // to deduce that the VL is ignored based off of DemandedFields, and remove - // the AVLIsIgnored state. Then we can just use an arbitrary immediate AVL. - InstrInfo.setAVLIgnored(); + // Pick a random value for state tracking purposes, will be ignored via + // the demanded fields mechanism + InstrInfo.setAVLImm(1); } #ifndef NDEBUG if (std::optional<unsigned> EEW = getEEWForLoadStore(MI)) { @@ -1029,15 +1061,12 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const { InstrInfo.setVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic); // If AVL is defined by a vsetvli with the same VLMAX, we can replace the - // AVL operand with the AVL of the defining vsetvli. We avoid general - // register AVLs to avoid extending live ranges without being sure we can - // kill the original source reg entirely. + // AVL operand with the AVL of the defining vsetvli. if (InstrInfo.hasAVLReg()) { if (const MachineInstr *DefMI = InstrInfo.getAVLDefMI(LIS); DefMI && isVectorConfigInstr(*DefMI)) { VSETVLIInfo DefInstrInfo = getInfoForVSETVLI(*DefMI); - if (DefInstrInfo.hasSameVLMAX(InstrInfo) && - (DefInstrInfo.hasAVLImm() || DefInstrInfo.hasAVLVLMAX())) + if (DefInstrInfo.hasSameVLMAX(InstrInfo)) InstrInfo.setAVL(DefInstrInfo); } } @@ -1066,7 +1095,8 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB, .addReg(RISCV::X0, RegState::Kill) .addImm(Info.encodeVTYPE()) .addReg(RISCV::VL, RegState::Implicit); - LIS->InsertMachineInstrInMaps(*MI); + if (LIS) + LIS->InsertMachineInstrInMaps(*MI); return; } @@ -1083,7 +1113,8 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB, .addReg(RISCV::X0, RegState::Kill) .addImm(Info.encodeVTYPE()) .addReg(RISCV::VL, RegState::Implicit); - LIS->InsertMachineInstrInMaps(*MI); + if (LIS) + LIS->InsertMachineInstrInMaps(*MI); return; } } @@ -1095,29 +1126,8 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB, .addReg(RISCV::X0, RegState::Define | RegState::Dead) .addImm(Info.getAVLImm()) .addImm(Info.encodeVTYPE()); - LIS->InsertMachineInstrInMaps(*MI); - return; - } - - if (Info.hasAVLIgnored()) { - // We can only use x0, x0 if there's no chance of the vtype change causing - // the previous vl to become invalid. - if (PrevInfo.isValid() && !PrevInfo.isUnknown() && - Info.hasSameVLMAX(PrevInfo)) { - auto MI = BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLIX0)) - .addReg(RISCV::X0, RegState::Define | RegState::Dead) - .addReg(RISCV::X0, RegState::Kill) - .addImm(Info.encodeVTYPE()) - .addReg(RISCV::VL, RegState::Implicit); + if (LIS) LIS->InsertMachineInstrInMaps(*MI); - return; - } - // Otherwise use an AVL of 1 to avoid depending on previous vl. - auto MI = BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETIVLI)) - .addReg(RISCV::X0, RegState::Define | RegState::Dead) - .addImm(1) - .addImm(Info.encodeVTYPE()); - LIS->InsertMachineInstrInMaps(*MI); return; } @@ -1127,8 +1137,10 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB, .addReg(DestReg, RegState::Define | RegState::Dead) .addReg(RISCV::X0, RegState::Kill) .addImm(Info.encodeVTYPE()); - LIS->InsertMachineInstrInMaps(*MI); - LIS->createAndComputeVirtRegInterval(DestReg); + if (LIS) { + LIS->InsertMachineInstrInMaps(*MI); + LIS->createAndComputeVirtRegInterval(DestReg); + } return; } @@ -1138,12 +1150,18 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB, .addReg(RISCV::X0, RegState::Define | RegState::Dead) .addReg(AVLReg) .addImm(Info.encodeVTYPE()); - LIS->InsertMachineInstrInMaps(*MI); - // Normally the AVL's live range will already extend past the inserted vsetvli - // because the pseudos below will already use the AVL. But this isn't always - // the case, e.g. PseudoVMV_X_S doesn't have an AVL operand. - LIS->getInterval(AVLReg).extendInBlock( - LIS->getMBBStartIdx(&MBB), LIS->getInstructionIndex(*MI).getRegSlot()); + if (LIS) { + LIS->InsertMachineInstrInMaps(*MI); + // Normally the AVL's live range will already extend past the inserted + // vsetvli because the pseudos below will already use the AVL. But this + // isn't always the case, e.g. PseudoVMV_X_S doesn't have an AVL operand or + // we've taken the AVL from the VL output of another vsetvli. + LiveInterval &LI = LIS->getInterval(AVLReg); + // Need to get non-const VNInfo + VNInfo *VNI = LI.getValNumInfo(Info.getAVLVNInfo()->id); + LI.addSegment(LiveInterval::Segment( + VNI->def, LIS->getInstructionIndex(*MI).getRegSlot(), VNI)); + } } /// Return true if a VSETVLI is required to transition from CurInfo to Require @@ -1157,19 +1175,6 @@ bool RISCVInsertVSETVLI::needVSETVLI(const DemandedFields &Used, if (CurInfo.isCompatible(Used, Require, LIS)) return false; - // We didn't find a compatible value. If our AVL is a virtual register, - // it might be defined by a VSET(I)VLI. If it has the same VLMAX we need - // and the last VL/VTYPE we observed is the same, we don't need a - // VSETVLI here. - if (Require.hasAVLReg() && CurInfo.hasCompatibleVTYPE(Used, Require)) { - if (const MachineInstr *DefMI = Require.getAVLDefMI(LIS); - DefMI && isVectorConfigInstr(*DefMI)) { - VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI); - if (DefInfo.hasSameAVL(CurInfo) && DefInfo.hasSameVLMAX(CurInfo)) - return false; - } - } - return true; } @@ -1257,10 +1262,14 @@ void RISCVInsertVSETVLI::transferAfter(VSETVLIInfo &Info, if (RISCV::isFaultFirstLoad(MI)) { // Update AVL to vl-output of the fault first load. assert(MI.getOperand(1).getReg().isVirtual()); - auto &LI = LIS->getInterval(MI.getOperand(1).getReg()); - SlotIndex SI = LIS->getSlotIndexes()->getInstructionIndex(MI).getRegSlot(); - VNInfo *VNI = LI.getVNInfoAt(SI); - Info.setAVLRegDef(VNI, MI.getOperand(1).getReg()); + if (LIS) { + auto &LI = LIS->getInterval(MI.getOperand(1).getReg()); + SlotIndex SI = + LIS->getSlotIndexes()->getInstructionIndex(MI).getRegSlot(); + VNInfo *VNI = LI.getVNInfoAt(SI); + Info.setAVLRegDef(VNI, MI.getOperand(1).getReg()); + } else + Info.setAVLRegDef(nullptr, MI.getOperand(1).getReg()); return; } @@ -1354,6 +1363,9 @@ bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require, if (!Require.hasAVLReg()) return true; + if (!LIS) + return true; + // We need the AVL to have been produced by a PHI node in this basic block. const VNInfo *Valno = Require.getAVLVNInfo(); if (!Valno->isPHIDef() || LIS->getMBBFromIndex(Valno->def) != &MBB) @@ -1412,7 +1424,7 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) { uint64_t TSFlags = MI.getDesc().TSFlags; if (RISCVII::hasSEWOp(TSFlags)) { - if (PrevInfo != CurInfo) { + if (!PrevInfo.isCompatible(DemandedFields::all(), CurInfo, LIS)) { // If this is the first implicit state change, and the state change // requested can be proven to produce the same register contents, we // can skip emitting the actual state change and continue as if we @@ -1429,27 +1441,29 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) { MachineOperand &VLOp = MI.getOperand(getVLOpNum(MI)); if (VLOp.isReg()) { Register Reg = VLOp.getReg(); - LiveInterval &LI = LIS->getInterval(Reg); // Erase the AVL operand from the instruction. VLOp.setReg(RISCV::NoRegister); VLOp.setIsKill(false); - SmallVector<MachineInstr *> DeadMIs; - LIS->shrinkToUses(&LI, &DeadMIs); - // We might have separate components that need split due to - // needVSETVLIPHI causing us to skip inserting a new VL def. - SmallVector<LiveInterval *> SplitLIs; - LIS->splitSeparateComponents(LI, SplitLIs); - - // If the AVL was an immediate > 31, then it would have been emitted - // as an ADDI. However, the ADDI might not have been used in the - // vsetvli, or a vsetvli might not have been emitted, so it may be - // dead now. - for (MachineInstr *DeadMI : DeadMIs) { - if (!TII->isAddImmediate(*DeadMI, Reg)) - continue; - LIS->RemoveMachineInstrFromMaps(*DeadMI); - DeadMI->eraseFromParent(); + if (LIS) { + LiveInterval &LI = LIS->getInterval(Reg); + SmallVector<MachineInstr *> DeadMIs; + LIS->shrinkToUses(&LI, &DeadMIs); + // We might have separate components that need split due to + // needVSETVLIPHI causing us to skip inserting a new VL def. + SmallVector<LiveInterval *> SplitLIs; + LIS->splitSeparateComponents(LI, SplitLIs); + + // If the AVL was an immediate > 31, then it would have been emitted + // as an ADDI. However, the ADDI might not have been used in the + // vsetvli, or a vsetvli might not have been emitted, so it may be + // dead now. + for (MachineInstr *DeadMI : DeadMIs) { + if (!TII->isAddImmediate(*DeadMI, Reg)) + continue; + LIS->RemoveMachineInstrFromMaps(*DeadMI); + DeadMI->eraseFromParent(); + } } } MI.addOperand(MachineOperand::CreateReg(RISCV::VL, /*isDef*/ false, @@ -1506,6 +1520,9 @@ void RISCVInsertVSETVLI::doPRE(MachineBasicBlock &MBB) { if (!UnavailablePred || !AvailableInfo.isValid()) return; + if (!LIS) + return; + // If we don't know the exact VTYPE, we can't copy the vsetvli to the exit of // the unavailable pred. if (AvailableInfo.hasSEWLMULRatioOnly()) @@ -1529,11 +1546,6 @@ void RISCVInsertVSETVLI::doPRE(MachineBasicBlock &MBB) { return; } - // If the AVL isn't used in its predecessors then bail, since we have no AVL - // to insert a vsetvli with. - if (AvailableInfo.hasAVLIgnored()) - return; - // Model the effect of changing the input state of the block MBB to // AvailableInfo. We're looking for two issues here; one legality, // one profitability. @@ -1657,7 +1669,7 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const { // The def of DefReg moved to MI, so extend the LiveInterval up to // it. - if (DefReg.isVirtual()) { + if (DefReg.isVirtual() && LIS) { LiveInterval &DefLI = LIS->getInterval(DefReg); SlotIndex MISlot = LIS->getInstructionIndex(MI).getRegSlot(); VNInfo *DefVNI = DefLI.getVNInfoAt(DefLI.beginIndex()); @@ -1686,13 +1698,15 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const { if (OldVLReg && OldVLReg.isVirtual()) { // NextMI no longer uses OldVLReg so shrink its LiveInterval. - LIS->shrinkToUses(&LIS->getInterval(OldVLReg)); + if (LIS) + LIS->shrinkToUses(&LIS->getInterval(OldVLReg)); MachineInstr *VLOpDef = MRI->getUniqueVRegDef(OldVLReg); if (VLOpDef && TII->isAddImmediate(*VLOpDef, OldVLReg) && MRI->use_nodbg_empty(OldVLReg)) { VLOpDef->eraseFromParent(); - LIS->removeInterval(OldVLReg); + if (LIS) + LIS->removeInterval(OldVLReg); } } MI.setDesc(NextMI->getDesc()); @@ -1708,7 +1722,8 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const { NumCoalescedVSETVL += ToDelete.size(); for (auto *MI : ToDelete) { - LIS->RemoveMachineInstrFromMaps(*MI); + if (LIS) + LIS->RemoveMachineInstrFromMaps(*MI); MI->eraseFromParent(); } } @@ -1723,12 +1738,14 @@ void RISCVInsertVSETVLI::insertReadVL(MachineBasicBlock &MBB) { auto ReadVLMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(RISCV::PseudoReadVL), VLOutput); // Move the LiveInterval's definition down to PseudoReadVL. - SlotIndex NewDefSI = - LIS->InsertMachineInstrInMaps(*ReadVLMI).getRegSlot(); - LiveInterval &DefLI = LIS->getInterval(VLOutput); - VNInfo *DefVNI = DefLI.getVNInfoAt(DefLI.beginIndex()); - DefLI.removeSegment(DefLI.beginIndex(), NewDefSI); - DefVNI->def = NewDefSI; + if (LIS) { + SlotIndex NewDefSI = + LIS->InsertMachineInstrInMaps(*ReadVLMI).getRegSlot(); + LiveInterval &DefLI = LIS->getInterval(VLOutput); + VNInfo *DefVNI = DefLI.getVNInfoAt(DefLI.beginIndex()); + DefLI.removeSegment(DefLI.beginIndex(), NewDefSI); + DefVNI->def = NewDefSI; + } } // We don't use the vl output of the VLEFF/VLSEGFF anymore. MI.getOperand(1).setReg(RISCV::X0); @@ -1746,7 +1763,7 @@ bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) { TII = ST->getInstrInfo(); MRI = &MF.getRegInfo(); - LIS = &getAnalysis<LiveIntervals>(); + LIS = getAnalysisIfAvailable<LiveIntervals>(); assert(BlockInfo.empty() && "Expect empty block infos"); BlockInfo.resize(MF.getNumBlockIDs()); @@ -1795,11 +1812,6 @@ bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock &MBB : MF) emitVSETVLIs(MBB); - // Insert PseudoReadVL after VLEFF/VLSEGFF and replace it with the vl output - // of VLEFF/VLSEGFF. - for (MachineBasicBlock &MBB : MF) - insertReadVL(MBB); - // Now that all vsetvlis are explicit, go through and do block local // DSE and peephole based demanded fields based transforms. Note that // this *must* be done outside the main dataflow so long as we allow @@ -1809,6 +1821,11 @@ bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock &MBB : MF) coalesceVSETVLIs(MBB); + // Insert PseudoReadVL after VLEFF/VLSEGFF and replace it with the vl output + // of VLEFF/VLSEGFF. + for (MachineBasicBlock &MBB : MF) + insertReadVL(MBB); + BlockInfo.clear(); return HaveVectorOp; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 444b9076005c..00eb83da652b 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -3695,7 +3695,7 @@ void RISCVInstrInfo::mulImm(MachineFunction &MF, MachineBasicBlock &MBB, .addReg(ScaledRegister, RegState::Kill) .addReg(DestReg, RegState::Kill) .setMIFlag(Flag); - } else if (STI.hasStdExtM() || STI.hasStdExtZmmul()) { + } else if (STI.hasStdExtZmmul()) { Register N = MRI.createVirtualRegister(&RISCV::GPRRegClass); movImm(MBB, II, DL, N, Amount, Flag); BuildMI(MBB, II, DL, get(RISCV::MUL), DestReg) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td index 814e0ddf111e..493e1a5fdc74 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td @@ -166,25 +166,25 @@ let Predicates = !listconcat([HasStdExtA, HasStdExtZtso], ExtraPreds) in { } } -defm : AMOPat<"atomic_swap_32", "AMOSWAP_W">; -defm : AMOPat<"atomic_load_add_32", "AMOADD_W">; -defm : AMOPat<"atomic_load_and_32", "AMOAND_W">; -defm : AMOPat<"atomic_load_or_32", "AMOOR_W">; -defm : AMOPat<"atomic_load_xor_32", "AMOXOR_W">; -defm : AMOPat<"atomic_load_max_32", "AMOMAX_W">; -defm : AMOPat<"atomic_load_min_32", "AMOMIN_W">; -defm : AMOPat<"atomic_load_umax_32", "AMOMAXU_W">; -defm : AMOPat<"atomic_load_umin_32", "AMOMINU_W">; - -defm : AMOPat<"atomic_swap_64", "AMOSWAP_D", i64, [IsRV64]>; -defm : AMOPat<"atomic_load_add_64", "AMOADD_D", i64, [IsRV64]>; -defm : AMOPat<"atomic_load_and_64", "AMOAND_D", i64, [IsRV64]>; -defm : AMOPat<"atomic_load_or_64", "AMOOR_D", i64, [IsRV64]>; -defm : AMOPat<"atomic_load_xor_64", "AMOXOR_D", i64, [IsRV64]>; -defm : AMOPat<"atomic_load_max_64", "AMOMAX_D", i64, [IsRV64]>; -defm : AMOPat<"atomic_load_min_64", "AMOMIN_D", i64, [IsRV64]>; -defm : AMOPat<"atomic_load_umax_64", "AMOMAXU_D", i64, [IsRV64]>; -defm : AMOPat<"atomic_load_umin_64", "AMOMINU_D", i64, [IsRV64]>; +defm : AMOPat<"atomic_swap_i32", "AMOSWAP_W">; +defm : AMOPat<"atomic_load_add_i32", "AMOADD_W">; +defm : AMOPat<"atomic_load_and_i32", "AMOAND_W">; +defm : AMOPat<"atomic_load_or_i32", "AMOOR_W">; +defm : AMOPat<"atomic_load_xor_i32", "AMOXOR_W">; +defm : AMOPat<"atomic_load_max_i32", "AMOMAX_W">; +defm : AMOPat<"atomic_load_min_i32", "AMOMIN_W">; +defm : AMOPat<"atomic_load_umax_i32", "AMOMAXU_W">; +defm : AMOPat<"atomic_load_umin_i32", "AMOMINU_W">; + +defm : AMOPat<"atomic_swap_i64", "AMOSWAP_D", i64, [IsRV64]>; +defm : AMOPat<"atomic_load_add_i64", "AMOADD_D", i64, [IsRV64]>; +defm : AMOPat<"atomic_load_and_i64", "AMOAND_D", i64, [IsRV64]>; +defm : AMOPat<"atomic_load_or_i64", "AMOOR_D", i64, [IsRV64]>; +defm : AMOPat<"atomic_load_xor_i64", "AMOXOR_D", i64, [IsRV64]>; +defm : AMOPat<"atomic_load_max_i64", "AMOMAX_D", i64, [IsRV64]>; +defm : AMOPat<"atomic_load_min_i64", "AMOMIN_D", i64, [IsRV64]>; +defm : AMOPat<"atomic_load_umax_i64", "AMOMAXU_D", i64, [IsRV64]>; +defm : AMOPat<"atomic_load_umin_i64", "AMOMINU_D", i64, [IsRV64]>; /// Pseudo AMOs @@ -243,15 +243,15 @@ let Size = 20 in def PseudoAtomicLoadNand32 : PseudoAMO; // Ordering constants must be kept in sync with the AtomicOrdering enum in // AtomicOrdering.h. -def : Pat<(XLenVT (atomic_load_nand_32_monotonic GPR:$addr, GPR:$incr)), +def : Pat<(XLenVT (atomic_load_nand_i32_monotonic GPR:$addr, GPR:$incr)), (PseudoAtomicLoadNand32 GPR:$addr, GPR:$incr, 2)>; -def : Pat<(XLenVT (atomic_load_nand_32_acquire GPR:$addr, GPR:$incr)), +def : Pat<(XLenVT (atomic_load_nand_i32_acquire GPR:$addr, GPR:$incr)), (PseudoAtomicLoadNand32 GPR:$addr, GPR:$incr, 4)>; -def : Pat<(XLenVT (atomic_load_nand_32_release GPR:$addr, GPR:$incr)), +def : Pat<(XLenVT (atomic_load_nand_i32_release GPR:$addr, GPR:$incr)), (PseudoAtomicLoadNand32 GPR:$addr, GPR:$incr, 5)>; -def : Pat<(XLenVT (atomic_load_nand_32_acq_rel GPR:$addr, GPR:$incr)), +def : Pat<(XLenVT (atomic_load_nand_i32_acq_rel GPR:$addr, GPR:$incr)), (PseudoAtomicLoadNand32 GPR:$addr, GPR:$incr, 6)>; -def : Pat<(XLenVT (atomic_load_nand_32_seq_cst GPR:$addr, GPR:$incr)), +def : Pat<(XLenVT (atomic_load_nand_i32_seq_cst GPR:$addr, GPR:$incr)), (PseudoAtomicLoadNand32 GPR:$addr, GPR:$incr, 7)>; let Size = 28 in @@ -294,15 +294,15 @@ let Size = 20 in def PseudoAtomicLoadNand64 : PseudoAMO; // Ordering constants must be kept in sync with the AtomicOrdering enum in // AtomicOrdering.h. -def : Pat<(i64 (atomic_load_nand_64_monotonic GPR:$addr, GPR:$incr)), +def : Pat<(i64 (atomic_load_nand_i64_monotonic GPR:$addr, GPR:$incr)), (PseudoAtomicLoadNand64 GPR:$addr, GPR:$incr, 2)>; -def : Pat<(i64 (atomic_load_nand_64_acquire GPR:$addr, GPR:$incr)), +def : Pat<(i64 (atomic_load_nand_i64_acquire GPR:$addr, GPR:$incr)), (PseudoAtomicLoadNand64 GPR:$addr, GPR:$incr, 4)>; -def : Pat<(i64 (atomic_load_nand_64_release GPR:$addr, GPR:$incr)), +def : Pat<(i64 (atomic_load_nand_i64_release GPR:$addr, GPR:$incr)), (PseudoAtomicLoadNand64 GPR:$addr, GPR:$incr, 5)>; -def : Pat<(i64 (atomic_load_nand_64_acq_rel GPR:$addr, GPR:$incr)), +def : Pat<(i64 (atomic_load_nand_i64_acq_rel GPR:$addr, GPR:$incr)), (PseudoAtomicLoadNand64 GPR:$addr, GPR:$incr, 6)>; -def : Pat<(i64 (atomic_load_nand_64_seq_cst GPR:$addr, GPR:$incr)), +def : Pat<(i64 (atomic_load_nand_i64_seq_cst GPR:$addr, GPR:$incr)), (PseudoAtomicLoadNand64 GPR:$addr, GPR:$incr, 7)>; def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_xchg_i64, @@ -354,12 +354,12 @@ multiclass PseudoCmpXchgPat<string Op, Pseudo CmpXchgInst, let Predicates = [HasStdExtA, NoStdExtZacas] in { def PseudoCmpXchg32 : PseudoCmpXchg; -defm : PseudoCmpXchgPat<"atomic_cmp_swap_32", PseudoCmpXchg32>; +defm : PseudoCmpXchgPat<"atomic_cmp_swap_i32", PseudoCmpXchg32>; } let Predicates = [HasStdExtA, NoStdExtZacas, IsRV64] in { def PseudoCmpXchg64 : PseudoCmpXchg; -defm : PseudoCmpXchgPat<"atomic_cmp_swap_64", PseudoCmpXchg64, i64>; +defm : PseudoCmpXchgPat<"atomic_cmp_swap_i64", PseudoCmpXchg64, i64>; } let Predicates = [HasStdExtA] in { @@ -422,18 +422,18 @@ let Predicates = !listconcat([HasStdExtA, HasStdExtZtso], ExtraPreds) in { } } -defm : AMOPat2<"atomic_swap_32", "AMOSWAP_W", i32>; -defm : AMOPat2<"atomic_load_add_32", "AMOADD_W", i32>; -defm : AMOPat2<"atomic_load_and_32", "AMOAND_W", i32>; -defm : AMOPat2<"atomic_load_or_32", "AMOOR_W", i32>; -defm : AMOPat2<"atomic_load_xor_32", "AMOXOR_W", i32>; -defm : AMOPat2<"atomic_load_max_32", "AMOMAX_W", i32>; -defm : AMOPat2<"atomic_load_min_32", "AMOMIN_W", i32>; -defm : AMOPat2<"atomic_load_umax_32", "AMOMAXU_W", i32>; -defm : AMOPat2<"atomic_load_umin_32", "AMOMINU_W", i32>; +defm : AMOPat2<"atomic_swap_i32", "AMOSWAP_W", i32>; +defm : AMOPat2<"atomic_load_add_i32", "AMOADD_W", i32>; +defm : AMOPat2<"atomic_load_and_i32", "AMOAND_W", i32>; +defm : AMOPat2<"atomic_load_or_i32", "AMOOR_W", i32>; +defm : AMOPat2<"atomic_load_xor_i32", "AMOXOR_W", i32>; +defm : AMOPat2<"atomic_load_max_i32", "AMOMAX_W", i32>; +defm : AMOPat2<"atomic_load_min_i32", "AMOMIN_W", i32>; +defm : AMOPat2<"atomic_load_umax_i32", "AMOMAXU_W", i32>; +defm : AMOPat2<"atomic_load_umin_i32", "AMOMINU_W", i32>; let Predicates = [HasStdExtA, IsRV64] in -defm : PseudoCmpXchgPat<"atomic_cmp_swap_32", PseudoCmpXchg32, i32>; +defm : PseudoCmpXchgPat<"atomic_cmp_swap_i32", PseudoCmpXchg32, i32>; let Predicates = [HasAtomicLdSt] in { def : LdPat<atomic_load_8, LB, i32>; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td index 8ea1560e5b37..8a2b32081dc5 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td @@ -24,7 +24,7 @@ def riscv_remuw : SDNode<"RISCVISD::REMUW", SDT_RISCVIntBinOpW>; // Instructions //===----------------------------------------------------------------------===// -let Predicates = [HasStdExtMOrZmmul] in { +let Predicates = [HasStdExtZmmul] in { def MUL : ALU_rr<0b0000001, 0b000, "mul", Commutable=1>, Sched<[WriteIMul, ReadIMul, ReadIMul]>; def MULH : ALU_rr<0b0000001, 0b001, "mulh", Commutable=1>, @@ -33,7 +33,7 @@ def MULHSU : ALU_rr<0b0000001, 0b010, "mulhsu">, Sched<[WriteIMul, ReadIMul, ReadIMul]>; def MULHU : ALU_rr<0b0000001, 0b011, "mulhu", Commutable=1>, Sched<[WriteIMul, ReadIMul, ReadIMul]>; -} // Predicates = [HasStdExtMOrZmmul] +} // Predicates = [HasStdExtZmmul] let Predicates = [HasStdExtM] in { def DIV : ALU_rr<0b0000001, 0b100, "div">, @@ -46,10 +46,10 @@ def REMU : ALU_rr<0b0000001, 0b111, "remu">, Sched<[WriteIRem, ReadIRem, ReadIRem]>; } // Predicates = [HasStdExtM] -let Predicates = [HasStdExtMOrZmmul, IsRV64], IsSignExtendingOpW = 1 in { +let Predicates = [HasStdExtZmmul, IsRV64], IsSignExtendingOpW = 1 in { def MULW : ALUW_rr<0b0000001, 0b000, "mulw", Commutable=1>, Sched<[WriteIMul32, ReadIMul32, ReadIMul32]>; -} // Predicates = [HasStdExtMOrZmmul, IsRV64] +} // Predicates = [HasStdExtZmmul, IsRV64] let Predicates = [HasStdExtM, IsRV64], IsSignExtendingOpW = 1 in { def DIVW : ALUW_rr<0b0000001, 0b100, "divw">, @@ -66,12 +66,12 @@ def REMUW : ALUW_rr<0b0000001, 0b111, "remuw">, // Pseudo-instructions and codegen patterns //===----------------------------------------------------------------------===// -let Predicates = [HasStdExtMOrZmmul] in { +let Predicates = [HasStdExtZmmul] in { def : PatGprGpr<mul, MUL>; def : PatGprGpr<mulhs, MULH>; def : PatGprGpr<mulhu, MULHU>; def : PatGprGpr<riscv_mulhsu, MULHSU>; -} // Predicates = [HasStdExtMOrZmmul] +} // Predicates = [HasStdExtZmmul] let Predicates = [HasStdExtM] in { def : PatGprGpr<sdiv, DIV>; @@ -81,7 +81,7 @@ def : PatGprGpr<urem, REMU>; } // Predicates = [HasStdExtM] // Select W instructions if only the lower 32-bits of the result are used. -let Predicates = [HasStdExtMOrZmmul, IsRV64] in +let Predicates = [HasStdExtZmmul, IsRV64] in def : PatGprGpr<binop_allwusers<mul>, MULW>; let Predicates = [HasStdExtM, IsRV64] in { @@ -106,20 +106,20 @@ def : Pat<(srem (sexti32 (i64 GPR:$rs1)), (sexti32 (i64 GPR:$rs2))), (REMW GPR:$rs1, GPR:$rs2)>; } // Predicates = [HasStdExtM, IsRV64] -let Predicates = [HasStdExtMOrZmmul, IsRV64, NotHasStdExtZba] in { +let Predicates = [HasStdExtZmmul, IsRV64, NotHasStdExtZba] in { // Special case for calculating the full 64-bit product of a 32x32 unsigned // multiply where the inputs aren't known to be zero extended. We can shift the // inputs left by 32 and use a MULHU. This saves two SRLIs needed to finish // zeroing the upper 32 bits. def : Pat<(i64 (mul (and GPR:$rs1, 0xffffffff), (and GPR:$rs2, 0xffffffff))), (MULHU (i64 (SLLI GPR:$rs1, 32)), (i64 (SLLI GPR:$rs2, 32)))>; -} // Predicates = [HasStdExtMOrZmmul, IsRV64, NotHasStdExtZba] +} // Predicates = [HasStdExtZmmul, IsRV64, NotHasStdExtZba] //===----------------------------------------------------------------------===// // Experimental RV64 i32 legalization patterns. //===----------------------------------------------------------------------===// -let Predicates = [HasStdExtMOrZmmul, IsRV64] in { +let Predicates = [HasStdExtZmmul, IsRV64] in { def : PatGprGpr<mul, MULW, i32, i32>; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index 7a95ebad364c..45a57d117081 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -67,11 +67,6 @@ /// that terminology in code frequently refers to these as "TA" which is /// confusing. We're in the process of migrating away from this /// representation. -/// * _TU w/o policy operand -- Has a passthrough operand, and always -/// represents the tail undisturbed state. -/// * _TU w/policy operand - Can represent all three policy states. If -/// passthrough is IMPLICIT_DEF (or NoReg), then represents "undefined". -/// Otherwise, policy operand and tablegen flags drive the interpretation. /// //===----------------------------------------------------------------------===// @@ -387,17 +382,6 @@ class GetIntVTypeInfo<VTypeInfo vti> { !cast<string>(vti)))); } -// This functor is used to obtain the fp vector type that has the same SEW and -// multiplier as the input parameter type. -class GetFpVTypeInfo<VTypeInfo vti> { - // Equivalent integer vector type. Eg. - // VF16M1 → VF16M1 (identity) - // VBF16M1 → VF16M1 - VTypeInfo Vti = !cast<VTypeInfo>(!subst("VBF", "VF", - !subst("VI", "VF", - !cast<string>(vti)))); -} - class MTypeInfo<ValueType Mas, LMULInfo M, string Bx> { ValueType Mask = Mas; // {SEW, VLMul} values set a valid VType to deal with this mask type. @@ -774,11 +758,6 @@ class GetVTypePredicates<VTypeInfo vti> { true : [HasVInstructions]); } -class GetVTypeScalarPredicates<VTypeInfo vti> { - list<Predicate> Predicates = !cond(!eq(vti.Scalar, bf16) : [HasStdExtZfbfmin], - true : []); -} - class VPseudoUSLoadNoMask<VReg RetClass, int EEW> : Pseudo<(outs RetClass:$rd), @@ -1373,23 +1352,6 @@ class VPseudoIStoreMask<VReg StClass, VReg IdxClass, int EEW, bits<3> LMUL, let HasSEWOp = 1; } -class VPseudoBinaryMask<VReg RetClass, - RegisterClass Op1Class, - DAGOperand Op2Class, - string Constraint> : - Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd), - (ins GetVRegNoV0<RetClass>.R:$merge, - Op1Class:$rs2, Op2Class:$rs1, - VMaskOp:$vm, AVL:$vl, ixlenimm:$sew), []>, - RISCVVPseudo { - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let Constraints = !interleave([Constraint, "$rd = $merge"], ","); - let HasVLOp = 1; - let HasSEWOp = 1; -} - class VPseudoBinaryMaskPolicy<VReg RetClass, RegisterClass Op1Class, DAGOperand Op2Class, @@ -1449,7 +1411,7 @@ class VPseudoTernaryMaskPolicyRoundingMode<VReg RetClass, let UsesVXRM = 0; } -// Like VPseudoBinaryMask, but output can be V0. +// Like VPseudoBinaryMaskPolicy, but output can be V0 and there is no policy. class VPseudoBinaryMOutMask<VReg RetClass, RegisterClass Op1Class, DAGOperand Op2Class, @@ -1470,8 +1432,8 @@ class VPseudoBinaryMOutMask<VReg RetClass, let UsesMaskPolicy = 1; } -// Special version of VPseudoBinaryMask where we pretend the first source is -// tied to the destination so we can workaround the earlyclobber constraint. +// Special version of VPseudoBinaryMaskPolicy where we pretend the first source +// is tied to the destination so we can workaround the earlyclobber constraint. // This allows maskedoff and rs2 to be the same register. class VPseudoTiedBinaryMask<VReg RetClass, DAGOperand Op2Class, @@ -4214,16 +4176,16 @@ class VPatBinaryMaskPolicy<string intrinsic_name, (op2_type op2_kind:$rs2), (mask_type V0), GPR:$vl, sew, (XLenVT timm:$policy))>; -class VPatBinaryMaskTARoundingMode<string intrinsic_name, - string inst, - ValueType result_type, - ValueType op1_type, - ValueType op2_type, - ValueType mask_type, - int sew, - VReg result_reg_class, - VReg op1_reg_class, - DAGOperand op2_kind> : +class VPatBinaryMaskPolicyRoundingMode<string intrinsic_name, + string inst, + ValueType result_type, + ValueType op1_type, + ValueType op2_type, + ValueType mask_type, + int sew, + VReg result_reg_class, + VReg op1_reg_class, + DAGOperand op2_kind> : Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask") (result_type result_reg_class:$merge), (op1_type op1_reg_class:$rs1), @@ -4375,28 +4337,6 @@ class VPatTiedBinaryMaskRoundingMode<string intrinsic_name, (XLenVT timm:$round), GPR:$vl, sew, (XLenVT timm:$policy))>; -class VPatTernaryNoMask<string intrinsic, - string inst, - string kind, - ValueType result_type, - ValueType op1_type, - ValueType op2_type, - int sew, - LMULInfo vlmul, - VReg result_reg_class, - RegisterClass op1_reg_class, - DAGOperand op2_kind> : - Pat<(result_type (!cast<Intrinsic>(intrinsic) - (result_type result_reg_class:$rs3), - (op1_type op1_reg_class:$rs1), - (op2_type op2_kind:$rs2), - VLOpFrag)), - (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX) - result_reg_class:$rs3, - (op1_type op1_reg_class:$rs1), - op2_kind:$rs2, - GPR:$vl, sew)>; - class VPatTernaryNoMaskTU<string intrinsic, string inst, string kind, @@ -4492,31 +4432,6 @@ class VPatTernaryNoMaskWithPolicyRoundingMode<string intrinsic, (XLenVT timm:$round), GPR:$vl, log2sew, (XLenVT timm:$policy))>; -class VPatTernaryMask<string intrinsic, - string inst, - string kind, - ValueType result_type, - ValueType op1_type, - ValueType op2_type, - ValueType mask_type, - int sew, - LMULInfo vlmul, - VReg result_reg_class, - RegisterClass op1_reg_class, - DAGOperand op2_kind> : - Pat<(result_type (!cast<Intrinsic>(intrinsic#"_mask") - (result_type result_reg_class:$rs3), - (op1_type op1_reg_class:$rs1), - (op2_type op2_kind:$rs2), - (mask_type V0), - VLOpFrag)), - (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX # "_MASK") - result_reg_class:$rs3, - (op1_type op1_reg_class:$rs1), - op2_kind:$rs2, - (mask_type V0), - GPR:$vl, sew)>; - class VPatTernaryMaskPolicy<string intrinsic, string inst, string kind, @@ -4784,9 +4699,9 @@ multiclass VPatBinaryRoundingMode<string intrinsic, DAGOperand op2_kind> { def : VPatBinaryNoMaskTURoundingMode<intrinsic, inst, result_type, op1_type, op2_type, sew, result_reg_class, op1_reg_class, op2_kind>; - def : VPatBinaryMaskTARoundingMode<intrinsic, inst, result_type, op1_type, op2_type, - mask_type, sew, result_reg_class, op1_reg_class, - op2_kind>; + def : VPatBinaryMaskPolicyRoundingMode<intrinsic, inst, result_type, op1_type, op2_type, + mask_type, sew, result_reg_class, op1_reg_class, + op2_kind>; } multiclass VPatBinaryMSwapped<string intrinsic, @@ -5158,10 +5073,10 @@ multiclass VPatBinaryW_WV_RM<string intrinsic, string instruction, Wti.Vector, Vti.Vector, Vti.Mask, Vti.Log2SEW, Wti.RegClass, Vti.RegClass>; } - def : VPatBinaryMaskTARoundingMode<intrinsic, name, - Wti.Vector, Wti.Vector, Vti.Vector, Vti.Mask, - Vti.Log2SEW, Wti.RegClass, - Wti.RegClass, Vti.RegClass>; + def : VPatBinaryMaskPolicyRoundingMode<intrinsic, name, + Wti.Vector, Wti.Vector, Vti.Vector, Vti.Mask, + Vti.Log2SEW, Wti.RegClass, + Wti.RegClass, Vti.RegClass>; } } } @@ -5513,46 +5428,6 @@ multiclass VPatBinaryM_V_X<string intrinsic, string instruction> : VPatBinaryV_V<intrinsic, instruction>, VPatBinaryV_X<intrinsic, instruction>; -multiclass VPatTernary<string intrinsic, - string inst, - string kind, - ValueType result_type, - ValueType op1_type, - ValueType op2_type, - ValueType mask_type, - int sew, - LMULInfo vlmul, - VReg result_reg_class, - RegisterClass op1_reg_class, - DAGOperand op2_kind> { - def : VPatTernaryNoMask<intrinsic, inst, kind, result_type, op1_type, op2_type, - sew, vlmul, result_reg_class, op1_reg_class, - op2_kind>; - def : VPatTernaryMask<intrinsic, inst, kind, result_type, op1_type, op2_type, - mask_type, sew, vlmul, result_reg_class, op1_reg_class, - op2_kind>; -} - -multiclass VPatTernaryNoMaskNoPolicy<string intrinsic, - string inst, - string kind, - ValueType result_type, - ValueType op1_type, - ValueType op2_type, - ValueType mask_type, - int sew, - LMULInfo vlmul, - VReg result_reg_class, - RegisterClass op1_reg_class, - DAGOperand op2_kind> { - def : VPatTernaryNoMask<intrinsic, inst, kind, result_type, op1_type, op2_type, - sew, vlmul, result_reg_class, op1_reg_class, - op2_kind>; - def : VPatTernaryMaskPolicy<intrinsic, inst, kind, result_type, op1_type, op2_type, - mask_type, sew, vlmul, result_reg_class, op1_reg_class, - op2_kind>; -} - multiclass VPatTernaryWithPolicy<string intrinsic, string inst, string kind, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td index 497c4aadf753..e82625f085be 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td @@ -1405,16 +1405,23 @@ foreach fvti = !listconcat(AllFloatVectors, AllBFloatVectors) in { fvti.AVL, fvti.Log2SEW)>; def : Pat<(fvti.Vector (vselect (fvti.Mask V0), + (SplatFPOp (SelectFPImm (XLenVT GPR:$imm))), + fvti.RegClass:$rs2)), + (!cast<Instruction>("PseudoVMERGE_VXM_"#fvti.LMul.MX) + (fvti.Vector (IMPLICIT_DEF)), + fvti.RegClass:$rs2, GPR:$imm, (fvti.Mask V0), fvti.AVL, fvti.Log2SEW)>; + + def : Pat<(fvti.Vector (vselect (fvti.Mask V0), (SplatFPOp (fvti.Scalar fpimm0)), fvti.RegClass:$rs2)), (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX) (fvti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs2, 0, (fvti.Mask V0), fvti.AVL, fvti.Log2SEW)>; - } +} - let Predicates = !listconcat(GetVTypePredicates<GetFpVTypeInfo<fvti>.Vti>.Predicates, - GetVTypeScalarPredicates<fvti>.Predicates) in +foreach fvti = AllFloatVectors in { + let Predicates = GetVTypePredicates<fvti>.Predicates in def : Pat<(fvti.Vector (vselect (fvti.Mask V0), (SplatFPOp fvti.ScalarRegClass:$rs1), fvti.RegClass:$rs2)), @@ -1473,26 +1480,6 @@ foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in { } //===----------------------------------------------------------------------===// -// Vector Splats -//===----------------------------------------------------------------------===// - -foreach fvti = !listconcat(AllFloatVectors, AllBFloatVectors) in { - let Predicates = !listconcat(GetVTypePredicates<GetFpVTypeInfo<fvti>.Vti>.Predicates, - GetVTypeScalarPredicates<fvti>.Predicates) in - def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl undef, fvti.ScalarRegClass:$rs1, srcvalue)), - (!cast<Instruction>("PseudoVFMV_V_"#fvti.ScalarSuffix#"_"#fvti.LMul.MX) - (fvti.Vector (IMPLICIT_DEF)), - (fvti.Scalar fvti.ScalarRegClass:$rs1), - fvti.AVL, fvti.Log2SEW, TA_MA)>; - defvar ivti = GetIntVTypeInfo<fvti>.Vti; - let Predicates = GetVTypePredicates<ivti>.Predicates in - def : Pat<(fvti.Vector (SplatFPOp (fvti.Scalar fpimm0))), - (!cast<Instruction>("PseudoVMV_V_I_"#fvti.LMul.MX) - (fvti.Vector (IMPLICIT_DEF)), - 0, fvti.AVL, fvti.Log2SEW, TA_MA)>; -} - -//===----------------------------------------------------------------------===// // Vector Element Extracts //===----------------------------------------------------------------------===// foreach vti = AllFloatVectors in { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td index 70d8265e7be4..a7945f2ee6c1 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -2638,9 +2638,10 @@ foreach fvti = !listconcat(AllFloatVectors, AllBFloatVectors) in { fvti.RegClass:$merge, fvti.RegClass:$rs2, 0, (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>; } +} - let Predicates = !listconcat(GetVTypePredicates<GetFpVTypeInfo<fvti>.Vti>.Predicates, - GetVTypeScalarPredicates<fvti>.Predicates) in { +foreach fvti = AllFloatVectors in { + let Predicates = GetVTypePredicates<fvti>.Predicates in { def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0), (SplatFPOp fvti.ScalarRegClass:$rs1), fvti.RegClass:$rs2, @@ -2654,8 +2655,8 @@ foreach fvti = !listconcat(AllFloatVectors, AllBFloatVectors) in { } foreach fvti = !listconcat(AllFloatVectors, AllBFloatVectors) in { - let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates, - GetVTypeScalarPredicates<fvti>.Predicates) in { + defvar ivti = GetIntVTypeInfo<fvti>.Vti; + let Predicates = GetVTypePredicates<ivti>.Predicates in { // 13.16. Vector Floating-Point Move Instruction // If we're splatting fpimm0, use vmv.v.x vd, x0. def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl @@ -2666,7 +2667,11 @@ foreach fvti = !listconcat(AllFloatVectors, AllBFloatVectors) in { fvti.Vector:$passthru, (fvti.Scalar (SelectFPImm (XLenVT GPR:$imm))), VLOpFrag)), (!cast<Instruction>("PseudoVMV_V_X_"#fvti.LMul.MX) $passthru, GPR:$imm, GPR:$vl, fvti.Log2SEW, TU_MU)>; + } +} +foreach fvti = AllFloatVectors in { + let Predicates = GetVTypePredicates<fvti>.Predicates in { def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl fvti.Vector:$passthru, (fvti.Scalar fvti.ScalarRegClass:$rs2), VLOpFrag)), (!cast<Instruction>("PseudoVFMV_V_" # fvti.ScalarSuffix # "_" # diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td index 0cd41cac218f..1ee78359bc4a 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td @@ -116,8 +116,8 @@ multiclass AMOCASPat<string AtomicOp, string BaseInst, ValueType vt = XLenVT, } // Predicates = !listconcat([HasStdExtZacas, HasStdExtZtso], ExtraPreds) } -defm : AMOCASPat<"atomic_cmp_swap_32", "AMOCAS_W">; -defm : AMOCASPat<"atomic_cmp_swap_64", "AMOCAS_D_RV64", i64, [IsRV64]>; +defm : AMOCASPat<"atomic_cmp_swap_i32", "AMOCAS_W">; +defm : AMOCASPat<"atomic_cmp_swap_i64", "AMOCAS_D_RV64", i64, [IsRV64]>; //===----------------------------------------------------------------------===// // Zawrs (Wait-on-Reservation-Set) @@ -188,27 +188,27 @@ defm AMOCAS_H : AMO_cas_aq_rl<0b00101, 0b001, "amocas.h", GPR>; /// AMOs -defm : AMOPat<"atomic_swap_8", "AMOSWAP_B", XLenVT, [HasStdExtZabha]>; -defm : AMOPat<"atomic_load_add_8", "AMOADD_B", XLenVT, [HasStdExtZabha]>; -defm : AMOPat<"atomic_load_and_8", "AMOAND_B", XLenVT, [HasStdExtZabha]>; -defm : AMOPat<"atomic_load_or_8", "AMOOR_B", XLenVT, [HasStdExtZabha]>; -defm : AMOPat<"atomic_load_xor_8", "AMOXOR_B", XLenVT, [HasStdExtZabha]>; -defm : AMOPat<"atomic_load_max_8", "AMOMAX_B", XLenVT, [HasStdExtZabha]>; -defm : AMOPat<"atomic_load_min_8", "AMOMIN_B", XLenVT, [HasStdExtZabha]>; -defm : AMOPat<"atomic_load_umax_8", "AMOMAXU_B", XLenVT, [HasStdExtZabha]>; -defm : AMOPat<"atomic_load_umin_8", "AMOMINU_B", XLenVT, [HasStdExtZabha]>; - -defm : AMOPat<"atomic_swap_16", "AMOSWAP_H", XLenVT, [HasStdExtZabha]>; -defm : AMOPat<"atomic_load_add_16", "AMOADD_H", XLenVT, [HasStdExtZabha]>; -defm : AMOPat<"atomic_load_and_16", "AMOAND_H", XLenVT, [HasStdExtZabha]>; -defm : AMOPat<"atomic_load_or_16", "AMOOR_H", XLenVT, [HasStdExtZabha]>; -defm : AMOPat<"atomic_load_xor_16", "AMOXOR_H", XLenVT, [HasStdExtZabha]>; -defm : AMOPat<"atomic_load_max_16", "AMOMAX_H", XLenVT, [HasStdExtZabha]>; -defm : AMOPat<"atomic_load_min_16", "AMOMIN_H", XLenVT, [HasStdExtZabha]>; -defm : AMOPat<"atomic_load_umax_16", "AMOMAXU_H", XLenVT, [HasStdExtZabha]>; -defm : AMOPat<"atomic_load_umin_16", "AMOMINU_H", XLenVT, [HasStdExtZabha]>; +defm : AMOPat<"atomic_swap_i8", "AMOSWAP_B", XLenVT, [HasStdExtZabha]>; +defm : AMOPat<"atomic_load_add_i8", "AMOADD_B", XLenVT, [HasStdExtZabha]>; +defm : AMOPat<"atomic_load_and_i8", "AMOAND_B", XLenVT, [HasStdExtZabha]>; +defm : AMOPat<"atomic_load_or_i8", "AMOOR_B", XLenVT, [HasStdExtZabha]>; +defm : AMOPat<"atomic_load_xor_i8", "AMOXOR_B", XLenVT, [HasStdExtZabha]>; +defm : AMOPat<"atomic_load_max_i8", "AMOMAX_B", XLenVT, [HasStdExtZabha]>; +defm : AMOPat<"atomic_load_min_i8", "AMOMIN_B", XLenVT, [HasStdExtZabha]>; +defm : AMOPat<"atomic_load_umax_i8", "AMOMAXU_B", XLenVT, [HasStdExtZabha]>; +defm : AMOPat<"atomic_load_umin_i8", "AMOMINU_B", XLenVT, [HasStdExtZabha]>; + +defm : AMOPat<"atomic_swap_i16", "AMOSWAP_H", XLenVT, [HasStdExtZabha]>; +defm : AMOPat<"atomic_load_add_i16", "AMOADD_H", XLenVT, [HasStdExtZabha]>; +defm : AMOPat<"atomic_load_and_i16", "AMOAND_H", XLenVT, [HasStdExtZabha]>; +defm : AMOPat<"atomic_load_or_i16", "AMOOR_H", XLenVT, [HasStdExtZabha]>; +defm : AMOPat<"atomic_load_xor_i16", "AMOXOR_H", XLenVT, [HasStdExtZabha]>; +defm : AMOPat<"atomic_load_max_i16", "AMOMAX_H", XLenVT, [HasStdExtZabha]>; +defm : AMOPat<"atomic_load_min_i16", "AMOMIN_H", XLenVT, [HasStdExtZabha]>; +defm : AMOPat<"atomic_load_umax_i16", "AMOMAXU_H", XLenVT, [HasStdExtZabha]>; +defm : AMOPat<"atomic_load_umin_i16", "AMOMINU_H", XLenVT, [HasStdExtZabha]>; /// AMOCAS -defm : AMOCASPat<"atomic_cmp_swap_8", "AMOCAS_B", XLenVT, [HasStdExtZabha]>; -defm : AMOCASPat<"atomic_cmp_swap_16", "AMOCAS_H", XLenVT, [HasStdExtZabha]>; +defm : AMOCASPat<"atomic_cmp_swap_i8", "AMOCAS_B", XLenVT, [HasStdExtZabha]>; +defm : AMOCASPat<"atomic_cmp_swap_i16", "AMOCAS_H", XLenVT, [HasStdExtZabha]>; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td index 2a4448d7881f..11c2695a5985 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td @@ -183,7 +183,7 @@ let Predicates = [HasStdExtZcb] in def C_ZEXT_B : RVZcArith_r<0b11000 , "c.zext.b">, Sched<[WriteIALU, ReadIALU]>; -let Predicates = [HasStdExtZcb, HasStdExtMOrZmmul] in +let Predicates = [HasStdExtZcb, HasStdExtZmmul] in def C_MUL : CA_ALU<0b100111, 0b10, "c.mul", GPRC>, Sched<[WriteIMul, ReadIMul, ReadIMul]>; @@ -270,13 +270,13 @@ def CM_JALT : RVInst16CJ<0b101, 0b10, (outs), (ins uimm8ge32:$index), } // DecoderNamespace = "RVZcmt", Predicates = [HasStdExtZcmt]... -let Predicates = [HasStdExtZcb, HasStdExtMOrZmmul] in{ +let Predicates = [HasStdExtZcb, HasStdExtZmmul] in{ def : CompressPat<(MUL GPRC:$rs1, GPRC:$rs1, GPRC:$rs2), (C_MUL GPRC:$rs1, GPRC:$rs2)>; let isCompressOnly = true in def : CompressPat<(MUL GPRC:$rs1, GPRC:$rs2, GPRC:$rs1), (C_MUL GPRC:$rs1, GPRC:$rs2)>; -} // Predicates = [HasStdExtZcb, HasStdExtMOrZmmul] +} // Predicates = [HasStdExtZcb, HasStdExtZmmul] let Predicates = [HasStdExtZcb, HasStdExtZbb] in{ def : CompressPat<(SEXT_B GPRC:$rs1, GPRC:$rs1), diff --git a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h index fcc20c17c6b4..779c652b4d8f 100644 --- a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h +++ b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h @@ -135,7 +135,7 @@ public: bool isPushable(const MachineFunction &MF) const { // We cannot use fixed locations for the callee saved spill slots if the // function uses a varargs save area. - // TODO: Use a seperate placement for vararg registers to enable Zcmp. + // TODO: Use a separate placement for vararg registers to enable Zcmp. return MF.getSubtarget<RISCVSubtarget>().hasStdExtZcmp() && !MF.getTarget().Options.DisableFramePointerElim(MF) && VarArgsSaveSize == 0; diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td index 6ebf9f1eb045..13a2491116b5 100644 --- a/llvm/lib/Target/RISCV/RISCVProcessors.td +++ b/llvm/lib/Target/RISCV/RISCVProcessors.td @@ -326,6 +326,27 @@ def SYNTACORE_SCR1_MAX : RISCVProcessorModel<"syntacore-scr1-max", FeatureStdExtC], [TuneNoDefaultUnroll]>; +def SYNTACORE_SCR3_RV32 : RISCVProcessorModel<"syntacore-scr3-rv32", + SyntacoreSCR3RV32Model, + [Feature32Bit, + FeatureStdExtI, + FeatureStdExtZicsr, + FeatureStdExtZifencei, + FeatureStdExtM, + FeatureStdExtC], + [TuneNoDefaultUnroll, FeaturePostRAScheduler]>; + +def SYNTACORE_SCR3_RV64 : RISCVProcessorModel<"syntacore-scr3-rv64", + SyntacoreSCR3RV64Model, + [Feature64Bit, + FeatureStdExtI, + FeatureStdExtZicsr, + FeatureStdExtZifencei, + FeatureStdExtM, + FeatureStdExtA, + FeatureStdExtC], + [TuneNoDefaultUnroll, FeaturePostRAScheduler]>; + def VENTANA_VEYRON_V1 : RISCVProcessorModel<"veyron-v1", NoSchedModel, [Feature64Bit, @@ -381,3 +402,19 @@ def XIANGSHAN_NANHU : RISCVProcessorModel<"xiangshan-nanhu", TuneZExtHFusion, TuneZExtWFusion, TuneShiftedZExtWFusion]>; + +def SPACEMIT_X60 : RISCVProcessorModel<"spacemit-x60", + NoSchedModel, + !listconcat(RVA22S64Features, + [FeatureStdExtV, + FeatureStdExtSscofpmf, + FeatureStdExtSstc, + FeatureStdExtSvnapot, + FeatureStdExtZbc, + FeatureStdExtZbkc, + FeatureStdExtZfh, + FeatureStdExtZicond, + FeatureStdExtZvfh, + FeatureStdExtZvkt, + FeatureStdExtZvl256b]), + [TuneDLenFactor2]>; diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp index caa5dbc15f8b..760d12103c36 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -104,14 +104,17 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); auto &Subtarget = MF.getSubtarget<RISCVSubtarget>(); - // Mark any registers requested to be reserved as such for (size_t Reg = 0; Reg < getNumRegs(); Reg++) { + // Mark any GPRs requested to be reserved as such if (Subtarget.isRegisterReservedByUser(Reg)) markSuperRegs(Reserved, Reg); + + // Mark all the registers defined as constant in TableGen as reserved. + if (isConstantPhysReg(Reg)) + markSuperRegs(Reserved, Reg); } // Use markSuperRegs to ensure any register aliases are also reserved - markSuperRegs(Reserved, RISCV::X0); // zero markSuperRegs(Reserved, RISCV::X2); // sp markSuperRegs(Reserved, RISCV::X3); // gp markSuperRegs(Reserved, RISCV::X4); // tp @@ -136,7 +139,6 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const { markSuperRegs(Reserved, RISCV::VTYPE); markSuperRegs(Reserved, RISCV::VXSAT); markSuperRegs(Reserved, RISCV::VXRM); - markSuperRegs(Reserved, RISCV::VLENB); // vlenb (constant) // Floating point environment registers. markSuperRegs(Reserved, RISCV::FRM); diff --git a/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td b/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td index 31112d140cde..dc20fdcea4d7 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td @@ -76,58 +76,6 @@ def : WriteRes<WriteLDW, [SCR1_LSU]>; def : WriteRes<WriteLDD, [SCR1_LSU]>; } -let Unsupported = true in { -// Atomic memory -def : WriteRes<WriteAtomicW, [SCR1_LSU]>; -def : WriteRes<WriteAtomicD, [SCR1_LSU]>; -def : WriteRes<WriteAtomicLDW, [SCR1_LSU]>; -def : WriteRes<WriteAtomicLDD, [SCR1_LSU]>; -def : WriteRes<WriteAtomicSTW, [SCR1_LSU]>; -def : WriteRes<WriteAtomicSTD, [SCR1_LSU]>; - -// FP load/store -def : WriteRes<WriteFST32, [SCR1_LSU]>; -def : WriteRes<WriteFST64, [SCR1_LSU]>; -def : WriteRes<WriteFLD32, [SCR1_LSU]>; -def : WriteRes<WriteFLD64, [SCR1_LSU]>; - -// FP instructions -def : WriteRes<WriteFAdd32, []>; -def : WriteRes<WriteFSGNJ32, []>; -def : WriteRes<WriteFMinMax32, []>; -def : WriteRes<WriteFAdd64, []>; -def : WriteRes<WriteFSGNJ64, []>; -def : WriteRes<WriteFMinMax64, []>; -def : WriteRes<WriteFCvtI32ToF32, []>; -def : WriteRes<WriteFCvtI32ToF64, []>; -def : WriteRes<WriteFCvtI64ToF32, []>; -def : WriteRes<WriteFCvtI64ToF64, []>; -def : WriteRes<WriteFCvtF32ToI32, []>; -def : WriteRes<WriteFCvtF32ToI64, []>; -def : WriteRes<WriteFCvtF64ToI32, []>; -def : WriteRes<WriteFCvtF64ToI64, []>; -def : WriteRes<WriteFCvtF32ToF64, []>; -def : WriteRes<WriteFCvtF64ToF32, []>; -def : WriteRes<WriteFClass32, []>; -def : WriteRes<WriteFClass64, []>; -def : WriteRes<WriteFCmp32, []>; -def : WriteRes<WriteFCmp64, []>; -def : WriteRes<WriteFMovF32ToI32, []>; -def : WriteRes<WriteFMovI32ToF32, []>; -def : WriteRes<WriteFMovF64ToI64, []>; -def : WriteRes<WriteFMovI64ToF64, []>; -def : WriteRes<WriteFMul32, []>; -def : WriteRes<WriteFMA32, []>; -def : WriteRes<WriteFMul64, []>; -def : WriteRes<WriteFMA64, []>; -def : WriteRes<WriteFDiv32, []>; -def : WriteRes<WriteFDiv64, []>; -def : WriteRes<WriteFSqrt32, []>; -def : WriteRes<WriteFSqrt64, []>; - -def : WriteRes<WriteSFB, []>; -} - // Others def : WriteRes<WriteCSR, []>; def : WriteRes<WriteNop, []>; @@ -153,55 +101,13 @@ def : ReadAdvance<ReadIRem, 0>; def : ReadAdvance<ReadIRem32, 0>; def : ReadAdvance<ReadIMul, 0>; def : ReadAdvance<ReadIMul32, 0>; -def : ReadAdvance<ReadAtomicWA, 0>; -def : ReadAdvance<ReadAtomicWD, 0>; -def : ReadAdvance<ReadAtomicDA, 0>; -def : ReadAdvance<ReadAtomicDD, 0>; -def : ReadAdvance<ReadAtomicLDW, 0>; -def : ReadAdvance<ReadAtomicLDD, 0>; -def : ReadAdvance<ReadAtomicSTW, 0>; -def : ReadAdvance<ReadAtomicSTD, 0>; -def : ReadAdvance<ReadFStoreData, 0>; -def : ReadAdvance<ReadFMemBase, 0>; -def : ReadAdvance<ReadFAdd32, 0>; -def : ReadAdvance<ReadFAdd64, 0>; -def : ReadAdvance<ReadFMul32, 0>; -def : ReadAdvance<ReadFMul64, 0>; -def : ReadAdvance<ReadFMA32, 0>; -def : ReadAdvance<ReadFMA32Addend, 0>; -def : ReadAdvance<ReadFMA64, 0>; -def : ReadAdvance<ReadFMA64Addend, 0>; -def : ReadAdvance<ReadFDiv32, 0>; -def : ReadAdvance<ReadFDiv64, 0>; -def : ReadAdvance<ReadFSqrt32, 0>; -def : ReadAdvance<ReadFSqrt64, 0>; -def : ReadAdvance<ReadFCmp32, 0>; -def : ReadAdvance<ReadFCmp64, 0>; -def : ReadAdvance<ReadFSGNJ32, 0>; -def : ReadAdvance<ReadFSGNJ64, 0>; -def : ReadAdvance<ReadFMinMax32, 0>; -def : ReadAdvance<ReadFMinMax64, 0>; -def : ReadAdvance<ReadFCvtF32ToI32, 0>; -def : ReadAdvance<ReadFCvtF32ToI64, 0>; -def : ReadAdvance<ReadFCvtF64ToI32, 0>; -def : ReadAdvance<ReadFCvtF64ToI64, 0>; -def : ReadAdvance<ReadFCvtI32ToF32, 0>; -def : ReadAdvance<ReadFCvtI32ToF64, 0>; -def : ReadAdvance<ReadFCvtI64ToF32, 0>; -def : ReadAdvance<ReadFCvtI64ToF64, 0>; -def : ReadAdvance<ReadFCvtF32ToF64, 0>; -def : ReadAdvance<ReadFCvtF64ToF32, 0>; -def : ReadAdvance<ReadFMovF32ToI32, 0>; -def : ReadAdvance<ReadFMovI32ToF32, 0>; -def : ReadAdvance<ReadFMovF64ToI64, 0>; -def : ReadAdvance<ReadFMovI64ToF64, 0>; -def : ReadAdvance<ReadFClass32, 0>; -def : ReadAdvance<ReadFClass64, 0>; -def : ReadAdvance<ReadSFBJmp, 0>; -def : ReadAdvance<ReadSFBALU, 0>; //===----------------------------------------------------------------------===// // Unsupported extensions +defm : UnsupportedSchedA; +defm : UnsupportedSchedD; +defm : UnsupportedSchedF; +defm : UnsupportedSchedSFB; defm : UnsupportedSchedV; defm : UnsupportedSchedZabha; defm : UnsupportedSchedZba; diff --git a/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR3.td b/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR3.td new file mode 100644 index 000000000000..607637bc0de5 --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR3.td @@ -0,0 +1,189 @@ +//==- RISCVSchedSyntacoreSCR3.td - Syntacore SCR3 Scheduling Definitions -*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// + +// This model covers SYNTACORE_SCR3_RV32IMC and SYNTACORE_RV64IMAC +// configurations (syntacore-scr3-rv32/64). +// Overview: https://syntacore.com/products/scr3 + +// SCR3 is single-issue in-order processor +class SyntacoreSCR3Model : SchedMachineModel { + let MicroOpBufferSize = 0; + let IssueWidth = 1; + let LoadLatency = 2; + let MispredictPenalty = 3; + let CompleteModel = 0; + let UnsupportedFeatures = [HasStdExtD, HasStdExtZbkb, HasStdExtZbkc, HasStdExtZbkx, + HasStdExtZknd, HasStdExtZkne, HasStdExtZknh, + HasStdExtZksed, HasStdExtZksh, HasStdExtZkr, + HasVInstructions]; +} + +// Branching +multiclass SCR3_Branching<ProcResourceKind BRU> { + def : WriteRes<WriteJmp, [BRU]>; + def : WriteRes<WriteJal, [BRU]>; + def : WriteRes<WriteJalr, [BRU]>; +} + +// Single-cycle integer arithmetic and logic +multiclass SCR3_IntALU<ProcResourceKind ALU> { + def : WriteRes<WriteIALU, [ALU]>; + def : WriteRes<WriteIALU32, [ALU]>; + def : WriteRes<WriteShiftImm, [ALU]>; + def : WriteRes<WriteShiftImm32, [ALU]>; + def : WriteRes<WriteShiftReg, [ALU]>; + def : WriteRes<WriteShiftReg32, [ALU]>; +} + +// Integer multiplication +multiclass SCR3_IntMul<ProcResourceKind MUL> { + let Latency = 2 in { + def : WriteRes<WriteIMul, [MUL]>; + def : WriteRes<WriteIMul32, [MUL]>; + } +} + +// Integer division +multiclass SCR3_IntDiv<ProcResourceKind DIV, int DivLatency> { + let Latency = DivLatency, ReleaseAtCycles = [DivLatency] in { + def : WriteRes<WriteIDiv, [DIV]>; + def : WriteRes<WriteIDiv32, [DIV]>; + def : WriteRes<WriteIRem, [DIV]>; + def : WriteRes<WriteIRem32, [DIV]>; + } +} + +// Load/store instructions on SCR3 have latency 2 +multiclass SCR3_Memory<ProcResourceKind LSU> { + let Latency = 2 in { + def : WriteRes<WriteSTB, [LSU]>; + def : WriteRes<WriteSTH, [LSU]>; + def : WriteRes<WriteSTW, [LSU]>; + def : WriteRes<WriteSTD, [LSU]>; + def : WriteRes<WriteLDB, [LSU]>; + def : WriteRes<WriteLDH, [LSU]>; + def : WriteRes<WriteLDW, [LSU]>; + def : WriteRes<WriteLDD, [LSU]>; + } +} + +// Atomic memory +multiclass SCR3_AtomicMemory<ProcResourceKind LSU> { + let Latency = 20 in { + def : WriteRes<WriteAtomicLDW, [LSU]>; + def : WriteRes<WriteAtomicLDD, [LSU]>; + def : WriteRes<WriteAtomicW, [LSU]>; + def : WriteRes<WriteAtomicD, [LSU]>; + def : WriteRes<WriteAtomicSTW, [LSU]>; + def : WriteRes<WriteAtomicSTD, [LSU]>; + } +} + +// Others +multiclass SCR3_Other { + def : WriteRes<WriteCSR, []>; + def : WriteRes<WriteNop, []>; + + def : InstRW<[WriteIALU], (instrs COPY)>; +} + + +multiclass SCR3_Unsupported { + defm : UnsupportedSchedD; + defm : UnsupportedSchedF; + defm : UnsupportedSchedSFB; + defm : UnsupportedSchedV; + defm : UnsupportedSchedXsfvcp; + defm : UnsupportedSchedZabha; + defm : UnsupportedSchedZba; + defm : UnsupportedSchedZbb; + defm : UnsupportedSchedZbc; + defm : UnsupportedSchedZbs; + defm : UnsupportedSchedZbkb; + defm : UnsupportedSchedZbkx; + defm : UnsupportedSchedZfa; + defm : UnsupportedSchedZfh; + defm : UnsupportedSchedZvk; +} + +// Bypasses (none) +multiclass SCR3_NoReadAdvances { + def : ReadAdvance<ReadJmp, 0>; + def : ReadAdvance<ReadJalr, 0>; + def : ReadAdvance<ReadCSR, 0>; + def : ReadAdvance<ReadStoreData, 0>; + def : ReadAdvance<ReadMemBase, 0>; + def : ReadAdvance<ReadIALU, 0>; + def : ReadAdvance<ReadIALU32, 0>; + def : ReadAdvance<ReadShiftImm, 0>; + def : ReadAdvance<ReadShiftImm32, 0>; + def : ReadAdvance<ReadShiftReg, 0>; + def : ReadAdvance<ReadShiftReg32, 0>; + def : ReadAdvance<ReadIDiv, 0>; + def : ReadAdvance<ReadIDiv32, 0>; + def : ReadAdvance<ReadIRem, 0>; + def : ReadAdvance<ReadIRem32, 0>; + def : ReadAdvance<ReadIMul, 0>; + def : ReadAdvance<ReadIMul32, 0>; + def : ReadAdvance<ReadAtomicWA, 0>; + def : ReadAdvance<ReadAtomicWD, 0>; + def : ReadAdvance<ReadAtomicDA, 0>; + def : ReadAdvance<ReadAtomicDD, 0>; + def : ReadAdvance<ReadAtomicLDW, 0>; + def : ReadAdvance<ReadAtomicLDD, 0>; + def : ReadAdvance<ReadAtomicSTW, 0>; + def : ReadAdvance<ReadAtomicSTD, 0>; +} + +def SyntacoreSCR3RV32Model : SyntacoreSCR3Model; + +let SchedModel = SyntacoreSCR3RV32Model in { + let BufferSize = 0 in { + def SCR3RV32_ALU : ProcResource<1>; + def SCR3RV32_MUL : ProcResource<1>; + def SCR3RV32_DIV : ProcResource<1>; + def SCR3RV32_LSU : ProcResource<1>; + def SCR3RV32_CFU : ProcResource<1>; + } + + defm : SCR3_Branching<SCR3RV32_CFU>; + defm : SCR3_IntALU<SCR3RV32_ALU>; + defm : SCR3_IntMul<SCR3RV32_MUL>; + defm : SCR3_IntDiv<SCR3RV32_DIV, /* div latency = */ 8>; + defm : SCR3_Memory<SCR3RV32_LSU>; + defm : SCR3_AtomicMemory<SCR3RV32_LSU>; + defm : SCR3_Other; + + defm : SCR3_Unsupported; + defm : SCR3_NoReadAdvances; +} + +def SyntacoreSCR3RV64Model : SyntacoreSCR3Model; + +let SchedModel = SyntacoreSCR3RV64Model in { + let BufferSize = 0 in { + def SCR3RV64_ALU : ProcResource<1>; + def SCR3RV64_MUL : ProcResource<1>; + def SCR3RV64_DIV : ProcResource<1>; + def SCR3RV64_LSU : ProcResource<1>; + def SCR3RV64_CFU : ProcResource<1>; + } + + defm : SCR3_Branching<SCR3RV64_CFU>; + defm : SCR3_IntALU<SCR3RV64_ALU>; + defm : SCR3_IntMul<SCR3RV64_MUL>; + defm : SCR3_IntDiv<SCR3RV64_DIV, /* div latency = */ 11>; + defm : SCR3_Memory<SCR3RV64_LSU>; + defm : SCR3_AtomicMemory<SCR3RV64_LSU>; + defm : SCR3_Other; + + defm : SCR3_Unsupported; + defm : SCR3_NoReadAdvances; +} diff --git a/llvm/lib/Target/RISCV/RISCVSchedule.td b/llvm/lib/Target/RISCV/RISCVSchedule.td index d9a2e38c0e9d..1fdbc7cbcbaf 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedule.td +++ b/llvm/lib/Target/RISCV/RISCVSchedule.td @@ -257,6 +257,90 @@ def : ReadAdvance<ReadFSqrt16, 0>; } // Unsupported = true } +multiclass UnsupportedSchedF { +let Unsupported = true in { +def : WriteRes<WriteFST32, []>; +def : WriteRes<WriteFLD32, []>; +def : WriteRes<WriteFAdd32, []>; +def : WriteRes<WriteFSGNJ32, []>; +def : WriteRes<WriteFMinMax32, []>; +def : WriteRes<WriteFCvtI32ToF32, []>; +def : WriteRes<WriteFCvtI64ToF32, []>; +def : WriteRes<WriteFCvtF32ToI32, []>; +def : WriteRes<WriteFCvtF32ToI64, []>; +def : WriteRes<WriteFClass32, []>; +def : WriteRes<WriteFCmp32, []>; +def : WriteRes<WriteFMovF32ToI32, []>; +def : WriteRes<WriteFMovI32ToF32, []>; +def : WriteRes<WriteFMul32, []>; +def : WriteRes<WriteFMA32, []>; +def : WriteRes<WriteFDiv32, []>; +def : WriteRes<WriteFSqrt32, []>; + +def : ReadAdvance<ReadFAdd32, 0>; +def : ReadAdvance<ReadFMul32, 0>; +def : ReadAdvance<ReadFMA32, 0>; +def : ReadAdvance<ReadFMA32Addend, 0>; +def : ReadAdvance<ReadFDiv32, 0>; +def : ReadAdvance<ReadFSqrt32, 0>; +def : ReadAdvance<ReadFCmp32, 0>; +def : ReadAdvance<ReadFSGNJ32, 0>; +def : ReadAdvance<ReadFMinMax32, 0>; +def : ReadAdvance<ReadFCvtF32ToI32, 0>; +def : ReadAdvance<ReadFCvtF32ToI64, 0>; +def : ReadAdvance<ReadFCvtI32ToF32, 0>; +def : ReadAdvance<ReadFCvtI64ToF32, 0>; +def : ReadAdvance<ReadFMovF32ToI32, 0>; +def : ReadAdvance<ReadFMovI32ToF32, 0>; +def : ReadAdvance<ReadFClass32, 0>; +def : ReadAdvance<ReadFStoreData, 0>; +def : ReadAdvance<ReadFMemBase, 0>; +} // Unsupported = true +} + +multiclass UnsupportedSchedD { +let Unsupported = true in { +def : WriteRes<WriteFST64, []>; +def : WriteRes<WriteFLD64, []>; +def : WriteRes<WriteFAdd64, []>; +def : WriteRes<WriteFSGNJ64, []>; +def : WriteRes<WriteFMinMax64, []>; +def : WriteRes<WriteFCvtI32ToF64, []>; +def : WriteRes<WriteFCvtI64ToF64, []>; +def : WriteRes<WriteFCvtF64ToI32, []>; +def : WriteRes<WriteFCvtF64ToI64, []>; +def : WriteRes<WriteFCvtF32ToF64, []>; +def : WriteRes<WriteFCvtF64ToF32, []>; +def : WriteRes<WriteFClass64, []>; +def : WriteRes<WriteFCmp64, []>; +def : WriteRes<WriteFMovF64ToI64, []>; +def : WriteRes<WriteFMovI64ToF64, []>; +def : WriteRes<WriteFMul64, []>; +def : WriteRes<WriteFMA64, []>; +def : WriteRes<WriteFDiv64, []>; +def : WriteRes<WriteFSqrt64, []>; + +def : ReadAdvance<ReadFAdd64, 0>; +def : ReadAdvance<ReadFMul64, 0>; +def : ReadAdvance<ReadFMA64, 0>; +def : ReadAdvance<ReadFMA64Addend, 0>; +def : ReadAdvance<ReadFDiv64, 0>; +def : ReadAdvance<ReadFSqrt64, 0>; +def : ReadAdvance<ReadFCmp64, 0>; +def : ReadAdvance<ReadFSGNJ64, 0>; +def : ReadAdvance<ReadFMinMax64, 0>; +def : ReadAdvance<ReadFCvtF64ToI32, 0>; +def : ReadAdvance<ReadFCvtF64ToI64, 0>; +def : ReadAdvance<ReadFCvtI32ToF64, 0>; +def : ReadAdvance<ReadFCvtI64ToF64, 0>; +def : ReadAdvance<ReadFCvtF32ToF64, 0>; +def : ReadAdvance<ReadFCvtF64ToF32, 0>; +def : ReadAdvance<ReadFMovF64ToI64, 0>; +def : ReadAdvance<ReadFMovI64ToF64, 0>; +def : ReadAdvance<ReadFClass64, 0>; +} // Unsupported = true +} + multiclass UnsupportedSchedSFB { let Unsupported = true in { def : WriteRes<WriteSFB, []>; @@ -293,6 +377,26 @@ def : ReadAdvance<ReadAtomicHD, 0>; } // Unsupported = true } +multiclass UnsupportedSchedA { +let Unsupported = true in { +def : WriteRes<WriteAtomicW, []>; +def : WriteRes<WriteAtomicD, []>; +def : WriteRes<WriteAtomicLDW, []>; +def : WriteRes<WriteAtomicLDD, []>; +def : WriteRes<WriteAtomicSTW, []>; +def : WriteRes<WriteAtomicSTD, []>; + +def : ReadAdvance<ReadAtomicWA, 0>; +def : ReadAdvance<ReadAtomicWD, 0>; +def : ReadAdvance<ReadAtomicDA, 0>; +def : ReadAdvance<ReadAtomicDD, 0>; +def : ReadAdvance<ReadAtomicLDW, 0>; +def : ReadAdvance<ReadAtomicLDD, 0>; +def : ReadAdvance<ReadAtomicSTW, 0>; +def : ReadAdvance<ReadAtomicSTD, 0>; +} // Unsupported = true +} + // Include the scheduler resources for other instruction extensions. include "RISCVScheduleZb.td" include "RISCVScheduleV.td" diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp index d3236bb07d56..e84ddc65e2b7 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp @@ -39,9 +39,6 @@ namespace llvm::RISCVTuneInfoTable { #include "RISCVGenSearchableTables.inc" } // namespace llvm::RISCVTuneInfoTable -static cl::opt<bool> EnableSubRegLiveness("riscv-enable-subreg-liveness", - cl::init(true), cl::Hidden); - static cl::opt<unsigned> RVVVectorLMULMax( "riscv-v-fixed-length-vector-lmul-max", cl::desc("The maximum LMUL value to use for fixed length vectors. " @@ -183,11 +180,7 @@ bool RISCVSubtarget::useRVVForFixedLengthVectors() const { return hasVInstructions() && getMinRVVVectorSizeInBits() != 0; } -bool RISCVSubtarget::enableSubRegLiveness() const { - // FIXME: Enable subregister liveness by default for RVV to better handle - // LMUL>1 and segment load/store. - return EnableSubRegLiveness; -} +bool RISCVSubtarget::enableSubRegLiveness() const { return true; } void RISCVSubtarget::getPostRAMutations( std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { diff --git a/llvm/lib/Target/RISCV/RISCVSystemOperands.td b/llvm/lib/Target/RISCV/RISCVSystemOperands.td index 01c276711950..db840b302749 100644 --- a/llvm/lib/Target/RISCV/RISCVSystemOperands.td +++ b/llvm/lib/Target/RISCV/RISCVSystemOperands.td @@ -311,6 +311,11 @@ foreach i = 3...31 in { } //===----------------------------------------------------------------------===// +// Supervisor Counter Setup +//===----------------------------------------------------------------------===// +def : SysReg<"scountinhibit", 0x120>; + +//===----------------------------------------------------------------------===// // Debug/ Trace Registers (shared with Debug Mode) //===----------------------------------------------------------------------===// def : SysReg<"tselect", 0x7A0>; @@ -377,6 +382,12 @@ def SEED : SysReg<"seed", 0x015>; // Machine-level CSRs def : SysReg<"miselect", 0x350>; def : SysReg<"mireg", 0x351>; +foreach i = 2...3 in { + def : SysReg<"mireg"#i, !add(0x350, i)>; +} +foreach i = 4...6 in { + def : SysReg<"mireg"#i, !add(0x351, i)>; +} def : SysReg<"mtopei", 0x35C>; def : SysReg<"mtopi", 0xFB0>; def : SysReg<"mvien", 0x308>; @@ -392,6 +403,12 @@ def : SysReg<"miph", 0x354>; // Supervisor-level CSRs def : SysReg<"siselect", 0x150>; def : SysReg<"sireg", 0x151>; +foreach i = 2...3 in { + def : SysReg<"sireg"#i, !add(0x150, i)>; +} +foreach i = 4...6 in { + def : SysReg<"sireg"#i, !add(0x151, i)>; +} def : SysReg<"stopei", 0x15C>; def : SysReg<"stopi", 0xDB0>; let isRV32Only = 1 in { @@ -406,6 +423,12 @@ def : SysReg<"hviprio1", 0x646>; def : SysReg<"hviprio2", 0x647>; def : SysReg<"vsiselect", 0x250>; def : SysReg<"vsireg", 0x251>; +foreach i = 2...3 in { + def : SysReg<"vsireg"#i, !add(0x250, i)>; +} +foreach i = 4...6 in { + def : SysReg<"vsireg"#i, !add(0x251, i)>; +} def : SysReg<"vstopei", 0x25C>; def : SysReg<"vstopi", 0xEB0>; let isRV32Only = 1 in { diff --git a/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp b/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp index 7f5f7d0b1e4d..25e285e35f93 100644 --- a/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp @@ -138,7 +138,7 @@ ConvergenceRegion::ConvergenceRegion( SmallPtrSet<BasicBlock *, 8> &&Blocks, SmallPtrSet<BasicBlock *, 2> &&Exits) : DT(DT), LI(LI), ConvergenceToken(ConvergenceToken), Entry(Entry), Exits(std::move(Exits)), Blocks(std::move(Blocks)) { - for (auto *BB : this->Exits) + for ([[maybe_unused]] auto *BB : this->Exits) assert(this->Blocks.count(BB) != 0); assert(this->Blocks.count(this->Entry) != 0); } diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp index d96d2bf31b62..0f9a2a69e073 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp @@ -198,6 +198,8 @@ std::string getExtInstSetName(SPIRV::InstructionSet::InstructionSet Set) { return "OpenCL.std"; case SPIRV::InstructionSet::GLSL_std_450: return "GLSL.std.450"; + case SPIRV::InstructionSet::NonSemantic_Shader_DebugInfo_100: + return "NonSemantic.Shader.DebugInfo.100"; case SPIRV::InstructionSet::SPV_AMD_shader_trinary_minmax: return "SPV_AMD_shader_trinary_minmax"; } @@ -206,8 +208,9 @@ std::string getExtInstSetName(SPIRV::InstructionSet::InstructionSet Set) { SPIRV::InstructionSet::InstructionSet getExtInstSetFromString(std::string SetName) { - for (auto Set : {SPIRV::InstructionSet::GLSL_std_450, - SPIRV::InstructionSet::OpenCL_std}) { + for (auto Set : + {SPIRV::InstructionSet::GLSL_std_450, SPIRV::InstructionSet::OpenCL_std, + SPIRV::InstructionSet::NonSemantic_Shader_DebugInfo_100}) { if (SetName == getExtInstSetName(Set)) return Set; } diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h index 990eb1d230bc..44625793e941 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h @@ -197,6 +197,11 @@ namespace GLSLExtInst { #include "SPIRVGenTables.inc" } // namespace GLSLExtInst +namespace NonSemanticExtInst { +#define GET_NonSemanticExtInst_DECL +#include "SPIRVGenTables.inc" +} // namespace NonSemanticExtInst + namespace Opcode { #define GET_Opcode_DECL #include "SPIRVGenTables.inc" diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.h index 842958695e10..a6dd7138edf3 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.h +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.h @@ -21,7 +21,7 @@ public: ~SPIRVTargetStreamer() override; void changeSection(const MCSection *CurSection, MCSection *Section, - const MCExpr *SubSection, raw_ostream &OS) override {} + uint32_t SubSection, raw_ostream &OS) override {} }; } // namespace llvm diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp index 49838e685a6d..0b93a4d85eed 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp @@ -300,6 +300,72 @@ lookupBuiltin(StringRef DemangledCall, return nullptr; } +static MachineInstr *getBlockStructInstr(Register ParamReg, + MachineRegisterInfo *MRI) { + // We expect the following sequence of instructions: + // %0:_(pN) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.spv.alloca) + // or = G_GLOBAL_VALUE @block_literal_global + // %1:_(pN) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.spv.bitcast), %0 + // %2:_(p4) = G_ADDRSPACE_CAST %1:_(pN) + MachineInstr *MI = MRI->getUniqueVRegDef(ParamReg); + assert(MI->getOpcode() == TargetOpcode::G_ADDRSPACE_CAST && + MI->getOperand(1).isReg()); + Register BitcastReg = MI->getOperand(1).getReg(); + MachineInstr *BitcastMI = MRI->getUniqueVRegDef(BitcastReg); + assert(isSpvIntrinsic(*BitcastMI, Intrinsic::spv_bitcast) && + BitcastMI->getOperand(2).isReg()); + Register ValueReg = BitcastMI->getOperand(2).getReg(); + MachineInstr *ValueMI = MRI->getUniqueVRegDef(ValueReg); + return ValueMI; +} + +// Return an integer constant corresponding to the given register and +// defined in spv_track_constant. +// TODO: maybe unify with prelegalizer pass. +static unsigned getConstFromIntrinsic(Register Reg, MachineRegisterInfo *MRI) { + MachineInstr *DefMI = MRI->getUniqueVRegDef(Reg); + assert(isSpvIntrinsic(*DefMI, Intrinsic::spv_track_constant) && + DefMI->getOperand(2).isReg()); + MachineInstr *DefMI2 = MRI->getUniqueVRegDef(DefMI->getOperand(2).getReg()); + assert(DefMI2->getOpcode() == TargetOpcode::G_CONSTANT && + DefMI2->getOperand(1).isCImm()); + return DefMI2->getOperand(1).getCImm()->getValue().getZExtValue(); +} + +// Return type of the instruction result from spv_assign_type intrinsic. +// TODO: maybe unify with prelegalizer pass. +static const Type *getMachineInstrType(MachineInstr *MI) { + MachineInstr *NextMI = MI->getNextNode(); + if (!NextMI) + return nullptr; + if (isSpvIntrinsic(*NextMI, Intrinsic::spv_assign_name)) + if ((NextMI = NextMI->getNextNode()) == nullptr) + return nullptr; + Register ValueReg = MI->getOperand(0).getReg(); + if ((!isSpvIntrinsic(*NextMI, Intrinsic::spv_assign_type) && + !isSpvIntrinsic(*NextMI, Intrinsic::spv_assign_ptr_type)) || + NextMI->getOperand(1).getReg() != ValueReg) + return nullptr; + Type *Ty = getMDOperandAsType(NextMI->getOperand(2).getMetadata(), 0); + assert(Ty && "Type is expected"); + return Ty; +} + +static const Type *getBlockStructType(Register ParamReg, + MachineRegisterInfo *MRI) { + // In principle, this information should be passed to us from Clang via + // an elementtype attribute. However, said attribute requires that + // the function call be an intrinsic, which is not. Instead, we rely on being + // able to trace this to the declaration of a variable: OpenCL C specification + // section 6.12.5 should guarantee that we can do this. + MachineInstr *MI = getBlockStructInstr(ParamReg, MRI); + if (MI->getOpcode() == TargetOpcode::G_GLOBAL_VALUE) + return MI->getOperand(1).getGlobal()->getType(); + assert(isSpvIntrinsic(*MI, Intrinsic::spv_alloca) && + "Blocks in OpenCL C must be traceable to allocation site"); + return getMachineInstrType(MI); +} + //===----------------------------------------------------------------------===// // Helper functions for building misc instructions //===----------------------------------------------------------------------===// @@ -492,16 +558,21 @@ static Register buildMemSemanticsReg(Register SemanticsRegister, static bool buildOpFromWrapper(MachineIRBuilder &MIRBuilder, unsigned Opcode, const SPIRV::IncomingCall *Call, - Register TypeReg = Register(0)) { + Register TypeReg, + ArrayRef<uint32_t> ImmArgs = {}) { MachineRegisterInfo *MRI = MIRBuilder.getMRI(); auto MIB = MIRBuilder.buildInstr(Opcode); if (TypeReg.isValid()) MIB.addDef(Call->ReturnRegister).addUse(TypeReg); - for (Register ArgReg : Call->Arguments) { + unsigned Sz = Call->Arguments.size() - ImmArgs.size(); + for (unsigned i = 0; i < Sz; ++i) { + Register ArgReg = Call->Arguments[i]; if (!MRI->getRegClassOrNull(ArgReg)) MRI->setRegClass(ArgReg, &SPIRV::IDRegClass); MIB.addUse(ArgReg); } + for (uint32_t ImmArg : ImmArgs) + MIB.addImm(ImmArg); return true; } @@ -509,7 +580,7 @@ static bool buildOpFromWrapper(MachineIRBuilder &MIRBuilder, unsigned Opcode, static bool buildAtomicInitInst(const SPIRV::IncomingCall *Call, MachineIRBuilder &MIRBuilder) { if (Call->isSpirvOp()) - return buildOpFromWrapper(MIRBuilder, SPIRV::OpStore, Call); + return buildOpFromWrapper(MIRBuilder, SPIRV::OpStore, Call, Register(0)); assert(Call->Arguments.size() == 2 && "Need 2 arguments for atomic init translation"); @@ -567,7 +638,7 @@ static bool buildAtomicStoreInst(const SPIRV::IncomingCall *Call, MachineIRBuilder &MIRBuilder, SPIRVGlobalRegistry *GR) { if (Call->isSpirvOp()) - return buildOpFromWrapper(MIRBuilder, SPIRV::OpAtomicStore, Call); + return buildOpFromWrapper(MIRBuilder, SPIRV::OpAtomicStore, Call, Register(0)); Register ScopeRegister = buildConstantIntReg(SPIRV::Scope::Device, MIRBuilder, GR); @@ -694,7 +765,7 @@ static bool buildAtomicCompareExchangeInst( return true; } -/// Helper function for building an atomic load instruction. +/// Helper function for building atomic instructions. static bool buildAtomicRMWInst(const SPIRV::IncomingCall *Call, unsigned Opcode, MachineIRBuilder &MIRBuilder, SPIRVGlobalRegistry *GR) { @@ -719,13 +790,36 @@ static bool buildAtomicRMWInst(const SPIRV::IncomingCall *Call, unsigned Opcode, MemSemanticsReg = buildMemSemanticsReg(MemSemanticsReg, PtrRegister, Semantics, MIRBuilder, GR); MRI->setRegClass(Call->Arguments[1], &SPIRV::IDRegClass); + Register ValueReg = Call->Arguments[1]; + Register ValueTypeReg = GR->getSPIRVTypeID(Call->ReturnType); + // support cl_ext_float_atomics + if (Call->ReturnType->getOpcode() == SPIRV::OpTypeFloat) { + if (Opcode == SPIRV::OpAtomicIAdd) { + Opcode = SPIRV::OpAtomicFAddEXT; + } else if (Opcode == SPIRV::OpAtomicISub) { + // Translate OpAtomicISub applied to a floating type argument to + // OpAtomicFAddEXT with the negative value operand + Opcode = SPIRV::OpAtomicFAddEXT; + Register NegValueReg = + MRI->createGenericVirtualRegister(MRI->getType(ValueReg)); + MRI->setRegClass(NegValueReg, &SPIRV::IDRegClass); + GR->assignSPIRVTypeToVReg(Call->ReturnType, NegValueReg, + MIRBuilder.getMF()); + MIRBuilder.buildInstr(TargetOpcode::G_FNEG) + .addDef(NegValueReg) + .addUse(ValueReg); + insertAssignInstr(NegValueReg, nullptr, Call->ReturnType, GR, MIRBuilder, + MIRBuilder.getMF().getRegInfo()); + ValueReg = NegValueReg; + } + } MIRBuilder.buildInstr(Opcode) .addDef(Call->ReturnRegister) - .addUse(GR->getSPIRVTypeID(Call->ReturnType)) + .addUse(ValueTypeReg) .addUse(PtrRegister) .addUse(ScopeRegister) .addUse(MemSemanticsReg) - .addUse(Call->Arguments[1]); + .addUse(ValueReg); return true; } @@ -804,7 +898,7 @@ static bool buildBarrierInst(const SPIRV::IncomingCall *Call, unsigned Opcode, MachineIRBuilder &MIRBuilder, SPIRVGlobalRegistry *GR) { if (Call->isSpirvOp()) - return buildOpFromWrapper(MIRBuilder, Opcode, Call); + return buildOpFromWrapper(MIRBuilder, Opcode, Call, Register(0)); MachineRegisterInfo *MRI = MIRBuilder.getMRI(); unsigned MemFlags = getIConstVal(Call->Arguments[0], MRI); @@ -949,7 +1043,35 @@ static bool generateGroupInst(const SPIRV::IncomingCall *Call, const SPIRV::DemangledBuiltin *Builtin = Call->Builtin; const SPIRV::GroupBuiltin *GroupBuiltin = SPIRV::lookupGroupBuiltin(Builtin->Name); + MachineRegisterInfo *MRI = MIRBuilder.getMRI(); + if (Call->isSpirvOp()) { + if (GroupBuiltin->NoGroupOperation) + return buildOpFromWrapper(MIRBuilder, GroupBuiltin->Opcode, Call, + GR->getSPIRVTypeID(Call->ReturnType)); + + // Group Operation is a literal + Register GroupOpReg = Call->Arguments[1]; + const MachineInstr *MI = getDefInstrMaybeConstant(GroupOpReg, MRI); + if (!MI || MI->getOpcode() != TargetOpcode::G_CONSTANT) + report_fatal_error( + "Group Operation parameter must be an integer constant"); + uint64_t GrpOp = MI->getOperand(1).getCImm()->getValue().getZExtValue(); + Register ScopeReg = Call->Arguments[0]; + if (!MRI->getRegClassOrNull(ScopeReg)) + MRI->setRegClass(ScopeReg, &SPIRV::IDRegClass); + Register ValueReg = Call->Arguments[2]; + if (!MRI->getRegClassOrNull(ValueReg)) + MRI->setRegClass(ValueReg, &SPIRV::IDRegClass); + MIRBuilder.buildInstr(GroupBuiltin->Opcode) + .addDef(Call->ReturnRegister) + .addUse(GR->getSPIRVTypeID(Call->ReturnType)) + .addUse(ScopeReg) + .addImm(GrpOp) + .addUse(ValueReg); + return true; + } + Register Arg0; if (GroupBuiltin->HasBoolArg) { Register ConstRegister = Call->Arguments[0]; @@ -1371,6 +1493,14 @@ static bool generateBarrierInst(const SPIRV::IncomingCall *Call, return buildBarrierInst(Call, Opcode, MIRBuilder, GR); } +static bool generateCastToPtrInst(const SPIRV::IncomingCall *Call, + MachineIRBuilder &MIRBuilder) { + MIRBuilder.buildInstr(TargetOpcode::G_ADDRSPACE_CAST) + .addDef(Call->ReturnRegister) + .addUse(Call->Arguments[0]); + return true; +} + static bool generateDotOrFMulInst(const SPIRV::IncomingCall *Call, MachineIRBuilder &MIRBuilder, SPIRVGlobalRegistry *GR) { @@ -1722,6 +1852,45 @@ static bool generateSelectInst(const SPIRV::IncomingCall *Call, return true; } +static bool generateConstructInst(const SPIRV::IncomingCall *Call, + MachineIRBuilder &MIRBuilder, + SPIRVGlobalRegistry *GR) { + return buildOpFromWrapper(MIRBuilder, SPIRV::OpCompositeConstruct, Call, + GR->getSPIRVTypeID(Call->ReturnType)); +} + +static bool generateCoopMatrInst(const SPIRV::IncomingCall *Call, + MachineIRBuilder &MIRBuilder, + SPIRVGlobalRegistry *GR) { + const SPIRV::DemangledBuiltin *Builtin = Call->Builtin; + unsigned Opcode = + SPIRV::lookupNativeBuiltin(Builtin->Name, Builtin->Set)->Opcode; + bool IsSet = Opcode != SPIRV::OpCooperativeMatrixStoreKHR; + unsigned ArgSz = Call->Arguments.size(); + unsigned LiteralIdx = 0; + if (Opcode == SPIRV::OpCooperativeMatrixLoadKHR && ArgSz > 3) + LiteralIdx = 3; + else if (Opcode == SPIRV::OpCooperativeMatrixStoreKHR && ArgSz > 4) + LiteralIdx = 4; + SmallVector<uint32_t, 1> ImmArgs; + MachineRegisterInfo *MRI = MIRBuilder.getMRI(); + if (LiteralIdx > 0) + ImmArgs.push_back(getConstFromIntrinsic(Call->Arguments[LiteralIdx], MRI)); + Register TypeReg = GR->getSPIRVTypeID(Call->ReturnType); + if (Opcode == SPIRV::OpCooperativeMatrixLengthKHR) { + SPIRVType *CoopMatrType = GR->getSPIRVTypeForVReg(Call->Arguments[0]); + if (!CoopMatrType) + report_fatal_error("Can't find a register's type definition"); + MIRBuilder.buildInstr(Opcode) + .addDef(Call->ReturnRegister) + .addUse(TypeReg) + .addUse(CoopMatrType->getOperand(0).getReg()); + return true; + } + return buildOpFromWrapper(MIRBuilder, Opcode, Call, + IsSet ? TypeReg : Register(0), ImmArgs); +} + static bool generateSpecConstantInst(const SPIRV::IncomingCall *Call, MachineIRBuilder &MIRBuilder, SPIRVGlobalRegistry *GR) { @@ -1826,7 +1995,10 @@ static bool buildNDRange(const SPIRV::IncomingCall *Call, .addDef(GlobalWorkSize) .addUse(GR->getSPIRVTypeID(SpvFieldTy)) .addUse(GWSPtr); - Const = GR->getOrCreateConsIntArray(0, MIRBuilder, SpvFieldTy); + const SPIRVSubtarget &ST = + cast<SPIRVSubtarget>(MIRBuilder.getMF().getSubtarget()); + Const = GR->getOrCreateConstIntArray(0, Size, *MIRBuilder.getInsertPt(), + SpvFieldTy, *ST.getInstrInfo()); } else { Const = GR->buildConstantInt(0, MIRBuilder, SpvTy); } @@ -1847,68 +2019,6 @@ static bool buildNDRange(const SPIRV::IncomingCall *Call, .addUse(TmpReg); } -static MachineInstr *getBlockStructInstr(Register ParamReg, - MachineRegisterInfo *MRI) { - // We expect the following sequence of instructions: - // %0:_(pN) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.spv.alloca) - // or = G_GLOBAL_VALUE @block_literal_global - // %1:_(pN) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.spv.bitcast), %0 - // %2:_(p4) = G_ADDRSPACE_CAST %1:_(pN) - MachineInstr *MI = MRI->getUniqueVRegDef(ParamReg); - assert(MI->getOpcode() == TargetOpcode::G_ADDRSPACE_CAST && - MI->getOperand(1).isReg()); - Register BitcastReg = MI->getOperand(1).getReg(); - MachineInstr *BitcastMI = MRI->getUniqueVRegDef(BitcastReg); - assert(isSpvIntrinsic(*BitcastMI, Intrinsic::spv_bitcast) && - BitcastMI->getOperand(2).isReg()); - Register ValueReg = BitcastMI->getOperand(2).getReg(); - MachineInstr *ValueMI = MRI->getUniqueVRegDef(ValueReg); - return ValueMI; -} - -// Return an integer constant corresponding to the given register and -// defined in spv_track_constant. -// TODO: maybe unify with prelegalizer pass. -static unsigned getConstFromIntrinsic(Register Reg, MachineRegisterInfo *MRI) { - MachineInstr *DefMI = MRI->getUniqueVRegDef(Reg); - assert(isSpvIntrinsic(*DefMI, Intrinsic::spv_track_constant) && - DefMI->getOperand(2).isReg()); - MachineInstr *DefMI2 = MRI->getUniqueVRegDef(DefMI->getOperand(2).getReg()); - assert(DefMI2->getOpcode() == TargetOpcode::G_CONSTANT && - DefMI2->getOperand(1).isCImm()); - return DefMI2->getOperand(1).getCImm()->getValue().getZExtValue(); -} - -// Return type of the instruction result from spv_assign_type intrinsic. -// TODO: maybe unify with prelegalizer pass. -static const Type *getMachineInstrType(MachineInstr *MI) { - MachineInstr *NextMI = MI->getNextNode(); - if (isSpvIntrinsic(*NextMI, Intrinsic::spv_assign_name)) - NextMI = NextMI->getNextNode(); - Register ValueReg = MI->getOperand(0).getReg(); - if (!isSpvIntrinsic(*NextMI, Intrinsic::spv_assign_type) || - NextMI->getOperand(1).getReg() != ValueReg) - return nullptr; - Type *Ty = getMDOperandAsType(NextMI->getOperand(2).getMetadata(), 0); - assert(Ty && "Type is expected"); - return Ty; -} - -static const Type *getBlockStructType(Register ParamReg, - MachineRegisterInfo *MRI) { - // In principle, this information should be passed to us from Clang via - // an elementtype attribute. However, said attribute requires that - // the function call be an intrinsic, which is not. Instead, we rely on being - // able to trace this to the declaration of a variable: OpenCL C specification - // section 6.12.5 should guarantee that we can do this. - MachineInstr *MI = getBlockStructInstr(ParamReg, MRI); - if (MI->getOpcode() == TargetOpcode::G_GLOBAL_VALUE) - return MI->getOperand(1).getGlobal()->getType(); - assert(isSpvIntrinsic(*MI, Intrinsic::spv_alloca) && - "Blocks in OpenCL C must be traceable to allocation site"); - return getMachineInstrType(MI); -} - // TODO: maybe move to the global register. static SPIRVType * getOrCreateSPIRVDeviceEventPointer(MachineIRBuilder &MIRBuilder, @@ -2322,6 +2432,8 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall, return generateAtomicFloatingInst(Call.get(), MIRBuilder, GR); case SPIRV::Barrier: return generateBarrierInst(Call.get(), MIRBuilder, GR); + case SPIRV::CastToPtr: + return generateCastToPtrInst(Call.get(), MIRBuilder); case SPIRV::Dot: return generateDotOrFMulInst(Call.get(), MIRBuilder, GR); case SPIRV::Wave: @@ -2340,6 +2452,8 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall, return generateSampleImageInst(DemangledCall, Call.get(), MIRBuilder, GR); case SPIRV::Select: return generateSelectInst(Call.get(), MIRBuilder); + case SPIRV::Construct: + return generateConstructInst(Call.get(), MIRBuilder, GR); case SPIRV::SpecConstant: return generateSpecConstantInst(Call.get(), MIRBuilder, GR); case SPIRV::Enqueue: @@ -2358,6 +2472,8 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall, return generateGroupUniformInst(Call.get(), MIRBuilder, GR); case SPIRV::KernelClock: return generateKernelClockInst(Call.get(), MIRBuilder, GR); + case SPIRV::CoopMatr: + return generateCoopMatrInst(Call.get(), MIRBuilder, GR); } return false; } @@ -2376,7 +2492,7 @@ Type *parseBuiltinCallArgumentBaseType(const StringRef DemangledCall, if (hasBuiltinTypePrefix(TypeStr)) { // OpenCL builtin types in demangled call strings have the following format: // e.g. ocl_image2d_ro - bool IsOCLBuiltinType = TypeStr.consume_front("ocl_"); + [[maybe_unused]] bool IsOCLBuiltinType = TypeStr.consume_front("ocl_"); assert(IsOCLBuiltinType && "Invalid OpenCL builtin prefix"); // Check if this is pointer to a builtin type and not just pointer @@ -2482,6 +2598,22 @@ static SPIRVType *getPipeType(const TargetExtType *ExtensionType, ExtensionType->getIntParameter(0))); } +static SPIRVType *getCoopMatrType(const TargetExtType *ExtensionType, + MachineIRBuilder &MIRBuilder, + SPIRVGlobalRegistry *GR) { + assert(ExtensionType->getNumIntParameters() == 4 && + "Invalid number of parameters for SPIR-V coop matrices builtin!"); + assert(ExtensionType->getNumTypeParameters() == 1 && + "SPIR-V coop matrices builtin type must have a type parameter!"); + const SPIRVType *ElemType = + GR->getOrCreateSPIRVType(ExtensionType->getTypeParameter(0), MIRBuilder); + // Create or get an existing type from GlobalRegistry. + return GR->getOrCreateOpTypeCoopMatr( + MIRBuilder, ExtensionType, ElemType, ExtensionType->getIntParameter(0), + ExtensionType->getIntParameter(1), ExtensionType->getIntParameter(2), + ExtensionType->getIntParameter(3)); +} + static SPIRVType * getImageType(const TargetExtType *ExtensionType, const SPIRV::AccessQualifier::AccessQualifier Qualifier, @@ -2612,6 +2744,9 @@ SPIRVType *lowerBuiltinType(const Type *OpaqueType, case SPIRV::OpTypeSampledImage: TargetType = getSampledImageType(BuiltinType, MIRBuilder, GR); break; + case SPIRV::OpTypeCooperativeMatrixKHR: + TargetType = getCoopMatrType(BuiltinType, MIRBuilder, GR); + break; default: TargetType = getNonParameterizedType(BuiltinType, TypeRecord, MIRBuilder, GR); diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td index edc9e1a33d9f..fb88332ab890 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td @@ -26,6 +26,7 @@ class InstructionSet<bits<32> value> { def OpenCL_std : InstructionSet<0>; def GLSL_std_450 : InstructionSet<1>; def SPV_AMD_shader_trinary_minmax : InstructionSet<2>; +def NonSemantic_Shader_DebugInfo_100 : InstructionSet<3>; // Define various builtin groups def BuiltinGroup : GenericEnum { @@ -59,6 +60,9 @@ def IntelSubgroups : BuiltinGroup; def AtomicFloating : BuiltinGroup; def GroupUniform : BuiltinGroup; def KernelClock : BuiltinGroup; +def CastToPtr : BuiltinGroup; +def Construct : BuiltinGroup; +def CoopMatr : BuiltinGroup; //===----------------------------------------------------------------------===// // Class defining a demangled builtin record. The information in the record @@ -113,6 +117,9 @@ def : DemangledBuiltin<"__spirv_ImageSampleExplicitLod", OpenCL_std, SampleImage // Select builtin record: def : DemangledBuiltin<"__spirv_Select", OpenCL_std, Select, 3, 3>; +// Composite Construct builtin record: +def : DemangledBuiltin<"__spirv_CompositeConstruct", OpenCL_std, Construct, 1, 0>; + //===----------------------------------------------------------------------===// // Class defining an extended builtin record used for lowering into an // OpExtInst instruction. @@ -170,6 +177,17 @@ class GLSLExtInst<string name, bits<32> value> { bits<32> Value = value; } +def NonSemanticExtInst : GenericEnum { + let FilterClass = "NonSemanticExtInst"; + let NameField = "Name"; + let ValueField = "Value"; +} + +class NonSemanticExtInst<string name, bits<32> value> { + string Name = name; + bits<32> Value = value; +} + // Multiclass used to define at the same time both a demangled builtin record // and a corresponding extended builtin record. multiclass DemangledExtendedBuiltin<string name, InstructionSet set, int number> { @@ -183,6 +201,10 @@ multiclass DemangledExtendedBuiltin<string name, InstructionSet set, int number> if !eq(set, GLSL_std_450) then { def : GLSLExtInst<name, number>; } + + if !eq(set, NonSemantic_Shader_DebugInfo_100) then { + def : NonSemanticExtInst<name, number>; + } } // Extended builtin records: @@ -430,6 +452,50 @@ defm : DemangledExtendedBuiltin<"NMin", GLSL_std_450, 79>; defm : DemangledExtendedBuiltin<"NMax", GLSL_std_450, 80>; defm : DemangledExtendedBuiltin<"NClamp", GLSL_std_450, 81>; +defm : DemangledExtendedBuiltin<"DebugInfoNone", NonSemantic_Shader_DebugInfo_100, 0>; +defm : DemangledExtendedBuiltin<"DebugCompilationUnit", NonSemantic_Shader_DebugInfo_100, 1>; +defm : DemangledExtendedBuiltin<"DebugTypeBasic", NonSemantic_Shader_DebugInfo_100, 2>; +defm : DemangledExtendedBuiltin<"DebugTypePointer", NonSemantic_Shader_DebugInfo_100, 3>; +defm : DemangledExtendedBuiltin<"DebugTypeQualifier", NonSemantic_Shader_DebugInfo_100, 4>; +defm : DemangledExtendedBuiltin<"DebugTypeArray", NonSemantic_Shader_DebugInfo_100, 5>; +defm : DemangledExtendedBuiltin<"DebugTypeVector", NonSemantic_Shader_DebugInfo_100, 6>; +defm : DemangledExtendedBuiltin<"DebugTypedef", NonSemantic_Shader_DebugInfo_100, 7>; +defm : DemangledExtendedBuiltin<"DebugTypeFunction", NonSemantic_Shader_DebugInfo_100, 8>; +defm : DemangledExtendedBuiltin<"DebugTypeEnum", NonSemantic_Shader_DebugInfo_100, 9>; +defm : DemangledExtendedBuiltin<"DebugTypeComposite", NonSemantic_Shader_DebugInfo_100, 10>; +defm : DemangledExtendedBuiltin<"DebugTypeMember", NonSemantic_Shader_DebugInfo_100, 11>; +defm : DemangledExtendedBuiltin<"DebugTypeInheritance", NonSemantic_Shader_DebugInfo_100, 12>; +defm : DemangledExtendedBuiltin<"DebugTypePtrToMember", NonSemantic_Shader_DebugInfo_100, 13>; +defm : DemangledExtendedBuiltin<"DebugTypeTemplate", NonSemantic_Shader_DebugInfo_100, 14>; +defm : DemangledExtendedBuiltin<"DebugTypeTemplateParameter", NonSemantic_Shader_DebugInfo_100, 15>; +defm : DemangledExtendedBuiltin<"DebugTypeTemplateTemplateParameter", NonSemantic_Shader_DebugInfo_100, 16>; +defm : DemangledExtendedBuiltin<"DebugTypeTemplateParameterPack", NonSemantic_Shader_DebugInfo_100, 17>; +defm : DemangledExtendedBuiltin<"DebugGlobalVariable", NonSemantic_Shader_DebugInfo_100, 18>; +defm : DemangledExtendedBuiltin<"DebugFunctionDeclaration", NonSemantic_Shader_DebugInfo_100, 19>; +defm : DemangledExtendedBuiltin<"DebugFunction", NonSemantic_Shader_DebugInfo_100, 20>; +defm : DemangledExtendedBuiltin<"DebugLexicalBlock", NonSemantic_Shader_DebugInfo_100, 21>; +defm : DemangledExtendedBuiltin<"DebugLexicalBlockDiscriminator", NonSemantic_Shader_DebugInfo_100, 22>; +defm : DemangledExtendedBuiltin<"DebugScope", NonSemantic_Shader_DebugInfo_100, 23>; +defm : DemangledExtendedBuiltin<"DebugNoScope", NonSemantic_Shader_DebugInfo_100, 24>; +defm : DemangledExtendedBuiltin<"DebugInlinedAt", NonSemantic_Shader_DebugInfo_100, 25>; +defm : DemangledExtendedBuiltin<"DebugLocalVariable", NonSemantic_Shader_DebugInfo_100, 26>; +defm : DemangledExtendedBuiltin<"DebugInlinedVariable", NonSemantic_Shader_DebugInfo_100, 27>; +defm : DemangledExtendedBuiltin<"DebugDeclare", NonSemantic_Shader_DebugInfo_100, 28>; +defm : DemangledExtendedBuiltin<"DebugValue", NonSemantic_Shader_DebugInfo_100, 29>; +defm : DemangledExtendedBuiltin<"DebugOperation", NonSemantic_Shader_DebugInfo_100, 30>; +defm : DemangledExtendedBuiltin<"DebugExpression", NonSemantic_Shader_DebugInfo_100, 31>; +defm : DemangledExtendedBuiltin<"DebugMacroDef", NonSemantic_Shader_DebugInfo_100, 32>; +defm : DemangledExtendedBuiltin<"DebugMacroUndef", NonSemantic_Shader_DebugInfo_100, 33>; +defm : DemangledExtendedBuiltin<"DebugImportedEntity", NonSemantic_Shader_DebugInfo_100, 34>; +defm : DemangledExtendedBuiltin<"DebugSource", NonSemantic_Shader_DebugInfo_100, 35>; +defm : DemangledExtendedBuiltin<"DebugFunctionDefinition", NonSemantic_Shader_DebugInfo_100, 101>; +defm : DemangledExtendedBuiltin<"DebugSourceContinued", NonSemantic_Shader_DebugInfo_100, 102>; +defm : DemangledExtendedBuiltin<"DebugLine", NonSemantic_Shader_DebugInfo_100, 103>; +defm : DemangledExtendedBuiltin<"DebugNoLine", NonSemantic_Shader_DebugInfo_100, 104>; +defm : DemangledExtendedBuiltin<"DebugBuildIdentifier", NonSemantic_Shader_DebugInfo_100, 105>; +defm : DemangledExtendedBuiltin<"DebugStoragePath", NonSemantic_Shader_DebugInfo_100, 106>; +defm : DemangledExtendedBuiltin<"DebugEntryPoint", NonSemantic_Shader_DebugInfo_100, 107>; +defm : DemangledExtendedBuiltin<"DebugTypeMatrix", NonSemantic_Shader_DebugInfo_100, 108>; //===----------------------------------------------------------------------===// // Class defining an native builtin record used for direct translation into a // SPIR-V instruction. @@ -532,6 +598,7 @@ defm : DemangledNativeBuiltin<"__spirv_AtomicAnd", OpenCL_std, Atomic, 4, 4, OpA defm : DemangledNativeBuiltin<"atomic_exchange", OpenCL_std, Atomic, 2, 4, OpAtomicExchange>; defm : DemangledNativeBuiltin<"atomic_exchange_explicit", OpenCL_std, Atomic, 2, 4, OpAtomicExchange>; defm : DemangledNativeBuiltin<"AtomicEx__spirv_change", OpenCL_std, Atomic, 2, 4, OpAtomicExchange>; +defm : DemangledNativeBuiltin<"__spirv_AtomicExchange", OpenCL_std, Atomic, 4, 4, OpAtomicExchange>; defm : DemangledNativeBuiltin<"atomic_work_item_fence", OpenCL_std, Atomic, 1, 3, OpMemoryBarrier>; defm : DemangledNativeBuiltin<"__spirv_MemoryBarrier", OpenCL_std, Atomic, 2, 2, OpMemoryBarrier>; defm : DemangledNativeBuiltin<"atomic_fetch_add", OpenCL_std, Atomic, 2, 4, OpAtomicIAdd>; @@ -539,11 +606,11 @@ defm : DemangledNativeBuiltin<"atomic_fetch_sub", OpenCL_std, Atomic, 2, 4, OpAt defm : DemangledNativeBuiltin<"atomic_fetch_or", OpenCL_std, Atomic, 2, 4, OpAtomicOr>; defm : DemangledNativeBuiltin<"atomic_fetch_xor", OpenCL_std, Atomic, 2, 4, OpAtomicXor>; defm : DemangledNativeBuiltin<"atomic_fetch_and", OpenCL_std, Atomic, 2, 4, OpAtomicAnd>; -defm : DemangledNativeBuiltin<"atomic_fetch_add_explicit", OpenCL_std, Atomic, 4, 6, OpAtomicIAdd>; -defm : DemangledNativeBuiltin<"atomic_fetch_sub_explicit", OpenCL_std, Atomic, 4, 6, OpAtomicISub>; -defm : DemangledNativeBuiltin<"atomic_fetch_or_explicit", OpenCL_std, Atomic, 4, 6, OpAtomicOr>; -defm : DemangledNativeBuiltin<"atomic_fetch_xor_explicit", OpenCL_std, Atomic, 4, 6, OpAtomicXor>; -defm : DemangledNativeBuiltin<"atomic_fetch_and_explicit", OpenCL_std, Atomic, 4, 6, OpAtomicAnd>; +defm : DemangledNativeBuiltin<"atomic_fetch_add_explicit", OpenCL_std, Atomic, 3, 4, OpAtomicIAdd>; +defm : DemangledNativeBuiltin<"atomic_fetch_sub_explicit", OpenCL_std, Atomic, 3, 4, OpAtomicISub>; +defm : DemangledNativeBuiltin<"atomic_fetch_or_explicit", OpenCL_std, Atomic, 3, 4, OpAtomicOr>; +defm : DemangledNativeBuiltin<"atomic_fetch_xor_explicit", OpenCL_std, Atomic, 3, 4, OpAtomicXor>; +defm : DemangledNativeBuiltin<"atomic_fetch_and_explicit", OpenCL_std, Atomic, 3, 4, OpAtomicAnd>; defm : DemangledNativeBuiltin<"atomic_flag_test_and_set", OpenCL_std, Atomic, 1, 1, OpAtomicFlagTestAndSet>; defm : DemangledNativeBuiltin<"__spirv_AtomicFlagTestAndSet", OpenCL_std, Atomic, 3, 3, OpAtomicFlagTestAndSet>; defm : DemangledNativeBuiltin<"atomic_flag_test_and_set_explicit", OpenCL_std, Atomic, 2, 3, OpAtomicFlagTestAndSet>; @@ -595,6 +662,23 @@ defm : DemangledNativeBuiltin<"__spirv_GroupWaitEvents", OpenCL_std, AsyncCopy, defm : DemangledNativeBuiltin<"__spirv_Load", OpenCL_std, LoadStore, 1, 3, OpLoad>; defm : DemangledNativeBuiltin<"__spirv_Store", OpenCL_std, LoadStore, 2, 4, OpStore>; +// Address Space Qualifier Functions/Pointers Conversion Instructions: +defm : DemangledNativeBuiltin<"to_global", OpenCL_std, CastToPtr, 1, 1, OpGenericCastToPtr>; +defm : DemangledNativeBuiltin<"to_local", OpenCL_std, CastToPtr, 1, 1, OpGenericCastToPtr>; +defm : DemangledNativeBuiltin<"to_private", OpenCL_std, CastToPtr, 1, 1, OpGenericCastToPtr>; +defm : DemangledNativeBuiltin<"__spirv_GenericCastToPtr_ToGlobal", OpenCL_std, CastToPtr, 2, 2, OpGenericCastToPtr>; +defm : DemangledNativeBuiltin<"__spirv_GenericCastToPtr_ToLocal", OpenCL_std, CastToPtr, 2, 2, OpGenericCastToPtr>; +defm : DemangledNativeBuiltin<"__spirv_GenericCastToPtr_ToPrivate", OpenCL_std, CastToPtr, 2, 2, OpGenericCastToPtr>; +defm : DemangledNativeBuiltin<"__spirv_GenericCastToPtrExplicit_ToGlobal", OpenCL_std, CastToPtr, 2, 2, OpGenericCastToPtr>; +defm : DemangledNativeBuiltin<"__spirv_GenericCastToPtrExplicit_ToLocal", OpenCL_std, CastToPtr, 2, 2, OpGenericCastToPtr>; +defm : DemangledNativeBuiltin<"__spirv_GenericCastToPtrExplicit_ToPrivate", OpenCL_std, CastToPtr, 2, 2, OpGenericCastToPtr>; + +// Cooperative Matrix builtin records: +defm : DemangledNativeBuiltin<"__spirv_CooperativeMatrixLoadKHR", OpenCL_std, CoopMatr, 2, 0, OpCooperativeMatrixLoadKHR>; +defm : DemangledNativeBuiltin<"__spirv_CooperativeMatrixStoreKHR", OpenCL_std, CoopMatr, 3, 0, OpCooperativeMatrixStoreKHR>; +defm : DemangledNativeBuiltin<"__spirv_CooperativeMatrixMulAddKHR", OpenCL_std, CoopMatr, 3, 0, OpCooperativeMatrixMulAddKHR>; +defm : DemangledNativeBuiltin<"__spirv_CooperativeMatrixLengthKHR", OpenCL_std, CoopMatr, 1, 1, OpCooperativeMatrixLengthKHR>; + //===----------------------------------------------------------------------===// // Class defining a work/sub group builtin that should be translated into a // SPIR-V instruction using the defined properties. @@ -682,9 +766,17 @@ multiclass DemangledGroupBuiltin<string name, int level /* OnlyWork/OnlySub/... } } +multiclass DemangledGroupBuiltinWrapper<string name, bits<8> minNumArgs, bits<8> maxNumArgs, Op operation> { + def : DemangledBuiltin<name, OpenCL_std, Group, minNumArgs, maxNumArgs>; + def : GroupBuiltin<name, operation>; +} + defm : DemangledGroupBuiltin<"group_all", WorkOrSub, OpGroupAll>; +defm : DemangledGroupBuiltinWrapper<"__spirv_GroupAll", 2, 2, OpGroupAll>; defm : DemangledGroupBuiltin<"group_any", WorkOrSub, OpGroupAny>; +defm : DemangledGroupBuiltinWrapper<"__spirv_GroupAny", 2, 2, OpGroupAny>; defm : DemangledGroupBuiltin<"group_broadcast", WorkOrSub, OpGroupBroadcast>; +defm : DemangledGroupBuiltinWrapper<"__spirv_GroupBroadcast", 3, 3, OpGroupBroadcast>; defm : DemangledGroupBuiltin<"group_non_uniform_broadcast", OnlySub, OpGroupNonUniformBroadcast>; defm : DemangledGroupBuiltin<"group_broadcast_first", OnlySub, OpGroupNonUniformBroadcastFirst>; @@ -719,41 +811,49 @@ defm : DemangledGroupBuiltin<"group_scan_inclusive_adds", WorkOrSub, OpGroupIAdd defm : DemangledGroupBuiltin<"group_reduce_addu", WorkOrSub, OpGroupIAdd>; defm : DemangledGroupBuiltin<"group_scan_exclusive_addu", WorkOrSub, OpGroupIAdd>; defm : DemangledGroupBuiltin<"group_scan_inclusive_addu", WorkOrSub, OpGroupIAdd>; +defm : DemangledGroupBuiltinWrapper<"__spirv_GroupIAdd", 3, 3, OpGroupIAdd>; defm : DemangledGroupBuiltin<"group_fadd", WorkOrSub, OpGroupFAdd>; defm : DemangledGroupBuiltin<"group_reduce_addf", WorkOrSub, OpGroupFAdd>; defm : DemangledGroupBuiltin<"group_scan_exclusive_addf", WorkOrSub, OpGroupFAdd>; defm : DemangledGroupBuiltin<"group_scan_inclusive_addf", WorkOrSub, OpGroupFAdd>; +defm : DemangledGroupBuiltinWrapper<"__spirv_GroupFAdd", 3, 3, OpGroupFAdd>; defm : DemangledGroupBuiltin<"group_fmin", WorkOrSub, OpGroupFMin>; defm : DemangledGroupBuiltin<"group_reduce_minf", WorkOrSub, OpGroupFMin>; defm : DemangledGroupBuiltin<"group_scan_exclusive_minf", WorkOrSub, OpGroupFMin>; defm : DemangledGroupBuiltin<"group_scan_inclusive_minf", WorkOrSub, OpGroupFMin>; +defm : DemangledGroupBuiltinWrapper<"__spirv_GroupFMin", 3, 3, OpGroupFMin>; defm : DemangledGroupBuiltin<"group_umin", WorkOrSub, OpGroupUMin>; defm : DemangledGroupBuiltin<"group_reduce_minu", WorkOrSub, OpGroupUMin>; defm : DemangledGroupBuiltin<"group_scan_exclusive_minu", WorkOrSub, OpGroupUMin>; defm : DemangledGroupBuiltin<"group_scan_inclusive_minu", WorkOrSub, OpGroupUMin>; +defm : DemangledGroupBuiltinWrapper<"__spirv_GroupUMin", 3, 3, OpGroupUMin>; defm : DemangledGroupBuiltin<"group_smin", WorkOrSub, OpGroupSMin>; defm : DemangledGroupBuiltin<"group_reduce_mins", WorkOrSub, OpGroupSMin>; defm : DemangledGroupBuiltin<"group_scan_exclusive_mins", WorkOrSub, OpGroupSMin>; defm : DemangledGroupBuiltin<"group_scan_inclusive_mins", WorkOrSub, OpGroupSMin>; +defm : DemangledGroupBuiltinWrapper<"__spirv_GroupSMin", 3, 3, OpGroupSMin>; defm : DemangledGroupBuiltin<"group_fmax", WorkOrSub, OpGroupFMax>; defm : DemangledGroupBuiltin<"group_reduce_maxf", WorkOrSub, OpGroupFMax>; defm : DemangledGroupBuiltin<"group_scan_exclusive_maxf", WorkOrSub, OpGroupFMax>; defm : DemangledGroupBuiltin<"group_scan_inclusive_maxf", WorkOrSub, OpGroupFMax>; +defm : DemangledGroupBuiltinWrapper<"__spirv_GroupFMax", 3, 3, OpGroupFMax>; defm : DemangledGroupBuiltin<"group_umax", WorkOrSub, OpGroupUMax>; defm : DemangledGroupBuiltin<"group_reduce_maxu", WorkOrSub, OpGroupUMax>; defm : DemangledGroupBuiltin<"group_scan_exclusive_maxu", WorkOrSub, OpGroupUMax>; defm : DemangledGroupBuiltin<"group_scan_inclusive_maxu", WorkOrSub, OpGroupUMax>; +defm : DemangledGroupBuiltinWrapper<"__spirv_GroupUMax", 3, 3, OpGroupUMax>; defm : DemangledGroupBuiltin<"group_smax", WorkOrSub, OpGroupSMax>; defm : DemangledGroupBuiltin<"group_reduce_maxs", WorkOrSub, OpGroupSMax>; defm : DemangledGroupBuiltin<"group_scan_exclusive_maxs", WorkOrSub, OpGroupSMax>; defm : DemangledGroupBuiltin<"group_scan_inclusive_maxs", WorkOrSub, OpGroupSMax>; +defm : DemangledGroupBuiltinWrapper<"__spirv_GroupSMax", 3, 3, OpGroupSMax>; // cl_khr_subgroup_non_uniform_arithmetic defm : DemangledGroupBuiltin<"group_non_uniform_iadd", WorkOrSub, OpGroupNonUniformIAdd>; @@ -997,8 +1097,6 @@ multiclass DemangledAtomicFloatingBuiltin<string name, bits<8> minNumArgs, bits< defm : DemangledAtomicFloatingBuiltin<"AddEXT", 4, 4, OpAtomicFAddEXT>; defm : DemangledAtomicFloatingBuiltin<"MinEXT", 4, 4, OpAtomicFMinEXT>; defm : DemangledAtomicFloatingBuiltin<"MaxEXT", 4, 4, OpAtomicFMaxEXT>; -// TODO: add support for cl_ext_float_atomics to enable performing atomic operations -// on floating-point numbers in memory (float arguments for atomic_fetch_add, ...) //===----------------------------------------------------------------------===// // Class defining a sub group builtin that should be translated into a @@ -1407,7 +1505,7 @@ def : BuiltinType<"spirv.DeviceEvent", OpTypeDeviceEvent>; def : BuiltinType<"spirv.Image", OpTypeImage>; def : BuiltinType<"spirv.SampledImage", OpTypeSampledImage>; def : BuiltinType<"spirv.Pipe", OpTypePipe>; - +def : BuiltinType<"spirv.CooperativeMatrixKHR", OpTypeCooperativeMatrixKHR>; //===----------------------------------------------------------------------===// // Class matching an OpenCL builtin type name to an equivalent SPIR-V diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp index 75aa1823b11f..c7c244cfa897 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp @@ -66,6 +66,8 @@ static const std::map<std::string, SPIRV::Extension::Extension> SPIRV::Extension::Extension::SPV_INTEL_function_pointers}, {"SPV_KHR_shader_clock", SPIRV::Extension::Extension::SPV_KHR_shader_clock}, + {"SPV_KHR_cooperative_matrix", + SPIRV::Extension::Extension::SPV_KHR_cooperative_matrix}, }; bool SPIRVExtensionsParser::parse(cl::Option &O, llvm::StringRef ArgName, diff --git a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h index 2ec3fb35ca04..a37e65a47eda 100644 --- a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h +++ b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h @@ -16,6 +16,7 @@ #include "MCTargetDesc/SPIRVBaseInfo.h" #include "MCTargetDesc/SPIRVMCTargetDesc.h" +#include "SPIRVUtils.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" @@ -51,152 +52,87 @@ public: void addDep(DTSortableEntry *E) { Deps.push_back(E); } }; -struct SpecialTypeDescriptor { - enum SpecialTypeKind { - STK_Empty = 0, - STK_Image, - STK_SampledImage, - STK_Sampler, - STK_Pipe, - STK_DeviceEvent, - STK_Pointer, - STK_Last = -1 - }; - SpecialTypeKind Kind; - - unsigned Hash; - - SpecialTypeDescriptor() = delete; - SpecialTypeDescriptor(SpecialTypeKind K) : Kind(K) { Hash = Kind; } - - unsigned getHash() const { return Hash; } - - virtual ~SpecialTypeDescriptor() {} -}; - -struct ImageTypeDescriptor : public SpecialTypeDescriptor { - union ImageAttrs { - struct BitFlags { - unsigned Dim : 3; - unsigned Depth : 2; - unsigned Arrayed : 1; - unsigned MS : 1; - unsigned Sampled : 2; - unsigned ImageFormat : 6; - unsigned AQ : 2; - } Flags; - unsigned Val; - }; - - ImageTypeDescriptor(const Type *SampledTy, unsigned Dim, unsigned Depth, - unsigned Arrayed, unsigned MS, unsigned Sampled, - unsigned ImageFormat, unsigned AQ = 0) - : SpecialTypeDescriptor(SpecialTypeKind::STK_Image) { - ImageAttrs Attrs; - Attrs.Val = 0; - Attrs.Flags.Dim = Dim; - Attrs.Flags.Depth = Depth; - Attrs.Flags.Arrayed = Arrayed; - Attrs.Flags.MS = MS; - Attrs.Flags.Sampled = Sampled; - Attrs.Flags.ImageFormat = ImageFormat; - Attrs.Flags.AQ = AQ; - Hash = (DenseMapInfo<Type *>().getHashValue(SampledTy) & 0xffff) ^ - ((Attrs.Val << 8) | Kind); - } - - static bool classof(const SpecialTypeDescriptor *TD) { - return TD->Kind == SpecialTypeKind::STK_Image; - } -}; - -struct SampledImageTypeDescriptor : public SpecialTypeDescriptor { - SampledImageTypeDescriptor(const Type *SampledTy, const MachineInstr *ImageTy) - : SpecialTypeDescriptor(SpecialTypeKind::STK_SampledImage) { - assert(ImageTy->getOpcode() == SPIRV::OpTypeImage); - ImageTypeDescriptor TD( - SampledTy, ImageTy->getOperand(2).getImm(), - ImageTy->getOperand(3).getImm(), ImageTy->getOperand(4).getImm(), - ImageTy->getOperand(5).getImm(), ImageTy->getOperand(6).getImm(), - ImageTy->getOperand(7).getImm(), ImageTy->getOperand(8).getImm()); - Hash = TD.getHash() ^ Kind; - } - - static bool classof(const SpecialTypeDescriptor *TD) { - return TD->Kind == SpecialTypeKind::STK_SampledImage; - } -}; - -struct SamplerTypeDescriptor : public SpecialTypeDescriptor { - SamplerTypeDescriptor() - : SpecialTypeDescriptor(SpecialTypeKind::STK_Sampler) { - Hash = Kind; - } - - static bool classof(const SpecialTypeDescriptor *TD) { - return TD->Kind == SpecialTypeKind::STK_Sampler; - } +enum SpecialTypeKind { + STK_Empty = 0, + STK_Image, + STK_SampledImage, + STK_Sampler, + STK_Pipe, + STK_DeviceEvent, + STK_Pointer, + STK_Last = -1 }; -struct PipeTypeDescriptor : public SpecialTypeDescriptor { - - PipeTypeDescriptor(uint8_t AQ) - : SpecialTypeDescriptor(SpecialTypeKind::STK_Pipe) { - Hash = (AQ << 8) | Kind; - } - - static bool classof(const SpecialTypeDescriptor *TD) { - return TD->Kind == SpecialTypeKind::STK_Pipe; +using SpecialTypeDescriptor = std::tuple<const Type *, unsigned, unsigned>; + +union ImageAttrs { + struct BitFlags { + unsigned Dim : 3; + unsigned Depth : 2; + unsigned Arrayed : 1; + unsigned MS : 1; + unsigned Sampled : 2; + unsigned ImageFormat : 6; + unsigned AQ : 2; + } Flags; + unsigned Val; + + ImageAttrs(unsigned Dim, unsigned Depth, unsigned Arrayed, unsigned MS, + unsigned Sampled, unsigned ImageFormat, unsigned AQ = 0) { + Val = 0; + Flags.Dim = Dim; + Flags.Depth = Depth; + Flags.Arrayed = Arrayed; + Flags.MS = MS; + Flags.Sampled = Sampled; + Flags.ImageFormat = ImageFormat; + Flags.AQ = AQ; } }; -struct DeviceEventTypeDescriptor : public SpecialTypeDescriptor { - - DeviceEventTypeDescriptor() - : SpecialTypeDescriptor(SpecialTypeKind::STK_DeviceEvent) { - Hash = Kind; - } - - static bool classof(const SpecialTypeDescriptor *TD) { - return TD->Kind == SpecialTypeKind::STK_DeviceEvent; - } -}; - -struct PointerTypeDescriptor : public SpecialTypeDescriptor { - const Type *ElementType; - unsigned AddressSpace; - - PointerTypeDescriptor() = delete; - PointerTypeDescriptor(const Type *ElementType, unsigned AddressSpace) - : SpecialTypeDescriptor(SpecialTypeKind::STK_Pointer), - ElementType(ElementType), AddressSpace(AddressSpace) { - Hash = (DenseMapInfo<Type *>().getHashValue(ElementType) & 0xffff) ^ - ((AddressSpace << 8) | Kind); - } - - static bool classof(const SpecialTypeDescriptor *TD) { - return TD->Kind == SpecialTypeKind::STK_Pointer; - } -}; +inline SpecialTypeDescriptor +make_descr_image(const Type *SampledTy, unsigned Dim, unsigned Depth, + unsigned Arrayed, unsigned MS, unsigned Sampled, + unsigned ImageFormat, unsigned AQ = 0) { + return std::make_tuple( + SampledTy, + ImageAttrs(Dim, Depth, Arrayed, MS, Sampled, ImageFormat, AQ).Val, + SpecialTypeKind::STK_Image); +} + +inline SpecialTypeDescriptor +make_descr_sampled_image(const Type *SampledTy, const MachineInstr *ImageTy) { + assert(ImageTy->getOpcode() == SPIRV::OpTypeImage); + return std::make_tuple( + SampledTy, + ImageAttrs( + ImageTy->getOperand(2).getImm(), ImageTy->getOperand(3).getImm(), + ImageTy->getOperand(4).getImm(), ImageTy->getOperand(5).getImm(), + ImageTy->getOperand(6).getImm(), ImageTy->getOperand(7).getImm(), + ImageTy->getOperand(8).getImm()) + .Val, + SpecialTypeKind::STK_SampledImage); +} + +inline SpecialTypeDescriptor make_descr_sampler() { + return std::make_tuple(nullptr, 0U, SpecialTypeKind::STK_Sampler); +} + +inline SpecialTypeDescriptor make_descr_pipe(uint8_t AQ) { + return std::make_tuple(nullptr, AQ, SpecialTypeKind::STK_Pipe); +} + +inline SpecialTypeDescriptor make_descr_event() { + return std::make_tuple(nullptr, 0U, SpecialTypeKind::STK_DeviceEvent); +} + +inline SpecialTypeDescriptor make_descr_pointee(const Type *ElementType, + unsigned AddressSpace) { + return std::make_tuple(ElementType, AddressSpace, + SpecialTypeKind::STK_Pointer); +} } // namespace SPIRV -template <> struct DenseMapInfo<SPIRV::SpecialTypeDescriptor> { - static inline SPIRV::SpecialTypeDescriptor getEmptyKey() { - return SPIRV::SpecialTypeDescriptor( - SPIRV::SpecialTypeDescriptor::STK_Empty); - } - static inline SPIRV::SpecialTypeDescriptor getTombstoneKey() { - return SPIRV::SpecialTypeDescriptor(SPIRV::SpecialTypeDescriptor::STK_Last); - } - static unsigned getHashValue(SPIRV::SpecialTypeDescriptor Val) { - return Val.getHash(); - } - static bool isEqual(SPIRV::SpecialTypeDescriptor LHS, - SPIRV::SpecialTypeDescriptor RHS) { - return getHashValue(LHS) == getHashValue(RHS); - } -}; - template <typename KeyTy> class SPIRVDuplicatesTrackerBase { public: // NOTE: using MapVector instead of DenseMap helps getting everything ordered @@ -282,12 +218,12 @@ public: MachineModuleInfo *MMI); void add(const Type *Ty, const MachineFunction *MF, Register R) { - TT.add(Ty, MF, R); + TT.add(unifyPtrType(Ty), MF, R); } - void add(const Type *PointerElementType, unsigned AddressSpace, + void add(const Type *PointeeTy, unsigned AddressSpace, const MachineFunction *MF, Register R) { - ST.add(SPIRV::PointerTypeDescriptor(PointerElementType, AddressSpace), MF, + ST.add(SPIRV::make_descr_pointee(unifyPtrType(PointeeTy), AddressSpace), MF, R); } @@ -317,13 +253,13 @@ public: } Register find(const Type *Ty, const MachineFunction *MF) { - return TT.find(const_cast<Type *>(Ty), MF); + return TT.find(unifyPtrType(Ty), MF); } - Register find(const Type *PointerElementType, unsigned AddressSpace, + Register find(const Type *PointeeTy, unsigned AddressSpace, const MachineFunction *MF) { return ST.find( - SPIRV::PointerTypeDescriptor(PointerElementType, AddressSpace), MF); + SPIRV::make_descr_pointee(unifyPtrType(PointeeTy), AddressSpace), MF); } Register find(const Constant *C, const MachineFunction *MF) { diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index 7b8e3230bf55..dd5884096b85 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -69,7 +69,7 @@ class SPIRVEmitIntrinsics DenseSet<Instruction *> AggrStores; // deduce element type of untyped pointers - Type *deduceElementType(Value *I); + Type *deduceElementType(Value *I, bool UnknownElemTypeI8); Type *deduceElementTypeHelper(Value *I); Type *deduceElementTypeHelper(Value *I, std::unordered_set<Value *> &Visited); Type *deduceElementTypeByValueDeep(Type *ValueTy, Value *Operand, @@ -105,7 +105,8 @@ class SPIRVEmitIntrinsics void replaceMemInstrUses(Instruction *Old, Instruction *New, IRBuilder<> &B); void processInstrAfterVisit(Instruction *I, IRBuilder<> &B); - void insertAssignPtrTypeIntrs(Instruction *I, IRBuilder<> &B); + bool insertAssignPtrTypeIntrs(Instruction *I, IRBuilder<> &B, + bool UnknownElemTypeI8); void insertAssignTypeIntrs(Instruction *I, IRBuilder<> &B); void insertAssignPtrTypeTargetExt(TargetExtType *AssignedType, Value *V, IRBuilder<> &B); @@ -367,6 +368,26 @@ Type *SPIRVEmitIntrinsics::deduceElementTypeHelper( if (Ty) break; } + } else if (auto *CI = dyn_cast<CallInst>(I)) { + static StringMap<unsigned> ResTypeByArg = { + {"to_global", 0}, + {"to_local", 0}, + {"to_private", 0}, + {"__spirv_GenericCastToPtr_ToGlobal", 0}, + {"__spirv_GenericCastToPtr_ToLocal", 0}, + {"__spirv_GenericCastToPtr_ToPrivate", 0}, + {"__spirv_GenericCastToPtrExplicit_ToGlobal", 0}, + {"__spirv_GenericCastToPtrExplicit_ToLocal", 0}, + {"__spirv_GenericCastToPtrExplicit_ToPrivate", 0}}; + // TODO: maybe improve performance by caching demangled names + if (Function *CalledF = CI->getCalledFunction()) { + std::string DemangledName = + getOclOrSpirvBuiltinDemangledName(CalledF->getName()); + auto AsArgIt = ResTypeByArg.find(DemangledName); + if (AsArgIt != ResTypeByArg.end()) + Ty = deduceElementTypeHelper(CI->getArgOperand(AsArgIt->second), + Visited); + } } // remember the found relationship @@ -460,10 +481,10 @@ Type *SPIRVEmitIntrinsics::deduceNestedTypeHelper( return OrigTy; } -Type *SPIRVEmitIntrinsics::deduceElementType(Value *I) { +Type *SPIRVEmitIntrinsics::deduceElementType(Value *I, bool UnknownElemTypeI8) { if (Type *Ty = deduceElementTypeHelper(I)) return Ty; - return IntegerType::getInt8Ty(I->getContext()); + return UnknownElemTypeI8 ? IntegerType::getInt8Ty(I->getContext()) : nullptr; } // If the Instruction has Pointer operands with unresolved types, this function @@ -652,10 +673,8 @@ void SPIRVEmitIntrinsics::preprocessCompositeConstants(IRBuilder<> &B) { AggrConst = cast<Constant>(COp); ResTy = B.getInt32Ty(); } else if (auto *COp = dyn_cast<ConstantAggregateZero>(Op)) { - if (!Op->getType()->isVectorTy()) { - AggrConst = cast<Constant>(COp); - ResTy = B.getInt32Ty(); - } + AggrConst = cast<Constant>(COp); + ResTy = Op->getType()->isVectorTy() ? COp->getType() : B.getInt32Ty(); } if (AggrConst) { SmallVector<Value *> Args; @@ -1152,16 +1171,23 @@ void SPIRVEmitIntrinsics::processGlobalValue(GlobalVariable &GV, B.CreateIntrinsic(Intrinsic::spv_unref_global, GV.getType(), &GV); } -void SPIRVEmitIntrinsics::insertAssignPtrTypeIntrs(Instruction *I, - IRBuilder<> &B) { +// Return true, if we can't decide what is the pointee type now and will get +// back to the question later. Return false is spv_assign_ptr_type is not needed +// or can be inserted immediately. +bool SPIRVEmitIntrinsics::insertAssignPtrTypeIntrs(Instruction *I, + IRBuilder<> &B, + bool UnknownElemTypeI8) { reportFatalOnTokenType(I); if (!isPointerTy(I->getType()) || !requireAssignType(I) || isa<BitCastInst>(I)) - return; + return false; setInsertPointAfterDef(B, I); - Type *ElemTy = deduceElementType(I); - buildAssignPtr(B, ElemTy, I); + if (Type *ElemTy = deduceElementType(I, UnknownElemTypeI8)) { + buildAssignPtr(B, ElemTy, I); + return false; + } + return true; } void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I, @@ -1199,7 +1225,7 @@ void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I, buildAssignPtr(B, PType->getElementType(), Op); } else if (isPointerTy(OpTy)) { Type *ElemTy = GR->findDeducedElementType(Op); - buildAssignPtr(B, ElemTy ? ElemTy : deduceElementType(Op), Op); + buildAssignPtr(B, ElemTy ? ElemTy : deduceElementType(Op, true), Op); } else { CallInst *AssignCI = buildIntrWithMD(Intrinsic::spv_assign_type, {OpTy}, Op, Op, {}, B); @@ -1235,8 +1261,7 @@ void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I, } bool IsPhi = isa<PHINode>(I), BPrepared = false; for (const auto &Op : I->operands()) { - if ((isa<ConstantAggregateZero>(Op) && Op->getType()->isVectorTy()) || - isa<PHINode>(I) || isa<SwitchInst>(I)) + if (isa<PHINode>(I) || isa<SwitchInst>(I)) TrackConstants = false; if ((isa<ConstantData>(Op) || isa<ConstantExpr>(Op)) && TrackConstants) { unsigned OpNo = Op.getOperandNo(); @@ -1395,10 +1420,15 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) { if (isConvergenceIntrinsic(I)) continue; - insertAssignPtrTypeIntrs(I, B); + bool Postpone = insertAssignPtrTypeIntrs(I, B, false); + // if Postpone is true, we can't decide on pointee type yet insertAssignTypeIntrs(I, B); insertPtrCastOrAssignTypeInstr(I, B); insertSpirvDecorations(I, B); + // if instruction requires a pointee type set, let's check if we know it + // already, and force it to be i8 if not + if (Postpone && !GR->findAssignPtrTypeInstr(I)) + insertAssignPtrTypeIntrs(I, B, true); } for (auto &I : instructions(Func)) diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp index d434e0b5efbc..5558c7a5a4a5 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp @@ -394,7 +394,7 @@ Register SPIRVGlobalRegistry::getOrCreateCompositeOrNull( Constant *Val, MachineInstr &I, SPIRVType *SpvType, const SPIRVInstrInfo &TII, Constant *CA, unsigned BitWidth, unsigned ElemCnt, bool ZeroAsNull) { - // Find a constant vector in DT or build a new one. + // Find a constant vector or array in DT or build a new one. Register Res = DT.find(CA, CurMF); // If no values are attached, the composite is null constant. bool IsNull = Val->isNullValue() && ZeroAsNull; @@ -474,20 +474,28 @@ Register SPIRVGlobalRegistry::getOrCreateConstVector(APFloat Val, ZeroAsNull); } -Register -SPIRVGlobalRegistry::getOrCreateConsIntArray(uint64_t Val, MachineInstr &I, - SPIRVType *SpvType, - const SPIRVInstrInfo &TII) { +Register SPIRVGlobalRegistry::getOrCreateConstIntArray( + uint64_t Val, size_t Num, MachineInstr &I, SPIRVType *SpvType, + const SPIRVInstrInfo &TII) { const Type *LLVMTy = getTypeForSPIRVType(SpvType); assert(LLVMTy->isArrayTy()); const ArrayType *LLVMArrTy = cast<ArrayType>(LLVMTy); Type *LLVMBaseTy = LLVMArrTy->getElementType(); - auto *ConstInt = ConstantInt::get(LLVMBaseTy, Val); - auto *ConstArr = - ConstantArray::get(const_cast<ArrayType *>(LLVMArrTy), {ConstInt}); + Constant *CI = ConstantInt::get(LLVMBaseTy, Val); SPIRVType *SpvBaseTy = getSPIRVTypeForVReg(SpvType->getOperand(1).getReg()); unsigned BW = getScalarOrVectorBitWidth(SpvBaseTy); - return getOrCreateCompositeOrNull(ConstInt, I, SpvType, TII, ConstArr, BW, + // The following is reasonably unique key that is better that [Val]. The naive + // alternative would be something along the lines of: + // SmallVector<Constant *> NumCI(Num, CI); + // Constant *UniqueKey = + // ConstantArray::get(const_cast<ArrayType*>(LLVMArrTy), NumCI); + // that would be a truly unique but dangerous key, because it could lead to + // the creation of constants of arbitrary length (that is, the parameter of + // memset) which were missing in the original module. + Constant *UniqueKey = ConstantStruct::getAnon( + {PoisonValue::get(const_cast<ArrayType *>(LLVMArrTy)), + ConstantInt::get(LLVMBaseTy, Val), ConstantInt::get(LLVMBaseTy, Num)}); + return getOrCreateCompositeOrNull(CI, I, SpvType, TII, UniqueKey, BW, LLVMArrTy->getNumElements()); } @@ -546,24 +554,6 @@ SPIRVGlobalRegistry::getOrCreateConsIntVector(uint64_t Val, } Register -SPIRVGlobalRegistry::getOrCreateConsIntArray(uint64_t Val, - MachineIRBuilder &MIRBuilder, - SPIRVType *SpvType, bool EmitIR) { - const Type *LLVMTy = getTypeForSPIRVType(SpvType); - assert(LLVMTy->isArrayTy()); - const ArrayType *LLVMArrTy = cast<ArrayType>(LLVMTy); - Type *LLVMBaseTy = LLVMArrTy->getElementType(); - const auto ConstInt = ConstantInt::get(LLVMBaseTy, Val); - auto ConstArr = - ConstantArray::get(const_cast<ArrayType *>(LLVMArrTy), {ConstInt}); - SPIRVType *SpvBaseTy = getSPIRVTypeForVReg(SpvType->getOperand(1).getReg()); - unsigned BW = getScalarOrVectorBitWidth(SpvBaseTy); - return getOrCreateIntCompositeOrNull(Val, MIRBuilder, SpvType, EmitIR, - ConstArr, BW, - LLVMArrTy->getNumElements()); -} - -Register SPIRVGlobalRegistry::getOrCreateConstNullPtr(MachineIRBuilder &MIRBuilder, SPIRVType *SpvType) { const Type *LLVMTy = getTypeForSPIRVType(SpvType); @@ -936,7 +926,7 @@ SPIRVType *SPIRVGlobalRegistry::restOfCreateSPIRVType( SPIRVType *SpirvType = createSPIRVType(Ty, MIRBuilder, AccessQual, EmitIR); TypesInProcessing.erase(Ty); VRegToTypeMap[&MIRBuilder.getMF()][getSPIRVTypeID(SpirvType)] = SpirvType; - SPIRVToLLVMType[SpirvType] = Ty; + SPIRVToLLVMType[SpirvType] = unifyPtrType(Ty); Register Reg = DT.find(Ty, &MIRBuilder.getMF()); // Do not add OpTypeForwardPointer to DT, a corresponding normal pointer type // will be added later. For special types it is already added to DT. @@ -1080,12 +1070,14 @@ bool SPIRVGlobalRegistry::isScalarOrVectorSigned(const SPIRVType *Type) const { return IntType && IntType->getOperand(2).getImm() != 0; } +SPIRVType *SPIRVGlobalRegistry::getPointeeType(SPIRVType *PtrType) { + return PtrType && PtrType->getOpcode() == SPIRV::OpTypePointer + ? getSPIRVTypeForVReg(PtrType->getOperand(2).getReg()) + : nullptr; +} + unsigned SPIRVGlobalRegistry::getPointeeTypeOp(Register PtrReg) { - SPIRVType *PtrType = getSPIRVTypeForVReg(PtrReg); - SPIRVType *ElemType = - PtrType && PtrType->getOpcode() == SPIRV::OpTypePointer - ? getSPIRVTypeForVReg(PtrType->getOperand(2).getReg()) - : nullptr; + SPIRVType *ElemType = getPointeeType(getSPIRVTypeForVReg(PtrReg)); return ElemType ? ElemType->getOpcode() : 0; } @@ -1122,9 +1114,9 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeImage( uint32_t Depth, uint32_t Arrayed, uint32_t Multisampled, uint32_t Sampled, SPIRV::ImageFormat::ImageFormat ImageFormat, SPIRV::AccessQualifier::AccessQualifier AccessQual) { - SPIRV::ImageTypeDescriptor TD(SPIRVToLLVMType.lookup(SampledType), Dim, Depth, - Arrayed, Multisampled, Sampled, ImageFormat, - AccessQual); + auto TD = SPIRV::make_descr_image(SPIRVToLLVMType.lookup(SampledType), Dim, + Depth, Arrayed, Multisampled, Sampled, + ImageFormat, AccessQual); if (auto *Res = checkSpecialInstr(TD, MIRBuilder)) return Res; Register ResVReg = createTypeVReg(MIRBuilder); @@ -1143,7 +1135,7 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeImage( SPIRVType * SPIRVGlobalRegistry::getOrCreateOpTypeSampler(MachineIRBuilder &MIRBuilder) { - SPIRV::SamplerTypeDescriptor TD; + auto TD = SPIRV::make_descr_sampler(); if (auto *Res = checkSpecialInstr(TD, MIRBuilder)) return Res; Register ResVReg = createTypeVReg(MIRBuilder); @@ -1154,7 +1146,7 @@ SPIRVGlobalRegistry::getOrCreateOpTypeSampler(MachineIRBuilder &MIRBuilder) { SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypePipe( MachineIRBuilder &MIRBuilder, SPIRV::AccessQualifier::AccessQualifier AccessQual) { - SPIRV::PipeTypeDescriptor TD(AccessQual); + auto TD = SPIRV::make_descr_pipe(AccessQual); if (auto *Res = checkSpecialInstr(TD, MIRBuilder)) return Res; Register ResVReg = createTypeVReg(MIRBuilder); @@ -1166,7 +1158,7 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypePipe( SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeDeviceEvent( MachineIRBuilder &MIRBuilder) { - SPIRV::DeviceEventTypeDescriptor TD; + auto TD = SPIRV::make_descr_event(); if (auto *Res = checkSpecialInstr(TD, MIRBuilder)) return Res; Register ResVReg = createTypeVReg(MIRBuilder); @@ -1176,7 +1168,7 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeDeviceEvent( SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeSampledImage( SPIRVType *ImageType, MachineIRBuilder &MIRBuilder) { - SPIRV::SampledImageTypeDescriptor TD( + auto TD = SPIRV::make_descr_sampled_image( SPIRVToLLVMType.lookup(MIRBuilder.getMF().getRegInfo().getVRegDef( ImageType->getOperand(1).getReg())), ImageType); @@ -1189,6 +1181,26 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeSampledImage( .addUse(getSPIRVTypeID(ImageType)); } +SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeCoopMatr( + MachineIRBuilder &MIRBuilder, const TargetExtType *ExtensionType, + const SPIRVType *ElemType, uint32_t Scope, uint32_t Rows, uint32_t Columns, + uint32_t Use) { + Register ResVReg = DT.find(ExtensionType, &MIRBuilder.getMF()); + if (ResVReg.isValid()) + return MIRBuilder.getMF().getRegInfo().getUniqueVRegDef(ResVReg); + ResVReg = createTypeVReg(MIRBuilder); + SPIRVType *SpirvTy = + MIRBuilder.buildInstr(SPIRV::OpTypeCooperativeMatrixKHR) + .addDef(ResVReg) + .addUse(getSPIRVTypeID(ElemType)) + .addUse(buildConstantInt(Scope, MIRBuilder, nullptr, true)) + .addUse(buildConstantInt(Rows, MIRBuilder, nullptr, true)) + .addUse(buildConstantInt(Columns, MIRBuilder, nullptr, true)) + .addUse(buildConstantInt(Use, MIRBuilder, nullptr, true)); + DT.add(ExtensionType, &MIRBuilder.getMF(), ResVReg); + return SpirvTy; +} + SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeByOpcode( const Type *Ty, MachineIRBuilder &MIRBuilder, unsigned Opcode) { Register ResVReg = DT.find(Ty, &MIRBuilder.getMF()); @@ -1268,7 +1280,7 @@ SPIRVType *SPIRVGlobalRegistry::finishCreatingSPIRVType(const Type *LLVMTy, SPIRVType *SpirvType) { assert(CurMF == SpirvType->getMF()); VRegToTypeMap[CurMF][getSPIRVTypeID(SpirvType)] = SpirvType; - SPIRVToLLVMType[SpirvType] = LLVMTy; + SPIRVToLLVMType[SpirvType] = unifyPtrType(LLVMTy); return SpirvType; } diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h index db01f68f48de..a45e1ccd0717 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h @@ -292,6 +292,8 @@ public: return Res->second; } + // Return a pointee's type, or nullptr otherwise. + SPIRVType *getPointeeType(SPIRVType *PtrType); // Return a pointee's type op code, or 0 otherwise. unsigned getPointeeTypeOp(Register PtrReg); @@ -327,6 +329,12 @@ public: return Ret; } + // Return true if the type is an aggregate type. + bool isAggregateType(SPIRVType *Type) const { + return Type && (Type->getOpcode() == SPIRV::OpTypeStruct && + Type->getOpcode() == SPIRV::OpTypeArray); + } + // Whether the given VReg has an OpTypeXXX instruction mapped to it with the // given opcode (e.g. OpTypeFloat). bool isScalarOfType(Register VReg, unsigned TypeOpcode) const; @@ -449,13 +457,11 @@ public: Register getOrCreateConstVector(APFloat Val, MachineInstr &I, SPIRVType *SpvType, const SPIRVInstrInfo &TII, bool ZeroAsNull = true); - Register getOrCreateConsIntArray(uint64_t Val, MachineInstr &I, - SPIRVType *SpvType, - const SPIRVInstrInfo &TII); + Register getOrCreateConstIntArray(uint64_t Val, size_t Num, MachineInstr &I, + SPIRVType *SpvType, + const SPIRVInstrInfo &TII); Register getOrCreateConsIntVector(uint64_t Val, MachineIRBuilder &MIRBuilder, SPIRVType *SpvType, bool EmitIR = true); - Register getOrCreateConsIntArray(uint64_t Val, MachineIRBuilder &MIRBuilder, - SPIRVType *SpvType, bool EmitIR = true); Register getOrCreateConstNullPtr(MachineIRBuilder &MIRBuilder, SPIRVType *SpvType); Register buildConstantSampler(Register Res, unsigned AddrMode, unsigned Param, @@ -514,7 +520,11 @@ public: SPIRVType *getOrCreateOpTypeSampledImage(SPIRVType *ImageType, MachineIRBuilder &MIRBuilder); - + SPIRVType *getOrCreateOpTypeCoopMatr(MachineIRBuilder &MIRBuilder, + const TargetExtType *ExtensionType, + const SPIRVType *ElemType, + uint32_t Scope, uint32_t Rows, + uint32_t Columns, uint32_t Use); SPIRVType * getOrCreateOpTypePipe(MachineIRBuilder &MIRBuilder, SPIRV::AccessQualifier::AccessQualifier AccQual); diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp index 5ccbaf12ddee..4383d1c5c0e2 100644 --- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp @@ -339,6 +339,7 @@ void SPIRVTargetLowering::finalizeLowering(MachineFunction &MF) const { GR.getSPIRVTypeForVReg(MI.getOperand(1).getReg())); break; case SPIRV::OpPtrCastToGeneric: + case SPIRV::OpGenericCastToPtr: validateAccessChain(STI, MRI, GR, MI); break; case SPIRV::OpInBoundsPtrAccessChain: diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.h b/llvm/lib/Target/SPIRV/SPIRVISelLowering.h index 6fc200abf462..77356b7512a7 100644 --- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.h +++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.h @@ -68,6 +68,11 @@ public: // extra instructions required to preserve validity of SPIR-V code imposed by // the standard. void finalizeLowering(MachineFunction &MF) const override; + + MVT getPreferredSwitchConditionType(LLVMContext &Context, + EVT ConditionVT) const override { + return ConditionVT.getSimpleVT(); + } }; } // namespace llvm diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td index dedfd5e6e32d..63549b06e967 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td @@ -211,6 +211,9 @@ def OpTypeAccelerationStructureNV: Op<5341, (outs TYPE:$res), (ins), def OpTypeCooperativeMatrixNV: Op<5358, (outs TYPE:$res), (ins TYPE:$compType, ID:$scope, ID:$rows, ID:$cols), "$res = OpTypeCooperativeMatrixNV $compType $scope $rows $cols">; +def OpTypeCooperativeMatrixKHR: Op<4456, (outs TYPE:$res), + (ins TYPE:$compType, ID:$scope, ID:$rows, ID:$cols, ID:$use), + "$res = OpTypeCooperativeMatrixKHR $compType $scope $rows $cols $use">; // 3.42.7 Constant-Creation Instructions @@ -864,3 +867,16 @@ def OpAsmINTEL: Op<5610, (outs ID:$res), (ins TYPE:$type, TYPE:$asm_type, ID:$ta "$res = OpAsmINTEL $type $asm_type $target $asm">; def OpAsmCallINTEL: Op<5611, (outs ID:$res), (ins TYPE:$type, ID:$asm, variable_ops), "$res = OpAsmCallINTEL $type $asm">; + +// SPV_KHR_cooperative_matrix +def OpCooperativeMatrixLoadKHR: Op<4457, (outs ID:$res), + (ins TYPE:$resType, ID:$pointer, ID:$memory_layout, variable_ops), + "$res = OpCooperativeMatrixLoadKHR $resType $pointer $memory_layout">; +def OpCooperativeMatrixStoreKHR: Op<4458, (outs), + (ins ID:$pointer, ID:$objectToStore, ID:$memory_layout, variable_ops), + "OpCooperativeMatrixStoreKHR $pointer $objectToStore $memory_layout">; +def OpCooperativeMatrixMulAddKHR: Op<4459, (outs ID:$res), + (ins TYPE:$type, ID:$A, ID:$B, ID:$C, variable_ops), + "$res = OpCooperativeMatrixMulAddKHR $type $A $B $C">; +def OpCooperativeMatrixLengthKHR: Op<4460, (outs ID:$res), (ins TYPE:$type, ID:$coop_matr_type), + "$res = OpCooperativeMatrixLengthKHR $type $coop_matr_type">; diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index db83172f7fa9..9be736ce88ce 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -173,6 +173,9 @@ private: bool selectFmix(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const; + bool selectRsqrt(Register ResVReg, const SPIRVType *ResType, + MachineInstr &I) const; + void renderImm32(MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const; void renderFImm32(MachineInstrBuilder &MIB, const MachineInstr &I, @@ -469,6 +472,18 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg, return selectExtInst(ResVReg, ResType, I, CL::sin, GL::Sin); case TargetOpcode::G_FTAN: return selectExtInst(ResVReg, ResType, I, CL::tan, GL::Tan); + case TargetOpcode::G_FACOS: + return selectExtInst(ResVReg, ResType, I, CL::acos, GL::Acos); + case TargetOpcode::G_FASIN: + return selectExtInst(ResVReg, ResType, I, CL::asin, GL::Asin); + case TargetOpcode::G_FATAN: + return selectExtInst(ResVReg, ResType, I, CL::atan, GL::Atan); + case TargetOpcode::G_FCOSH: + return selectExtInst(ResVReg, ResType, I, CL::cosh, GL::Cosh); + case TargetOpcode::G_FSINH: + return selectExtInst(ResVReg, ResType, I, CL::sinh, GL::Sinh); + case TargetOpcode::G_FTANH: + return selectExtInst(ResVReg, ResType, I, CL::tanh, GL::Tanh); case TargetOpcode::G_FSQRT: return selectExtInst(ResVReg, ResType, I, CL::sqrt, GL::Sqrt); @@ -831,7 +846,7 @@ bool SPIRVInstructionSelector::selectMemOperation(Register ResVReg, unsigned Num = getIConstVal(I.getOperand(2).getReg(), MRI); SPIRVType *ValTy = GR.getOrCreateSPIRVIntegerType(8, I, TII); SPIRVType *ArrTy = GR.getOrCreateSPIRVArrayType(ValTy, Num, I, TII); - Register Const = GR.getOrCreateConsIntArray(Val, I, ArrTy, TII); + Register Const = GR.getOrCreateConstIntArray(Val, Num, I, ArrTy, TII); SPIRVType *VarTy = GR.getOrCreateSPIRVPointerType( ArrTy, I, TII, SPIRV::StorageClass::UniformConstant); // TODO: check if we have such GV, add init, use buildGlobalVariable. @@ -1102,7 +1117,7 @@ bool SPIRVInstructionSelector::selectAddrSpaceCast(Register ResVReg, if (isGenericCastablePtr(SrcSC) && isGenericCastablePtr(DstSC)) { Register Tmp = MRI->createVirtualRegister(&SPIRV::IDRegClass); SPIRVType *GenericPtrTy = GR.getOrCreateSPIRVPointerType( - SrcPtrTy, I, TII, SPIRV::StorageClass::Generic); + GR.getPointeeType(SrcPtrTy), I, TII, SPIRV::StorageClass::Generic); MachineBasicBlock &BB = *I.getParent(); const DebugLoc &DL = I.getDebugLoc(); bool Success = BuildMI(BB, I, DL, TII.get(SPIRV::OpPtrCastToGeneric)) @@ -1315,6 +1330,23 @@ bool SPIRVInstructionSelector::selectFmix(Register ResVReg, .constrainAllUses(TII, TRI, RBI); } +bool SPIRVInstructionSelector::selectRsqrt(Register ResVReg, + const SPIRVType *ResType, + MachineInstr &I) const { + + assert(I.getNumOperands() == 3); + assert(I.getOperand(2).isReg()); + MachineBasicBlock &BB = *I.getParent(); + + return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpExtInst)) + .addDef(ResVReg) + .addUse(GR.getSPIRVTypeID(ResType)) + .addImm(static_cast<uint32_t>(SPIRV::InstructionSet::GLSL_std_450)) + .addImm(GL::InverseSqrt) + .addUse(I.getOperand(2).getReg()) + .constrainAllUses(TII, TRI, RBI); +} + bool SPIRVInstructionSelector::selectBitreverse(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const { @@ -1413,20 +1445,50 @@ static unsigned getArrayComponentCount(MachineRegisterInfo *MRI, } // Return true if the type represents a constant register -static bool isConstReg(MachineRegisterInfo *MRI, SPIRVType *OpDef) { +static bool isConstReg(MachineRegisterInfo *MRI, SPIRVType *OpDef, + SmallPtrSet<SPIRVType *, 4> &Visited) { if (OpDef->getOpcode() == SPIRV::ASSIGN_TYPE && OpDef->getOperand(1).isReg()) { if (SPIRVType *RefDef = MRI->getVRegDef(OpDef->getOperand(1).getReg())) OpDef = RefDef; } - return OpDef->getOpcode() == TargetOpcode::G_CONSTANT || - OpDef->getOpcode() == TargetOpcode::G_FCONSTANT; + + if (Visited.contains(OpDef)) + return true; + Visited.insert(OpDef); + + unsigned Opcode = OpDef->getOpcode(); + switch (Opcode) { + case TargetOpcode::G_CONSTANT: + case TargetOpcode::G_FCONSTANT: + return true; + case TargetOpcode::G_INTRINSIC: + case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: + case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: + return cast<GIntrinsic>(*OpDef).getIntrinsicID() == + Intrinsic::spv_const_composite; + case TargetOpcode::G_BUILD_VECTOR: + case TargetOpcode::G_SPLAT_VECTOR: { + for (unsigned i = OpDef->getNumExplicitDefs(); i < OpDef->getNumOperands(); + i++) { + SPIRVType *OpNestedDef = + OpDef->getOperand(i).isReg() + ? MRI->getVRegDef(OpDef->getOperand(i).getReg()) + : nullptr; + if (OpNestedDef && !isConstReg(MRI, OpNestedDef, Visited)) + return false; + } + return true; + } + } + return false; } // Return true if the virtual register represents a constant static bool isConstReg(MachineRegisterInfo *MRI, Register OpReg) { + SmallPtrSet<SPIRVType *, 4> Visited; if (SPIRVType *OpDef = MRI->getVRegDef(OpReg)) - return isConstReg(MRI, OpDef); + return isConstReg(MRI, OpDef, Visited); return false; } @@ -1740,15 +1802,18 @@ bool SPIRVInstructionSelector::selectOpUndef(Register ResVReg, static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI) { assert(MO.isReg()); const SPIRVType *TypeInst = MRI->getVRegDef(MO.getReg()); - if (TypeInst->getOpcode() != SPIRV::ASSIGN_TYPE) - return false; - assert(TypeInst->getOperand(1).isReg()); - MachineInstr *ImmInst = MRI->getVRegDef(TypeInst->getOperand(1).getReg()); - return ImmInst->getOpcode() == TargetOpcode::G_CONSTANT; + if (TypeInst->getOpcode() == SPIRV::ASSIGN_TYPE) { + assert(TypeInst->getOperand(1).isReg()); + MachineInstr *ImmInst = MRI->getVRegDef(TypeInst->getOperand(1).getReg()); + return ImmInst->getOpcode() == TargetOpcode::G_CONSTANT; + } + return TypeInst->getOpcode() == SPIRV::OpConstantI; } static int64_t foldImm(const MachineOperand &MO, MachineRegisterInfo *MRI) { const SPIRVType *TypeInst = MRI->getVRegDef(MO.getReg()); + if (TypeInst->getOpcode() == SPIRV::OpConstantI) + return TypeInst->getOperand(2).getImm(); MachineInstr *ImmInst = MRI->getVRegDef(TypeInst->getOperand(1).getReg()); assert(ImmInst->getOpcode() == TargetOpcode::G_CONSTANT); return ImmInst->getOperand(1).getCImm()->getZExtValue(); @@ -1850,8 +1915,10 @@ bool SPIRVInstructionSelector::wrapIntoSpecConstantOp( Register OpReg = I.getOperand(i).getReg(); SPIRVType *OpDefine = MRI->getVRegDef(OpReg); SPIRVType *OpType = GR.getSPIRVTypeForVReg(OpReg); - if (!OpDefine || !OpType || isConstReg(MRI, OpDefine) || - OpDefine->getOpcode() == TargetOpcode::G_ADDRSPACE_CAST) { + SmallPtrSet<SPIRVType *, 4> Visited; + if (!OpDefine || !OpType || isConstReg(MRI, OpDefine, Visited) || + OpDefine->getOpcode() == TargetOpcode::G_ADDRSPACE_CAST || + GR.isAggregateType(OpType)) { // The case of G_ADDRSPACE_CAST inside spv_const_composite() is processed // by selectAddrSpaceCast() CompositeArgs.push_back(OpReg); @@ -1992,6 +2059,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, return selectAny(ResVReg, ResType, I); case Intrinsic::spv_lerp: return selectFmix(ResVReg, ResType, I); + case Intrinsic::spv_rsqrt: + return selectRsqrt(ResVReg, ResType, I); case Intrinsic::spv_lifetime_start: case Intrinsic::spv_lifetime_end: { unsigned Op = IID == Intrinsic::spv_lifetime_start ? SPIRV::OpLifetimeStart diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp index 57fbf3b3f8f1..6c7c3af19965 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp @@ -278,6 +278,12 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) { G_FCOS, G_FSIN, G_FTAN, + G_FACOS, + G_FASIN, + G_FATAN, + G_FCOSH, + G_FSINH, + G_FTANH, G_FSQRT, G_FFLOOR, G_FRINT, diff --git a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp index 2744c25d1bc7..0747dd1bbaf4 100644 --- a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp @@ -17,6 +17,8 @@ #include "SPIRVSubtarget.h" #include "SPIRVTargetMachine.h" #include "SPIRVUtils.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/IR/CFG.h" @@ -71,7 +73,7 @@ public: /// terminator will take. llvm::Value *createExitVariable( BasicBlock *BB, - const std::unordered_map<BasicBlock *, ConstantInt *> &TargetToValue) { + const DenseMap<BasicBlock *, ConstantInt *> &TargetToValue) { auto *T = BB->getTerminator(); if (isa<ReturnInst>(T)) return nullptr; @@ -98,12 +100,12 @@ public: } // TODO: add support for switch cases. - assert(false && "Unhandled terminator type."); + llvm_unreachable("Unhandled terminator type."); } /// Replaces |BB|'s branch targets present in |ToReplace| with |NewTarget|. void replaceBranchTargets(BasicBlock *BB, - const std::unordered_set<BasicBlock *> ToReplace, + const SmallPtrSet<BasicBlock *, 4> &ToReplace, BasicBlock *NewTarget) { auto *T = BB->getTerminator(); if (isa<ReturnInst>(T)) @@ -133,7 +135,7 @@ public: bool runOnConvergenceRegionNoRecurse(LoopInfo &LI, const SPIRV::ConvergenceRegion *CR) { // Gather all the exit targets for this region. - std::unordered_set<BasicBlock *> ExitTargets; + SmallPtrSet<BasicBlock *, 4> ExitTargets; for (BasicBlock *Exit : CR->Exits) { for (BasicBlock *Target : gatherSuccessors(Exit)) { if (CR->Blocks.count(Target) == 0) @@ -164,9 +166,10 @@ public: // Creating one constant per distinct exit target. This will be route to the // correct target. - std::unordered_map<BasicBlock *, ConstantInt *> TargetToValue; + DenseMap<BasicBlock *, ConstantInt *> TargetToValue; for (BasicBlock *Target : SortedExitTargets) - TargetToValue.emplace(Target, Builder.getInt32(TargetToValue.size())); + TargetToValue.insert( + std::make_pair(Target, Builder.getInt32(TargetToValue.size()))); // Creating one variable per exit node, set to the constant matching the // targeted external block. @@ -184,12 +187,12 @@ public: } // Creating the switch to jump to the correct exit target. - std::vector<std::pair<BasicBlock *, ConstantInt *>> CasesList( - TargetToValue.begin(), TargetToValue.end()); - llvm::SwitchInst *Sw = - Builder.CreateSwitch(node, CasesList[0].first, CasesList.size() - 1); - for (size_t i = 1; i < CasesList.size(); i++) - Sw->addCase(CasesList[i].second, CasesList[i].first); + llvm::SwitchInst *Sw = Builder.CreateSwitch(node, SortedExitTargets[0], + SortedExitTargets.size() - 1); + for (size_t i = 1; i < SortedExitTargets.size(); i++) { + BasicBlock *BB = SortedExitTargets[i]; + Sw->addCase(TargetToValue[BB], BB); + } // Fix exit branches to redirect to the new exit. for (auto Exit : CR->Exits) diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index 30a6c474f467..ac0aa682ea4b 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -1168,6 +1168,15 @@ void addInstrRequirements(const MachineInstr &MI, Reqs.addCapability(SPIRV::Capability::AsmINTEL); } break; + case SPIRV::OpTypeCooperativeMatrixKHR: + if (!ST.canUseExtension(SPIRV::Extension::SPV_KHR_cooperative_matrix)) + report_fatal_error( + "OpTypeCooperativeMatrixKHR type requires the " + "following SPIR-V extension: SPV_KHR_cooperative_matrix", + false); + Reqs.addExtension(SPIRV::Extension::SPV_KHR_cooperative_matrix); + Reqs.addCapability(SPIRV::Capability::CooperativeMatrixKHR); + break; default: break; } diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp index adc5b36af6f1..0ea2f176565e 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp @@ -41,7 +41,8 @@ public: static void addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR, const SPIRVSubtarget &STI, - DenseMap<MachineInstr *, Type *> &TargetExtConstTypes) { + DenseMap<MachineInstr *, Type *> &TargetExtConstTypes, + SmallSet<Register, 4> &TrackedConstRegs) { MachineRegisterInfo &MRI = MF.getRegInfo(); DenseMap<MachineInstr *, Register> RegsAlreadyAddedToDT; SmallVector<MachineInstr *, 10> ToErase, ToEraseComposites; @@ -80,6 +81,7 @@ addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR, } } GR->add(Const, &MF, SrcReg); + TrackedConstRegs.insert(SrcReg); if (Const->getType()->isTargetExtTy()) { // remember association so that we can restore it when assign types MachineInstr *SrcMI = MRI.getVRegDef(SrcReg); @@ -121,7 +123,9 @@ addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR, MI->eraseFromParent(); } -static void foldConstantsIntoIntrinsics(MachineFunction &MF) { +static void +foldConstantsIntoIntrinsics(MachineFunction &MF, + const SmallSet<Register, 4> &TrackedConstRegs) { SmallVector<MachineInstr *, 10> ToErase; MachineRegisterInfo &MRI = MF.getRegInfo(); const unsigned AssignNameOperandShift = 2; @@ -137,7 +141,8 @@ static void foldConstantsIntoIntrinsics(MachineFunction &MF) { MI.removeOperand(NumOp); MI.addOperand(MachineOperand::CreateImm( ConstMI->getOperand(1).getCImm()->getZExtValue())); - if (MRI.use_empty(ConstMI->getOperand(0).getReg())) + Register DefReg = ConstMI->getOperand(0).getReg(); + if (MRI.use_empty(DefReg) && !TrackedConstRegs.contains(DefReg)) ToErase.push_back(ConstMI); } } @@ -271,6 +276,21 @@ static SPIRVType *propagateSPIRVType(MachineInstr *MI, SPIRVGlobalRegistry *GR, return SpirvTy; } +// To support current approach and limitations wrt. bit width here we widen a +// scalar register with a bit width greater than 1 to valid sizes and cap it to +// 64 width. +static void widenScalarLLTNextPow2(Register Reg, MachineRegisterInfo &MRI) { + LLT RegType = MRI.getType(Reg); + if (!RegType.isScalar()) + return; + unsigned Sz = RegType.getScalarSizeInBits(); + if (Sz == 1) + return; + unsigned NewSz = std::min(std::max(1u << Log2_32_Ceil(Sz), 8u), 64u); + if (NewSz != Sz) + MRI.setType(Reg, LLT::scalar(NewSz)); +} + static std::pair<Register, unsigned> createNewIdReg(SPIRVType *SpvType, Register SrcReg, MachineRegisterInfo &MRI, const SPIRVGlobalRegistry &GR) { @@ -395,6 +415,7 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR, MachineRegisterInfo &MRI = MF.getRegInfo(); SmallVector<MachineInstr *, 10> ToErase; + DenseMap<MachineInstr *, Register> RegsAlreadyAddedToDT; for (MachineBasicBlock *MBB : post_order(&MF)) { if (MBB->empty()) @@ -406,6 +427,11 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR, MachineInstr &MI = *MII; unsigned MIOp = MI.getOpcode(); + // validate bit width of scalar registers + for (const auto &MOP : MI.operands()) + if (MOP.isReg()) + widenScalarLLTNextPow2(MOP.getReg(), MRI); + if (isSpvIntrinsic(MI, Intrinsic::spv_assign_ptr_type)) { Register Reg = MI.getOperand(1).getReg(); MIB.setInsertPt(*MI.getParent(), MI.getIterator()); @@ -441,6 +467,7 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR, // %rctmp = G_CONSTANT ty Val // %rc = ASSIGN_TYPE %rctmp, %cty Register Reg = MI.getOperand(0).getReg(); + bool NeedAssignType = true; if (MRI.hasOneUse(Reg)) { MachineInstr &UseMI = *MRI.use_instr_begin(Reg); if (isSpvIntrinsic(UseMI, Intrinsic::spv_assign_type) || @@ -453,7 +480,20 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR, Ty = TargetExtIt == TargetExtConstTypes.end() ? MI.getOperand(1).getCImm()->getType() : TargetExtIt->second; - GR->add(MI.getOperand(1).getCImm(), &MF, Reg); + const ConstantInt *OpCI = MI.getOperand(1).getCImm(); + Register PrimaryReg = GR->find(OpCI, &MF); + if (!PrimaryReg.isValid()) { + GR->add(OpCI, &MF, Reg); + } else if (PrimaryReg != Reg && + MRI.getType(Reg) == MRI.getType(PrimaryReg)) { + auto *RCReg = MRI.getRegClassOrNull(Reg); + auto *RCPrimary = MRI.getRegClassOrNull(PrimaryReg); + if (!RCReg || RCPrimary == RCReg) { + RegsAlreadyAddedToDT[&MI] = PrimaryReg; + ToErase.push_back(&MI); + NeedAssignType = false; + } + } } else if (MIOp == TargetOpcode::G_FCONSTANT) { Ty = MI.getOperand(1).getFPImm()->getType(); } else { @@ -472,14 +512,10 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR, MI.getNumExplicitOperands() - MI.getNumExplicitDefs(); Ty = VectorType::get(ElemTy, NumElts, false); } - insertAssignInstr(Reg, Ty, nullptr, GR, MIB, MRI); + if (NeedAssignType) + insertAssignInstr(Reg, Ty, nullptr, GR, MIB, MRI); } else if (MIOp == TargetOpcode::G_GLOBAL_VALUE) { propagateSPIRVType(&MI, GR, MRI, MIB); - } else if (MIOp == TargetOpcode::G_BITREVERSE) { - Register Reg = MI.getOperand(0).getReg(); - LLT RegType = MRI.getType(Reg); - if (RegType.getSizeInBits() < 32) - MRI.setType(Reg, LLT::scalar(32)); } if (MII == Begin) @@ -488,8 +524,12 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR, --MII; } } - for (MachineInstr *MI : ToErase) + for (MachineInstr *MI : ToErase) { + auto It = RegsAlreadyAddedToDT.find(MI); + if (RegsAlreadyAddedToDT.contains(MI)) + MRI.replaceRegWith(MI->getOperand(0).getReg(), It->second); MI->eraseFromParent(); + } // Address the case when IRTranslator introduces instructions with new // registers without SPIRVType associated. @@ -821,8 +861,10 @@ bool SPIRVPreLegalizer::runOnMachineFunction(MachineFunction &MF) { MachineIRBuilder MIB(MF); // a registry of target extension constants DenseMap<MachineInstr *, Type *> TargetExtConstTypes; - addConstantsToTrack(MF, GR, ST, TargetExtConstTypes); - foldConstantsIntoIntrinsics(MF); + // to keep record of tracked constants + SmallSet<Register, 4> TrackedConstRegs; + addConstantsToTrack(MF, GR, ST, TargetExtConstTypes, TrackedConstRegs); + foldConstantsIntoIntrinsics(MF, TrackedConstRegs); insertBitcasts(MF, GR, MIB); generateAssignInstrs(MF, GR, MIB, TargetExtConstTypes); processSwitches(MF, GR, MIB); diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td index 318c5cebb7a4..96601dd8796c 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td +++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td @@ -302,6 +302,7 @@ defm SPV_INTEL_inline_assembly : ExtensionOperand<107>; defm SPV_INTEL_cache_controls : ExtensionOperand<108>; defm SPV_INTEL_global_variable_host_access : ExtensionOperand<109>; defm SPV_INTEL_global_variable_fpga_decorations : ExtensionOperand<110>; +defm SPV_KHR_cooperative_matrix : ExtensionOperand<111>; //===----------------------------------------------------------------------===// // Multiclass used to define Capabilities enum values and at the same time @@ -478,6 +479,7 @@ defm GlobalVariableHostAccessINTEL : CapabilityOperand<6187, 0, 0, [SPV_INTEL_gl defm HostAccessINTEL : CapabilityOperand<6188, 0, 0, [SPV_INTEL_global_variable_host_access], []>; defm GlobalVariableFPGADecorationsINTEL : CapabilityOperand<6189, 0, 0, [SPV_INTEL_global_variable_fpga_decorations], []>; defm CacheControlsINTEL : CapabilityOperand<6441, 0, 0, [SPV_INTEL_cache_controls], []>; +defm CooperativeMatrixKHR : CapabilityOperand<6022, 0, 0, [SPV_KHR_cooperative_matrix], []>; //===----------------------------------------------------------------------===// // Multiclass used to define SourceLanguage enum values and at the same time diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp index c1b90b0e9d88..927683ad7e32 100644 --- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp @@ -253,7 +253,11 @@ SPIRV::MemorySemantics::MemorySemantics getMemSemantics(AtomicOrdering Ord) { MachineInstr *getDefInstrMaybeConstant(Register &ConstReg, const MachineRegisterInfo *MRI) { - MachineInstr *ConstInstr = MRI->getVRegDef(ConstReg); + MachineInstr *MI = MRI->getVRegDef(ConstReg); + MachineInstr *ConstInstr = + MI->getOpcode() == SPIRV::G_TRUNC || MI->getOpcode() == SPIRV::G_ZEXT + ? MRI->getVRegDef(MI->getOperand(1).getReg()) + : MI; if (auto *GI = dyn_cast<GIntrinsic>(ConstInstr)) { if (GI->is(Intrinsic::spv_track_constant)) { ConstReg = ConstInstr->getOperand(2).getReg(); diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h index c131eecb1c13..12725d6bac14 100644 --- a/llvm/lib/Target/SPIRV/SPIRVUtils.h +++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h @@ -160,5 +160,29 @@ inline Type *toTypedPointer(Type *Ty) { : Ty; } +inline Type *toTypedFunPointer(FunctionType *FTy) { + Type *OrigRetTy = FTy->getReturnType(); + Type *RetTy = toTypedPointer(OrigRetTy); + bool IsUntypedPtr = false; + for (Type *PTy : FTy->params()) { + if (isUntypedPointerTy(PTy)) { + IsUntypedPtr = true; + break; + } + } + if (!IsUntypedPtr && RetTy == OrigRetTy) + return FTy; + SmallVector<Type *> ParamTys; + for (Type *PTy : FTy->params()) + ParamTys.push_back(toTypedPointer(PTy)); + return FunctionType::get(RetTy, ParamTys, FTy->isVarArg()); +} + +inline const Type *unifyPtrType(const Type *Ty) { + if (auto FTy = dyn_cast<FunctionType>(Ty)) + return toTypedFunPointer(const_cast<FunctionType *>(FTy)); + return toTypedPointer(const_cast<Type *>(Ty)); +} + } // namespace llvm #endif // LLVM_LIB_TARGET_SPIRV_SPIRVUTILS_H diff --git a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp index 3d8637bb8c35..af634a7da71c 100644 --- a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp +++ b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp @@ -107,10 +107,15 @@ class SparcAsmParser : public MCTargetAsmParser { ParseStatus parseBranchModifiers(OperandVector &Operands); + ParseStatus parseExpression(int64_t &Val); + // Helper function for dealing with %lo / %hi in PIC mode. const SparcMCExpr *adjustPICRelocation(SparcMCExpr::VariantKind VK, const MCExpr *subExpr); + // Helper function to see if current token can start an expression. + bool isPossibleExpression(const AsmToken &Token); + // returns true if Tok is matched to a register and returns register in RegNo. MCRegister matchRegisterName(const AsmToken &Tok, unsigned &RegKind); @@ -1085,38 +1090,35 @@ ParseStatus SparcAsmParser::parseASITag(OperandVector &Operands) { SMLoc E = Parser.getTok().getEndLoc(); int64_t ASIVal = 0; - switch (getLexer().getKind()) { - case AsmToken::LParen: - case AsmToken::Integer: - case AsmToken::Identifier: - case AsmToken::Plus: - case AsmToken::Minus: - case AsmToken::Tilde: - if (getParser().parseAbsoluteExpression(ASIVal) || !isUInt<8>(ASIVal)) - return Error(S, "invalid ASI number, must be between 0 and 255"); - break; - case AsmToken::Hash: { - // For now we only support named tags for 64-bit/V9 systems. - // TODO: add support for 32-bit/V8 systems. - SMLoc TagStart = getLexer().peekTok(false).getLoc(); - Parser.Lex(); // Eat the '#'. - const StringRef ASIName = Parser.getTok().getString(); - const SparcASITag::ASITag *ASITag = - SparcASITag::lookupASITagByName(ASIName); - if (!ASITag) - ASITag = SparcASITag::lookupASITagByAltName(ASIName); - Parser.Lex(); // Eat the identifier token. + if (getLexer().getKind() != AsmToken::Hash) { + // If the ASI tag provided is not a named tag, then it + // must be a constant expression. + ParseStatus ParseExprStatus = parseExpression(ASIVal); + if (!ParseExprStatus.isSuccess()) + return ParseExprStatus; - if (!ASITag) - return Error(TagStart, "unknown ASI tag"); + if (!isUInt<8>(ASIVal)) + return Error(S, "invalid ASI number, must be between 0 and 255"); - ASIVal = ASITag->Encoding; - break; - } - default: - return ParseStatus::NoMatch; + Operands.push_back(SparcOperand::CreateASITag(ASIVal, S, E)); + return ParseStatus::Success; } + // For now we only support named tags for 64-bit/V9 systems. + // TODO: add support for 32-bit/V8 systems. + SMLoc TagStart = getLexer().peekTok(false).getLoc(); + Parser.Lex(); // Eat the '#'. + const StringRef ASIName = Parser.getTok().getString(); + const SparcASITag::ASITag *ASITag = SparcASITag::lookupASITagByName(ASIName); + if (!ASITag) + ASITag = SparcASITag::lookupASITagByAltName(ASIName); + Parser.Lex(); // Eat the identifier token. + + if (!ASITag) + return Error(TagStart, "unknown ASI tag"); + + ASIVal = ASITag->Encoding; + Operands.push_back(SparcOperand::CreateASITag(ASIVal, S, E)); return ParseStatus::Success; } @@ -1126,35 +1128,32 @@ ParseStatus SparcAsmParser::parsePrefetchTag(OperandVector &Operands) { SMLoc E = Parser.getTok().getEndLoc(); int64_t PrefetchVal = 0; - switch (getLexer().getKind()) { - case AsmToken::LParen: - case AsmToken::Integer: - case AsmToken::Identifier: - case AsmToken::Plus: - case AsmToken::Minus: - case AsmToken::Tilde: - if (getParser().parseAbsoluteExpression(PrefetchVal) || - !isUInt<5>(PrefetchVal)) - return Error(S, "invalid prefetch number, must be between 0 and 31"); - break; - case AsmToken::Hash: { - SMLoc TagStart = getLexer().peekTok(false).getLoc(); - Parser.Lex(); // Eat the '#'. - const StringRef PrefetchName = Parser.getTok().getString(); - const SparcPrefetchTag::PrefetchTag *PrefetchTag = - SparcPrefetchTag::lookupPrefetchTagByName(PrefetchName); - Parser.Lex(); // Eat the identifier token. + if (getLexer().getKind() != AsmToken::Hash) { + // If the prefetch tag provided is not a named tag, then it + // must be a constant expression. + ParseStatus ParseExprStatus = parseExpression(PrefetchVal); + if (!ParseExprStatus.isSuccess()) + return ParseExprStatus; - if (!PrefetchTag) - return Error(TagStart, "unknown prefetch tag"); + if (!isUInt<8>(PrefetchVal)) + return Error(S, "invalid prefetch number, must be between 0 and 31"); - PrefetchVal = PrefetchTag->Encoding; - break; - } - default: - return ParseStatus::NoMatch; + Operands.push_back(SparcOperand::CreatePrefetchTag(PrefetchVal, S, E)); + return ParseStatus::Success; } + SMLoc TagStart = getLexer().peekTok(false).getLoc(); + Parser.Lex(); // Eat the '#'. + const StringRef PrefetchName = Parser.getTok().getString(); + const SparcPrefetchTag::PrefetchTag *PrefetchTag = + SparcPrefetchTag::lookupPrefetchTagByName(PrefetchName); + Parser.Lex(); // Eat the identifier token. + + if (!PrefetchTag) + return Error(TagStart, "unknown prefetch tag"); + + PrefetchVal = PrefetchTag->Encoding; + Operands.push_back(SparcOperand::CreatePrefetchTag(PrefetchVal, S, E)); return ParseStatus::Success; } @@ -1370,6 +1369,15 @@ ParseStatus SparcAsmParser::parseBranchModifiers(OperandVector &Operands) { return ParseStatus::Success; } +ParseStatus SparcAsmParser::parseExpression(int64_t &Val) { + AsmToken Tok = getLexer().getTok(); + + if (!isPossibleExpression(Tok)) + return ParseStatus::NoMatch; + + return getParser().parseAbsoluteExpression(Val); +} + #define GET_REGISTER_MATCHER #include "SparcGenAsmMatcher.inc" @@ -1600,6 +1608,20 @@ bool SparcAsmParser::matchSparcAsmModifiers(const MCExpr *&EVal, return true; } +bool SparcAsmParser::isPossibleExpression(const AsmToken &Token) { + switch (Token.getKind()) { + case AsmToken::LParen: + case AsmToken::Integer: + case AsmToken::Identifier: + case AsmToken::Plus: + case AsmToken::Minus: + case AsmToken::Tilde: + return true; + default: + return false; + } +} + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSparcAsmParser() { RegisterMCAsmParser<SparcAsmParser> A(getTheSparcTarget()); RegisterMCAsmParser<SparcAsmParser> B(getTheSparcV9Target()); diff --git a/llvm/lib/Target/Sparc/SparcInstr64Bit.td b/llvm/lib/Target/Sparc/SparcInstr64Bit.td index 93862414fb35..6b7813745165 100644 --- a/llvm/lib/Target/Sparc/SparcInstr64Bit.td +++ b/llvm/lib/Target/Sparc/SparcInstr64Bit.td @@ -478,7 +478,7 @@ def : Pat<(i64 (atomic_load_64 ADDRri:$src)), (LDXri ADDRri:$src)>; def : Pat<(atomic_store_64 i64:$val, ADDRrr:$dst), (STXrr ADDRrr:$dst, $val)>; def : Pat<(atomic_store_64 i64:$val, ADDRri:$dst), (STXri ADDRri:$dst, $val)>; -def : Pat<(atomic_cmp_swap_64 i64:$rs1, i64:$rs2, i64:$swap), +def : Pat<(atomic_cmp_swap_i64 i64:$rs1, i64:$rs2, i64:$swap), (CASXArr $rs1, $rs2, $swap, 0x80)>; } // Predicates = [Is64Bit] diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.td b/llvm/lib/Target/Sparc/SparcInstrInfo.td index cac96a139872..7b074231ec62 100644 --- a/llvm/lib/Target/Sparc/SparcInstrInfo.td +++ b/llvm/lib/Target/Sparc/SparcInstrInfo.td @@ -744,11 +744,11 @@ let Constraints = "$val = $rd" in { def SWAPrr : F3_1<3, 0b001111, (outs IntRegs:$rd), (ins (MEMrr $rs1, $rs2):$addr, IntRegs:$val), "swap [$addr], $rd", - [(set i32:$rd, (atomic_swap_32 ADDRrr:$addr, i32:$val))]>; + [(set i32:$rd, (atomic_swap_i32 ADDRrr:$addr, i32:$val))]>; def SWAPri : F3_2<3, 0b001111, (outs IntRegs:$rd), (ins (MEMri $rs1, $simm13):$addr, IntRegs:$val), "swap [$addr], $rd", - [(set i32:$rd, (atomic_swap_32 ADDRri:$addr, i32:$val))]>; + [(set i32:$rd, (atomic_swap_i32 ADDRri:$addr, i32:$val))]>; def SWAPArr : F3_1_asi<3, 0b011111, (outs IntRegs:$rd), (ins (MEMrr $rs1, $rs2):$addr, ASITag:$asi, IntRegs:$val), "swapa [$addr] $asi, $rd", @@ -1913,12 +1913,12 @@ def : Pat<(atomic_store_32 i32:$val, ADDRrr:$dst), (STrr ADDRrr:$dst, $val)>; def : Pat<(atomic_store_32 i32:$val, ADDRri:$dst), (STri ADDRri:$dst, $val)>; let Predicates = [HasV9] in -def : Pat<(atomic_cmp_swap_32 iPTR:$rs1, i32:$rs2, i32:$swap), +def : Pat<(atomic_cmp_swap_i32 iPTR:$rs1, i32:$rs2, i32:$swap), (CASArr $rs1, $rs2, $swap, 0x80)>; // Same pattern as CASArr above, but with a different ASI. let Predicates = [HasLeonCASA] in -def : Pat<(atomic_cmp_swap_32 iPTR:$rs1, i32:$rs2, i32:$swap), +def : Pat<(atomic_cmp_swap_i32 iPTR:$rs1, i32:$rs2, i32:$swap), (CASArr $rs1, $rs2, $swap, 0x0A)>; // A register pair with zero upper half. diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td index 7f3a143aad97..7c6ab3f9b1ab 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td @@ -1733,16 +1733,16 @@ let hasSideEffects = 1 in def Serialize : Alias<2, (outs), (ins), []>; let Predicates = [FeatureInterlockedAccess1], Defs = [CC] in { - def LAA : LoadAndOpRSY<"laa", 0xEBF8, atomic_load_add_32, GR32>; - def LAAG : LoadAndOpRSY<"laag", 0xEBE8, atomic_load_add_64, GR64>; + def LAA : LoadAndOpRSY<"laa", 0xEBF8, atomic_load_add_i32, GR32>; + def LAAG : LoadAndOpRSY<"laag", 0xEBE8, atomic_load_add_i64, GR64>; def LAAL : LoadAndOpRSY<"laal", 0xEBFA, null_frag, GR32>; def LAALG : LoadAndOpRSY<"laalg", 0xEBEA, null_frag, GR64>; - def LAN : LoadAndOpRSY<"lan", 0xEBF4, atomic_load_and_32, GR32>; - def LANG : LoadAndOpRSY<"lang", 0xEBE4, atomic_load_and_64, GR64>; - def LAO : LoadAndOpRSY<"lao", 0xEBF6, atomic_load_or_32, GR32>; - def LAOG : LoadAndOpRSY<"laog", 0xEBE6, atomic_load_or_64, GR64>; - def LAX : LoadAndOpRSY<"lax", 0xEBF7, atomic_load_xor_32, GR32>; - def LAXG : LoadAndOpRSY<"laxg", 0xEBE7, atomic_load_xor_64, GR64>; + def LAN : LoadAndOpRSY<"lan", 0xEBF4, atomic_load_and_i32, GR32>; + def LANG : LoadAndOpRSY<"lang", 0xEBE4, atomic_load_and_i64, GR64>; + def LAO : LoadAndOpRSY<"lao", 0xEBF6, atomic_load_or_i32, GR32>; + def LAOG : LoadAndOpRSY<"laog", 0xEBE6, atomic_load_or_i64, GR64>; + def LAX : LoadAndOpRSY<"lax", 0xEBF7, atomic_load_xor_i32, GR32>; + def LAXG : LoadAndOpRSY<"laxg", 0xEBE7, atomic_load_xor_i64, GR64>; } def ATOMIC_SWAPW : AtomicLoadWBinaryReg<z_atomic_swapw>; diff --git a/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp b/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp index 8073ed0e2a3c..bf8d109ff71f 100644 --- a/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp +++ b/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp @@ -58,7 +58,7 @@ FunctionPass *llvm::createSystemZLDCleanupPass(SystemZTargetMachine &TM) { void SystemZLDCleanup::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); - AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -75,7 +75,8 @@ bool SystemZLDCleanup::runOnMachineFunction(MachineFunction &F) { return false; } - MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>(); + MachineDominatorTree *DT = + &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); return VisitNode(DT->getRootNode(), 0); } diff --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td index cbad5a0eafb2..75ef3b7336db 100644 --- a/llvm/lib/Target/VE/VEInstrInfo.td +++ b/llvm/lib/Target/VE/VEInstrInfo.td @@ -1158,9 +1158,9 @@ defm ATMAM : RRCASm<"atmam", 0x53, I64, i64, uimm0to2>; // Section 8.2.20 - CAS (Compare and Swap) let DecoderMethod = "DecodeCASI64" in -defm CASL : RRCASm<"cas.l", 0x62, I64, i64, simm7, atomic_cmp_swap_64>; +defm CASL : RRCASm<"cas.l", 0x62, I64, i64, simm7, atomic_cmp_swap_i64>; let DecoderMethod = "DecodeCASI32", cx = 1 in -defm CASW : RRCASm<"cas.w", 0x62, I32, i32, simm7, atomic_cmp_swap_32>; +defm CASW : RRCASm<"cas.w", 0x62, I32, i32, simm7, atomic_cmp_swap_i32>; //----------------------------------------------------------------------------- // Section 8.3 - Transfer Control Instructions @@ -1896,9 +1896,9 @@ defm : TRATMSTm<atomic_store_32, STLrri, STLrii, STLzri, STLzii>; // Atomic swaps def : Pat<(i32 (ts1am i64:$src, i32:$flag, i32:$new)), (TS1AMWrir $src, 0, $flag, $new)>; -def : Pat<(i32 (atomic_swap_32 ADDRri:$src, i32:$new)), +def : Pat<(i32 (atomic_swap_i32 ADDRri:$src, i32:$new)), (TS1AMWrii MEMriRRM:$src, 15, $new)>; -def : Pat<(i64 (atomic_swap_64 ADDRri:$src, i64:$new)), +def : Pat<(i64 (atomic_swap_i64 ADDRri:$src, i64:$new)), (TS1AMLrir MEMriRRM:$src, (LEAzii 0, 0, 255), i64:$new)>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp index f5bc584ac4e1..3cc2cc0e830f 100644 --- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp +++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp @@ -757,7 +757,7 @@ public: bool CheckDataSection() { if (CurrentState != DataSection) { auto WS = cast<MCSectionWasm>(getStreamer().getCurrentSection().first); - if (WS && WS->getKind().isText()) + if (WS && WS->isText()) return error("data directive must occur in a data segment: ", Lexer.getTok()); } @@ -1074,7 +1074,7 @@ public: void doBeforeLabelEmit(MCSymbol *Symbol, SMLoc IDLoc) override { // Code below only applies to labels in text sections. auto CWS = cast<MCSectionWasm>(getStreamer().getCurrentSection().first); - if (!CWS || !CWS->getKind().isText()) + if (!CWS || !CWS->isText()) return; auto WasmSym = cast<MCSymbolWasm>(Symbol); diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp index 43c67b4b4749..b76179b1cf6e 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp @@ -122,7 +122,7 @@ unsigned WebAssemblyWasmObjectWriter::getRelocType( return wasm::R_WASM_MEMORY_ADDR_LEB64; case FK_Data_4: if (SymA.isFunction()) { - if (FixupSection.getKind().isMetadata()) + if (FixupSection.isMetadata()) return wasm::R_WASM_FUNCTION_OFFSET_I32; assert(FixupSection.isWasmData()); return wasm::R_WASM_TABLE_INDEX_I32; @@ -131,7 +131,7 @@ unsigned WebAssemblyWasmObjectWriter::getRelocType( return wasm::R_WASM_GLOBAL_INDEX_I32; if (auto Section = static_cast<const MCSectionWasm *>( getTargetSection(Fixup.getValue()))) { - if (Section->getKind().isText()) + if (Section->isText()) return wasm::R_WASM_FUNCTION_OFFSET_I32; else if (!Section->isWasmData()) return wasm::R_WASM_SECTION_OFFSET_I32; @@ -140,7 +140,7 @@ unsigned WebAssemblyWasmObjectWriter::getRelocType( : wasm::R_WASM_MEMORY_ADDR_I32; case FK_Data_8: if (SymA.isFunction()) { - if (FixupSection.getKind().isMetadata()) + if (FixupSection.isMetadata()) return wasm::R_WASM_FUNCTION_OFFSET_I64; return wasm::R_WASM_TABLE_INDEX_I64; } @@ -148,7 +148,7 @@ unsigned WebAssemblyWasmObjectWriter::getRelocType( llvm_unreachable("unimplemented R_WASM_GLOBAL_INDEX_I64"); if (auto Section = static_cast<const MCSectionWasm *>( getTargetSection(Fixup.getValue()))) { - if (Section->getKind().isText()) + if (Section->isText()) return wasm::R_WASM_FUNCTION_OFFSET_I64; else if (!Section->isWasmData()) llvm_unreachable("unimplemented R_WASM_SECTION_OFFSET_I64"); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp index 0b7ec6e74cab..b0a97c725c87 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp @@ -319,8 +319,8 @@ void WebAssemblyAsmPrinter::emitDecls(const Module &M) { // Emit .globaltype, .tagtype, or .tabletype declarations for extern // declarations, i.e. those that have only been declared (but not defined) // in the current module - auto Sym = cast<MCSymbolWasm>(It.getValue()); - if (!Sym->isDefined()) + auto Sym = cast_or_null<MCSymbolWasm>(It.getValue().Symbol); + if (Sym && !Sym->isDefined()) emitSymbolType(Sym); } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp index 06758e465197..f746bf4307a0 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp @@ -53,8 +53,8 @@ class WebAssemblyCFGSort final : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); AU.addRequired<MachineLoopInfo>(); AU.addPreserved<MachineLoopInfo>(); AU.addRequired<WebAssemblyExceptionInfo>(); @@ -387,7 +387,7 @@ bool WebAssemblyCFGSort::runOnMachineFunction(MachineFunction &MF) { const auto &MLI = getAnalysis<MachineLoopInfo>(); const auto &WEI = getAnalysis<WebAssemblyExceptionInfo>(); - auto &MDT = getAnalysis<MachineDominatorTree>(); + auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); // Liveness is not tracked for VALUE_STACK physreg. MF.getRegInfo().invalidateLiveness(); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp index d8cbddf74545..77e82a32545f 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp @@ -48,7 +48,7 @@ class WebAssemblyCFGStackify final : public MachineFunctionPass { StringRef getPassName() const override { return "WebAssembly CFG Stackify"; } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); AU.addRequired<MachineLoopInfo>(); AU.addRequired<WebAssemblyExceptionInfo>(); MachineFunctionPass::getAnalysisUsage(AU); @@ -252,7 +252,7 @@ void WebAssemblyCFGStackify::unregisterScope(MachineInstr *Begin) { void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) { assert(!MBB.isEHPad()); MachineFunction &MF = *MBB.getParent(); - auto &MDT = getAnalysis<MachineDominatorTree>(); + auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo(); const auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>(); @@ -465,7 +465,7 @@ void WebAssemblyCFGStackify::placeLoopMarker(MachineBasicBlock &MBB) { void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) { assert(MBB.isEHPad()); MachineFunction &MF = *MBB.getParent(); - auto &MDT = getAnalysis<MachineDominatorTree>(); + auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo(); const auto &MLI = getAnalysis<MachineLoopInfo>(); const auto &WEI = getAnalysis<WebAssemblyExceptionInfo>(); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp index 8deac76b2bc3..b312ca7f5346 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp @@ -31,7 +31,7 @@ char WebAssemblyExceptionInfo::ID = 0; INITIALIZE_PASS_BEGIN(WebAssemblyExceptionInfo, DEBUG_TYPE, "WebAssembly Exception Information", true, true) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier) INITIALIZE_PASS_END(WebAssemblyExceptionInfo, DEBUG_TYPE, "WebAssembly Exception Information", true, true) @@ -45,7 +45,7 @@ bool WebAssemblyExceptionInfo::runOnMachineFunction(MachineFunction &MF) { ExceptionHandling::Wasm || !MF.getFunction().hasPersonalityFn()) return false; - auto &MDT = getAnalysis<MachineDominatorTree>(); + auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); auto &MDF = getAnalysis<MachineDominanceFrontier>(); recalculate(MF, MDT, MDF); LLVM_DEBUG(dump()); @@ -207,12 +207,12 @@ void WebAssemblyExceptionInfo::recalculate( auto *SrcWE = P.first; auto *DstWE = P.second; - for (auto *MBB : SrcWE->getBlocksSet()) { + SrcWE->getBlocksSet().remove_if([&](MachineBasicBlock *MBB){ if (MBB->isEHPad()) { assert(!isReachableAmongDominated(DstWE->getEHPad(), MBB, SrcWE->getEHPad(), MDT) && "We already handled EH pads above"); - continue; + return false; } if (isReachableAmongDominated(DstWE->getEHPad(), MBB, SrcWE->getEHPad(), MDT)) { @@ -227,15 +227,16 @@ void WebAssemblyExceptionInfo::recalculate( InnerWE->removeFromBlocksSet(MBB); InnerWE = InnerWE->getParentException(); } - SrcWE->removeFromBlocksSet(MBB); LLVM_DEBUG(dbgs() << " removed from " << SrcWE->getEHPad()->getNumber() << "." << SrcWE->getEHPad()->getName() << "'s exception\n"); changeExceptionFor(MBB, SrcWE->getParentException()); if (SrcWE->getParentException()) SrcWE->getParentException()->addToBlocksSet(MBB); + return true; } - } + return false; + }); } // Add BBs to exceptions' block vector @@ -273,7 +274,7 @@ void WebAssemblyExceptionInfo::releaseMemory() { void WebAssemblyExceptionInfo::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); - AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); AU.addRequired<MachineDominanceFrontier>(); MachineFunctionPass::getAnalysisUsage(AU); } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td index 4623ce9b5c38..46bd5e42a9d5 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td @@ -351,17 +351,17 @@ multiclass BinRMWPattern<PatFrag rmw_32, PatFrag rmw_64, string inst_32, defm : BinRMWPat<i64, rmw_64, inst_64>; } -defm : BinRMWPattern<atomic_load_add_32, atomic_load_add_64, +defm : BinRMWPattern<atomic_load_add_i32, atomic_load_add_i64, "ATOMIC_RMW_ADD_I32", "ATOMIC_RMW_ADD_I64">; -defm : BinRMWPattern<atomic_load_sub_32, atomic_load_sub_64, +defm : BinRMWPattern<atomic_load_sub_i32, atomic_load_sub_i64, "ATOMIC_RMW_SUB_I32", "ATOMIC_RMW_SUB_I64">; -defm : BinRMWPattern<atomic_load_and_32, atomic_load_and_64, +defm : BinRMWPattern<atomic_load_and_i32, atomic_load_and_i64, "ATOMIC_RMW_AND_I32", "ATOMIC_RMW_AND_I64">; -defm : BinRMWPattern<atomic_load_or_32, atomic_load_or_64, +defm : BinRMWPattern<atomic_load_or_i32, atomic_load_or_i64, "ATOMIC_RMW_OR_I32", "ATOMIC_RMW_OR_I64">; -defm : BinRMWPattern<atomic_load_xor_32, atomic_load_xor_64, +defm : BinRMWPattern<atomic_load_xor_i32, atomic_load_xor_i64, "ATOMIC_RMW_XOR_I32", "ATOMIC_RMW_XOR_I64">; -defm : BinRMWPattern<atomic_swap_32, atomic_swap_64, +defm : BinRMWPattern<atomic_swap_i32, atomic_swap_i64, "ATOMIC_RMW_XCHG_I32", "ATOMIC_RMW_XCHG_I64">; // Truncating & zero-extending binary RMW patterns. @@ -408,27 +408,27 @@ multiclass BinRMWTruncExtPattern< } defm : BinRMWTruncExtPattern< - atomic_load_add_8, atomic_load_add_16, atomic_load_add_32, + atomic_load_add_i8, atomic_load_add_i16, atomic_load_add_i32, "ATOMIC_RMW8_U_ADD_I32", "ATOMIC_RMW16_U_ADD_I32", "ATOMIC_RMW8_U_ADD_I64", "ATOMIC_RMW16_U_ADD_I64", "ATOMIC_RMW32_U_ADD_I64">; defm : BinRMWTruncExtPattern< - atomic_load_sub_8, atomic_load_sub_16, atomic_load_sub_32, + atomic_load_sub_i8, atomic_load_sub_i16, atomic_load_sub_i32, "ATOMIC_RMW8_U_SUB_I32", "ATOMIC_RMW16_U_SUB_I32", "ATOMIC_RMW8_U_SUB_I64", "ATOMIC_RMW16_U_SUB_I64", "ATOMIC_RMW32_U_SUB_I64">; defm : BinRMWTruncExtPattern< - atomic_load_and_8, atomic_load_and_16, atomic_load_and_32, + atomic_load_and_i8, atomic_load_and_i16, atomic_load_and_i32, "ATOMIC_RMW8_U_AND_I32", "ATOMIC_RMW16_U_AND_I32", "ATOMIC_RMW8_U_AND_I64", "ATOMIC_RMW16_U_AND_I64", "ATOMIC_RMW32_U_AND_I64">; defm : BinRMWTruncExtPattern< - atomic_load_or_8, atomic_load_or_16, atomic_load_or_32, + atomic_load_or_i8, atomic_load_or_i16, atomic_load_or_i32, "ATOMIC_RMW8_U_OR_I32", "ATOMIC_RMW16_U_OR_I32", "ATOMIC_RMW8_U_OR_I64", "ATOMIC_RMW16_U_OR_I64", "ATOMIC_RMW32_U_OR_I64">; defm : BinRMWTruncExtPattern< - atomic_load_xor_8, atomic_load_xor_16, atomic_load_xor_32, + atomic_load_xor_i8, atomic_load_xor_i16, atomic_load_xor_i32, "ATOMIC_RMW8_U_XOR_I32", "ATOMIC_RMW16_U_XOR_I32", "ATOMIC_RMW8_U_XOR_I64", "ATOMIC_RMW16_U_XOR_I64", "ATOMIC_RMW32_U_XOR_I64">; defm : BinRMWTruncExtPattern< - atomic_swap_8, atomic_swap_16, atomic_swap_32, + atomic_swap_i8, atomic_swap_i16, atomic_swap_i32, "ATOMIC_RMW8_U_XCHG_I32", "ATOMIC_RMW16_U_XCHG_I32", "ATOMIC_RMW8_U_XCHG_I64", "ATOMIC_RMW16_U_XCHG_I64", "ATOMIC_RMW32_U_XCHG_I64">; @@ -485,8 +485,8 @@ multiclass TerRMWPat<ValueType ty, PatFrag kind, string inst> { Requires<[HasAddr64, HasAtomics]>; } -defm : TerRMWPat<i32, atomic_cmp_swap_32, "ATOMIC_RMW_CMPXCHG_I32">; -defm : TerRMWPat<i64, atomic_cmp_swap_64, "ATOMIC_RMW_CMPXCHG_I64">; +defm : TerRMWPat<i32, atomic_cmp_swap_i32, "ATOMIC_RMW_CMPXCHG_I32">; +defm : TerRMWPat<i64, atomic_cmp_swap_i64, "ATOMIC_RMW_CMPXCHG_I64">; // Truncating & zero-extending ternary RMW patterns. // DAG legalization & optimization before instruction selection may introduce @@ -524,13 +524,13 @@ class sext_ter_rmw_8_64<PatFrag kind> : class sext_ter_rmw_16_64<PatFrag kind> : sext_ter_rmw_8_64<kind>; // 32->64 sext RMW gets selected as i32.atomic.rmw.***, i64.extend_i32_s -defm : TerRMWPat<i32, zext_ter_rmw_8_32<atomic_cmp_swap_8>, "ATOMIC_RMW8_U_CMPXCHG_I32">; -defm : TerRMWPat<i32, zext_ter_rmw_16_32<atomic_cmp_swap_16>, "ATOMIC_RMW16_U_CMPXCHG_I32">; -defm : TerRMWPat<i64, zext_ter_rmw_8_64<atomic_cmp_swap_8>, "ATOMIC_RMW8_U_CMPXCHG_I64">; -defm : TerRMWPat<i64, zext_ter_rmw_16_64<atomic_cmp_swap_16>, "ATOMIC_RMW16_U_CMPXCHG_I64">; -defm : TerRMWPat<i64, zext_ter_rmw_32_64<atomic_cmp_swap_32>, "ATOMIC_RMW32_U_CMPXCHG_I64">; +defm : TerRMWPat<i32, zext_ter_rmw_8_32<atomic_cmp_swap_i8>, "ATOMIC_RMW8_U_CMPXCHG_I32">; +defm : TerRMWPat<i32, zext_ter_rmw_16_32<atomic_cmp_swap_i16>, "ATOMIC_RMW16_U_CMPXCHG_I32">; +defm : TerRMWPat<i64, zext_ter_rmw_8_64<atomic_cmp_swap_i8>, "ATOMIC_RMW8_U_CMPXCHG_I64">; +defm : TerRMWPat<i64, zext_ter_rmw_16_64<atomic_cmp_swap_i16>, "ATOMIC_RMW16_U_CMPXCHG_I64">; +defm : TerRMWPat<i64, zext_ter_rmw_32_64<atomic_cmp_swap_i32>, "ATOMIC_RMW32_U_CMPXCHG_I64">; -defm : TerRMWPat<i32, sext_ter_rmw_8_32<atomic_cmp_swap_8>, "ATOMIC_RMW8_U_CMPXCHG_I32">; -defm : TerRMWPat<i32, sext_ter_rmw_16_32<atomic_cmp_swap_16>, "ATOMIC_RMW16_U_CMPXCHG_I32">; -defm : TerRMWPat<i64, sext_ter_rmw_8_64<atomic_cmp_swap_8>, "ATOMIC_RMW8_U_CMPXCHG_I64">; -defm : TerRMWPat<i64, sext_ter_rmw_16_64<atomic_cmp_swap_16>, "ATOMIC_RMW16_U_CMPXCHG_I64">; +defm : TerRMWPat<i32, sext_ter_rmw_8_32<atomic_cmp_swap_i8>, "ATOMIC_RMW8_U_CMPXCHG_I32">; +defm : TerRMWPat<i32, sext_ter_rmw_16_32<atomic_cmp_swap_i16>, "ATOMIC_RMW16_U_CMPXCHG_I32">; +defm : TerRMWPat<i64, sext_ter_rmw_8_64<atomic_cmp_swap_i8>, "ATOMIC_RMW8_U_CMPXCHG_I64">; +defm : TerRMWPat<i64, sext_ter_rmw_16_64<atomic_cmp_swap_i16>, "ATOMIC_RMW16_U_CMPXCHG_I64">; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 3c97befcea1a..2ee430c88169 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -1320,16 +1320,23 @@ def : Pat<(v8f16 (int_wasm_pmax (v8f16 V128:$lhs), (v8f16 V128:$rhs))), //===----------------------------------------------------------------------===// multiclass SIMDConvert<Vec vec, Vec arg, SDPatternOperator op, string name, - bits<32> simdop> { + bits<32> simdop, list<Predicate> reqs = []> { defm op#_#vec : SIMD_I<(outs V128:$dst), (ins V128:$vec), (outs), (ins), [(set (vec.vt V128:$dst), (vec.vt (op (arg.vt V128:$vec))))], - vec.prefix#"."#name#"\t$dst, $vec", vec.prefix#"."#name, simdop>; + vec.prefix#"."#name#"\t$dst, $vec", vec.prefix#"."#name, simdop, reqs>; +} + +multiclass HalfPrecisionConvert<Vec vec, Vec arg, SDPatternOperator op, + string name, bits<32> simdop> { + defm "" : SIMDConvert<vec, arg, op, name, simdop, [HasHalfPrecision]>; } // Floating point to integer with saturation: trunc_sat defm "" : SIMDConvert<I32x4, F32x4, fp_to_sint, "trunc_sat_f32x4_s", 248>; defm "" : SIMDConvert<I32x4, F32x4, fp_to_uint, "trunc_sat_f32x4_u", 249>; +defm "" : HalfPrecisionConvert<I16x8, F16x8, fp_to_sint, "trunc_sat_f16x8_s", 0x148>; +defm "" : HalfPrecisionConvert<I16x8, F16x8, fp_to_uint, "trunc_sat_f16x8_u", 0x149>; // Support the saturating variety as well. def trunc_s_sat32 : PatFrag<(ops node:$x), (fp_to_sint_sat $x, i32)>; @@ -1355,6 +1362,8 @@ defm "" : SIMDConvert<F32x4, I32x4, sint_to_fp, "convert_i32x4_s", 250>; defm "" : SIMDConvert<F32x4, I32x4, uint_to_fp, "convert_i32x4_u", 251>; defm "" : SIMDConvert<F64x2, I32x4, convert_low_s, "convert_low_i32x4_s", 0xfe>; defm "" : SIMDConvert<F64x2, I32x4, convert_low_u, "convert_low_i32x4_u", 0xff>; +defm "" : HalfPrecisionConvert<F16x8, I16x8, sint_to_fp, "convert_i16x8_s", 0x14a>; +defm "" : HalfPrecisionConvert<F16x8, I16x8, uint_to_fp, "convert_i16x8_u", 0x14b>; // Extending operations // TODO: refactor this to be uniform for i64x2 if the numbering is not changed. @@ -1480,23 +1489,24 @@ defm "" : RelaxedConvert<I32x4, F64x2, int_wasm_relaxed_trunc_unsigned_zero, // Relaxed (Negative) Multiply-Add (madd/nmadd) //===----------------------------------------------------------------------===// -multiclass SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS> { +multiclass SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS, list<Predicate> reqs> { defm MADD_#vec : - RELAXED_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins), - [(set (vec.vt V128:$dst), (int_wasm_relaxed_madd - (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)))], - vec.prefix#".relaxed_madd\t$dst, $a, $b, $c", - vec.prefix#".relaxed_madd", simdopA>; + SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins), + [(set (vec.vt V128:$dst), (int_wasm_relaxed_madd + (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)))], + vec.prefix#".relaxed_madd\t$dst, $a, $b, $c", + vec.prefix#".relaxed_madd", simdopA, reqs>; defm NMADD_#vec : - RELAXED_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins), - [(set (vec.vt V128:$dst), (int_wasm_relaxed_nmadd - (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)))], - vec.prefix#".relaxed_nmadd\t$dst, $a, $b, $c", - vec.prefix#".relaxed_nmadd", simdopS>; + SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins), + [(set (vec.vt V128:$dst), (int_wasm_relaxed_nmadd + (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)))], + vec.prefix#".relaxed_nmadd\t$dst, $a, $b, $c", + vec.prefix#".relaxed_nmadd", simdopS, reqs>; } -defm "" : SIMDMADD<F32x4, 0x105, 0x106>; -defm "" : SIMDMADD<F64x2, 0x107, 0x108>; +defm "" : SIMDMADD<F32x4, 0x105, 0x106, [HasRelaxedSIMD]>; +defm "" : SIMDMADD<F64x2, 0x107, 0x108, [HasRelaxedSIMD]>; +defm "" : SIMDMADD<F16x8, 0x146, 0x147, [HasHalfPrecision]>; //===----------------------------------------------------------------------===// // Laneselect diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp index 2180f57c106a..2ab5bcdd838d 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp @@ -56,8 +56,8 @@ public: AU.setPreservesCFG(); AU.addRequired<MachineBlockFrequencyInfo>(); AU.addPreserved<MachineBlockFrequencyInfo>(); - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); AU.addRequired<LiveIntervals>(); AU.addPreserved<SlotIndexes>(); AU.addPreserved<LiveIntervals>(); @@ -180,7 +180,7 @@ bool WebAssemblyMemIntrinsicResults::runOnMachineFunction(MachineFunction &MF) { }); MachineRegisterInfo &MRI = MF.getRegInfo(); - auto &MDT = getAnalysis<MachineDominatorTree>(); + auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); const WebAssemblyTargetLowering &TLI = *MF.getSubtarget<WebAssemblySubtarget>().getTargetLowering(); const auto &LibInfo = diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp index d4edb6bf18d9..e38905c20b83 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp @@ -48,13 +48,13 @@ class WebAssemblyRegStackify final : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); AU.addRequired<LiveIntervals>(); AU.addPreserved<MachineBlockFrequencyInfo>(); AU.addPreserved<SlotIndexes>(); AU.addPreserved<LiveIntervals>(); AU.addPreservedID(LiveVariablesID); - AU.addPreserved<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTreeWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -813,7 +813,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>(); const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo(); const auto *TRI = MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo(); - auto &MDT = getAnalysis<MachineDominatorTree>(); + auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); auto &LIS = getAnalysis<LiveIntervals>(); // Walk the instructions from the bottom up. Currently we don't look past diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp index d9936557776b..20e50c8c9e1a 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp @@ -201,6 +201,9 @@ struct RuntimeLibcallSignatureTable { Table[RTLIB::COS_F32] = f32_func_f32; Table[RTLIB::COS_F64] = f64_func_f64; Table[RTLIB::COS_F128] = i64_i64_func_i64_i64; + Table[RTLIB::TAN_F32] = f32_func_f32; + Table[RTLIB::TAN_F64] = f64_func_f64; + Table[RTLIB::TAN_F128] = i64_i64_func_i64_i64; Table[RTLIB::SINCOS_F32] = func_f32_iPTR_iPTR; Table[RTLIB::SINCOS_F64] = func_f64_iPTR_iPTR; Table[RTLIB::SINCOS_F128] = func_i64_i64_iPTR_iPTR; diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index 662310610931..dbea42d55b5f 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -88,15 +88,17 @@ class X86AsmParser : public MCTargetAsmParser { bool Code16GCC; unsigned ForcedDataPrefix = 0; - enum VEXEncoding { - VEXEncoding_Default, - VEXEncoding_VEX, - VEXEncoding_VEX2, - VEXEncoding_VEX3, - VEXEncoding_EVEX, + enum OpcodePrefix { + OpcodePrefix_Default, + OpcodePrefix_REX, + OpcodePrefix_REX2, + OpcodePrefix_VEX, + OpcodePrefix_VEX2, + OpcodePrefix_VEX3, + OpcodePrefix_EVEX, }; - VEXEncoding ForcedVEXEncoding = VEXEncoding_Default; + OpcodePrefix ForcedOpcodePrefix = OpcodePrefix_Default; enum DispEncoding { DispEncoding_Default, @@ -1197,12 +1199,11 @@ private: bool ErrorMissingFeature(SMLoc IDLoc, const FeatureBitset &MissingFeatures, bool MatchingInlineAsm); - bool MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, + bool matchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, MCInst &Inst, OperandVector &Operands, MCStreamer &Out, - uint64_t &ErrorInfo, - bool MatchingInlineAsm); + uint64_t &ErrorInfo, bool MatchingInlineAsm); - bool MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, + bool matchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, MCInst &Inst, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm); @@ -2317,7 +2318,7 @@ bool X86AsmParser::parseCFlagsOp(OperandVector &Operands) { return Error(Tok.getLoc(), "Expected { at this point"); Parser.Lex(); // Eat "{" Tok = Parser.getTok(); - if (Tok.getIdentifier() != "dfv") + if (Tok.getIdentifier().lower() != "dfv") return Error(Tok.getLoc(), "Expected dfv at this point"); Parser.Lex(); // Eat "dfv" Tok = Parser.getTok(); @@ -2337,7 +2338,7 @@ bool X86AsmParser::parseCFlagsOp(OperandVector &Operands) { unsigned CFlags = 0; for (unsigned I = 0; I < 4; ++I) { Tok = Parser.getTok(); - unsigned CFlag = StringSwitch<unsigned>(Tok.getIdentifier()) + unsigned CFlag = StringSwitch<unsigned>(Tok.getIdentifier().lower()) .Case("of", 0x8) .Case("sf", 0x4) .Case("zf", 0x2) @@ -3186,7 +3187,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, InstInfo = &Info; // Reset the forced VEX encoding. - ForcedVEXEncoding = VEXEncoding_Default; + ForcedOpcodePrefix = OpcodePrefix_Default; ForcedDispEncoding = DispEncoding_Default; UseApxExtendedReg = false; ForcedNoFlag = false; @@ -3202,14 +3203,18 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, return Error(Parser.getTok().getLoc(), "Expected '}'"); Parser.Lex(); // Eat curly. - if (Prefix == "vex") - ForcedVEXEncoding = VEXEncoding_VEX; + if (Prefix == "rex") + ForcedOpcodePrefix = OpcodePrefix_REX; + else if (Prefix == "rex2") + ForcedOpcodePrefix = OpcodePrefix_REX2; + else if (Prefix == "vex") + ForcedOpcodePrefix = OpcodePrefix_VEX; else if (Prefix == "vex2") - ForcedVEXEncoding = VEXEncoding_VEX2; + ForcedOpcodePrefix = OpcodePrefix_VEX2; else if (Prefix == "vex3") - ForcedVEXEncoding = VEXEncoding_VEX3; + ForcedOpcodePrefix = OpcodePrefix_VEX3; else if (Prefix == "evex") - ForcedVEXEncoding = VEXEncoding_EVEX; + ForcedOpcodePrefix = OpcodePrefix_EVEX; else if (Prefix == "disp8") ForcedDispEncoding = DispEncoding_Disp8; else if (Prefix == "disp32") @@ -3235,15 +3240,15 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // Parse MASM style pseudo prefixes. if (isParsingMSInlineAsm()) { if (Name.equals_insensitive("vex")) - ForcedVEXEncoding = VEXEncoding_VEX; + ForcedOpcodePrefix = OpcodePrefix_VEX; else if (Name.equals_insensitive("vex2")) - ForcedVEXEncoding = VEXEncoding_VEX2; + ForcedOpcodePrefix = OpcodePrefix_VEX2; else if (Name.equals_insensitive("vex3")) - ForcedVEXEncoding = VEXEncoding_VEX3; + ForcedOpcodePrefix = OpcodePrefix_VEX3; else if (Name.equals_insensitive("evex")) - ForcedVEXEncoding = VEXEncoding_EVEX; + ForcedOpcodePrefix = OpcodePrefix_EVEX; - if (ForcedVEXEncoding != VEXEncoding_Default) { + if (ForcedOpcodePrefix != OpcodePrefix_Default) { if (getLexer().isNot(AsmToken::Identifier)) return Error(Parser.getTok().getLoc(), "Expected identifier"); // FIXME: The mnemonic won't match correctly if its not in lower case. @@ -3741,7 +3746,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, } bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) { - if (ForcedVEXEncoding != VEXEncoding_VEX3 && + if (ForcedOpcodePrefix != OpcodePrefix_VEX3 && X86::optimizeInstFromVEX3ToVEX2(Inst, MII.get(Inst.getOpcode()))) return true; @@ -4002,15 +4007,59 @@ void X86AsmParser::emitInstruction(MCInst &Inst, OperandVector &Operands, applyLVILoadHardeningMitigation(Inst, Out); } +static unsigned getPrefixes(OperandVector &Operands) { + unsigned Result = 0; + X86Operand &Prefix = static_cast<X86Operand &>(*Operands.back()); + if (Prefix.isPrefix()) { + Result = Prefix.getPrefix(); + Operands.pop_back(); + } + return Result; +} + bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) { - if (isParsingIntelSyntax()) - return MatchAndEmitIntelInstruction(IDLoc, Opcode, Operands, Out, ErrorInfo, - MatchingInlineAsm); - return MatchAndEmitATTInstruction(IDLoc, Opcode, Operands, Out, ErrorInfo, - MatchingInlineAsm); + assert(!Operands.empty() && "Unexpect empty operand list!"); + assert((*Operands[0]).isToken() && "Leading operand should always be a mnemonic!"); + + // First, handle aliases that expand to multiple instructions. + MatchFPUWaitAlias(IDLoc, static_cast<X86Operand &>(*Operands[0]), Operands, + Out, MatchingInlineAsm); + unsigned Prefixes = getPrefixes(Operands); + + MCInst Inst; + + // If REX/REX2/VEX/EVEX encoding is forced, we need to pass the USE_* flag to + // the encoder and printer. + if (ForcedOpcodePrefix == OpcodePrefix_REX) + Prefixes |= X86::IP_USE_REX; + else if (ForcedOpcodePrefix == OpcodePrefix_REX2) + Prefixes |= X86::IP_USE_REX2; + else if (ForcedOpcodePrefix == OpcodePrefix_VEX) + Prefixes |= X86::IP_USE_VEX; + else if (ForcedOpcodePrefix == OpcodePrefix_VEX2) + Prefixes |= X86::IP_USE_VEX2; + else if (ForcedOpcodePrefix == OpcodePrefix_VEX3) + Prefixes |= X86::IP_USE_VEX3; + else if (ForcedOpcodePrefix == OpcodePrefix_EVEX) + Prefixes |= X86::IP_USE_EVEX; + + // Set encoded flags for {disp8} and {disp32}. + if (ForcedDispEncoding == DispEncoding_Disp8) + Prefixes |= X86::IP_USE_DISP8; + else if (ForcedDispEncoding == DispEncoding_Disp32) + Prefixes |= X86::IP_USE_DISP32; + + if (Prefixes) + Inst.setFlags(Prefixes); + + return isParsingIntelSyntax() + ? matchAndEmitIntelInstruction(IDLoc, Opcode, Inst, Operands, Out, + ErrorInfo, MatchingInlineAsm) + : matchAndEmitATTInstruction(IDLoc, Opcode, Inst, Operands, Out, + ErrorInfo, MatchingInlineAsm); } void X86AsmParser::MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op, @@ -4053,82 +4102,50 @@ bool X86AsmParser::ErrorMissingFeature(SMLoc IDLoc, return Error(IDLoc, OS.str(), SMRange(), MatchingInlineAsm); } -static unsigned getPrefixes(OperandVector &Operands) { - unsigned Result = 0; - X86Operand &Prefix = static_cast<X86Operand &>(*Operands.back()); - if (Prefix.isPrefix()) { - Result = Prefix.getPrefix(); - Operands.pop_back(); - } - return Result; -} - unsigned X86AsmParser::checkTargetMatchPredicate(MCInst &Inst) { unsigned Opc = Inst.getOpcode(); const MCInstrDesc &MCID = MII.get(Opc); + uint64_t TSFlags = MCID.TSFlags; if (UseApxExtendedReg && !X86II::canUseApxExtendedReg(MCID)) return Match_Unsupported; - if (ForcedNoFlag == !(MCID.TSFlags & X86II::EVEX_NF) && !X86::isCFCMOVCC(Opc)) + if (ForcedNoFlag == !(TSFlags & X86II::EVEX_NF) && !X86::isCFCMOVCC(Opc)) return Match_Unsupported; - if (ForcedVEXEncoding == VEXEncoding_EVEX && - (MCID.TSFlags & X86II::EncodingMask) != X86II::EVEX) - return Match_Unsupported; - - if ((ForcedVEXEncoding == VEXEncoding_VEX || - ForcedVEXEncoding == VEXEncoding_VEX2 || - ForcedVEXEncoding == VEXEncoding_VEX3) && - (MCID.TSFlags & X86II::EncodingMask) != X86II::VEX) - return Match_Unsupported; + switch (ForcedOpcodePrefix) { + case OpcodePrefix_Default: + break; + case OpcodePrefix_REX: + case OpcodePrefix_REX2: + if (TSFlags & X86II::EncodingMask) + return Match_Unsupported; + break; + case OpcodePrefix_VEX: + case OpcodePrefix_VEX2: + case OpcodePrefix_VEX3: + if ((TSFlags & X86II::EncodingMask) != X86II::VEX) + return Match_Unsupported; + break; + case OpcodePrefix_EVEX: + if ((TSFlags & X86II::EncodingMask) != X86II::EVEX) + return Match_Unsupported; + break; + } - if ((MCID.TSFlags & X86II::ExplicitOpPrefixMask) == - X86II::ExplicitVEXPrefix && - (ForcedVEXEncoding != VEXEncoding_VEX && - ForcedVEXEncoding != VEXEncoding_VEX2 && - ForcedVEXEncoding != VEXEncoding_VEX3)) + if ((TSFlags & X86II::ExplicitOpPrefixMask) == X86II::ExplicitVEXPrefix && + (ForcedOpcodePrefix != OpcodePrefix_VEX && + ForcedOpcodePrefix != OpcodePrefix_VEX2 && + ForcedOpcodePrefix != OpcodePrefix_VEX3)) return Match_Unsupported; return Match_Success; } -bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, - OperandVector &Operands, - MCStreamer &Out, - uint64_t &ErrorInfo, - bool MatchingInlineAsm) { - assert(!Operands.empty() && "Unexpect empty operand list!"); - assert((*Operands[0]).isToken() && "Leading operand should always be a mnemonic!"); - SMRange EmptyRange = std::nullopt; - - // First, handle aliases that expand to multiple instructions. - MatchFPUWaitAlias(IDLoc, static_cast<X86Operand &>(*Operands[0]), Operands, - Out, MatchingInlineAsm); +bool X86AsmParser::matchAndEmitATTInstruction( + SMLoc IDLoc, unsigned &Opcode, MCInst &Inst, OperandVector &Operands, + MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) { X86Operand &Op = static_cast<X86Operand &>(*Operands[0]); - unsigned Prefixes = getPrefixes(Operands); - - MCInst Inst; - - // If VEX/EVEX encoding is forced, we need to pass the USE_* flag to the - // encoder and printer. - if (ForcedVEXEncoding == VEXEncoding_VEX) - Prefixes |= X86::IP_USE_VEX; - else if (ForcedVEXEncoding == VEXEncoding_VEX2) - Prefixes |= X86::IP_USE_VEX2; - else if (ForcedVEXEncoding == VEXEncoding_VEX3) - Prefixes |= X86::IP_USE_VEX3; - else if (ForcedVEXEncoding == VEXEncoding_EVEX) - Prefixes |= X86::IP_USE_EVEX; - - // Set encoded flags for {disp8} and {disp32}. - if (ForcedDispEncoding == DispEncoding_Disp8) - Prefixes |= X86::IP_USE_DISP8; - else if (ForcedDispEncoding == DispEncoding_Disp32) - Prefixes |= X86::IP_USE_DISP32; - - if (Prefixes) - Inst.setFlags(Prefixes); - + SMRange EmptyRange = std::nullopt; // In 16-bit mode, if data32 is specified, temporarily switch to 32-bit mode // when matching the instruction. if (ForcedDataPrefix == X86::Is32Bit) @@ -4350,44 +4367,11 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, return true; } -bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, - OperandVector &Operands, - MCStreamer &Out, - uint64_t &ErrorInfo, - bool MatchingInlineAsm) { - assert(!Operands.empty() && "Unexpect empty operand list!"); - assert((*Operands[0]).isToken() && "Leading operand should always be a mnemonic!"); - StringRef Mnemonic = (static_cast<X86Operand &>(*Operands[0])).getToken(); - SMRange EmptyRange = std::nullopt; - StringRef Base = (static_cast<X86Operand &>(*Operands[0])).getToken(); - unsigned Prefixes = getPrefixes(Operands); - - // First, handle aliases that expand to multiple instructions. - MatchFPUWaitAlias(IDLoc, static_cast<X86Operand &>(*Operands[0]), Operands, Out, MatchingInlineAsm); +bool X86AsmParser::matchAndEmitIntelInstruction( + SMLoc IDLoc, unsigned &Opcode, MCInst &Inst, OperandVector &Operands, + MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) { X86Operand &Op = static_cast<X86Operand &>(*Operands[0]); - - MCInst Inst; - - // If VEX/EVEX encoding is forced, we need to pass the USE_* flag to the - // encoder and printer. - if (ForcedVEXEncoding == VEXEncoding_VEX) - Prefixes |= X86::IP_USE_VEX; - else if (ForcedVEXEncoding == VEXEncoding_VEX2) - Prefixes |= X86::IP_USE_VEX2; - else if (ForcedVEXEncoding == VEXEncoding_VEX3) - Prefixes |= X86::IP_USE_VEX3; - else if (ForcedVEXEncoding == VEXEncoding_EVEX) - Prefixes |= X86::IP_USE_EVEX; - - // Set encoded flags for {disp8} and {disp32}. - if (ForcedDispEncoding == DispEncoding_Disp8) - Prefixes |= X86::IP_USE_DISP8; - else if (ForcedDispEncoding == DispEncoding_Disp32) - Prefixes |= X86::IP_USE_DISP32; - - if (Prefixes) - Inst.setFlags(Prefixes); - + SMRange EmptyRange = std::nullopt; // Find one unsized memory operand, if present. X86Operand *UnsizedMemOp = nullptr; for (const auto &Op : Operands) { @@ -4402,6 +4386,7 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, // Allow some instructions to have implicitly pointer-sized operands. This is // compatible with gas. + StringRef Mnemonic = (static_cast<X86Operand &>(*Operands[0])).getToken(); if (UnsizedMemOp) { static const char *const PtrSizedInstrs[] = {"call", "jmp", "push"}; for (const char *Instr : PtrSizedInstrs) { @@ -4415,6 +4400,7 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, SmallVector<unsigned, 8> Match; FeatureBitset ErrorInfoMissingFeatures; FeatureBitset MissingFeatures; + StringRef Base = (static_cast<X86Operand &>(*Operands[0])).getToken(); // If unsized push has immediate operand we should default the default pointer // size for the size. diff --git a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp index 0ff440bdbe0d..6272e2d270f2 100644 --- a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -284,7 +284,10 @@ static int readPrefixes(struct InternalInstruction *insn) { // it's not mandatory prefix // 3. if (nextByte >= 0x40 && nextByte <= 0x4f) it's REX and we need // 0x0f exactly after it to be mandatory prefix - if (isREX(insn, nextByte) || nextByte == 0x0f || nextByte == 0x66) + // 4. if (nextByte == 0xd5) it's REX2 and we need + // 0x0f exactly after it to be mandatory prefix + if (isREX(insn, nextByte) || isREX2(insn, nextByte) || nextByte == 0x0f || + nextByte == 0x66) // The last of 0xf2 /0xf3 is mandatory prefix insn->mandatoryPrefix = byte; insn->repeatPrefix = byte; diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 472f34a4efdb..c2188d206b5f 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -125,9 +125,10 @@ class X86AsmBackend : public MCAsmBackend { unsigned TargetPrefixMax = 0; MCInst PrevInst; + unsigned PrevInstOpcode = 0; MCBoundaryAlignFragment *PendingBA = nullptr; std::pair<MCFragment *, size_t> PrevInstPosition; - bool CanPadInst = false; + bool IsRightAfterData = false; uint8_t determinePaddingPrefix(const MCInst &Inst) const; bool isMacroFused(const MCInst &Cmp, const MCInst &Jcc) const; @@ -267,8 +268,8 @@ static bool isRIPRelative(const MCInst &MI, const MCInstrInfo &MCII) { } /// Check if the instruction is a prefix. -static bool isPrefix(const MCInst &MI, const MCInstrInfo &MCII) { - return X86II::isPrefix(MCII.get(MI.getOpcode()).TSFlags); +static bool isPrefix(unsigned Opcode, const MCInstrInfo &MCII) { + return X86II::isPrefix(MCII.get(Opcode).TSFlags); } /// Check if the instruction is valid as the first instruction in macro fusion. @@ -382,9 +383,9 @@ bool X86AsmBackend::allowEnhancedRelaxation() const { /// X86 has certain instructions which enable interrupts exactly one /// instruction *after* the instruction which stores to SS. Return true if the -/// given instruction has such an interrupt delay slot. -static bool hasInterruptDelaySlot(const MCInst &Inst) { - switch (Inst.getOpcode()) { +/// given instruction may have such an interrupt delay slot. +static bool mayHaveInterruptDelaySlot(unsigned InstOpcode) { + switch (InstOpcode) { case X86::POPSS16: case X86::POPSS32: case X86::STI: @@ -394,9 +395,9 @@ static bool hasInterruptDelaySlot(const MCInst &Inst) { case X86::MOV32sr: case X86::MOV64sr: case X86::MOV16sm: - if (Inst.getOperand(0).getReg() == X86::SS) - return true; - break; + // In fact, this is only the case if the first operand is SS. However, as + // segment moves occur extremely rarely, this is just a minor pessimization. + return true; } return false; } @@ -406,16 +407,10 @@ static bool isRightAfterData(MCFragment *CurrentFragment, const std::pair<MCFragment *, size_t> &PrevInstPosition) { MCFragment *F = CurrentFragment; - // Empty data fragments may be created to prevent further data being - // added into the previous fragment, we need to skip them since they - // have no contents. - for (; isa_and_nonnull<MCDataFragment>(F); F = F->getPrevNode()) - if (cast<MCDataFragment>(F)->getContents().size() != 0) - break; - // Since data is always emitted into a DataFragment, our check strategy is // simple here. // - If the fragment is a DataFragment + // - If it's empty (section start or data after align), return false. // - If it's not the fragment where the previous instruction is, // returns true. // - If it's the fragment holding the previous instruction but its @@ -424,8 +419,9 @@ isRightAfterData(MCFragment *CurrentFragment, // - Otherwise returns false. // - If the fragment is not a DataFragment, returns false. if (auto *DF = dyn_cast_or_null<MCDataFragment>(F)) - return DF != PrevInstPosition.first || - DF->getContents().size() != PrevInstPosition.second; + return DF->getContents().size() && + (DF != PrevInstPosition.first || + DF->getContents().size() != PrevInstPosition.second); return false; } @@ -455,22 +451,22 @@ bool X86AsmBackend::canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const { // TLSCALL). return false; - if (hasInterruptDelaySlot(PrevInst)) + if (mayHaveInterruptDelaySlot(PrevInstOpcode)) // If this instruction follows an interrupt enabling instruction with a one // instruction delay, inserting a nop would change behavior. return false; - if (isPrefix(PrevInst, *MCII)) + if (isPrefix(PrevInstOpcode, *MCII)) // If this instruction follows a prefix, inserting a nop/prefix would change // semantic. return false; - if (isPrefix(Inst, *MCII)) + if (isPrefix(Inst.getOpcode(), *MCII)) // If this instruction is a prefix, inserting a prefix would change // semantic. return false; - if (isRightAfterData(OS.getCurrentFragment(), PrevInstPosition)) + if (IsRightAfterData) // If this instruction follows any data, there is no clear // instruction boundary, inserting a nop/prefix would change semantic. return false; @@ -484,7 +480,7 @@ bool X86AsmBackend::canPadBranches(MCObjectStreamer &OS) const { assert(allowAutoPadding() && "incorrect initialization!"); // We only pad in text section. - if (!OS.getCurrentSectionOnly()->getKind().isText()) + if (!OS.getCurrentSectionOnly()->isText()) return false; // To be Done: Currently don't deal with Bundle cases. @@ -514,19 +510,27 @@ bool X86AsmBackend::needAlign(const MCInst &Inst) const { /// Insert BoundaryAlignFragment before instructions to align branches. void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS, const MCInst &Inst, const MCSubtargetInfo &STI) { - CanPadInst = canPadInst(Inst, OS); + // Used by canPadInst. Done here, because in emitInstructionEnd, the current + // fragment will have changed. + IsRightAfterData = + isRightAfterData(OS.getCurrentFragment(), PrevInstPosition); if (!canPadBranches(OS)) return; + // NB: PrevInst only valid if canPadBranches is true. if (!isMacroFused(PrevInst, Inst)) // Macro fusion doesn't happen indeed, clear the pending. PendingBA = nullptr; - if (!CanPadInst) + // When branch padding is enabled (basically the skx102 erratum => unlikely), + // we call canPadInst (not cheap) twice. However, in the common case, we can + // avoid unnecessary calls to that, as this is otherwise only used for + // relaxable fragments. + if (!canPadInst(Inst, OS)) return; - if (PendingBA && OS.getCurrentFragment()->getPrevNode() == PendingBA) { + if (PendingBA && PendingBA->getNext() == OS.getCurrentFragment()) { // Macro fusion actually happens and there is no other fragment inserted // after the previous instruction. // @@ -552,21 +556,29 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS, isFirstMacroFusibleInst(Inst, *MCII))) { // If we meet a unfused branch or the first instuction in a fusiable pair, // insert a BoundaryAlign fragment. - OS.insert(PendingBA = new MCBoundaryAlignFragment(AlignBoundary, STI)); + PendingBA = OS.getContext().allocFragment<MCBoundaryAlignFragment>( + AlignBoundary, STI); + OS.insert(PendingBA); } } /// Set the last fragment to be aligned for the BoundaryAlignFragment. -void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst) { - PrevInst = Inst; +void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS, + const MCInst &Inst) { MCFragment *CF = OS.getCurrentFragment(); - PrevInstPosition = std::make_pair(CF, getSizeForInstFragment(CF)); if (auto *F = dyn_cast_or_null<MCRelaxableFragment>(CF)) - F->setAllowAutoPadding(CanPadInst); + F->setAllowAutoPadding(canPadInst(Inst, OS)); + + // Update PrevInstOpcode here, canPadInst() reads that. + PrevInstOpcode = Inst.getOpcode(); + PrevInstPosition = std::make_pair(CF, getSizeForInstFragment(CF)); if (!canPadBranches(OS)) return; + // PrevInst is only needed if canPadBranches. Copying an MCInst isn't cheap. + PrevInst = Inst; + if (!needAlign(Inst) || !PendingBA) return; @@ -579,7 +591,7 @@ void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst) // MCAssembler::relaxBoundaryAlign. The easiest way is to insert a new empty // DataFragment. if (isa_and_nonnull<MCDataFragment>(CF)) - OS.insert(new MCDataFragment()); + OS.insert(OS.getContext().allocFragment<MCDataFragment>()); // Update the maximum alignment on the current section if necessary. MCSection *Sec = OS.getCurrentSectionOnly(); @@ -866,7 +878,7 @@ void X86AsmBackend::finishLayout(MCAssembler const &Asm, LabeledFragments.insert(S.getFragment(false)); for (MCSection &Sec : Asm) { - if (!Sec.getKind().isText()) + if (!Sec.isText()) continue; SmallVector<MCRelaxableFragment *, 4> Relaxable; @@ -968,8 +980,8 @@ void X86AsmBackend::finishLayout(MCAssembler const &Asm, // The layout is done. Mark every fragment as valid. for (unsigned int i = 0, n = Layout.getSectionOrder().size(); i != n; ++i) { MCSection &Section = *Layout.getSectionOrder()[i]; - Layout.getFragmentOffset(&*Section.getFragmentList().rbegin()); - Asm.computeFragmentSize(Layout, *Section.getFragmentList().rbegin()); + Layout.getFragmentOffset(&*Section.curFragList()->Tail); + Asm.computeFragmentSize(Layout, *Section.curFragList()->Tail); } } diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index 8e4015783641..a89408bb79b0 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -56,12 +56,14 @@ enum IPREFIXES { IP_HAS_REPEAT = 1U << 3, IP_HAS_LOCK = 1U << 4, IP_HAS_NOTRACK = 1U << 5, - IP_USE_VEX = 1U << 6, - IP_USE_VEX2 = 1U << 7, - IP_USE_VEX3 = 1U << 8, - IP_USE_EVEX = 1U << 9, - IP_USE_DISP8 = 1U << 10, - IP_USE_DISP32 = 1U << 11, + IP_USE_REX = 1U << 6, + IP_USE_REX2 = 1U << 7, + IP_USE_VEX = 1U << 8, + IP_USE_VEX2 = 1U << 9, + IP_USE_VEX3 = 1U << 10, + IP_USE_EVEX = 1U << 11, + IP_USE_DISP8 = 1U << 12, + IP_USE_DISP32 = 1U << 13, }; enum OperandType : unsigned { diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index b4633b91bee3..72219c136c7e 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -1365,7 +1365,10 @@ PrefixKind X86MCCodeEmitter::emitREXPrefix(int MemOperand, const MCInst &MI, } } } - if ((TSFlags & X86II::ExplicitOpPrefixMask) == X86II::ExplicitREX2Prefix) + if (MI.getFlags() & X86::IP_USE_REX) + Prefix.setLowerBound(REX); + if ((TSFlags & X86II::ExplicitOpPrefixMask) == X86II::ExplicitREX2Prefix || + MI.getFlags() & X86::IP_USE_REX2) Prefix.setLowerBound(REX2); switch (TSFlags & X86II::FormMask) { default: diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 628ff560017e..68b78c7c4477 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -346,6 +346,8 @@ def FeatureNF : SubtargetFeature<"nf", "HasNF", "true", "Support status flags update suppression">; def FeatureCF : SubtargetFeature<"cf", "HasCF", "true", "Support conditional faulting">; +def FeatureZU : SubtargetFeature<"zu", "HasZU", "true", + "Support zero-upper SETcc/IMUL">; def FeatureUseGPR32InInlineAsm : SubtargetFeature<"inline-asm-use-gpr32", "UseInlineAsmGPR32", "true", "Enable use of GPR32 in inline assembly for APX">; diff --git a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp index 9819bfd12985..d979517e12af 100644 --- a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp +++ b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp @@ -50,8 +50,7 @@ Error X86CodeGenPassBuilder::addInstSelector(AddMachinePass &addPass) const { } // namespace -void X86TargetMachine::registerPassBuilderCallbacks( - PassBuilder &PB, bool PopulateClassToPassNames) { +void X86TargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { #define GET_PASS_REGISTRY "X86PassRegistry.def" #include "llvm/Passes/TargetPassRegistry.inc" } diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp index 11b2155e3f98..7343af1bdc9a 100644 --- a/llvm/lib/Target/X86/X86CompressEVEX.cpp +++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp @@ -174,29 +174,6 @@ static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) { return true; } -static bool isRedundantNewDataDest(MachineInstr &MI, const X86Subtarget &ST) { - // $rbx = ADD64rr_ND $rbx, $rax / $rbx = ADD64rr_ND $rax, $rbx - // -> - // $rbx = ADD64rr $rbx, $rax - const MCInstrDesc &Desc = MI.getDesc(); - Register Reg0 = MI.getOperand(0).getReg(); - const MachineOperand &Op1 = MI.getOperand(1); - if (!Op1.isReg() || X86::getFirstAddrOperandIdx(MI) == 1 || - X86::isCFCMOVCC(MI.getOpcode())) - return false; - Register Reg1 = Op1.getReg(); - if (Reg1 == Reg0) - return true; - - // Op1 and Op2 may be commutable for ND instructions. - if (!Desc.isCommutable() || Desc.getNumOperands() < 3 || - !MI.getOperand(2).isReg() || MI.getOperand(2).getReg() != Reg0) - return false; - // Opcode may change after commute, e.g. SHRD -> SHLD - ST.getInstrInfo()->commuteInstruction(MI, false, 1, 2); - return true; -} - static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) { uint64_t TSFlags = MI.getDesc().TSFlags; @@ -208,6 +185,30 @@ static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) { if (TSFlags & (X86II::EVEX_K | X86II::EVEX_L2)) return false; + auto IsRedundantNewDataDest = [&](unsigned &Opc) { + // $rbx = ADD64rr_ND $rbx, $rax / $rbx = ADD64rr_ND $rax, $rbx + // -> + // $rbx = ADD64rr $rbx, $rax + const MCInstrDesc &Desc = MI.getDesc(); + Register Reg0 = MI.getOperand(0).getReg(); + const MachineOperand &Op1 = MI.getOperand(1); + if (!Op1.isReg() || X86::getFirstAddrOperandIdx(MI) == 1 || + X86::isCFCMOVCC(MI.getOpcode())) + return false; + Register Reg1 = Op1.getReg(); + if (Reg1 == Reg0) + return true; + + // Op1 and Op2 may be commutable for ND instructions. + if (!Desc.isCommutable() || Desc.getNumOperands() < 3 || + !MI.getOperand(2).isReg() || MI.getOperand(2).getReg() != Reg0) + return false; + // Opcode may change after commute, e.g. SHRD -> SHLD + ST.getInstrInfo()->commuteInstruction(MI, false, 1, 2); + Opc = MI.getOpcode(); + return true; + }; + // EVEX_B has several meanings. // AVX512: // register form: rounding control or SAE @@ -218,40 +219,36 @@ static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) { // // For AVX512 cases, EVEX prefix is needed in order to carry this information // thus preventing the transformation to VEX encoding. - unsigned Opc = MI.getOpcode(); bool IsND = X86II::hasNewDataDest(TSFlags); if (TSFlags & X86II::EVEX_B && !IsND) return false; + unsigned Opc = MI.getOpcode(); // MOVBE*rr is special because it has semantic of NDD but not set EVEX_B. bool IsNDLike = IsND || Opc == X86::MOVBE32rr || Opc == X86::MOVBE64rr; - bool IsRedundantNDD = IsNDLike ? isRedundantNewDataDest(MI, ST) : false; - // NonNF -> NF only if it's not a compressible NDD instruction and eflags is - // dead. - unsigned NFOpc = (ST.hasNF() && !IsRedundantNDD && - MI.registerDefIsDead(X86::EFLAGS, /*TRI=*/nullptr)) - ? X86::getNFVariant(Opc) - : 0U; - if (IsNDLike && !IsRedundantNDD && !NFOpc) - return false; + bool IsRedundantNDD = IsNDLike ? IsRedundantNewDataDest(Opc) : false; - unsigned NewOpc = NFOpc; - if (!NewOpc) { + auto GetCompressedOpc = [&](unsigned Opc) -> unsigned { ArrayRef<X86TableEntry> Table = ArrayRef(X86CompressEVEXTable); - - Opc = MI.getOpcode(); const auto I = llvm::lower_bound(Table, Opc); - if (I == Table.end() || I->OldOpc != Opc) { - assert(!IsNDLike && "Missing entry for ND-like instruction"); - return false; - } - - if (!IsNDLike) { - if (usesExtendedRegister(MI) || !checkPredicate(I->NewOpc, &ST) || - !performCustomAdjustments(MI, I->NewOpc)) - return false; - } - NewOpc = I->NewOpc; - } + if (I == Table.end() || I->OldOpc != Opc) + return 0; + + if (usesExtendedRegister(MI) || !checkPredicate(I->NewOpc, &ST) || + !performCustomAdjustments(MI, I->NewOpc)) + return 0; + return I->NewOpc; + }; + // NonNF -> NF only if it's not a compressible NDD instruction and eflags is + // dead. + unsigned NewOpc = IsRedundantNDD + ? X86::getNonNDVariant(Opc) + : ((IsNDLike && ST.hasNF() && + MI.registerDefIsDead(X86::EFLAGS, /*TRI=*/nullptr)) + ? X86::getNFVariant(Opc) + : GetCompressedOpc(Opc)); + + if (!NewOpc) + return false; const MCInstrDesc &NewDesc = ST.getInstrInfo()->get(NewOpc); MI.setDesc(NewDesc); diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp index 7708089074c0..d50a4d3b23ae 100644 --- a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp @@ -653,28 +653,20 @@ bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) { } bool X86FastPreTileConfig::runOnMachineFunction(MachineFunction &MFunc) { + X86FI = MFunc.getInfo<X86MachineFunctionInfo>(); + // Early exit in the common case of non-AMX code. + if (X86FI->getAMXProgModel() != AMXProgModelEnum::ManagedRA) + return false; + MF = &MFunc; MRI = &MFunc.getRegInfo(); ST = &MFunc.getSubtarget<X86Subtarget>(); TII = ST->getInstrInfo(); - X86FI = MFunc.getInfo<X86MachineFunctionInfo>(); MFI = &MFunc.getFrameInfo(); TRI = ST->getRegisterInfo(); CfgSS = -1; unsigned NumVirtRegs = MRI->getNumVirtRegs(); - // Abandon early if there is no tile register to config. - bool HasVirtTileReg = false; - for (unsigned I = 0, E = NumVirtRegs; I != E; ++I) { - Register VirtReg = Register::index2VirtReg(I); - if (!MRI->reg_nodbg_empty(VirtReg) && - MRI->getRegClass(VirtReg)->getID() == X86::TILERegClassID) { - HasVirtTileReg = true; - break; - } - } - if (!HasVirtTileReg) - return false; StackSlotForVirtReg.resize(NumVirtRegs); MayLiveAcrossBlocks.clear(); diff --git a/llvm/lib/Target/X86/X86FastTileConfig.cpp b/llvm/lib/Target/X86/X86FastTileConfig.cpp index 2a20cd13791d..70bc11228be6 100644 --- a/llvm/lib/Target/X86/X86FastTileConfig.cpp +++ b/llvm/lib/Target/X86/X86FastTileConfig.cpp @@ -161,19 +161,20 @@ bool X86FastTileConfig::configBasicBlock(MachineBasicBlock &MBB) { } } - if (Change) - X86FI->setHasVirtualTileReg(true); - return Change; } bool X86FastTileConfig::runOnMachineFunction(MachineFunction &MFunc) { + X86FI = MFunc.getInfo<X86MachineFunctionInfo>(); + // Early exit in the common case of non-AMX code. + if (X86FI->getAMXProgModel() != AMXProgModelEnum::ManagedRA) + return false; + MF = &MFunc; MRI = &MFunc.getRegInfo(); const TargetSubtargetInfo *ST = &MFunc.getSubtarget<X86Subtarget>(); TRI = ST->getRegisterInfo(); TII = MFunc.getSubtarget().getInstrInfo(); - X86FI = MFunc.getInfo<X86MachineFunctionInfo>(); bool Change = false; // Loop over all of the basic blocks, eliminating virtual register references diff --git a/llvm/lib/Target/X86/X86FixupSetCC.cpp b/llvm/lib/Target/X86/X86FixupSetCC.cpp index 5c7105988070..2de89947c451 100644 --- a/llvm/lib/Target/X86/X86FixupSetCC.cpp +++ b/llvm/lib/Target/X86/X86FixupSetCC.cpp @@ -1,4 +1,4 @@ -//===---- X86FixupSetCC.cpp - optimize usage of LEA instructions ----------===// +//===- X86FixupSetCC.cpp - fix zero-extension of setcc patterns -----------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -17,6 +17,11 @@ // performed by the setcc. Instead, we can use: // xor %eax, %eax; seta %al // This both avoids the stall, and encodes shorter. +// +// Furthurmore, we can use: +// setzua %al +// if feature zero-upper is available. It's faster than the xor+setcc sequence. +// When r16-r31 is used, it even encodes shorter. //===----------------------------------------------------------------------===// #include "X86.h" @@ -46,6 +51,7 @@ public: private: MachineRegisterInfo *MRI = nullptr; + const X86Subtarget *ST = nullptr; const X86InstrInfo *TII = nullptr; enum { SearchBound = 16 }; @@ -61,7 +67,8 @@ FunctionPass *llvm::createX86FixupSetCC() { return new X86FixupSetCCPass(); } bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; MRI = &MF.getRegInfo(); - TII = MF.getSubtarget<X86Subtarget>().getInstrInfo(); + ST = &MF.getSubtarget<X86Subtarget>(); + TII = ST->getInstrInfo(); SmallVector<MachineInstr*, 4> ToErase; @@ -79,7 +86,8 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) { continue; MachineInstr *ZExt = nullptr; - for (auto &Use : MRI->use_instructions(MI.getOperand(0).getReg())) + Register Reg0 = MI.getOperand(0).getReg(); + for (auto &Use : MRI->use_instructions(Reg0)) if (Use.getOpcode() == X86::MOVZX32rr8) ZExt = &Use; @@ -98,9 +106,8 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) { continue; // On 32-bit, we need to be careful to force an ABCD register. - const TargetRegisterClass *RC = MF.getSubtarget<X86Subtarget>().is64Bit() - ? &X86::GR32RegClass - : &X86::GR32_ABCDRegClass; + const TargetRegisterClass *RC = + ST->is64Bit() ? &X86::GR32RegClass : &X86::GR32_ABCDRegClass; if (!MRI->constrainRegClass(ZExt->getOperand(0).getReg(), RC)) { // If we cannot constrain the register, we would need an additional copy // and are better off keeping the MOVZX32rr8 we have now. @@ -110,17 +117,24 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) { ++NumSubstZexts; Changed = true; - // Initialize a register with 0. This must go before the eflags def + // X86 setcc/setzucc only takes an output GR8, so fake a GR32 input by + // inserting the setcc/setzucc result into the low byte of the zeroed + // register. Register ZeroReg = MRI->createVirtualRegister(RC); - BuildMI(MBB, FlagsDefMI, MI.getDebugLoc(), TII->get(X86::MOV32r0), - ZeroReg); + if (ST->hasZU()) { + MI.setDesc(TII->get(X86::SETZUCCr)); + BuildMI(*ZExt->getParent(), ZExt, ZExt->getDebugLoc(), + TII->get(TargetOpcode::IMPLICIT_DEF), ZeroReg); + } else { + // Initialize a register with 0. This must go before the eflags def + BuildMI(MBB, FlagsDefMI, MI.getDebugLoc(), TII->get(X86::MOV32r0), + ZeroReg); + } - // X86 setcc only takes an output GR8, so fake a GR32 input by inserting - // the setcc result into the low byte of the zeroed register. BuildMI(*ZExt->getParent(), ZExt, ZExt->getDebugLoc(), TII->get(X86::INSERT_SUBREG), ZExt->getOperand(0).getReg()) .addReg(ZeroReg) - .addReg(MI.getOperand(0).getReg()) + .addReg(Reg0) .addImm(X86::sub_8bit); ToErase.push_back(ZExt); } diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp index d6d077363f6f..394947bc65c8 100644 --- a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp +++ b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp @@ -24,6 +24,7 @@ #include "X86InstrBuilder.h" #include "X86InstrInfo.h" #include "X86Subtarget.h" +#include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/ScopeExit.h" @@ -127,7 +128,7 @@ FunctionPass *llvm::createX86FlagsCopyLoweringPass() { char X86FlagsCopyLoweringPass::ID = 0; void X86FlagsCopyLoweringPass::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -257,7 +258,7 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) { MRI = &MF.getRegInfo(); TII = Subtarget->getInstrInfo(); TRI = Subtarget->getRegisterInfo(); - MDT = &getAnalysis<MachineDominatorTree>(); + MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); PromoteRC = &X86::GR8RegClass; if (MF.empty()) diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index 4521401d8741..76bcf875f00e 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -2594,7 +2594,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, } // Emit tilerelease for AMX kernel. - if (X86FI->hasVirtualTileReg()) + if (X86FI->getAMXProgModel() == AMXProgModelEnum::ManagedRA) BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE)); } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 2aec14e93d08..3bbf009a1def 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2500,6 +2500,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(Op, MVT::f32, Promote); // clang-format on + // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has + // it, but it's just a wrapper around ldexp. + if (Subtarget.isOSWindows()) { + for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP}) + if (isOperationExpand(Op, MVT::f32)) + setOperationAction(Op, MVT::f32, Promote); + } + // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine({ISD::VECTOR_SHUFFLE, ISD::SCALAR_TO_VECTOR, @@ -2516,6 +2524,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, ISD::SRL, ISD::OR, ISD::AND, + ISD::AVGCEILS, + ISD::AVGCEILU, + ISD::AVGFLOORS, + ISD::AVGFLOORU, ISD::BITREVERSE, ISD::ADD, ISD::FADD, @@ -4179,8 +4191,7 @@ static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl) { // Make sure we only try to split 256/512-bit types to avoid creating // narrow vectors. - EVT VT = Op.getValueType(); - (void)VT; + [[maybe_unused]] EVT VT = Op.getValueType(); assert((Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"); @@ -4195,8 +4206,7 @@ static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG, static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl) { // Assert that all the types match. - EVT VT = Op.getValueType(); - (void)VT; + [[maybe_unused]] EVT VT = Op.getValueType(); assert(Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && "Unexpected VTs!"); assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"); @@ -7326,9 +7336,8 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, unsigned SeqLen = Sequence.size(); bool UpperZeroOrUndef = SeqLen == 1 || - llvm::all_of(ArrayRef(Sequence).drop_front(), [](SDValue V) { - return !V || V.isUndef() || isNullConstant(V); - }); + llvm::all_of(ArrayRef(Sequence).drop_front(), + [](SDValue V) { return !V || isNullConstantOrUndef(V); }); SDValue Op0 = Sequence[0]; if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) || (Op0.getOpcode() == ISD::ZERO_EXTEND && @@ -13350,11 +13359,11 @@ static SDValue lowerV8I16GeneralSingleInputShuffle( SmallVector<int, 4> LoInputs; copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; }); array_pod_sort(LoInputs.begin(), LoInputs.end()); - LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end()); + LoInputs.erase(llvm::unique(LoInputs), LoInputs.end()); SmallVector<int, 4> HiInputs; copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; }); array_pod_sort(HiInputs.begin(), HiInputs.end()); - HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); + HiInputs.erase(llvm::unique(HiInputs), HiInputs.end()); int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin(); int NumHToL = LoInputs.size() - NumLToL; int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin(); @@ -14242,13 +14251,11 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, copy_if(Mask, std::back_inserter(LoInputs), [](int M) { return M >= 0 && M < 8; }); array_pod_sort(LoInputs.begin(), LoInputs.end()); - LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), - LoInputs.end()); + LoInputs.erase(llvm::unique(LoInputs), LoInputs.end()); SmallVector<int, 4> HiInputs; copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; }); array_pod_sort(HiInputs.begin(), HiInputs.end()); - HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), - HiInputs.end()); + HiInputs.erase(llvm::unique(HiInputs), HiInputs.end()); bool TargetLo = LoInputs.size() >= HiInputs.size(); ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs; @@ -23410,8 +23417,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, assert(!IsStrict && "Strict SETCC only handles FP operands."); - MVT VTOp0 = Op0.getSimpleValueType(); - (void)VTOp0; + [[maybe_unused]] MVT VTOp0 = Op0.getSimpleValueType(); assert(VTOp0 == Op1.getSimpleValueType() && "Expected operands with same type!"); assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() && @@ -23816,6 +23822,20 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1, } } + // Look for X == INT_MIN or X != INT_MIN. We can use NEG and test for + // overflow. + if (isMinSignedConstant(Op1)) { + EVT VT = Op0.getValueType(); + if (VT == MVT::i32 || VT == MVT::i64 || Op0->hasOneUse()) { + SDVTList CmpVTs = DAG.getVTList(VT, MVT::i32); + X86::CondCode CondCode = CC == ISD::SETEQ ? X86::COND_O : X86::COND_NO; + X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8); + SDValue Neg = DAG.getNode(X86ISD::SUB, dl, CmpVTs, + DAG.getConstant(0, dl, VT), Op0); + return SDValue(Neg.getNode(), 1); + } + } + // Try to use the carry flag from the add in place of an separate CMP for: // (seteq (add X, -1), -1). Similar for setne. if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD && @@ -28476,6 +28496,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, // vector pairs, multiply and truncate. if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) { unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits() / 128; + unsigned NumEltsPerLane = NumElts / NumLanes; if ((VT == MVT::v16i8 && Subtarget.hasInt256()) || (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) { @@ -28489,6 +28511,33 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2); + // For vXi8 mul, try PMADDUBSW to avoid the need for extension. + // Don't do this if we only need to unpack one half. + if (Subtarget.hasSSSE3()) { + bool BIsBuildVector = isa<BuildVectorSDNode>(B); + bool IsLoLaneAllZeroOrUndef = BIsBuildVector; + bool IsHiLaneAllZeroOrUndef = BIsBuildVector; + if (BIsBuildVector) { + for (auto [Idx, Val] : enumerate(B->ops())) { + if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2)) + IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val); + else + IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val); + } + } + if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) { + SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT)); + SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B); + SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B); + SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo); + SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi); + RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask); + RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi, + DAG.getTargetConstant(8, dl, MVT::i8)); + return DAG.getNode(ISD::OR, dl, VT, RLo, DAG.getBitcast(VT, RHi)); + } + } + // Extract the lo/hi parts to any extend to i16. // We're going to mask off the low byte of each result element of the // pmullw, so it doesn't matter what's in the high byte of each 16-bit @@ -30508,7 +30557,7 @@ static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) { bool Not = false; // Check if we have a NOT Value *PeekI; - if (match(I, m_c_Xor(m_Value(PeekI), m_AllOnes())) || + if (match(I, m_Not(m_Value(PeekI))) || match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) { Not = true; I = dyn_cast<Instruction>(PeekI); @@ -32266,6 +32315,54 @@ bool X86TargetLowering::isInlineAsmTargetBranch( return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp"); } +static SDValue getFlagsOfCmpZeroFori1(SelectionDAG &DAG, const SDLoc &DL, + SDValue Mask) { + EVT Ty = MVT::i8; + auto V = DAG.getBitcast(MVT::i1, Mask); + auto VE = DAG.getZExtOrTrunc(V, DL, Ty); + auto Zero = DAG.getConstant(0, DL, Ty); + SDVTList X86SubVTs = DAG.getVTList(Ty, MVT::i32); + auto CmpZero = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, VE); + return SDValue(CmpZero.getNode(), 1); +} + +SDValue X86TargetLowering::visitMaskedLoad( + SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, + SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const { + // @llvm.masked.load.v1*(ptr, alignment, mask, passthru) + // -> + // _, flags = SUB 0, mask + // res, chain = CLOAD inchain, ptr, (bit_cast_to_scalar passthru), cond, flags + // bit_cast_to_vector<res> + EVT VTy = PassThru.getValueType(); + EVT Ty = VTy.getVectorElementType(); + SDVTList Tys = DAG.getVTList(Ty, MVT::Other); + auto ScalarPassThru = PassThru.isUndef() ? DAG.getConstant(0, DL, Ty) + : DAG.getBitcast(Ty, PassThru); + auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask); + auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8); + SDValue Ops[] = {Chain, Ptr, ScalarPassThru, COND_NE, Flags}; + NewLoad = DAG.getMemIntrinsicNode(X86ISD::CLOAD, DL, Tys, Ops, Ty, MMO); + return DAG.getBitcast(VTy, NewLoad); +} + +SDValue X86TargetLowering::visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, + SDValue Chain, + MachineMemOperand *MMO, SDValue Ptr, + SDValue Val, SDValue Mask) const { + // llvm.masked.store.v1*(Src0, Ptr, alignment, Mask) + // -> + // _, flags = SUB 0, mask + // chain = CSTORE inchain, (bit_cast_to_scalar val), ptr, cond, flags + EVT Ty = Val.getValueType().getVectorElementType(); + SDVTList Tys = DAG.getVTList(MVT::Other); + auto ScalarVal = DAG.getBitcast(Ty, Val); + auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask); + auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8); + SDValue Ops[] = {Chain, ScalarVal, Ptr, COND_NE, Flags}; + return DAG.getMemIntrinsicNode(X86ISD::CSTORE, DL, Tys, Ops, Ty, MMO); +} + /// Provide custom lowering hooks for some operations. SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { @@ -33982,6 +34079,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(STRICT_FP80_ADD) NODE_NAME_CASE(CCMP) NODE_NAME_CASE(CTEST) + NODE_NAME_CASE(CLOAD) + NODE_NAME_CASE(CSTORE) } return nullptr; #undef NODE_NAME_CASE @@ -37040,6 +37139,52 @@ static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS, Known = Known.zext(64); } +static void computeKnownBitsForPMADDWD(SDValue LHS, SDValue RHS, + KnownBits &Known, + const APInt &DemandedElts, + const SelectionDAG &DAG, + unsigned Depth) { + unsigned NumSrcElts = LHS.getValueType().getVectorNumElements(); + + // Multiply signed i16 elements to create i32 values and add Lo/Hi pairs. + APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts); + APInt DemandedLoElts = + DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01)); + APInt DemandedHiElts = + DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10)); + KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1); + KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1); + KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1); + KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1); + KnownBits Lo = KnownBits::mul(LHSLo.sext(32), RHSLo.sext(32)); + KnownBits Hi = KnownBits::mul(LHSHi.sext(32), RHSHi.sext(32)); + Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/false, + /*NUW=*/false, Lo, Hi); +} + +static void computeKnownBitsForPMADDUBSW(SDValue LHS, SDValue RHS, + KnownBits &Known, + const APInt &DemandedElts, + const SelectionDAG &DAG, + unsigned Depth) { + unsigned NumSrcElts = LHS.getValueType().getVectorNumElements(); + + // Multiply unsigned/signed i8 elements to create i16 values and add_sat Lo/Hi + // pairs. + APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts); + APInt DemandedLoElts = + DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01)); + APInt DemandedHiElts = + DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10)); + KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1); + KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1); + KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1); + KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1); + KnownBits Lo = KnownBits::mul(LHSLo.zext(16), RHSLo.sext(16)); + KnownBits Hi = KnownBits::mul(LHSHi.zext(16), RHSHi.sext(16)); + Known = KnownBits::sadd_sat(Lo, Hi); +} + void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, @@ -37215,6 +37360,26 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, } break; } + case X86ISD::VPMADDWD: { + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + assert(VT.getVectorElementType() == MVT::i32 && + LHS.getValueType() == RHS.getValueType() && + LHS.getValueType().getVectorElementType() == MVT::i16 && + "Unexpected PMADDWD types"); + computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth); + break; + } + case X86ISD::VPMADDUBSW: { + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + assert(VT.getVectorElementType() == MVT::i16 && + LHS.getValueType() == RHS.getValueType() && + LHS.getValueType().getVectorElementType() == MVT::i8 && + "Unexpected PMADDUBSW types"); + computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth); + break; + } case X86ISD::PMULUDQ: { KnownBits Known2; Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); @@ -37351,6 +37516,30 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, } case ISD::INTRINSIC_WO_CHAIN: { switch (Op->getConstantOperandVal(0)) { + case Intrinsic::x86_sse2_pmadd_wd: + case Intrinsic::x86_avx2_pmadd_wd: + case Intrinsic::x86_avx512_pmaddw_d_512: { + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + assert(VT.getScalarType() == MVT::i32 && + LHS.getValueType() == RHS.getValueType() && + LHS.getValueType().getScalarType() == MVT::i16 && + "Unexpected PMADDWD types"); + computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth); + break; + } + case Intrinsic::x86_ssse3_pmadd_ub_sw_128: + case Intrinsic::x86_avx2_pmadd_ub_sw: + case Intrinsic::x86_avx512_pmaddubs_w_512: { + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + assert(VT.getScalarType() == MVT::i16 && + LHS.getValueType() == RHS.getValueType() && + LHS.getValueType().getScalarType() == MVT::i8 && + "Unexpected PMADDUBSW types"); + computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth); + break; + } case Intrinsic::x86_sse2_psad_bw: case Intrinsic::x86_avx2_psad_bw: case Intrinsic::x86_avx512_psad_bw_512: { @@ -41635,6 +41824,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( KnownZero = LHSZero | RHSZero; break; } + case X86ISD::VPMADDUBSW: case X86ISD::VPMADDWD: { APInt LHSUndef, LHSZero; APInt RHSUndef, RHSZero; @@ -41917,7 +42107,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( case X86ISD::CVTPH2PS: case X86ISD::CVTPS2PH: { SDValue Src = Op.getOperand(0); - MVT SrcVT = Src.getSimpleValueType(); + EVT SrcVT = Src.getValueType(); APInt SrcUndef, SrcZero; APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements()); if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO, @@ -42793,6 +42983,19 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1, AssumeSingleUse); } + case X86ISD::CMOV: { + KnownBits Known2; + if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits, + OriginalDemandedElts, Known2, TLO, Depth + 1)) + return true; + if (SimplifyDemandedBits(Op.getOperand(0), OriginalDemandedBits, + OriginalDemandedElts, Known, TLO, Depth + 1)) + return true; + + // Only known if known in both the LHS and RHS. + Known = Known.intersectWith(Known2); + break; + } case X86ISD::BEXTR: case X86ISD::BEXTRI: { SDValue Op0 = Op.getOperand(0); @@ -43710,7 +43913,7 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) { SDValue Op = N0.getOperand(i); LowUndef &= Op.isUndef() || (i >= e/2); - AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op)); + AllUndefOrZero &= isNullConstantOrUndef(Op); } if (AllUndefOrZero) { SDValue N00 = N0.getOperand(0); @@ -50184,12 +50387,12 @@ static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT, /// If this is an add or subtract where one operand is produced by a cmp+setcc, /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB} /// with CMP+{ADC, SBB}. -static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { +static SDValue combineAddOrSubToADCOrSBB(SDNode *N, const SDLoc &DL, + SelectionDAG &DAG) { bool IsSub = N->getOpcode() == ISD::SUB; SDValue X = N->getOperand(0); SDValue Y = N->getOperand(1); EVT VT = N->getValueType(0); - SDLoc DL(N); if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG)) return ADCOrSBB; @@ -50708,157 +50911,6 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, return SDValue(); } -/// This function detects the AVG pattern between vectors of unsigned i8/i16, -/// which is c = (a + b + 1) / 2, and replace this operation with the efficient -/// ISD::AVGCEILU (AVG) instruction. -static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, - const X86Subtarget &Subtarget, - const SDLoc &DL) { - if (!VT.isVector()) - return SDValue(); - EVT InVT = In.getValueType(); - unsigned NumElems = VT.getVectorNumElements(); - - EVT ScalarVT = VT.getVectorElementType(); - if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2)) - return SDValue(); - - // InScalarVT is the intermediate type in AVG pattern and it should be greater - // than the original input type (i8/i16). - EVT InScalarVT = InVT.getVectorElementType(); - if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits()) - return SDValue(); - - if (!Subtarget.hasSSE2()) - return SDValue(); - - // Detect the following pattern: - // - // %1 = zext <N x i8> %a to <N x i32> - // %2 = zext <N x i8> %b to <N x i32> - // %3 = add nuw nsw <N x i32> %1, <i32 1 x N> - // %4 = add nuw nsw <N x i32> %3, %2 - // %5 = lshr <N x i32> %N, <i32 1 x N> - // %6 = trunc <N x i32> %5 to <N x i8> - // - // In AVX512, the last instruction can also be a trunc store. - if (In.getOpcode() != ISD::SRL) - return SDValue(); - - // A lambda checking the given SDValue is a constant vector and each element - // is in the range [Min, Max]. - auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) { - return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) { - return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max)); - }); - }; - - auto IsZExtLike = [DAG = &DAG, ScalarVT](SDValue V) { - unsigned MaxActiveBits = DAG->computeKnownBits(V).countMaxActiveBits(); - return MaxActiveBits <= ScalarVT.getSizeInBits(); - }; - - // Check if each element of the vector is right-shifted by one. - SDValue LHS = In.getOperand(0); - SDValue RHS = In.getOperand(1); - if (!IsConstVectorInRange(RHS, 1, 1)) - return SDValue(); - if (LHS.getOpcode() != ISD::ADD) - return SDValue(); - - // Detect a pattern of a + b + 1 where the order doesn't matter. - SDValue Operands[3]; - Operands[0] = LHS.getOperand(0); - Operands[1] = LHS.getOperand(1); - - auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL, - ArrayRef<SDValue> Ops) { - return DAG.getNode(ISD::AVGCEILU, DL, Ops[0].getValueType(), Ops); - }; - - auto AVGSplitter = [&](std::array<SDValue, 2> Ops) { - for (SDValue &Op : Ops) - if (Op.getValueType() != VT) - Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op); - // Pad to a power-of-2 vector, split+apply and extract the original vector. - unsigned NumElemsPow2 = PowerOf2Ceil(NumElems); - EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2); - if (NumElemsPow2 != NumElems) { - for (SDValue &Op : Ops) { - SmallVector<SDValue, 32> EltsOfOp(NumElemsPow2, DAG.getUNDEF(ScalarVT)); - for (unsigned i = 0; i != NumElems; ++i) { - SDValue Idx = DAG.getIntPtrConstant(i, DL); - EltsOfOp[i] = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op, Idx); - } - Op = DAG.getBuildVector(Pow2VT, DL, EltsOfOp); - } - } - SDValue Res = SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, Ops, AVGBuilder); - if (NumElemsPow2 == NumElems) - return Res; - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, - DAG.getIntPtrConstant(0, DL)); - }; - - // Take care of the case when one of the operands is a constant vector whose - // element is in the range [1, 256]. - if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) && - IsZExtLike(Operands[0])) { - // The pattern is detected. Subtract one from the constant vector, then - // demote it and emit X86ISD::AVG instruction. - SDValue VecOnes = DAG.getConstant(1, DL, InVT); - Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes); - return AVGSplitter({Operands[0], Operands[1]}); - } - - // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)). - // Match the or case only if its 'add-like' - can be replaced by an add. - auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) { - if (ISD::ADD == V.getOpcode()) { - Op0 = V.getOperand(0); - Op1 = V.getOperand(1); - return true; - } - if (ISD::ZERO_EXTEND != V.getOpcode()) - return false; - V = V.getOperand(0); - if (V.getValueType() != VT || ISD::OR != V.getOpcode() || - !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1))) - return false; - Op0 = V.getOperand(0); - Op1 = V.getOperand(1); - return true; - }; - - SDValue Op0, Op1; - if (FindAddLike(Operands[0], Op0, Op1)) - std::swap(Operands[0], Operands[1]); - else if (!FindAddLike(Operands[1], Op0, Op1)) - return SDValue(); - Operands[2] = Op0; - Operands[1] = Op1; - - // Now we have three operands of two additions. Check that one of them is a - // constant vector with ones, and the other two can be promoted from i8/i16. - for (SDValue &Op : Operands) { - if (!IsConstVectorInRange(Op, 1, 1)) - continue; - std::swap(Op, Operands[2]); - - // Check if Operands[0] and Operands[1] are results of type promotion. - for (int j = 0; j < 2; ++j) - if (Operands[j].getValueType() != VT) - if (!IsZExtLike(Operands[j])) - return SDValue(); - - // The pattern is detected, emit X86ISD::AVG instruction(s). - return AVGSplitter({Operands[0], Operands[1]}); - } - - return SDValue(); -} - static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -50875,6 +50927,10 @@ static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl, if (!(RegVT.is128BitVector() || RegVT.is256BitVector())) return SDValue(); + const Constant *LdC = getTargetConstantFromBasePtr(Ptr); + if (!LdC) + return SDValue(); + auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs, ArrayRef<APInt> Bits, ArrayRef<APInt> UserBits) { for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) { @@ -50899,12 +50955,11 @@ static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl, RegVT.getFixedSizeInBits()) { EVT UserVT = User->getValueType(0); SDValue UserPtr = UserLd->getBasePtr(); - const Constant *LdC = getTargetConstantFromBasePtr(Ptr); const Constant *UserC = getTargetConstantFromBasePtr(UserPtr); // See if we are loading a constant that matches in the lower // bits of a longer constant (but from a different constant pool ptr). - if (LdC && UserC && UserPtr != Ptr) { + if (UserC && UserPtr != Ptr) { unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits(); unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits(); if (LdSize < UserSize || !ISD::isNormalLoad(User)) { @@ -51498,16 +51553,6 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, // First, pack all of the elements in one place. Next, store to memory // in fewer chunks. if (St->isTruncatingStore() && VT.isVector()) { - // Check if we can detect an AVG pattern from the truncation. If yes, - // replace the trunc store by a normal store with the result of X86ISD::AVG - // instruction. - if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT())) - if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, - Subtarget, dl)) - return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(), - St->getPointerInfo(), St->getOriginalAlign(), - St->getMemOperand()->getFlags()); - if (TLI.isTruncStoreLegal(VT, StVT)) { if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT())) return EmitTruncSStore(true /* Signed saturation */, St->getChain(), @@ -52363,10 +52408,6 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL)) return V; - // Try to detect AVG pattern first. - if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL)) - return Avg; - // Try to detect PMADD if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL)) return PMAdd; @@ -52718,7 +52759,7 @@ static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) { return getSETCC(NewCC, LHS->getOperand(1), DL, DAG); } -static SDValue combineXorSubCTLZ(SDNode *N, SelectionDAG &DAG, +static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) && "Invalid opcode for combing with CTLZ"); @@ -52758,7 +52799,6 @@ static SDValue combineXorSubCTLZ(SDNode *N, SelectionDAG &DAG, if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1)) return SDValue(); - SDLoc DL(N); EVT OpVT = VT; SDValue Op = OpCTLZ.getOperand(0); if (VT == MVT::i8) { @@ -52781,11 +52821,12 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG, SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); + SDLoc DL(N); // If this is SSE1 only convert to FXOR to avoid scalarization. if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) { return DAG.getBitcast(MVT::v4i32, - DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32, + DAG.getNode(X86ISD::FXOR, DL, MVT::v4f32, DAG.getBitcast(MVT::v4f32, N0), DAG.getBitcast(MVT::v4f32, N1))); } @@ -52805,7 +52846,7 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG, if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget)) return FPLogic; - if (SDValue R = combineXorSubCTLZ(N, DAG, Subtarget)) + if (SDValue R = combineXorSubCTLZ(N, DL, DAG, Subtarget)) return R; if (DCI.isBeforeLegalizeOps()) @@ -52826,8 +52867,8 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG, N0.getOperand(0).getValueType().isVector() && N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 && TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) { - return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0), - N0.getOperand(0).getValueType())); + return DAG.getBitcast( + VT, DAG.getNOT(DL, N0.getOperand(0), N0.getOperand(0).getValueType())); } // Handle AVX512 mask widening. @@ -52837,8 +52878,8 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG, N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() && TLI.isTypeLegal(N0.getOperand(1).getValueType())) { return DAG.getNode( - ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0), - DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()), + ISD::INSERT_SUBVECTOR, DL, VT, N0.getOperand(0), + DAG.getNOT(DL, N0.getOperand(1), N0.getOperand(1).getValueType()), N0.getOperand(2)); } @@ -52851,7 +52892,6 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG, auto *N1C = dyn_cast<ConstantSDNode>(N1); auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1)); if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) { - SDLoc DL(N); SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT); SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT); return DAG.getNode(ISD::XOR, DL, VT, LHS, @@ -52892,6 +52932,31 @@ static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// Various combines to try to convert to avgceilu. +static SDValue combineAVG(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + unsigned Opcode = N->getOpcode(); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N->getValueType(0); + EVT SVT = VT.getScalarType(); + SDLoc DL(N); + + // avgceils(x,y) -> flipsign(avgceilu(flipsign(x),flipsign(y))) + // Only useful on vXi8 which doesn't have good SRA handling. + if (Opcode == ISD::AVGCEILS && VT.isVector() && SVT == MVT::i8) { + APInt SignBit = APInt::getSignMask(VT.getScalarSizeInBits()); + SDValue SignMask = DAG.getConstant(SignBit, DL, VT); + N0 = DAG.getNode(ISD::XOR, DL, VT, N0, SignMask); + N1 = DAG.getNode(ISD::XOR, DL, VT, N1, SignMask); + return DAG.getNode(ISD::XOR, DL, VT, + DAG.getNode(ISD::AVGCEILU, DL, VT, N0, N1), SignMask); + } + + return SDValue(); +} + static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -55419,7 +55484,8 @@ static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1, /// Try to fold those constants into an 'add' instruction to reduce instruction /// count. We do this with CMOV rather the generic 'select' because there are /// earlier folds that may be used to turn select-of-constants into logic hacks. -static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG, +static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL, + SelectionDAG &DAG, const X86Subtarget &Subtarget) { // If an operand is zero, add-of-0 gets simplified away, so that's clearly // better because we eliminate 1-2 instructions. This transform is still @@ -55451,7 +55517,6 @@ static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG, return SDValue(); EVT VT = N->getValueType(0); - SDLoc DL(N); SDValue FalseOp = Cmov.getOperand(0); SDValue TrueOp = Cmov.getOperand(1); @@ -55492,7 +55557,7 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, SDValue Op1 = N->getOperand(1); SDLoc DL(N); - if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG, Subtarget)) + if (SDValue Select = pushAddIntoCmovOfConsts(N, DL, DAG, Subtarget)) return Select; if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget)) @@ -55550,7 +55615,7 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, Op0.getOperand(0), Op0.getOperand(2)); } - return combineAddOrSubToADCOrSBB(N, DAG); + return combineAddOrSubToADCOrSBB(N, DL, DAG); } // Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov @@ -55621,11 +55686,38 @@ static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG) { + // res, flags2 = sub 0, (setcc cc, flag) + // cload/cstore ..., cond_ne, flag2 + // -> + // cload/cstore cc, flag + if (N->getConstantOperandVal(3) != X86::COND_NE) + return SDValue(); + + SDValue Sub = N->getOperand(4); + if (Sub.getOpcode() != X86ISD::SUB) + return SDValue(); + + SDValue SetCC = Sub.getOperand(1); + + if (!X86::isZeroNode(Sub.getOperand(0)) || SetCC.getOpcode() != X86ISD::SETCC) + return SDValue(); + + SmallVector<SDValue, 5> Ops(N->op_values()); + Ops[3] = SetCC.getOperand(0); + Ops[4] = SetCC.getOperand(1); + + return DAG.getMemIntrinsicNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops, + cast<MemSDNode>(N)->getMemoryVT(), + cast<MemSDNode>(N)->getMemOperand()); +} + static SDValue combineSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); + SDLoc DL(N); // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt. auto IsNonOpaqueConstant = [&](SDValue Op) { @@ -55645,7 +55737,6 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG, if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) && !isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) && Op1->hasOneUse()) { - SDLoc DL(N); EVT VT = Op0.getValueType(); SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0), DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT)); @@ -55676,14 +55767,14 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG, assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use"); SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0, Op1.getOperand(1), Op1.getOperand(2)); - return DAG.getNode(ISD::SUB, SDLoc(N), Op0.getValueType(), ADC.getValue(0), + return DAG.getNode(ISD::SUB, DL, Op0.getValueType(), ADC.getValue(0), Op1.getOperand(0)); } - if (SDValue V = combineXorSubCTLZ(N, DAG, Subtarget)) + if (SDValue V = combineXorSubCTLZ(N, DL, DAG, Subtarget)) return V; - if (SDValue V = combineAddOrSubToADCOrSBB(N, DAG)) + if (SDValue V = combineAddOrSubToADCOrSBB(N, DL, DAG)) return V; return combineSubSetcc(N, DAG); @@ -55917,6 +56008,8 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, break; case X86ISD::PSHUFB: case X86ISD::PSADBW: + case X86ISD::VPMADDUBSW: + case X86ISD::VPMADDWD: if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) || (VT.is512BitVector() && Subtarget.useBWIRegs()))) { MVT SrcVT = Op0.getOperand(0).getSimpleValueType(); @@ -57328,6 +57421,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget); case X86ISD::ADD: case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget); + case X86ISD::CLOAD: + case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG); case X86ISD::SBB: return combineSBB(N, DAG); case X86ISD::ADC: return combineADC(N, DAG, DCI); case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget); @@ -57338,6 +57433,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::OR: return combineOr(N, DAG, DCI, Subtarget); case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget); case ISD::BITREVERSE: return combineBITREVERSE(N, DAG, DCI, Subtarget); + case ISD::AVGCEILS: + case ISD::AVGCEILU: + case ISD::AVGFLOORS: + case ISD::AVGFLOORU: return combineAVG(N, DAG, DCI, Subtarget); case X86ISD::BEXTR: case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget); case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget); @@ -57529,16 +57628,20 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: + case ISD::MUL: + return false; case ISD::SHL: case ISD::SRA: case ISD::SRL: case ISD::SUB: case ISD::ADD: - case ISD::MUL: case ISD::AND: case ISD::OR: case ISD::XOR: - return false; + // NDD instruction never has "partial register write" issue b/c it has + // destination register's upper bits [63:OSIZE]) zeroed even when + // OSIZE=8/16. + return Subtarget.hasNDD(); } } diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 3c5c903bc0d9..362daa98e1f8 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -903,6 +903,10 @@ namespace llvm { // is needed so that this can be expanded with control flow. VASTART_SAVE_XMM_REGS, + // Conditional load/store instructions + CLOAD, + CSTORE, + // WARNING: Do not add anything in the end unless you want the node to // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all // opcodes will be thought as target memory ops! @@ -1556,6 +1560,14 @@ namespace llvm { bool isInlineAsmTargetBranch(const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const override; + SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, + MachineMemOperand *MMO, SDValue &NewLoad, + SDValue Ptr, SDValue PassThru, + SDValue Mask) const override; + SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, + MachineMemOperand *MMO, SDValue Ptr, SDValue Val, + SDValue Mask) const override; + /// Lower interleaved load(s) into target specific /// instructions/intrinsics. bool lowerInterleavedLoad(LoadInst *LI, diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index b107d56f8cf9..8be64b6fff3f 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -417,7 +417,8 @@ unsigned X86TargetLowering::getJumpTableEncoding() const { if (isPositionIndependent() && Subtarget.isPICStyleGOT()) return MachineJumpTableInfo::EK_Custom32; if (isPositionIndependent() && - getTargetMachine().getCodeModel() == CodeModel::Large) + getTargetMachine().getCodeModel() == CodeModel::Large && + !Subtarget.isTargetCOFF()) return MachineJumpTableInfo::EK_LabelDifference64; // Otherwise, use the normal jump table encoding heuristics. diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp index 8e75e185f0f6..8cf502d820e9 100644 --- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp +++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp @@ -2701,14 +2701,14 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) && BoolVec->getType()->isVectorTy() && BoolVec->getType()->getScalarSizeInBits() == 1) { - assert(Mask->getType()->getPrimitiveSizeInBits() == - II.getType()->getPrimitiveSizeInBits() && + auto *MaskTy = cast<FixedVectorType>(Mask->getType()); + auto *OpTy = cast<FixedVectorType>(II.getType()); + assert(MaskTy->getPrimitiveSizeInBits() == + OpTy->getPrimitiveSizeInBits() && "Not expecting mask and operands with different sizes"); + unsigned NumMaskElts = MaskTy->getNumElements(); + unsigned NumOperandElts = OpTy->getNumElements(); - unsigned NumMaskElts = - cast<FixedVectorType>(Mask->getType())->getNumElements(); - unsigned NumOperandElts = - cast<FixedVectorType>(II.getType())->getNumElements(); if (NumMaskElts == NumOperandElts) { return SelectInst::Create(BoolVec, Op1, Op0); } @@ -2716,8 +2716,8 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { // If the mask has less elements than the operands, each mask bit maps to // multiple elements of the operands. Bitcast back and forth. if (NumMaskElts < NumOperandElts) { - Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType()); - Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType()); + Value *CastOp0 = IC.Builder.CreateBitCast(Op0, MaskTy); + Value *CastOp1 = IC.Builder.CreateBitCast(Op1, MaskTy); Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0); return new BitCastInst(Sel, II.getType()); } diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td index c45ec8981ab1..ffa8a105e2d1 100644 --- a/llvm/lib/Target/X86/X86InstrArithmetic.td +++ b/llvm/lib/Target/X86/X86InstrArithmetic.td @@ -14,34 +14,34 @@ //===----------------------------------------------------------------------===// // LEA - Load Effective Address let SchedRW = [WriteLEA] in { -let hasSideEffects = 0 in -def LEA16r : I<0x8D, MRMSrcMem, - (outs GR16:$dst), (ins anymem:$src), - "lea{w}\t{$src|$dst}, {$dst|$src}", []>, OpSize16; -let isReMaterializable = 1 in -def LEA32r : I<0x8D, MRMSrcMem, - (outs GR32:$dst), (ins anymem:$src), - "lea{l}\t{$src|$dst}, {$dst|$src}", - [(set GR32:$dst, lea32addr:$src)]>, - OpSize32, Requires<[Not64BitMode]>; - -def LEA64_32r : I<0x8D, MRMSrcMem, - (outs GR32:$dst), (ins lea64_32mem:$src), - "lea{l}\t{$src|$dst}, {$dst|$src}", - [(set GR32:$dst, lea64_32addr:$src)]>, - OpSize32, Requires<[In64BitMode]>; - -let isReMaterializable = 1 in -def LEA64r : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins lea64mem:$src), - "lea{q}\t{$src|$dst}, {$dst|$src}", - [(set GR64:$dst, lea64addr:$src)]>; + let hasSideEffects = 0 in + def LEA16r : I<0x8D, MRMSrcMem, + (outs GR16:$dst), (ins anymem:$src), + "lea{w}\t{$src|$dst}, {$dst|$src}", []>, OpSize16; + let isReMaterializable = 1 in + def LEA32r : I<0x8D, MRMSrcMem, + (outs GR32:$dst), (ins anymem:$src), + "lea{l}\t{$src|$dst}, {$dst|$src}", + [(set GR32:$dst, lea32addr:$src)]>, + OpSize32, Requires<[Not64BitMode]>; + + def LEA64_32r : I<0x8D, MRMSrcMem, + (outs GR32:$dst), (ins lea64_32mem:$src), + "lea{l}\t{$src|$dst}, {$dst|$src}", + [(set GR32:$dst, lea64_32addr:$src)]>, + OpSize32, Requires<[In64BitMode]>; + + let isReMaterializable = 1 in + def LEA64r : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins lea64mem:$src), + "lea{q}\t{$src|$dst}, {$dst|$src}", + [(set GR64:$dst, lea64addr:$src)]>; } // SchedRW // Pseudo instruction for lea that prevent optimizer from eliminating // the instruction. let SchedRW = [WriteLEA], isPseudo = true, hasSideEffects = 1 in { -def PLEA32r : PseudoI<(outs GR32:$dst), (ins anymem:$src), []>; -def PLEA64r : PseudoI<(outs GR64:$dst), (ins anymem:$src), []>; + def PLEA32r : PseudoI<(outs GR32:$dst), (ins anymem:$src), []>; + def PLEA64r : PseudoI<(outs GR64:$dst), (ins anymem:$src), []>; } //===----------------------------------------------------------------------===// @@ -655,111 +655,112 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, } } - def 8rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8>; - def 16rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16; - def 32rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32; - def 64rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64>; - let Predicates = [In64BitMode] in { - def 8rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8>, PL; - def 16rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16>, PL, PD; - def 32rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32>, PL; - def 64rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64>, PL; - def 8rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8, 1>; - def 16rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16, 1>, PD; - def 32rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32, 1>; - def 64rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64, 1>; - def 8rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi8>, NF; - def 16rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi16>, NF, PD; - def 32rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi32>, NF; - def 64rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi64>, NF; - def 8rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi8, 1>, EVEX_NF; - def 16rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi16, 1>, EVEX_NF, PD; - def 32rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi32, 1>, EVEX_NF; - def 64rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi64, 1>, EVEX_NF; - } + def 8rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8>; + def 16rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16; + def 32rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32; + def 64rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64>; + let Predicates = [In64BitMode] in { + def 8rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8>, PL; + def 16rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16>, PL, PD; + def 32rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32>, PL; + def 64rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64>, PL; + def 8rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8, 1>; + def 16rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16, 1>, PD; + def 32rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32, 1>; + def 64rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64, 1>; + def 8rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi8>, NF; + def 16rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi16>, NF, PD; + def 32rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi32>, NF; + def 64rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi64>, NF; + def 8rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi8, 1>, EVEX_NF; + def 16rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi16, 1>, EVEX_NF, PD; + def 32rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi32, 1>, EVEX_NF; + def 64rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi64, 1>, EVEX_NF; + } + + let Predicates = [NoNDD] in { + def 8rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>; + def 16rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>, OpSize16; + def 32rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>, OpSize32; + def 64rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>; + } + let Predicates = [HasNDD, In64BitMode] in { + def 8rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag, 1>; + def 16rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag, 1>, PD; + def 32rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag, 1>; + def 64rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag, 1>; + def 8rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi8, 1>, EVEX_NF; + def 16rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi16, 1>, EVEX_NF, PD; + def 32rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi32, 1>, EVEX_NF; + def 64rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi64, 1>, EVEX_NF; + } + let Predicates = [In64BitMode] in { + def 8rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi8>, NF; + def 16rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi16>, NF, PD; + def 32rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi32>, NF; + def 64rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi64>, NF; + def 8rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , null_frag>, PL; + def 16rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, null_frag>, PL, PD; + def 32rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, null_frag>, PL; + def 64rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, null_frag>, PL; + } + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { let Predicates = [NoNDD] in { - def 8rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>; - def 16rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>, OpSize16; - def 32rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>, OpSize32; - def 64rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>; + // NOTE: These are order specific, we want the ri8 forms to be listed + // first so that they are slightly preferred to the ri forms. + def 16ri8 : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM>, OpSize16; + def 32ri8 : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM>, OpSize32; + def 64ri8 : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM>; + def 8ri : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>; + def 16ri : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM>, OpSize16; + def 32ri : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM>, OpSize32; + def 64ri32: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM>; } let Predicates = [HasNDD, In64BitMode] in { - def 8rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag, 1>; - def 16rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag, 1>, PD; - def 32rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag, 1>; - def 64rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag, 1>; - def 8rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi8, 1>, EVEX_NF; - def 16rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi16, 1>, EVEX_NF, PD; - def 32rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi32, 1>, EVEX_NF; - def 64rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi64, 1>, EVEX_NF; + def 16ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM, 1>, PD; + def 32ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM, 1>; + def 64ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM, 1>; + def 8ri_ND : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM, 1>; + def 16ri_ND : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM, 1>, PD; + def 32ri_ND : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM, 1>; + def 64ri32_ND: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM, 1>; + def 16ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi16, RegMRM, 1>, EVEX_NF, PD; + def 32ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi32, RegMRM, 1>, EVEX_NF; + def 64ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi64, RegMRM, 1>, EVEX_NF; + def 8ri_NF_ND : BinOpRI_R<0x80, mnemonic, Xi8, RegMRM, 1>, EVEX_NF; + def 16ri_NF_ND : BinOpRI_R<0x81, mnemonic, Xi16, RegMRM, 1>, EVEX_NF, PD; + def 32ri_NF_ND : BinOpRI_R<0x81, mnemonic, Xi32, RegMRM, 1>, EVEX_NF; + def 64ri32_NF_ND : BinOpRI_R<0x81, mnemonic, Xi64, RegMRM, 1>, EVEX_NF; } let Predicates = [In64BitMode] in { - def 8rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi8>, NF; - def 16rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi16>, NF, PD; - def 32rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi32>, NF; - def 64rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi64>, NF; - def 8rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , null_frag>, PL; - def 16rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, null_frag>, PL, PD; - def 32rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, null_frag>, PL; - def 64rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, null_frag>, PL; - } - - let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { - let Predicates = [NoNDD] in { - // NOTE: These are order specific, we want the ri8 forms to be listed - // first so that they are slightly preferred to the ri forms. - def 16ri8 : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM>, OpSize16; - def 32ri8 : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM>, OpSize32; - def 64ri8 : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM>; - def 8ri : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>; - def 16ri : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM>, OpSize16; - def 32ri : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM>, OpSize32; - def 64ri32: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM>; - } - let Predicates = [HasNDD, In64BitMode] in { - def 16ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM, 1>, PD; - def 32ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM, 1>; - def 64ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM, 1>; - def 8ri_ND : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM, 1>; - def 16ri_ND : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM, 1>, PD; - def 32ri_ND : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM, 1>; - def 64ri32_ND: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM, 1>; - def 16ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi16, RegMRM, 1>, EVEX_NF, PD; - def 32ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi32, RegMRM, 1>, EVEX_NF; - def 64ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi64, RegMRM, 1>, EVEX_NF; - def 8ri_NF_ND : BinOpRI_R<0x80, mnemonic, Xi8, RegMRM, 1>, EVEX_NF; - def 16ri_NF_ND : BinOpRI_R<0x81, mnemonic, Xi16, RegMRM, 1>, EVEX_NF, PD; - def 32ri_NF_ND : BinOpRI_R<0x81, mnemonic, Xi32, RegMRM, 1>, EVEX_NF; - def 64ri32_NF_ND : BinOpRI_R<0x81, mnemonic, Xi64, RegMRM, 1>, EVEX_NF; - } - let Predicates = [In64BitMode] in { - def 16ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi16, RegMRM>, NF, PD; - def 32ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi32, RegMRM>, NF; - def 64ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi64, RegMRM>, NF; - def 8ri_NF : BinOpRI_R<0x80, mnemonic, Xi8, RegMRM>, NF; - def 16ri_NF : BinOpRI_R<0x81, mnemonic, Xi16, RegMRM>, NF, PD; - def 32ri_NF : BinOpRI_R<0x81, mnemonic, Xi32, RegMRM>, NF; - def 64ri32_NF : BinOpRI_R<0x81, mnemonic, Xi64, RegMRM>, NF; - def 16ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM>, PL, PD; - def 32ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM>, PL; - def 64ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM>, PL; - def 8ri_EVEX : BinOpRI_RF<0x80, mnemonic, Xi8 , null_frag, RegMRM>, PL; - def 16ri_EVEX : BinOpRI_RF<0x81, mnemonic, Xi16, null_frag, RegMRM>, PL, PD; - def 32ri_EVEX : BinOpRI_RF<0x81, mnemonic, Xi32, null_frag, RegMRM>, PL; - def 64ri32_EVEX: BinOpRI_RF<0x81, mnemonic, Xi64, null_frag, RegMRM>, PL; - } + def 16ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi16, RegMRM>, NF, PD; + def 32ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi32, RegMRM>, NF; + def 64ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi64, RegMRM>, NF; + def 8ri_NF : BinOpRI_R<0x80, mnemonic, Xi8, RegMRM>, NF; + def 16ri_NF : BinOpRI_R<0x81, mnemonic, Xi16, RegMRM>, NF, PD; + def 32ri_NF : BinOpRI_R<0x81, mnemonic, Xi32, RegMRM>, NF; + def 64ri32_NF : BinOpRI_R<0x81, mnemonic, Xi64, RegMRM>, NF; + def 16ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM>, PL, PD; + def 32ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM>, PL; + def 64ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM>, PL; + def 8ri_EVEX : BinOpRI_RF<0x80, mnemonic, Xi8 , null_frag, RegMRM>, PL; + def 16ri_EVEX : BinOpRI_RF<0x81, mnemonic, Xi16, null_frag, RegMRM>, PL, PD; + def 32ri_EVEX : BinOpRI_RF<0x81, mnemonic, Xi32, null_frag, RegMRM>, PL; + def 64ri32_EVEX: BinOpRI_RF<0x81, mnemonic, Xi64, null_frag, RegMRM>, PL; } + } - def 8mr : BinOpMR_MF<BaseOpc, mnemonic, Xi8 , opnode>; - def 16mr : BinOpMR_MF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16; - def 32mr : BinOpMR_MF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32; - def 64mr : BinOpMR_MF<BaseOpc, mnemonic, Xi64, opnode>; - let Predicates = [HasNDD, In64BitMode] in { - def 8mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi8 , opnode>; - def 16mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi16, opnode>, PD; - def 32mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi32, opnode>; - def 64mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi64, opnode>; + def 8mr : BinOpMR_MF<BaseOpc, mnemonic, Xi8 , opnode>; + def 16mr : BinOpMR_MF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16; + def 32mr : BinOpMR_MF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32; + def 64mr : BinOpMR_MF<BaseOpc, mnemonic, Xi64, opnode>; + let Predicates = [HasNDD, In64BitMode] in { + defvar node = !if(!eq(CommutableRR, 0), opnode, null_frag); + def 8mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi8 , node>; + def 16mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi16, node>, PD; + def 32mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi32, node>; + def 64mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi64, node>; def 8mr_NF_ND : BinOpMR_R<BaseOpc, mnemonic, Xi8>, EVEX_NF; def 16mr_NF_ND : BinOpMR_R<BaseOpc, mnemonic, Xi16>, EVEX_NF, PD; def 32mr_NF_ND : BinOpMR_R<BaseOpc, mnemonic, Xi32>, EVEX_NF; @@ -823,18 +824,14 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, // These are for the disassembler since 0x82 opcode behaves like 0x80, but // not in 64-bit mode. let Predicates = [Not64BitMode] in { - def 8ri8 : BinOpRI8_RF<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly; - def 8mi8 : BinOpMI8_MF<mnemonic, Xi8, MemMRM>, DisassembleOnly; + def 8ri8 : BinOpRI8_RF<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly; + def 8mi8 : BinOpMI8_MF<mnemonic, Xi8, MemMRM>, DisassembleOnly; } - def 8i8 : BinOpAI_AF<BaseOpc4, mnemonic, Xi8 , AL, - "{$src, %al|al, $src}">; - def 16i16 : BinOpAI_AF<BaseOpc4, mnemonic, Xi16, AX, - "{$src, %ax|ax, $src}">, OpSize16; - def 32i32 : BinOpAI_AF<BaseOpc4, mnemonic, Xi32, EAX, - "{$src, %eax|eax, $src}">, OpSize32; - def 64i32 : BinOpAI_AF<BaseOpc4, mnemonic, Xi64, RAX, - "{$src, %rax|rax, $src}">; + def 8i8 : BinOpAI_AF<BaseOpc4, mnemonic, Xi8 , AL, "{$src, %al|al, $src}">; + def 16i16 : BinOpAI_AF<BaseOpc4, mnemonic, Xi16, AX, "{$src, %ax|ax, $src}">, OpSize16; + def 32i32 : BinOpAI_AF<BaseOpc4, mnemonic, Xi32, EAX, "{$src, %eax|eax, $src}">, OpSize32; + def 64i32 : BinOpAI_AF<BaseOpc4, mnemonic, Xi64, RAX, "{$src, %rax|rax, $src}">; } /// ArithBinOp_RFF - This is an arithmetic binary operator where the pattern is @@ -948,10 +945,11 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, def 32mr : BinOpMRF_MF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32; def 64mr : BinOpMRF_MF<BaseOpc, mnemonic, Xi64, opnode>; let Predicates = [HasNDD, In64BitMode] in { - def 8mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi8 , opnode>; - def 16mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi16, opnode>, PD; - def 32mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi32, opnode>; - def 64mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi64, opnode>; + defvar node = !if(!eq(CommutableRR, 0), opnode, null_frag); + def 8mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi8 , node>; + def 16mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi16, node>, PD; + def 32mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi32, node>; + def 64mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi64, node>; } let Predicates = [In64BitMode] in { def 8mr_EVEX : BinOpMRF_MF<BaseOpc, mnemonic, Xi8 , null_frag>, PL; @@ -995,17 +993,13 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, // not in 64-bit mode. let Predicates = [Not64BitMode] in { def 8ri8 : BinOpRI8F_RF<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly; - def 8mi8 : BinOpMI8F_MF<mnemonic, Xi8, MemMRM>, DisassembleOnly; + def 8mi8 : BinOpMI8F_MF<mnemonic, Xi8, MemMRM>, DisassembleOnly; } - def 8i8 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi8 , AL, - "{$src, %al|al, $src}">; - def 16i16 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi16, AX, - "{$src, %ax|ax, $src}">, OpSize16; - def 32i32 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi32, EAX, - "{$src, %eax|eax, $src}">, OpSize32; - def 64i32 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi64, RAX, - "{$src, %rax|rax, $src}">; + def 8i8 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi8 , AL, "{$src, %al|al, $src}">; + def 16i16 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi16, AX, "{$src, %ax|ax, $src}">, OpSize16; + def 32i32 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi32, EAX, "{$src, %eax|eax, $src}">, OpSize32; + def 64i32 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi64, RAX, "{$src, %rax|rax, $src}">; } /// ArithBinOp_F - This is an arithmetic binary operator where the pattern is @@ -1017,11 +1011,11 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, SDNode opnode, bit CommutableRR, bit ConvertibleToThreeAddress> { let isCommutable = CommutableRR in { - def 8rr : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>; + def 8rr : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>; let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { - def 16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>, OpSize16; - def 32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>, OpSize32; - def 64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>; + def 16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>, OpSize16; + def 32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>, OpSize32; + def 64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>; } // isConvertibleToThreeAddress } // isCommutable @@ -1038,15 +1032,15 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, def 8ri : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>; let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { - // NOTE: These are order specific, we want the ri8 forms to be listed - // first so that they are slightly preferred to the ri forms. - def 16ri8 : BinOpRI8_F<0x83, mnemonic, Xi16, RegMRM>, OpSize16; - def 32ri8 : BinOpRI8_F<0x83, mnemonic, Xi32, RegMRM>, OpSize32; - def 64ri8 : BinOpRI8_F<0x83, mnemonic, Xi64, RegMRM>; - - def 16ri : BinOpRI_F<0x81, mnemonic, Xi16, opnode, RegMRM>, OpSize16; - def 32ri : BinOpRI_F<0x81, mnemonic, Xi32, opnode, RegMRM>, OpSize32; - def 64ri32: BinOpRI_F<0x81, mnemonic, Xi64, opnode, RegMRM>; + // NOTE: These are order specific, we want the ri8 forms to be listed + // first so that they are slightly preferred to the ri forms. + def 16ri8 : BinOpRI8_F<0x83, mnemonic, Xi16, RegMRM>, OpSize16; + def 32ri8 : BinOpRI8_F<0x83, mnemonic, Xi32, RegMRM>, OpSize32; + def 64ri8 : BinOpRI8_F<0x83, mnemonic, Xi64, RegMRM>; + + def 16ri : BinOpRI_F<0x81, mnemonic, Xi16, opnode, RegMRM>, OpSize16; + def 32ri : BinOpRI_F<0x81, mnemonic, Xi32, opnode, RegMRM>, OpSize32; + def 64ri32: BinOpRI_F<0x81, mnemonic, Xi64, opnode, RegMRM>; } def 8mr : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>; @@ -1065,24 +1059,20 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, def 16mi : BinOpMI_F<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16; def 32mi : BinOpMI_F<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32; let Predicates = [In64BitMode] in - def 64mi32 : BinOpMI_F<0x81, mnemonic, Xi64, opnode, MemMRM>; + def 64mi32 : BinOpMI_F<0x81, mnemonic, Xi64, opnode, MemMRM>; // These are for the disassembler since 0x82 opcode behaves like 0x80, but // not in 64-bit mode. let Predicates = [Not64BitMode] in { - def 8ri8 : BinOpRI8_F<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly; + def 8ri8 : BinOpRI8_F<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly; let mayLoad = 1 in - def 8mi8 : BinOpMI8_F<mnemonic, Xi8, MemMRM>; + def 8mi8 : BinOpMI8_F<mnemonic, Xi8, MemMRM>; } - def 8i8 : BinOpAI_F<BaseOpc4, mnemonic, Xi8 , AL, - "{$src, %al|al, $src}">; - def 16i16 : BinOpAI_F<BaseOpc4, mnemonic, Xi16, AX, - "{$src, %ax|ax, $src}">, OpSize16; - def 32i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi32, EAX, - "{$src, %eax|eax, $src}">, OpSize32; - def 64i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi64, RAX, - "{$src, %rax|rax, $src}">; + def 8i8 : BinOpAI_F<BaseOpc4, mnemonic, Xi8 , AL, "{$src, %al|al, $src}">; + def 16i16 : BinOpAI_F<BaseOpc4, mnemonic, Xi16, AX, "{$src, %ax|ax, $src}">, OpSize16; + def 32i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi32, EAX, "{$src, %eax|eax, $src}">, OpSize32; + def 64i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi64, RAX, "{$src, %rax|rax, $src}">; } @@ -1095,18 +1085,18 @@ defm XOR : ArithBinOp_RF<0x31, 0x33, 0x35, "xor", MRM6r, MRM6m, defm ADD : ArithBinOp_RF<0x01, 0x03, 0x05, "add", MRM0r, MRM0m, X86add_flag, add, 1, 1, 1>; let isCompare = 1 in { -defm SUB : ArithBinOp_RF<0x29, 0x2B, 0x2D, "sub", MRM5r, MRM5m, - X86sub_flag, sub, 0, 1, 0>; + defm SUB : ArithBinOp_RF<0x29, 0x2B, 0x2D, "sub", MRM5r, MRM5m, + X86sub_flag, sub, 0, 1, 0>; } // Version of XOR8rr_NOREX that use GR8_NOREX. This is used by the handling of // __builtin_parity where the last step xors an h-register with an l-register. let isCodeGenOnly = 1, hasSideEffects = 0, Constraints = "$src1 = $dst", Defs = [EFLAGS], isCommutable = 1 in -def XOR8rr_NOREX : I<0x30, MRMDestReg, (outs GR8_NOREX:$dst), - (ins GR8_NOREX:$src1, GR8_NOREX:$src2), - "xor{b}\t{$src2, $dst|$dst, $src2}", []>, - Sched<[WriteALU]>; + def XOR8rr_NOREX : I<0x30, MRMDestReg, (outs GR8_NOREX:$dst), + (ins GR8_NOREX:$src1, GR8_NOREX:$src2), + "xor{b}\t{$src2, $dst|$dst, $src2}", []>, + Sched<[WriteALU]>; // Arithmetic. defm ADC : ArithBinOp_RFF<0x11, 0x13, 0x15, "adc", MRM2r, MRM2m, X86adc_flag, @@ -1115,19 +1105,31 @@ defm SBB : ArithBinOp_RFF<0x19, 0x1B, 0x1D, "sbb", MRM3r, MRM3m, X86sbb_flag, 0, 0>; let isCompare = 1 in { -defm CMP : ArithBinOp_F<0x39, 0x3B, 0x3D, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>; + defm CMP : ArithBinOp_F<0x39, 0x3B, 0x3D, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>; } // Patterns to recognize loads on the LHS of an ADC. We can't make X86adc_flag // commutable since it has EFLAGs as an input. -def : Pat<(X86adc_flag (loadi8 addr:$src2), GR8:$src1, EFLAGS), - (ADC8rm GR8:$src1, addr:$src2)>; -def : Pat<(X86adc_flag (loadi16 addr:$src2), GR16:$src1, EFLAGS), - (ADC16rm GR16:$src1, addr:$src2)>; -def : Pat<(X86adc_flag (loadi32 addr:$src2), GR32:$src1, EFLAGS), - (ADC32rm GR32:$src1, addr:$src2)>; -def : Pat<(X86adc_flag (loadi64 addr:$src2), GR64:$src1, EFLAGS), - (ADC64rm GR64:$src1, addr:$src2)>; +let Predicates = [NoNDD] in { + def : Pat<(X86adc_flag (loadi8 addr:$src2), GR8:$src1, EFLAGS), + (ADC8rm GR8:$src1, addr:$src2)>; + def : Pat<(X86adc_flag (loadi16 addr:$src2), GR16:$src1, EFLAGS), + (ADC16rm GR16:$src1, addr:$src2)>; + def : Pat<(X86adc_flag (loadi32 addr:$src2), GR32:$src1, EFLAGS), + (ADC32rm GR32:$src1, addr:$src2)>; + def : Pat<(X86adc_flag (loadi64 addr:$src2), GR64:$src1, EFLAGS), + (ADC64rm GR64:$src1, addr:$src2)>; +} +let Predicates = [HasNDD] in { + def : Pat<(X86adc_flag (loadi8 addr:$src2), GR8:$src1, EFLAGS), + (ADC8rm_ND GR8:$src1, addr:$src2)>; + def : Pat<(X86adc_flag (loadi16 addr:$src2), GR16:$src1, EFLAGS), + (ADC16rm_ND GR16:$src1, addr:$src2)>; + def : Pat<(X86adc_flag (loadi32 addr:$src2), GR32:$src1, EFLAGS), + (ADC32rm_ND GR32:$src1, addr:$src2)>; + def : Pat<(X86adc_flag (loadi64 addr:$src2), GR64:$src1, EFLAGS), + (ADC64rm_ND GR64:$src1, addr:$src2)>; +} // Patterns to recognize RMW ADC with loads in operand 1. def : Pat<(store (X86adc_flag GR8:$src, (loadi8 addr:$dst), EFLAGS), @@ -1299,33 +1301,33 @@ let isCompare = 1 in { // Avoid selecting these and instead use a test+and. Post processing will // combine them. This gives bunch of other patterns that start with // and a chance to match. - def TEST8rr : BinOpRR_F<0x84, "test", Xi8 , null_frag>; - def TEST16rr : BinOpRR_F<0x85, "test", Xi16, null_frag>, OpSize16; - def TEST32rr : BinOpRR_F<0x85, "test", Xi32, null_frag>, OpSize32; - def TEST64rr : BinOpRR_F<0x85, "test", Xi64, null_frag>; + def TEST8rr : BinOpRR_F<0x84, "test", Xi8 , null_frag>; + def TEST16rr : BinOpRR_F<0x85, "test", Xi16, null_frag>, OpSize16; + def TEST32rr : BinOpRR_F<0x85, "test", Xi32, null_frag>, OpSize32; + def TEST64rr : BinOpRR_F<0x85, "test", Xi64, null_frag>; } // isCommutable -def TEST8mr : BinOpMR_F<0x84, "test", Xi8 , null_frag>; -def TEST16mr : BinOpMR_F<0x85, "test", Xi16, null_frag>, OpSize16; -def TEST32mr : BinOpMR_F<0x85, "test", Xi32, null_frag>, OpSize32; -def TEST64mr : BinOpMR_F<0x85, "test", Xi64, null_frag>; + def TEST8mr : BinOpMR_F<0x84, "test", Xi8 , null_frag>; + def TEST16mr : BinOpMR_F<0x85, "test", Xi16, null_frag>, OpSize16; + def TEST32mr : BinOpMR_F<0x85, "test", Xi32, null_frag>, OpSize32; + def TEST64mr : BinOpMR_F<0x85, "test", Xi64, null_frag>; -def TEST8ri : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>; -def TEST16ri : BinOpRI_F<0xF7, "test", Xi16, X86testpat, MRM0r>, OpSize16; -def TEST32ri : BinOpRI_F<0xF7, "test", Xi32, X86testpat, MRM0r>, OpSize32; -def TEST64ri32 : BinOpRI_F<0xF7, "test", Xi64, X86testpat, MRM0r>; + def TEST8ri : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>; + def TEST16ri : BinOpRI_F<0xF7, "test", Xi16, X86testpat, MRM0r>, OpSize16; + def TEST32ri : BinOpRI_F<0xF7, "test", Xi32, X86testpat, MRM0r>, OpSize32; + def TEST64ri32 : BinOpRI_F<0xF7, "test", Xi64, X86testpat, MRM0r>; -def TEST8mi : BinOpMI_F<0xF6, "test", Xi8 , X86testpat, MRM0m>; -def TEST16mi : BinOpMI_F<0xF7, "test", Xi16, X86testpat, MRM0m>, OpSize16; -def TEST32mi : BinOpMI_F<0xF7, "test", Xi32, X86testpat, MRM0m>, OpSize32; + def TEST8mi : BinOpMI_F<0xF6, "test", Xi8 , X86testpat, MRM0m>; + def TEST16mi : BinOpMI_F<0xF7, "test", Xi16, X86testpat, MRM0m>, OpSize16; + def TEST32mi : BinOpMI_F<0xF7, "test", Xi32, X86testpat, MRM0m>, OpSize32; let Predicates = [In64BitMode] in - def TEST64mi32 : BinOpMI_F<0xF7, "test", Xi64, X86testpat, MRM0m>; + def TEST64mi32 : BinOpMI_F<0xF7, "test", Xi64, X86testpat, MRM0m>; -def TEST8i8 : BinOpAI_F<0xA8, "test", Xi8 , AL, "{$src, %al|al, $src}">; -def TEST16i16 : BinOpAI_F<0xA9, "test", Xi16, AX, "{$src, %ax|ax, $src}">, OpSize16; -def TEST32i32 : BinOpAI_F<0xA9, "test", Xi32, EAX, "{$src, %eax|eax, $src}">, OpSize32; -def TEST64i32 : BinOpAI_F<0xA9, "test", Xi64, RAX, "{$src, %rax|rax, $src}">; + def TEST8i8 : BinOpAI_F<0xA8, "test", Xi8 , AL, "{$src, %al|al, $src}">; + def TEST16i16 : BinOpAI_F<0xA9, "test", Xi16, AX, "{$src, %ax|ax, $src}">, OpSize16; + def TEST32i32 : BinOpAI_F<0xA9, "test", Xi32, EAX, "{$src, %eax|eax, $src}">, OpSize32; + def TEST64i32 : BinOpAI_F<0xA9, "test", Xi64, RAX, "{$src, %rax|rax, $src}">; } // isCompare // Patterns to match a relocImm into the immediate field. @@ -1409,20 +1411,20 @@ multiclass MulX<X86TypeInfo t, X86FoldableSchedWrite sched> { (ins t.RegClass:$src), "mulx", mulx_args, []>, T8, XD, VEX, VVVV, Sched<[WriteIMulH, sched]>; let mayLoad = 1 in - def rm : ITy<0xF6, MRMSrcMem, t, (outs t.RegClass:$dst1, t.RegClass:$dst2), - (ins t.MemOperand:$src), "mulx", mulx_args, []>, T8, XD, VEX, - VVVV, Sched<mulx_rm_sched>; + def rm : ITy<0xF6, MRMSrcMem, t, (outs t.RegClass:$dst1, t.RegClass:$dst2), + (ins t.MemOperand:$src), "mulx", mulx_args, []>, T8, XD, VEX, + VVVV, Sched<mulx_rm_sched>; let Predicates = [In64BitMode] in { - def rr_EVEX : ITy<0xF6, MRMSrcReg, t, - (outs t.RegClass:$dst1, t.RegClass:$dst2), - (ins t.RegClass:$src), "mulx", mulx_args, []>, T8, XD, - EVEX, VVVV, Sched<[WriteIMulH, sched]>; - let mayLoad = 1 in - def rm_EVEX : ITy<0xF6, MRMSrcMem, t, + def rr_EVEX : ITy<0xF6, MRMSrcReg, t, (outs t.RegClass:$dst1, t.RegClass:$dst2), - (ins t.MemOperand:$src), "mulx", mulx_args, []>, T8, XD, - EVEX, VVVV, Sched<mulx_rm_sched>; + (ins t.RegClass:$src), "mulx", mulx_args, []>, T8, XD, + EVEX, VVVV, Sched<[WriteIMulH, sched]>; + let mayLoad = 1 in + def rm_EVEX : ITy<0xF6, MRMSrcMem, t, + (outs t.RegClass:$dst1, t.RegClass:$dst2), + (ins t.MemOperand:$src), "mulx", mulx_args, []>, T8, XD, + EVEX, VVVV, Sched<mulx_rm_sched>; } // Pseudo instructions to be used when the low result isn't used. The // instruction is defined to keep the high if both destinations are the same. @@ -1434,10 +1436,10 @@ multiclass MulX<X86TypeInfo t, X86FoldableSchedWrite sched> { } let Uses = [EDX] in -defm MULX32 : MulX<Xi32, WriteMULX32>; + defm MULX32 : MulX<Xi32, WriteMULX32>; let Uses = [RDX] in -defm MULX64 : MulX<Xi64, WriteMULX64>, REX_W; + defm MULX64 : MulX<Xi64, WriteMULX64>, REX_W; //===----------------------------------------------------------------------===// // ADCX and ADOX Instructions diff --git a/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/llvm/lib/Target/X86/X86InstrCMovSetCC.td index e27aa4115990..7d5d7cf4a83a 100644 --- a/llvm/lib/Target/X86/X86InstrCMovSetCC.td +++ b/llvm/lib/Target/X86/X86InstrCMovSetCC.td @@ -113,6 +113,27 @@ let Predicates = [HasCMOV, HasCF] in { (CFCMOV32rr GR32:$src1, (inv_cond_XFORM timm:$cond))>; def : Pat<(X86cmov GR64:$src1, 0, timm:$cond, EFLAGS), (CFCMOV64rr GR64:$src1, (inv_cond_XFORM timm:$cond))>; + + def : Pat<(X86cload addr:$src1, 0, timm:$cond, EFLAGS), + (CFCMOV16rm addr:$src1, timm:$cond)>; + def : Pat<(X86cload addr:$src1, 0, timm:$cond, EFLAGS), + (CFCMOV32rm addr:$src1, timm:$cond)>; + def : Pat<(X86cload addr:$src1, 0, timm:$cond, EFLAGS), + (CFCMOV64rm addr:$src1, timm:$cond)>; + + def : Pat<(X86cload addr:$src2, GR16:$src1, timm:$cond, EFLAGS), + (CFCMOV16rm_ND GR16:$src1, addr:$src2, timm:$cond)>; + def : Pat<(X86cload addr:$src2, GR32:$src1, timm:$cond, EFLAGS), + (CFCMOV32rm_ND GR32:$src1, addr:$src2, timm:$cond)>; + def : Pat<(X86cload addr:$src2, GR64:$src1, timm:$cond, EFLAGS), + (CFCMOV64rm_ND GR64:$src1, addr:$src2, timm:$cond)>; + + def : Pat<(X86cstore GR16:$src2, addr:$src1, timm:$cond, EFLAGS), + (CFCMOV16mr addr:$src1, GR16:$src2, timm:$cond)>; + def : Pat<(X86cstore GR32:$src2, addr:$src1, timm:$cond, EFLAGS), + (CFCMOV32mr addr:$src1, GR32:$src2, timm:$cond)>; + def : Pat<(X86cstore GR64:$src2, addr:$src1, timm:$cond, EFLAGS), + (CFCMOV64mr addr:$src1, GR64:$src2, timm:$cond)>; } // SetCC instructions. diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index 6fb6e1633b0c..5a8177e2b360 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -1035,27 +1035,27 @@ multiclass ATOMIC_RMW_BINOP<bits<8> opc8, bits<8> opc, string mnemonic, (ins GR8:$val, i8mem:$ptr), !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"), [(set GR8:$dst, - (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))]>; + (!cast<PatFrag>(frag # "_i8") addr:$ptr, GR8:$val))]>; def NAME#16 : I<opc, MRMSrcMem, (outs GR16:$dst), (ins GR16:$val, i16mem:$ptr), !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"), [(set GR16:$dst, - (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))]>, + (!cast<PatFrag>(frag # "_i16") addr:$ptr, GR16:$val))]>, OpSize16; def NAME#32 : I<opc, MRMSrcMem, (outs GR32:$dst), (ins GR32:$val, i32mem:$ptr), !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"), [(set GR32:$dst, - (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))]>, + (!cast<PatFrag>(frag # "_i32") addr:$ptr, GR32:$val))]>, OpSize32; def NAME#64 : RI<opc, MRMSrcMem, (outs GR64:$dst), (ins GR64:$val, i64mem:$ptr), !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"), [(set GR64:$dst, - (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))]>; + (!cast<PatFrag>(frag # "_i64") addr:$ptr, GR64:$val))]>; } } diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td index 162e322712a6..038100b8264d 100644 --- a/llvm/lib/Target/X86/X86InstrFragments.td +++ b/llvm/lib/Target/X86/X86InstrFragments.td @@ -15,6 +15,15 @@ def SDTX86FCmp : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisFP<1>, def SDTX86Ccmp : SDTypeProfile<1, 5, [SDTCisVT<3, i8>, SDTCisVT<4, i8>, SDTCisVT<5, i32>]>; +// RES = op PTR, PASSTHRU, COND, EFLAGS +def SDTX86Cload : SDTypeProfile<1, 4, + [SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisSameAs<0, 2>, + SDTCisVT<3, i8>, SDTCisVT<4, i32>]>; +// op VAL, PTR, COND, EFLAGS +def SDTX86Cstore : SDTypeProfile<0, 4, + [SDTCisInt<0>, SDTCisPtrTy<1>, + SDTCisVT<2, i8>, SDTCisVT<3, i32>]>; + def SDTX86Cmov : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>, SDTCisVT<4, i32>]>; @@ -144,6 +153,9 @@ def X86bt : SDNode<"X86ISD::BT", SDTX86CmpTest>; def X86ccmp : SDNode<"X86ISD::CCMP", SDTX86Ccmp>; def X86ctest : SDNode<"X86ISD::CTEST", SDTX86Ccmp>; +def X86cload : SDNode<"X86ISD::CLOAD", SDTX86Cload, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def X86cstore : SDNode<"X86ISD::CSTORE", SDTX86Cstore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov>; def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond, [SDNPHasChain]>; diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 1f93d293bc2a..069a1ec9a598 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -3224,18 +3224,18 @@ int X86::getCCMPCondFlagsFromCondCode(X86::CondCode CC) { #define GET_X86_NF_TRANSFORM_TABLE #define GET_X86_ND2NONND_TABLE #include "X86GenInstrMapping.inc" -unsigned X86::getNFVariant(unsigned Opc) { - ArrayRef<X86TableEntry> Table = ArrayRef(X86NFTransformTable); + +static unsigned getNewOpcFromTable(ArrayRef<X86TableEntry> Table, + unsigned Opc) { const auto I = llvm::lower_bound(Table, Opc); return (I == Table.end() || I->OldOpc != Opc) ? 0U : I->NewOpc; } +unsigned X86::getNFVariant(unsigned Opc) { + return getNewOpcFromTable(X86NFTransformTable, Opc); +} -static unsigned getNonNDVariant(unsigned Opc, const X86Subtarget &STI) { - if (!STI.hasNDD()) - return 0U; - ArrayRef<X86TableEntry> Table = ArrayRef(X86ND2NonNDTable); - const auto I = llvm::lower_bound(Table, Opc); - return (I == Table.end() || I->OldOpc != Opc) ? 0U : I->NewOpc; +unsigned X86::getNonNDVariant(unsigned Opc) { + return getNewOpcFromTable(X86ND2NonNDTable, Opc); } /// Return the inverse of the specified condition, @@ -7202,7 +7202,7 @@ static MachineInstr *fuseInst(MachineFunction &MF, unsigned Opcode, return MIB; } -static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, +static MachineInstr *makeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt, MachineInstr &MI) { @@ -7282,6 +7282,12 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( } } break; + case X86::MOV32r0: + if (auto *NewMI = + makeM0Inst(*this, (Size == 4) ? X86::MOV32mi : X86::MOV64mi32, MOs, + InsertPt, MI)) + return NewMI; + break; } return nullptr; @@ -7382,16 +7388,12 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( Size, Alignment)) return CustomMI; - if (Opc == X86::MOV32r0) - if (auto *NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, InsertPt, MI)) - return NewMI; - // Folding a memory location into the two-address part of a two-address // instruction is different than folding it other places. It requires // replacing the *two* registers with the memory location. // // Utilize the mapping NonNDD -> RMW for the NDD variant. - unsigned NonNDOpc = getNonNDVariant(Opc, Subtarget); + unsigned NonNDOpc = Subtarget.hasNDD() ? X86::getNonNDVariant(Opc) : 0U; const X86FoldTableEntry *I = IsTwoAddr ? lookupTwoAddrFoldTable(NonNDOpc ? NonNDOpc : Opc) : lookupFoldTable(Opc, OpNum); @@ -7483,6 +7485,10 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( for (auto Op : Ops) { MachineOperand &MO = MI.getOperand(Op); auto SubReg = MO.getSubReg(); + // MOV32r0 is special b/c it's used to clear a 64-bit register too. + // (See patterns for MOV32r0 in TD files). + if (MI.getOpcode() == X86::MOV32r0 && SubReg == X86::sub_32bit) + continue; if (SubReg && (MO.isDef() || SubReg == X86::sub_8bit_hi)) return nullptr; } @@ -7508,7 +7514,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( switch (Opc) { default: // NDD can be folded into RMW though its Op0 and Op1 are not tied. - return getNonNDVariant(Opc, Subtarget) ? Impl() : nullptr; + return (Subtarget.hasNDD() ? X86::getNonNDVariant(Opc) : 0U) ? Impl() + : nullptr; case X86::TEST8rr: NewOpc = X86::CMP8ri; RCSize = 1; @@ -8825,6 +8832,11 @@ bool X86InstrInfo::isSchedulingBoundary(const MachineInstr &MI, Opcode == X86::PLDTILECFGV) return true; + // Frame setup and destory can't be scheduled around. + if (MI.getFlag(MachineInstr::FrameSetup) || + MI.getFlag(MachineInstr::FrameDestroy)) + return true; + return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF); } @@ -10324,7 +10336,8 @@ struct LDTLSCleanup : public MachineFunctionPass { return false; } - MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>(); + MachineDominatorTree *DT = + &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); return VisitNode(DT->getRootNode(), 0); } @@ -10411,7 +10424,7 @@ struct LDTLSCleanup : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } }; diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index 9eb2bd56b2ab..eaa3dd089394 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -80,6 +80,9 @@ int getCCMPCondFlagsFromCondCode(CondCode CC); // Get the opcode of corresponding NF variant. unsigned getNFVariant(unsigned Opc); +// Get the opcode of corresponding NonND variant. +unsigned getNonNDVariant(unsigned Opc); + /// GetOppositeBranchCondition - Return the inverse of the specified cond, /// e.g. turning COND_E to COND_NE. CondCode GetOppositeBranchCondition(CondCode CC); diff --git a/llvm/lib/Target/X86/X86InstrMisc.td b/llvm/lib/Target/X86/X86InstrMisc.td index c4da0e50a1dd..c9ff8abb02ef 100644 --- a/llvm/lib/Target/X86/X86InstrMisc.td +++ b/llvm/lib/Target/X86/X86InstrMisc.td @@ -823,27 +823,27 @@ multiclass ATOMIC_SWAP<bits<8> opc8, bits<8> opc, string mnemonic, string frag> !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"), [(set GR8:$dst, - (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))]>; + (!cast<PatFrag>(frag # "_i8") addr:$ptr, GR8:$val))]>; def NAME#16rm : I<opc, MRMSrcMem, (outs GR16:$dst), (ins GR16:$val, i16mem:$ptr), !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"), [(set GR16:$dst, - (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))]>, + (!cast<PatFrag>(frag # "_i16") addr:$ptr, GR16:$val))]>, OpSize16; def NAME#32rm : I<opc, MRMSrcMem, (outs GR32:$dst), (ins GR32:$val, i32mem:$ptr), !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"), [(set GR32:$dst, - (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))]>, + (!cast<PatFrag>(frag # "_i32") addr:$ptr, GR32:$val))]>, OpSize32; def NAME#64rm : RI<opc, MRMSrcMem, (outs GR64:$dst), (ins GR64:$val, i64mem:$ptr), !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"), [(set GR64:$dst, - (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))]>; + (!cast<PatFrag>(frag # "_i64") addr:$ptr, GR64:$val))]>; } } diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp index 4dfe7556df00..d5c23295ee9a 100644 --- a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp +++ b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp @@ -237,7 +237,7 @@ void X86LoadValueInjectionLoadHardeningPass::getAnalysisUsage( AnalysisUsage &AU) const { MachineFunctionPass::getAnalysisUsage(AU); AU.addRequired<MachineLoopInfo>(); - AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); AU.addRequired<MachineDominanceFrontier>(); AU.setPreservesCFG(); } @@ -270,7 +270,7 @@ bool X86LoadValueInjectionLoadHardeningPass::runOnMachineFunction( TRI = STI->getRegisterInfo(); LLVM_DEBUG(dbgs() << "Building gadget graph...\n"); const auto &MLI = getAnalysis<MachineLoopInfo>(); - const auto &MDT = getAnalysis<MachineDominatorTree>(); + const auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); const auto &MDF = getAnalysis<MachineDominanceFrontier>(); std::unique_ptr<MachineGadgetGraph> Graph = getGadgetGraph(MF, MLI, MDT, MDF); LLVM_DEBUG(dbgs() << "Building gadget graph... Done\n"); @@ -439,9 +439,8 @@ X86LoadValueInjectionLoadHardeningPass::getGadgetGraph( // Remove duplicate transmitters llvm::sort(DefTransmitters); - DefTransmitters.erase( - std::unique(DefTransmitters.begin(), DefTransmitters.end()), - DefTransmitters.end()); + DefTransmitters.erase(llvm::unique(DefTransmitters), + DefTransmitters.end()); }; // Find all of the transmitters @@ -801,7 +800,7 @@ bool X86LoadValueInjectionLoadHardeningPass::instrUsesRegToBranch( INITIALIZE_PASS_BEGIN(X86LoadValueInjectionLoadHardeningPass, PASS_KEY, "X86 LVI load hardening", false, false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier) INITIALIZE_PASS_END(X86LoadValueInjectionLoadHardeningPass, PASS_KEY, "X86 LVI load hardening", false, false) diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp index 8f6fba8ac22c..00f58f9432e4 100644 --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -67,8 +67,8 @@ class X86MCInstLower { public: X86MCInstLower(const MachineFunction &MF, X86AsmPrinter &asmprinter); - std::optional<MCOperand> LowerMachineOperand(const MachineInstr *MI, - const MachineOperand &MO) const; + MCOperand LowerMachineOperand(const MachineInstr *MI, + const MachineOperand &MO) const; void Lower(const MachineInstr *MI, MCInst &OutMI) const; MCSymbol *GetSymbolFromOperand(const MachineOperand &MO) const; @@ -326,9 +326,8 @@ static unsigned getRetOpcode(const X86Subtarget &Subtarget) { return Subtarget.is64Bit() ? X86::RET64 : X86::RET32; } -std::optional<MCOperand> -X86MCInstLower::LowerMachineOperand(const MachineInstr *MI, - const MachineOperand &MO) const { +MCOperand X86MCInstLower::LowerMachineOperand(const MachineInstr *MI, + const MachineOperand &MO) const { switch (MO.getType()) { default: MI->print(errs()); @@ -336,7 +335,7 @@ X86MCInstLower::LowerMachineOperand(const MachineInstr *MI, case MachineOperand::MO_Register: // Ignore all implicit register operands. if (MO.isImplicit()) - return std::nullopt; + return MCOperand(); return MCOperand::createReg(MO.getReg()); case MachineOperand::MO_Immediate: return MCOperand::createImm(MO.getImm()); @@ -355,7 +354,7 @@ X86MCInstLower::LowerMachineOperand(const MachineInstr *MI, MO, AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress())); case MachineOperand::MO_RegisterMask: // Ignore call clobbers. - return std::nullopt; + return MCOperand(); } } @@ -398,8 +397,8 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { OutMI.setOpcode(MI->getOpcode()); for (const MachineOperand &MO : MI->operands()) - if (auto MaybeMCOp = LowerMachineOperand(MI, MO)) - OutMI.addOperand(*MaybeMCOp); + if (auto Op = LowerMachineOperand(MI, MO); Op.isValid()) + OutMI.addOperand(Op); bool In64BitMode = AsmPrinter.getSubtarget().is64Bit(); if (X86::optimizeInstFromVEX3ToVEX2(OutMI, MI->getDesc()) || @@ -867,8 +866,8 @@ void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI, for (const MachineOperand &MO : llvm::drop_begin(FaultingMI.operands(), OperandsBeginIdx)) - if (auto MaybeOperand = MCIL.LowerMachineOperand(&FaultingMI, MO)) - MI.addOperand(*MaybeOperand); + if (auto Op = MCIL.LowerMachineOperand(&FaultingMI, MO); Op.isValid()) + MI.addOperand(Op); OutStreamer->AddComment("on-fault: " + HandlerLabel->getName()); OutStreamer->emitInstruction(MI, getSubtargetInfo()); @@ -1139,9 +1138,10 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI, // emit nops appropriately sized to keep the sled the same size in every // situation. for (unsigned I = 0; I < MI.getNumOperands(); ++I) - if (auto Op = MCIL.LowerMachineOperand(&MI, MI.getOperand(I))) { - assert(Op->isReg() && "Only support arguments in registers"); - SrcRegs[I] = getX86SubSuperRegister(Op->getReg(), 64); + if (auto Op = MCIL.LowerMachineOperand(&MI, MI.getOperand(I)); + Op.isValid()) { + assert(Op.isReg() && "Only support arguments in registers"); + SrcRegs[I] = getX86SubSuperRegister(Op.getReg(), 64); assert(SrcRegs[I].isValid() && "Invalid operand"); if (SrcRegs[I] != DestRegs[I]) { UsedMask[I] = true; @@ -1237,10 +1237,11 @@ void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI, // In case the arguments are already in the correct register, we emit nops // appropriately sized to keep the sled the same size in every situation. for (unsigned I = 0; I < MI.getNumOperands(); ++I) - if (auto Op = MCIL.LowerMachineOperand(&MI, MI.getOperand(I))) { + if (auto Op = MCIL.LowerMachineOperand(&MI, MI.getOperand(I)); + Op.isValid()) { // TODO: Is register only support adequate? - assert(Op->isReg() && "Only supports arguments in registers"); - SrcRegs[I] = getX86SubSuperRegister(Op->getReg(), 64); + assert(Op.isReg() && "Only supports arguments in registers"); + SrcRegs[I] = getX86SubSuperRegister(Op.getReg(), 64); assert(SrcRegs[I].isValid() && "Invalid operand"); if (SrcRegs[I] != DestRegs[I]) { UsedMask[I] = true; @@ -1354,8 +1355,8 @@ void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI, MCInst Ret; Ret.setOpcode(OpCode); for (auto &MO : drop_begin(MI.operands())) - if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO)) - Ret.addOperand(*MaybeOperand); + if (auto Op = MCIL.LowerMachineOperand(&MI, MO); Op.isValid()) + Ret.addOperand(Op); OutStreamer->emitInstruction(Ret, getSubtargetInfo()); emitX86Nops(*OutStreamer, 10, Subtarget); recordSled(CurSled, MI, SledKind::FUNCTION_EXIT, 2); @@ -1417,8 +1418,8 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, // indeed a tail call. OutStreamer->AddComment("TAILCALL"); for (auto &MO : TCOperands) - if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO)) - TC.addOperand(*MaybeOperand); + if (auto Op = MCIL.LowerMachineOperand(&MI, MO); Op.isValid()) + TC.addOperand(Op); OutStreamer->emitInstruction(TC, getSubtargetInfo()); if (IsConditional) @@ -1898,6 +1899,62 @@ static void addConstantComments(const MachineInstr *MI, break; } +#define INSTR_CASE(Prefix, Instr, Suffix, Postfix) \ + case X86::Prefix##Instr##Suffix##rm##Postfix: + +#define CASE_ARITH_RM(Instr) \ + INSTR_CASE(, Instr, , ) /* SSE */ \ + INSTR_CASE(V, Instr, , ) /* AVX-128 */ \ + INSTR_CASE(V, Instr, Y, ) /* AVX-256 */ \ + INSTR_CASE(V, Instr, Z128, ) \ + INSTR_CASE(V, Instr, Z128, k) \ + INSTR_CASE(V, Instr, Z128, kz) \ + INSTR_CASE(V, Instr, Z256, ) \ + INSTR_CASE(V, Instr, Z256, k) \ + INSTR_CASE(V, Instr, Z256, kz) \ + INSTR_CASE(V, Instr, Z, ) \ + INSTR_CASE(V, Instr, Z, k) \ + INSTR_CASE(V, Instr, Z, kz) + + // TODO: Add additional instructions when useful. + CASE_ARITH_RM(PMADDUBSW) { + unsigned SrcIdx = getSrcIdx(MI, 1); + if (auto *C = X86::getConstantFromPool(*MI, SrcIdx + 1)) { + if (C->getType()->getScalarSizeInBits() == 8) { + std::string Comment; + raw_string_ostream CS(Comment); + unsigned VectorWidth = + X86::getVectorRegisterWidth(MI->getDesc().operands()[0]); + CS << "["; + printConstant(C, VectorWidth, CS); + CS << "]"; + OutStreamer.AddComment(CS.str()); + } + } + break; + } + + CASE_ARITH_RM(PMADDWD) + CASE_ARITH_RM(PMULLW) + CASE_ARITH_RM(PMULHW) + CASE_ARITH_RM(PMULHUW) + CASE_ARITH_RM(PMULHRSW) { + unsigned SrcIdx = getSrcIdx(MI, 1); + if (auto *C = X86::getConstantFromPool(*MI, SrcIdx + 1)) { + if (C->getType()->getScalarSizeInBits() == 16) { + std::string Comment; + raw_string_ostream CS(Comment); + unsigned VectorWidth = + X86::getVectorRegisterWidth(MI->getDesc().operands()[0]); + CS << "["; + printConstant(C, VectorWidth, CS); + CS << "]"; + OutStreamer.AddComment(CS.str()); + } + } + break; + } + #define MASK_AVX512_CASE(Instr) \ case Instr: \ case Instr##k: \ diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp b/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp index 2e88e01ce7fd..7b57f7c23bf4 100644 --- a/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp +++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp @@ -13,6 +13,14 @@ using namespace llvm; +yaml::X86MachineFunctionInfo::X86MachineFunctionInfo( + const llvm::X86MachineFunctionInfo &MFI) + : AMXProgModel(MFI.getAMXProgModel()) {} + +void yaml::X86MachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) { + MappingTraits<X86MachineFunctionInfo>::mapping(YamlIO, *this); +} + MachineFunctionInfo *X86MachineFunctionInfo::clone( BumpPtrAllocator &Allocator, MachineFunction &DestMF, const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB) @@ -20,6 +28,11 @@ MachineFunctionInfo *X86MachineFunctionInfo::clone( return DestMF.cloneInfo<X86MachineFunctionInfo>(*this); } +void X86MachineFunctionInfo::initializeBaseYamlFields( + const yaml::X86MachineFunctionInfo &YamlMFI) { + AMXProgModel = YamlMFI.AMXProgModel; +} + void X86MachineFunctionInfo::anchor() { } void X86MachineFunctionInfo::setRestoreBasePointer(const MachineFunction *MF) { diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h index 8aaa49945f9d..315aeef65d28 100644 --- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h +++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -16,13 +16,43 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MIRYamlMapping.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/Support/YAMLTraits.h" #include <set> namespace llvm { enum AMXProgModelEnum { None = 0, DirectReg = 1, ManagedRA = 2 }; +class X86MachineFunctionInfo; + +namespace yaml { +template <> struct ScalarEnumerationTraits<AMXProgModelEnum> { + static void enumeration(IO &YamlIO, AMXProgModelEnum &Value) { + YamlIO.enumCase(Value, "None", AMXProgModelEnum::None); + YamlIO.enumCase(Value, "DirectReg", AMXProgModelEnum::DirectReg); + YamlIO.enumCase(Value, "ManagedRA", AMXProgModelEnum::ManagedRA); + } +}; + +struct X86MachineFunctionInfo final : public yaml::MachineFunctionInfo { + AMXProgModelEnum AMXProgModel; + + X86MachineFunctionInfo() = default; + X86MachineFunctionInfo(const llvm::X86MachineFunctionInfo &MFI); + + void mappingImpl(yaml::IO &YamlIO) override; + ~X86MachineFunctionInfo() = default; +}; + +template <> struct MappingTraits<X86MachineFunctionInfo> { + static void mapping(IO &YamlIO, X86MachineFunctionInfo &MFI) { + YamlIO.mapOptional("amxProgModel", MFI.AMXProgModel); + } +}; +} // end namespace yaml + /// X86MachineFunctionInfo - This class is derived from MachineFunction and /// contains private X86 target-specific information for each MachineFunction. class X86MachineFunctionInfo : public MachineFunctionInfo { @@ -119,10 +149,6 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { /// other tools to detect the extended record. bool HasSwiftAsyncContext = false; - /// True if this function has tile virtual register. This is used to - /// determine if we should insert tilerelease in frame lowering. - bool HasVirtualTileReg = false; - /// Ajust stack for push2/pop2 bool PadForPush2Pop2 = false; @@ -160,6 +186,8 @@ public: const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB) const override; + void initializeBaseYamlFields(const yaml::X86MachineFunctionInfo &YamlMFI); + bool getForceFramePointer() const { return ForceFramePointer;} void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; } @@ -250,9 +278,6 @@ public: bool hasSwiftAsyncContext() const { return HasSwiftAsyncContext; } void setHasSwiftAsyncContext(bool v) { HasSwiftAsyncContext = v; } - bool hasVirtualTileReg() const { return HasVirtualTileReg; } - void setHasVirtualTileReg(bool v) { HasVirtualTileReg = v; } - bool padForPush2Pop2() const { return PadForPush2Pop2; } void setPadForPush2Pop2(bool V) { PadForPush2Pop2 = V; } diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp index 75ad58e5cdcb..ecf923c0e2a2 100644 --- a/llvm/lib/Target/X86/X86PreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp @@ -237,11 +237,15 @@ void X86PreTileConfig::collectShapeInfo(MachineInstr &MI) { } bool X86PreTileConfig::runOnMachineFunction(MachineFunction &MF) { + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + // Early exit in the common case of non-AMX code. + if (X86FI->getAMXProgModel() != AMXProgModelEnum::ManagedRA) + return false; + const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); const TargetInstrInfo *TII = ST.getInstrInfo(); const TargetRegisterInfo *TRI = ST.getRegisterInfo(); const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID); - X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); BitVector AMXRegs(TRI->getNumRegs()); for (unsigned I = 0; I < RC->getNumRegs(); I++) @@ -301,7 +305,6 @@ bool X86PreTileConfig::runOnMachineFunction(MachineFunction &MF) { // There's no AMX instruction if we didn't find a tile config live in point. if (CfgNeedInsert.empty()) return false; - X86FI->setHasVirtualTileReg(true); // Avoid to insert ldtilecfg before any shape defs. SmallVector<MachineBasicBlock *, 8> WorkList; diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td index 7107dbc63e27..420c42928c1c 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver4.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td @@ -1631,17 +1631,25 @@ def : InstRW<[Zn4WriteFCmp64], (instregex )>; // MOV Instructions -def Zn4MOVS: SchedWriteRes<[Zn4FPFMisc12]> { +def Zn4MOVDUPZ: SchedWriteRes<[Zn4FPFMisc12]> { let Latency = 2; let ReleaseAtCycles = [2]; let NumMicroOps = 1; } +def : InstRW<[Zn4MOVDUPZ], (instregex + "(V?)VMOVDDUP(Z|Z128|Z256)(rr|rrk|rrkz)" + )>; + +def Zn4MOVS: SchedWriteRes<[Zn4FPFMisc12]> { + let Latency = 2; + let ReleaseAtCycles = [1]; + let NumMicroOps = 1; +} def : InstRW<[Zn4MOVS], (instregex "(V?)PMOV(SX|ZX)(BD|BQ|BW|WD|WQ|DQ)(Z128?|Z256?)(rr|rrk|rrkz)", "(V?)PMOV(SX|QD|UZ|ZX)(BD|BQ|BW?)(Y|Z128?)(rr|rrk|rrkz)", "(V?)PMOV(SX|US|ZX)(DQ|WD|QW|WQ?)(Y|Z128?)(rr|rrk|rrkz)", - "(V?)VMOVDDUP(Z|Z128|Z256)(rr|rrk|rrkz)", - "VPMOV(DB|DW|QB|QD|QW|SDB|SDW|SQB|SQD|SQW|SWB|USDB|USDW|USQB|USQD|USWB|WB)(Z128?)(rr|rrk|rrkz)" + "VPMOV(DB|DW|QB|QD|QW|SDB|SDW|SQB|SQD|SQW|SWB|USDB|USDW|USQB|USQD|USWB|WB)(Z128?|Z256?)(rr|rrk|rrkz)" )>; def Zn4MOVSZ: SchedWriteRes<[Zn4FPFMisc12]> { diff --git a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp index 489c8f492524..46317cb33776 100644 --- a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp +++ b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp @@ -822,8 +822,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG( // Sort and unique the codes to minimize them. llvm::sort(UncondCodeSeq); - UncondCodeSeq.erase(std::unique(UncondCodeSeq.begin(), UncondCodeSeq.end()), - UncondCodeSeq.end()); + UncondCodeSeq.erase(llvm::unique(UncondCodeSeq), UncondCodeSeq.end()); // Build a checking version of the successor. BuildCheckingBlockForSuccAndConds(MBB, *UncondSucc, /*SuccCount*/ 1, diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 27542e54829b..d4e642c7df9c 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -31,6 +31,8 @@ #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" #include "llvm/CodeGen/GlobalISel/Legalizer.h" #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" +#include "llvm/CodeGen/MIRParser/MIParser.h" +#include "llvm/CodeGen/MIRYamlMapping.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/RegAllocRegistry.h" @@ -344,6 +346,24 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const { return I.get(); } +yaml::MachineFunctionInfo *X86TargetMachine::createDefaultFuncInfoYAML() const { + return new yaml::X86MachineFunctionInfo(); +} + +yaml::MachineFunctionInfo * +X86TargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const { + const auto *MFI = MF.getInfo<X86MachineFunctionInfo>(); + return new yaml::X86MachineFunctionInfo(*MFI); +} + +bool X86TargetMachine::parseMachineFunctionInfo( + const yaml::MachineFunctionInfo &MFI, PerFunctionMIParsingState &PFS, + SMDiagnostic &Error, SMRange &SourceRange) const { + const auto &YamlMFI = static_cast<const yaml::X86MachineFunctionInfo &>(MFI); + PFS.MF.getInfo<X86MachineFunctionInfo>()->initializeBaseYamlFields(YamlMFI); + return false; +} + bool X86TargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { assert(SrcAS != DestAS && "Expected different address spaces!"); diff --git a/llvm/lib/Target/X86/X86TargetMachine.h b/llvm/lib/Target/X86/X86TargetMachine.h index 4a5f20fcc017..ec4a93e9c9d4 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.h +++ b/llvm/lib/Target/X86/X86TargetMachine.h @@ -58,8 +58,15 @@ public: createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F, const TargetSubtargetInfo *STI) const override; - void registerPassBuilderCallbacks(PassBuilder &PB, - bool PopulateClassToPassNames) override; + yaml::MachineFunctionInfo *createDefaultFuncInfoYAML() const override; + yaml::MachineFunctionInfo * + convertFuncInfoToYAML(const MachineFunction &MF) const override; + bool parseMachineFunctionInfo(const yaml::MachineFunctionInfo &, + PerFunctionMIParsingState &PFS, + SMDiagnostic &Error, + SMRange &SourceRange) const override; + + void registerPassBuilderCallbacks(PassBuilder &PB) override; Error buildCodeGenPipeline(ModulePassManager &, raw_pwrite_stream &, raw_pwrite_stream *, CodeGenFileType, diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 0a23bf251676..bb0270c018c9 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -176,6 +176,27 @@ unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const { return 8; } +bool X86TTIImpl::hasConditionalLoadStoreForType(Type *Ty) const { + if (!ST->hasCF()) + return false; + if (!Ty) + return true; + // Conditional faulting is supported by CFCMOV, which only accepts + // 16/32/64-bit operands. + // TODO: Support f32/f64 with VMOVSS/VMOVSD with zero mask when it's + // profitable. + if (!Ty->isIntegerTy()) + return false; + switch (cast<IntegerType>(Ty)->getBitWidth()) { + default: + return false; + case 16: + case 32: + case 64: + return true; + } +} + TypeSize X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { unsigned PreferVectorWidth = ST->getPreferVectorWidth(); @@ -851,7 +872,9 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost( { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw - { ISD::MUL, MVT::v64i8, { 5, 10,10,11 } }, + { ISD::MUL, MVT::v16i8, { 4, 12, 4, 5 } }, // extend/pmullw/trunc + { ISD::MUL, MVT::v32i8, { 3, 10, 7,10 } }, // pmaddubsw + { ISD::MUL, MVT::v64i8, { 3, 11, 7,10 } }, // pmaddubsw { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb @@ -1117,7 +1140,7 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost( { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack - { ISD::MUL, MVT::v32i8, { 6, 11,10,19 } }, // unpack/pmullw + { ISD::MUL, MVT::v32i8, { 4, 8, 8,16 } }, // pmaddubsw { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld @@ -1168,7 +1191,8 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost( // We don't have to scalarize unsupported ops. We can issue two half-sized // operations and we only need to extract the upper YMM half. // Two ops + 1 extract + 1 insert = 4. - { ISD::MUL, MVT::v32i8, { 12, 13, 22, 23 } }, // unpack/pmullw + split + { ISD::MUL, MVT::v32i8, { 10, 11, 18, 19 } }, // pmaddubsw + split + { ISD::MUL, MVT::v16i8, { 5, 6, 8, 12 } }, // 2*pmaddubsw/3*and/psllw/or { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld @@ -1308,7 +1332,6 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost( { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend. { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence. - { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*unpack/2*pmullw/2*and/pack { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org) }; @@ -1317,6 +1340,15 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost( if (auto KindCost = Entry->Cost[CostKind]) return LT.first * *KindCost; + static const CostKindTblEntry SSSE3CostTable[] = { + { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*pmaddubsw/3*and/psllw/or + }; + + if (ST->hasSSSE3()) + if (const auto *Entry = CostTableLookup(SSSE3CostTable, ISD, LT.second)) + if (auto KindCost = Entry->Cost[CostKind]) + return LT.first * *KindCost; + static const CostKindTblEntry SSE2CostTable[] = { // We don't correctly identify costs of casts because they are marked as // custom. @@ -1353,7 +1385,7 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost( { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq - { ISD::MUL, MVT::v16i8, { 5, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack + { ISD::MUL, MVT::v16i8, { 6, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add @@ -4061,7 +4093,7 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext()) }; static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets - { ISD::ABS, MVT::i64, { 1, 2, 3, 4 } }, // SUB+CMOV + { ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } }, { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } }, { ISD::CTLZ, MVT::i64, { 4 } }, // BSR+XOR or BSR+XOR+CMOV @@ -4082,9 +4114,9 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, { ISD::UMULO, MVT::i64, { 2 } }, // mulq + seto }; static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets - { ISD::ABS, MVT::i32, { 1, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV - { ISD::ABS, MVT::i16, { 2, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV - { ISD::ABS, MVT::i8, { 2, 4, 4, 4 } }, // SUB+XOR+SRA + { ISD::ABS, MVT::i32, { 1, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV + { ISD::ABS, MVT::i16, { 2, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV + { ISD::ABS, MVT::i8, { 2, 4, 4, 3 } }, // SUB+XOR+SRA { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } }, { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } }, { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } }, @@ -4259,6 +4291,37 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, } if (ISD != ISD::DELETED_NODE) { + auto adjustTableCost = [&](int ISD, unsigned Cost, + std::pair<InstructionCost, MVT> LT, + FastMathFlags FMF) -> InstructionCost { + InstructionCost LegalizationCost = LT.first; + MVT MTy = LT.second; + + // If there are no NANs to deal with, then these are reduced to a + // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we + // assume is used in the non-fast case. + if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) { + if (FMF.noNaNs()) + return LegalizationCost * 1; + } + + // For cases where some ops can be folded into a load/store, assume free. + if (MTy.isScalarInteger()) { + if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) { + if (const Instruction *II = ICA.getInst()) { + if (II->hasOneUse() && isa<StoreInst>(II->user_back())) + return TTI::TCC_Free; + if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) { + if (LI->hasOneUse()) + return TTI::TCC_Free; + } + } + } + } + + return LegalizationCost * (int)Cost; + }; + // Legalize the type. std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy); MVT MTy = LT.second; @@ -4277,180 +4340,132 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize) return LT.first; - auto adjustTableCost = [](int ISD, unsigned Cost, - InstructionCost LegalizationCost, - FastMathFlags FMF) { - // If there are no NANs to deal with, then these are reduced to a - // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we - // assume is used in the non-fast case. - if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) { - if (FMF.noNaNs()) - return LegalizationCost * 1; - } - return LegalizationCost * (int)Cost; - }; - if (ST->useGLMDivSqrtCosts()) if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->useSLMArithCosts()) if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasVBMI2()) if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasBITALG()) if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasVPOPCNTDQ()) if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasGFNI()) if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasCDI()) if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasBWI()) if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasAVX512()) if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasXOP()) if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasAVX2()) if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasAVX()) if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasSSE42()) if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasSSE41()) if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasSSSE3()) if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasSSE2()) if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasSSE1()) if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasBMI()) { if (ST->is64Bit()) if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); } if (ST->hasLZCNT()) { if (ST->is64Bit()) if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); } if (ST->hasPOPCNT()) { if (ST->is64Bit()) if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); - } - - if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) { - if (const Instruction *II = ICA.getInst()) { - if (II->hasOneUse() && isa<StoreInst>(II->user_back())) - return TTI::TCC_Free; - if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) { - if (LI->hasOneUse()) - return TTI::TCC_Free; - } - } + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); } if (ST->is64Bit()) if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); } return BaseT::getIntrinsicInstrCost(ICA, CostKind); @@ -5076,7 +5091,12 @@ X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment, std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy); auto VT = TLI->getValueType(DL, SrcVTy); InstructionCost Cost = 0; - if (VT.isSimple() && LT.second != VT.getSimpleVT() && + MVT Ty = LT.second; + if (Ty == MVT::i16 || Ty == MVT::i32 || Ty == MVT::i64) + // APX masked load/store for scalar is cheap. + return Cost + LT.first; + + if (VT.isSimple() && Ty != VT.getSimpleVT() && LT.second.getVectorNumElements() == NumElem) // Promotion requires extend/truncate for data and a shuffle for mask. Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, std::nullopt, @@ -5084,9 +5104,9 @@ X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment, getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, std::nullopt, CostKind, 0, nullptr); - else if (LT.first * LT.second.getVectorNumElements() > NumElem) { + else if (LT.first * Ty.getVectorNumElements() > NumElem) { auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(), - LT.second.getVectorNumElements()); + Ty.getVectorNumElements()); // Expanding requires fill mask with zeroes Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, std::nullopt, CostKind, 0, MaskTy); @@ -5905,14 +5925,14 @@ bool X86TTIImpl::canMacroFuseCmp() { } bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) { - if (!ST->hasAVX()) - return false; + Type *ScalarTy = DataTy->getScalarType(); - // The backend can't handle a single element vector. - if (isa<VectorType>(DataTy) && - cast<FixedVectorType>(DataTy)->getNumElements() == 1) + // The backend can't handle a single element vector w/o CFCMOV. + if (isa<VectorType>(DataTy) && cast<FixedVectorType>(DataTy)->getNumElements() == 1) + return ST->hasCF() && hasConditionalLoadStoreForType(ScalarTy); + + if (!ST->hasAVX()) return false; - Type *ScalarTy = DataTy->getScalarType(); if (ScalarTy->isPointerTy()) return true; diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index e14dc9fc0905..e6bb4720071d 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -132,6 +132,7 @@ public: /// @{ unsigned getNumberOfRegisters(unsigned ClassID) const; + bool hasConditionalLoadStoreForType(Type *Ty = nullptr) const; TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const; unsigned getLoadStoreVecRegBitWidth(unsigned AS) const; unsigned getMaxInterleaveFactor(ElementCount VF); diff --git a/llvm/lib/Target/X86/X86TileConfig.cpp b/llvm/lib/Target/X86/X86TileConfig.cpp index 5cada924e006..ebe48910225f 100644 --- a/llvm/lib/Target/X86/X86TileConfig.cpp +++ b/llvm/lib/Target/X86/X86TileConfig.cpp @@ -77,6 +77,11 @@ INITIALIZE_PASS_END(X86TileConfig, DEBUG_TYPE, "Tile Register Configure", false, false) bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) { + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + // Early exit in the common case of non-AMX code. + if (X86FI->getAMXProgModel() != AMXProgModelEnum::ManagedRA) + return false; + const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); const TargetRegisterInfo *TRI = ST.getRegisterInfo(); const TargetInstrInfo *TII = ST.getInstrInfo(); diff --git a/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp index 5e91cce1068b..7503bf1561cc 100644 --- a/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp +++ b/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp @@ -79,7 +79,7 @@ static bool replaceConstantExprOp(ConstantExpr *CE, Pass *P) { do { SmallVector<WeakTrackingVH, 8> WUsers(CE->users()); llvm::sort(WUsers); - WUsers.erase(std::unique(WUsers.begin(), WUsers.end()), WUsers.end()); + WUsers.erase(llvm::unique(WUsers), WUsers.end()); while (!WUsers.empty()) if (WeakTrackingVH WU = WUsers.pop_back_val()) { if (PHINode *PN = dyn_cast<PHINode>(WU)) { diff --git a/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp b/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp index eaf046630299..b0ce624a495f 100644 --- a/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp +++ b/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp @@ -572,7 +572,7 @@ ParseStatus XtensaAsmParser::parseRegister(OperandVector &Operands, case AsmToken::Integer: if (!SR) return ParseStatus::NoMatch; - RegName = StringRef(std::to_string(getLexer().getTok().getIntVal())); + RegName = getLexer().getTok().getString(); RegNo = MatchRegisterName(RegName); if (RegNo == 0) RegNo = MatchRegisterAltName(RegName); diff --git a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp index e222919b28dc..3f99387f759d 100644 --- a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp +++ b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp @@ -22,6 +22,7 @@ #include "llvm/CodeGen/MachineModuleInfoImpls.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInstBuilder.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" @@ -42,9 +43,20 @@ getModifierVariantKind(XtensaCP::XtensaCPModifier Modifier) { } void XtensaAsmPrinter::emitInstruction(const MachineInstr *MI) { - MCInst LoweredMI; - lowerToMCInst(MI, LoweredMI); - EmitToStreamer(*OutStreamer, LoweredMI); + unsigned Opc = MI->getOpcode(); + + switch (Opc) { + case Xtensa::BR_JT: + EmitToStreamer( + *OutStreamer, + MCInstBuilder(Xtensa::JX).addReg(MI->getOperand(0).getReg())); + return; + default: + MCInst LoweredMI; + lowerToMCInst(MI, LoweredMI); + EmitToStreamer(*OutStreamer, LoweredMI); + return; + } } void XtensaAsmPrinter::emitMachineConstantPoolValue( @@ -52,16 +64,27 @@ void XtensaAsmPrinter::emitMachineConstantPoolValue( XtensaConstantPoolValue *ACPV = static_cast<XtensaConstantPoolValue *>(MCPV); MCSymbol *MCSym; - assert(ACPV->isExtSymbol() && "unrecognized constant pool value"); - - XtensaConstantPoolSymbol *XtensaSym = cast<XtensaConstantPoolSymbol>(ACPV); - const char *Sym = XtensaSym->getSymbol(); - std::string SymName(Sym); - - if (XtensaSym->isPrivateLinkage()) - SymName = ".L" + SymName; + if (ACPV->isBlockAddress()) { + const BlockAddress *BA = + cast<XtensaConstantPoolConstant>(ACPV)->getBlockAddress(); + MCSym = GetBlockAddressSymbol(BA); + } else if (ACPV->isJumpTable()) { + unsigned Idx = cast<XtensaConstantPoolJumpTable>(ACPV)->getIndex(); + MCSym = this->GetJTISymbol(Idx, false); + } else { + assert(ACPV->isExtSymbol() && "unrecognized constant pool value"); + XtensaConstantPoolSymbol *XtensaSym = cast<XtensaConstantPoolSymbol>(ACPV); + const char *SymName = XtensaSym->getSymbol(); + + if (XtensaSym->isPrivateLinkage()) { + const DataLayout &DL = getDataLayout(); + MCSym = OutContext.getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) + + SymName); + } else { + MCSym = OutContext.getOrCreateSymbol(SymName); + } + } - MCSym = GetExternalSymbolSymbol(StringRef(SymName)); MCSymbol *LblSym = GetCPISymbol(ACPV->getLabelId()); auto *TS = static_cast<XtensaTargetStreamer *>(OutStreamer->getTargetStreamer()); @@ -71,7 +94,7 @@ void XtensaAsmPrinter::emitMachineConstantPoolValue( std::string SymName(MCSym->getName()); StringRef Modifier = ACPV->getModifierText(); SymName += Modifier; - MCSym = GetExternalSymbolSymbol(StringRef(SymName)); + MCSym = OutContext.getOrCreateSymbol(SymName); } const MCExpr *Expr = MCSymbolRefExpr::create(MCSym, VK, OutContext); @@ -140,6 +163,10 @@ XtensaAsmPrinter::GetConstantPoolIndexSymbol(const MachineOperand &MO) const { return GetCPISymbol(MO.getIndex()); } +MCSymbol *XtensaAsmPrinter::GetJumpTableSymbol(const MachineOperand &MO) const { + return GetJTISymbol(MO.getIndex()); +} + MCOperand XtensaAsmPrinter::LowerSymbolOperand(const MachineOperand &MO, MachineOperand::MachineOperandType MOTy, @@ -152,6 +179,20 @@ XtensaAsmPrinter::LowerSymbolOperand(const MachineOperand &MO, Symbol = getSymbol(MO.getGlobal()); Offset += MO.getOffset(); break; + case MachineOperand::MO_MachineBasicBlock: + Symbol = MO.getMBB()->getSymbol(); + break; + case MachineOperand::MO_BlockAddress: + Symbol = GetBlockAddressSymbol(MO.getBlockAddress()); + Offset += MO.getOffset(); + break; + case MachineOperand::MO_ExternalSymbol: + Symbol = GetExternalSymbolSymbol(MO.getSymbolName()); + Offset += MO.getOffset(); + break; + case MachineOperand::MO_JumpTableIndex: + Symbol = GetJumpTableSymbol(MO); + break; case MachineOperand::MO_ConstantPoolIndex: Symbol = GetConstantPoolIndexSymbol(MO); Offset += MO.getOffset(); @@ -191,6 +232,10 @@ MCOperand XtensaAsmPrinter::lowerOperand(const MachineOperand &MO, case MachineOperand::MO_RegisterMask: break; case MachineOperand::MO_GlobalAddress: + case MachineOperand::MO_MachineBasicBlock: + case MachineOperand::MO_BlockAddress: + case MachineOperand::MO_ExternalSymbol: + case MachineOperand::MO_JumpTableIndex: case MachineOperand::MO_ConstantPoolIndex: return LowerSymbolOperand(MO, MOTy, Offset); default: diff --git a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.h b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.h index f3fec19724aa..f9cf5ae8c9f6 100644 --- a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.h +++ b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.h @@ -44,6 +44,8 @@ public: MCSymbol *GetConstantPoolIndexSymbol(const MachineOperand &MO) const; + MCSymbol *GetJumpTableSymbol(const MachineOperand &MO) const; + MCOperand LowerSymbolOperand(const MachineOperand &MO, MachineOperand::MachineOperandType MOTy, unsigned Offset) const; diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp index 6c3258b4bb46..650979301250 100644 --- a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp +++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp @@ -77,6 +77,18 @@ XtensaTargetLowering::XtensaTargetLowering(const TargetMachine &TM, } setOperationAction(ISD::ConstantPool, PtrVT, Custom); + setOperationAction(ISD::GlobalAddress, PtrVT, Custom); + setOperationAction(ISD::BlockAddress, PtrVT, Custom); + setOperationAction(ISD::JumpTable, PtrVT, Custom); + + // Expand jump table branches as address arithmetic followed by an + // indirect jump. + setOperationAction(ISD::BR_JT, MVT::Other, Custom); + + setOperationPromotedToType(ISD::BR_CC, MVT::i1, MVT::i32); + setOperationAction(ISD::BR_CC, MVT::i32, Legal); + setOperationAction(ISD::BR_CC, MVT::i64, Expand); + setOperationAction(ISD::BR_CC, MVT::f32, Expand); // Implement custom stack allocations setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom); @@ -88,6 +100,12 @@ XtensaTargetLowering::XtensaTargetLowering(const TargetMachine &TM, computeRegisterProperties(STI.getRegisterInfo()); } +bool XtensaTargetLowering::isOffsetFoldingLegal( + const GlobalAddressSDNode *GA) const { + // The Xtensa target isn't yet aware of offsets. + return false; +} + //===----------------------------------------------------------------------===// // Calling conventions //===----------------------------------------------------------------------===// @@ -519,6 +537,72 @@ SDValue XtensaTargetLowering::LowerImmediate(SDValue Op, return Op; } +SDValue XtensaTargetLowering::LowerGlobalAddress(SDValue Op, + SelectionDAG &DAG) const { + const GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op); + SDLoc DL(Op); + auto PtrVT = Op.getValueType(); + const GlobalValue *GV = G->getGlobal(); + + SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4)); + SDValue CPWrap = getAddrPCRel(CPAddr, DAG); + + return CPWrap; +} + +SDValue XtensaTargetLowering::LowerBlockAddress(SDValue Op, + SelectionDAG &DAG) const { + BlockAddressSDNode *Node = cast<BlockAddressSDNode>(Op); + const BlockAddress *BA = Node->getBlockAddress(); + EVT PtrVT = Op.getValueType(); + + XtensaConstantPoolValue *CPV = + XtensaConstantPoolConstant::Create(BA, 0, XtensaCP::CPBlockAddress); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); + SDValue CPWrap = getAddrPCRel(CPAddr, DAG); + + return CPWrap; +} + +SDValue XtensaTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + SDValue Table = Op.getOperand(1); + SDValue Index = Op.getOperand(2); + SDLoc DL(Op); + JumpTableSDNode *JT = cast<JumpTableSDNode>(Table); + MachineFunction &MF = DAG.getMachineFunction(); + const MachineJumpTableInfo *MJTI = MF.getJumpTableInfo(); + SDValue TargetJT = DAG.getTargetJumpTable(JT->getIndex(), MVT::i32); + const DataLayout &TD = DAG.getDataLayout(); + EVT PtrVT = Table.getValueType(); + unsigned EntrySize = MJTI->getEntrySize(TD); + + Index = DAG.getNode(ISD::MUL, DL, Index.getValueType(), Index, + DAG.getConstant(EntrySize, DL, Index.getValueType())); + SDValue Addr = DAG.getNode(ISD::ADD, DL, Index.getValueType(), Index, Table); + SDValue LD = + DAG.getLoad(PtrVT, DL, Chain, Addr, + MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); + + return DAG.getNode(XtensaISD::BR_JT, DL, MVT::Other, LD.getValue(1), LD, + TargetJT); +} + +SDValue XtensaTargetLowering::LowerJumpTable(SDValue Op, + SelectionDAG &DAG) const { + JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); + EVT PtrVT = Op.getValueType(); + + // Create a constant pool entry for the callee address + XtensaConstantPoolValue *CPV = + XtensaConstantPoolJumpTable::Create(*DAG.getContext(), JT->getIndex()); + + // Get the address of the callee into a register + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); + + return getAddrPCRel(CPAddr, DAG); +} + SDValue XtensaTargetLowering::getAddrPCRel(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); @@ -580,8 +664,16 @@ SDValue XtensaTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDValue XtensaTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { + case ISD::BR_JT: + return LowerBR_JT(Op, DAG); case ISD::Constant: return LowerImmediate(Op, DAG); + case ISD::GlobalAddress: + return LowerGlobalAddress(Op, DAG); + case ISD::BlockAddress: + return LowerBlockAddress(Op, DAG); + case ISD::JumpTable: + return LowerJumpTable(Op, DAG); case ISD::ConstantPool: return LowerConstantPool(cast<ConstantPoolSDNode>(Op), DAG); case ISD::STACKSAVE: @@ -597,6 +689,8 @@ SDValue XtensaTargetLowering::LowerOperation(SDValue Op, const char *XtensaTargetLowering::getTargetNodeName(unsigned Opcode) const { switch (Opcode) { + case XtensaISD::BR_JT: + return "XtensaISD::BR_JT"; case XtensaISD::CALL: return "XtensaISD::CALL"; case XtensaISD::PCREL_WRAPPER: diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.h b/llvm/lib/Target/Xtensa/XtensaISelLowering.h index 6f6ec391430a..23a0217daaa9 100644 --- a/llvm/lib/Target/Xtensa/XtensaISelLowering.h +++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.h @@ -23,6 +23,7 @@ namespace llvm { namespace XtensaISD { enum { FIRST_NUMBER = ISD::BUILTIN_OP_END, + BR_JT, // Calls a function. Operand 0 is the chain operand and operand 1 // is the target address. The arguments start at operand 2. @@ -43,6 +44,8 @@ public: explicit XtensaTargetLowering(const TargetMachine &TM, const XtensaSubtarget &STI); + bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; + const char *getTargetNodeName(unsigned Opcode) const override; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; @@ -71,8 +74,16 @@ public: private: const XtensaSubtarget &Subtarget; + SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerImmediate(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerConstantPool(ConstantPoolSDNode *CP, SelectionDAG &DAG) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td index 6e9e75257ccf..f68d20dcdd54 100644 --- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td +++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td @@ -416,6 +416,15 @@ def BBSI : RRI8_Inst<0x07, (outs), let imm8 = target; } +def : Pat<(brcc SETGT, AR:$s, AR:$t, bb:$target), + (BLT AR:$t, AR:$s, bb:$target)>; +def : Pat<(brcc SETUGT, AR:$s, AR:$t, bb:$target), + (BLTU AR:$t, AR:$s, bb:$target)>; +def : Pat<(brcc SETLE, AR:$s, AR:$t, bb:$target), + (BGE AR:$t, AR:$s, bb:$target)>; +def : Pat<(brcc SETULE, AR:$s, AR:$t, bb:$target), + (BGEU AR:$t, AR:$s, bb:$target)>; + //===----------------------------------------------------------------------===// // Call and jump instructions //===----------------------------------------------------------------------===// @@ -471,6 +480,12 @@ def : Pat<(Xtensa_call (i32 texternalsym:$dst)), def : Pat<(Xtensa_call AR:$dst), (CALLX0 AR:$dst)>; +let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1, Size = 3 in { + def BR_JT: Pseudo<(outs), (ins AR:$s, i32imm:$jt), + "!br_jt_p, $s, $jt", + [(Xtensa_brjt AR:$s, tjumptable:$jt)]>; +} + //===----------------------------------------------------------------------===// // Mem barrier instructions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/Xtensa/XtensaOperators.td b/llvm/lib/Target/Xtensa/XtensaOperators.td index cd4d831c85b5..88d3c9dfe7fd 100644 --- a/llvm/lib/Target/Xtensa/XtensaOperators.td +++ b/llvm/lib/Target/Xtensa/XtensaOperators.td @@ -17,6 +17,8 @@ def SDT_XtensaWrapPtr : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; +def SDT_XtensaBrJT : SDTypeProfile<0, 2, + [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>; //===----------------------------------------------------------------------===// // Node definitions //===----------------------------------------------------------------------===// @@ -34,3 +36,5 @@ def Xtensa_callseq_start: SDNode<"ISD::CALLSEQ_START", SDT_XtensaCallSeqStart, def Xtensa_callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_XtensaCallSeqEnd, [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>; + +def Xtensa_brjt: SDNode<"XtensaISD::BR_JT", SDT_XtensaBrJT, [SDNPHasChain]>; |
