diff options
Diffstat (limited to 'llvm/lib')
616 files changed, 25151 insertions, 15170 deletions
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp index 86a2edbd8bd4..de37c391cf25 100644 --- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp +++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp @@ -103,12 +103,15 @@ static std::optional<TypeSize> getObjectSize(const Value *V, const TargetLibraryInfo &TLI, bool NullIsValidLoc, bool RoundToAlign = false) { - uint64_t Size; ObjectSizeOpts Opts; Opts.RoundToAlign = RoundToAlign; Opts.NullIsUnknownSize = NullIsValidLoc; - if (getObjectSize(V, Size, DL, &TLI, Opts)) - return TypeSize::getFixed(Size); + if (std::optional<TypeSize> Size = getBaseObjectSize(V, DL, &TLI, Opts)) { + // FIXME: Remove this check, only exists to preserve previous behavior. + if (Size->isScalable()) + return std::nullopt; + return Size; + } return std::nullopt; } @@ -227,9 +230,9 @@ EarliestEscapeAnalysis::getCapturesBefore(const Value *Object, auto Iter = EarliestEscapes.try_emplace(Object); if (Iter.second) { std::pair<Instruction *, CaptureComponents> EarliestCapture = - FindEarliestCapture( - Object, *const_cast<Function *>(DT.getRoot()->getParent()), - /*ReturnCaptures=*/false, DT, CaptureComponents::Provenance); + FindEarliestCapture(Object, *DT.getRoot()->getParent(), + /*ReturnCaptures=*/false, DT, + CaptureComponents::Provenance); if (EarliestCapture.first) Inst2Obj[EarliestCapture.first].push_back(Object); Iter.first->second = EarliestCapture; diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index 2148431c1acc..a136e8718435 100644..100755 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -1652,6 +1652,13 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) { case Intrinsic::amdgcn_perm: case Intrinsic::amdgcn_wave_reduce_umin: case Intrinsic::amdgcn_wave_reduce_umax: + case Intrinsic::amdgcn_wave_reduce_max: + case Intrinsic::amdgcn_wave_reduce_min: + case Intrinsic::amdgcn_wave_reduce_add: + case Intrinsic::amdgcn_wave_reduce_sub: + case Intrinsic::amdgcn_wave_reduce_and: + case Intrinsic::amdgcn_wave_reduce_or: + case Intrinsic::amdgcn_wave_reduce_xor: case Intrinsic::amdgcn_s_wqm: case Intrinsic::amdgcn_s_quadmask: case Intrinsic::amdgcn_s_bitreplicate: @@ -3672,6 +3679,13 @@ static Constant *ConstantFoldIntrinsicCall2(Intrinsic::ID IntrinsicID, Type *Ty, return ConstantInt::get(Ty, C0->abs()); case Intrinsic::amdgcn_wave_reduce_umin: case Intrinsic::amdgcn_wave_reduce_umax: + case Intrinsic::amdgcn_wave_reduce_max: + case Intrinsic::amdgcn_wave_reduce_min: + case Intrinsic::amdgcn_wave_reduce_add: + case Intrinsic::amdgcn_wave_reduce_sub: + case Intrinsic::amdgcn_wave_reduce_and: + case Intrinsic::amdgcn_wave_reduce_or: + case Intrinsic::amdgcn_wave_reduce_xor: return dyn_cast<Constant>(Operands[0]); } @@ -4608,4 +4622,55 @@ bool llvm::isMathLibCallNoop(const CallBase *Call, return false; } +Constant *llvm::getLosslessInvCast(Constant *C, Type *InvCastTo, + unsigned CastOp, const DataLayout &DL, + PreservedCastFlags *Flags) { + switch (CastOp) { + case Instruction::BitCast: + // Bitcast is always lossless. + return ConstantFoldCastOperand(Instruction::BitCast, C, InvCastTo, DL); + case Instruction::Trunc: { + auto *ZExtC = ConstantFoldCastOperand(Instruction::ZExt, C, InvCastTo, DL); + if (Flags) { + // Truncation back on ZExt value is always NUW. + Flags->NUW = true; + // Test positivity of C. + auto *SExtC = + ConstantFoldCastOperand(Instruction::SExt, C, InvCastTo, DL); + Flags->NSW = ZExtC == SExtC; + } + return ZExtC; + } + case Instruction::SExt: + case Instruction::ZExt: { + auto *InvC = ConstantExpr::getTrunc(C, InvCastTo); + auto *CastInvC = ConstantFoldCastOperand(CastOp, InvC, C->getType(), DL); + // Must satisfy CastOp(InvC) == C. + if (!CastInvC || CastInvC != C) + return nullptr; + if (Flags && CastOp == Instruction::ZExt) { + auto *SExtInvC = + ConstantFoldCastOperand(Instruction::SExt, InvC, C->getType(), DL); + // Test positivity of InvC. + Flags->NNeg = CastInvC == SExtInvC; + } + return InvC; + } + default: + return nullptr; + } +} + +Constant *llvm::getLosslessUnsignedTrunc(Constant *C, Type *DestTy, + const DataLayout &DL, + PreservedCastFlags *Flags) { + return getLosslessInvCast(C, DestTy, Instruction::ZExt, DL, Flags); +} + +Constant *llvm::getLosslessSignedTrunc(Constant *C, Type *DestTy, + const DataLayout &DL, + PreservedCastFlags *Flags) { + return getLosslessInvCast(C, DestTy, Instruction::SExt, DL, Flags); +} + void TargetFolder::anchor() {} diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp index 3a7066602924..b78cc03e34db 100644 --- a/llvm/lib/Analysis/DXILResource.cpp +++ b/llvm/lib/Analysis/DXILResource.cpp @@ -786,7 +786,7 @@ StringRef dxil::getResourceNameFromBindingCall(CallInst *CI) { llvm_unreachable("unexpected handle creation intrinsic"); case Intrinsic::dx_resource_handlefrombinding: case Intrinsic::dx_resource_handlefromimplicitbinding: - Op = CI->getArgOperand(5); + Op = CI->getArgOperand(4); break; } @@ -1010,7 +1010,7 @@ void DXILResourceBindingInfo::populate(Module &M, DXILResourceTypeMap &DRTM) { cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue(); int32_t Size = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue(); - Value *Name = CI->getArgOperand(5); + Value *Name = CI->getArgOperand(4); // negative size means unbounded resource array; // upper bound register overflow should be detected in Sema diff --git a/llvm/lib/Analysis/Delinearization.cpp b/llvm/lib/Analysis/Delinearization.cpp index 762d9191aab1..4064b25d9d4e 100644 --- a/llvm/lib/Analysis/Delinearization.cpp +++ b/llvm/lib/Analysis/Delinearization.cpp @@ -182,7 +182,7 @@ void llvm::collectParametricTerms(ScalarEvolution &SE, const SCEV *Expr, LLVM_DEBUG({ dbgs() << "Strides:\n"; for (const SCEV *S : Strides) - dbgs() << *S << "\n"; + dbgs().indent(2) << *S << "\n"; }); for (const SCEV *S : Strides) { @@ -193,7 +193,7 @@ void llvm::collectParametricTerms(ScalarEvolution &SE, const SCEV *Expr, LLVM_DEBUG({ dbgs() << "Terms:\n"; for (const SCEV *T : Terms) - dbgs() << *T << "\n"; + dbgs().indent(2) << *T << "\n"; }); SCEVCollectAddRecMultiplies MulCollector(Terms, SE); @@ -294,7 +294,7 @@ void llvm::findArrayDimensions(ScalarEvolution &SE, LLVM_DEBUG({ dbgs() << "Terms:\n"; for (const SCEV *T : Terms) - dbgs() << *T << "\n"; + dbgs().indent(2) << *T << "\n"; }); // Remove duplicates. @@ -325,7 +325,7 @@ void llvm::findArrayDimensions(ScalarEvolution &SE, LLVM_DEBUG({ dbgs() << "Terms after sorting:\n"; for (const SCEV *T : NewTerms) - dbgs() << *T << "\n"; + dbgs().indent(2) << *T << "\n"; }); if (NewTerms.empty() || !findArrayDimensionsRec(SE, NewTerms, Sizes)) { @@ -339,7 +339,7 @@ void llvm::findArrayDimensions(ScalarEvolution &SE, LLVM_DEBUG({ dbgs() << "Sizes:\n"; for (const SCEV *S : Sizes) - dbgs() << *S << "\n"; + dbgs().indent(2) << *S << "\n"; }); } @@ -354,18 +354,24 @@ void llvm::computeAccessFunctions(ScalarEvolution &SE, const SCEV *Expr, if (!AR->isAffine()) return; + LLVM_DEBUG(dbgs() << "\ncomputeAccessFunctions\n" + << "Memory Access Function: " << *Expr << "\n"); + const SCEV *Res = Expr; int Last = Sizes.size() - 1; + for (int i = Last; i >= 0; i--) { + const SCEV *Size = Sizes[i]; const SCEV *Q, *R; - SCEVDivision::divide(SE, Res, Sizes[i], &Q, &R); + + SCEVDivision::divide(SE, Res, Size, &Q, &R); LLVM_DEBUG({ - dbgs() << "Res: " << *Res << "\n"; - dbgs() << "Sizes[i]: " << *Sizes[i] << "\n"; - dbgs() << "Res divided by Sizes[i]:\n"; - dbgs() << "Quotient: " << *Q << "\n"; - dbgs() << "Remainder: " << *R << "\n"; + dbgs() << "Computing 'MemAccFn / Sizes[" << i << "]':\n"; + dbgs() << " MemAccFn: " << *Res << "\n"; + dbgs() << " Sizes[" << i << "]: " << *Size << "\n"; + dbgs() << " Quotient (Leftover): " << *Q << "\n"; + dbgs() << " Remainder (Subscript Access Function): " << *R << "\n"; }); Res = Q; @@ -397,7 +403,8 @@ void llvm::computeAccessFunctions(ScalarEvolution &SE, const SCEV *Expr, LLVM_DEBUG({ dbgs() << "Subscripts:\n"; for (const SCEV *S : Subscripts) - dbgs() << *S << "\n"; + dbgs().indent(2) << *S << "\n"; + dbgs() << "\n"; }); } @@ -469,21 +476,6 @@ void llvm::delinearize(ScalarEvolution &SE, const SCEV *Expr, // Third step: compute the access functions for each subscript. computeAccessFunctions(SE, Expr, Subscripts, Sizes); - - if (Subscripts.empty()) - return; - - LLVM_DEBUG({ - dbgs() << "succeeded to delinearize " << *Expr << "\n"; - dbgs() << "ArrayDecl[UnknownSize]"; - for (const SCEV *S : Sizes) - dbgs() << "[" << *S << "]"; - - dbgs() << "\nArrayRef"; - for (const SCEV *S : Subscripts) - dbgs() << "[" << *S << "]"; - dbgs() << "\n"; - }); } static std::optional<APInt> tryIntoAPInt(const SCEV *S) { @@ -671,6 +663,7 @@ bool llvm::getIndexExpressionsFromGEP(ScalarEvolution &SE, assert(Subscripts.empty() && Sizes.empty() && "Expected output lists to be empty on entry to this function."); assert(GEP && "getIndexExpressionsFromGEP called with a null GEP"); + LLVM_DEBUG(dbgs() << "\nGEP to delinearize: " << *GEP << "\n"); Type *Ty = nullptr; bool DroppedFirstDim = false; for (unsigned i = 1; i < GEP->getNumOperands(); i++) { @@ -688,6 +681,8 @@ bool llvm::getIndexExpressionsFromGEP(ScalarEvolution &SE, auto *ArrayTy = dyn_cast<ArrayType>(Ty); if (!ArrayTy) { + LLVM_DEBUG(dbgs() << "GEP delinearize failed: " << *Ty + << " is not an array type.\n"); Subscripts.clear(); Sizes.clear(); return false; @@ -699,6 +694,13 @@ bool llvm::getIndexExpressionsFromGEP(ScalarEvolution &SE, Ty = ArrayTy->getElementType(); } + LLVM_DEBUG({ + dbgs() << "Subscripts:\n"; + for (const SCEV *S : Subscripts) + dbgs() << *S << "\n"; + dbgs() << "\n"; + }); + return !Subscripts.empty(); } @@ -769,7 +771,6 @@ void printDelinearization(raw_ostream &O, Function *F, LoopInfo *LI, O << "\n"; O << "Inst:" << Inst << "\n"; - O << "In Loop with Header: " << L->getHeader()->getName() << "\n"; O << "AccessFunction: " << *AccessFn << "\n"; SmallVector<const SCEV *, 3> Subscripts, Sizes; diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp index f33e04e804e3..da86a8d2cc9c 100644 --- a/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -3419,13 +3419,24 @@ bool DependenceInfo::tryDelinearizeFixedSize( size_t SSize = Subscripts.size(); for (size_t I = 1; I < SSize; ++I) { const SCEV *S = Subscripts[I]; - if (!isKnownNonNegative(S, Ptr)) + if (!isKnownNonNegative(S, Ptr)) { + LLVM_DEBUG({ + dbgs() << "Check failed: !isKnownNonNegative(S, Ptr)\n"; + dbgs() << " S: " << *S << "\n" << " Ptr: " << *Ptr << "\n"; + }); return false; + } if (auto *SType = dyn_cast<IntegerType>(S->getType())) { const SCEV *Range = SE->getConstant( ConstantInt::get(SType, DimensionSizes[I - 1], false)); - if (!isKnownLessThan(S, Range)) + if (!isKnownLessThan(S, Range)) { + LLVM_DEBUG({ + dbgs() << "Check failed: !isKnownLessThan(S, Range)\n"; + dbgs() << " S: " << *S << "\n" + << " Range: " << *Range << "\n"; + }); return false; + } } } return true; @@ -3433,6 +3444,7 @@ bool DependenceInfo::tryDelinearizeFixedSize( if (!AllIndicesInRange(SrcSizes, SrcSubscripts, SrcPtr) || !AllIndicesInRange(DstSizes, DstSubscripts, DstPtr)) { + LLVM_DEBUG(dbgs() << "Check failed: AllIndicesInRange.\n"); SrcSubscripts.clear(); DstSubscripts.clear(); return false; @@ -3500,17 +3512,27 @@ bool DependenceInfo::tryDelinearizeParametricSize( // to the dependency checks. if (!DisableDelinearizationChecks) for (size_t I = 1; I < Size; ++I) { - if (!isKnownNonNegative(SrcSubscripts[I], SrcPtr)) - return false; - - if (!isKnownLessThan(SrcSubscripts[I], Sizes[I - 1])) - return false; - - if (!isKnownNonNegative(DstSubscripts[I], DstPtr)) - return false; + bool SNN = isKnownNonNegative(SrcSubscripts[I], SrcPtr); + bool DNN = isKnownNonNegative(DstSubscripts[I], DstPtr); + bool SLT = isKnownLessThan(SrcSubscripts[I], Sizes[I - 1]); + bool DLT = isKnownLessThan(DstSubscripts[I], Sizes[I - 1]); + if (SNN && DNN && SLT && DLT) + continue; - if (!isKnownLessThan(DstSubscripts[I], Sizes[I - 1])) - return false; + LLVM_DEBUG({ + dbgs() << "Delinearization checks failed: can't prove the following\n"; + if (!SNN) + dbgs() << " isKnownNonNegative(" << *SrcSubscripts[I] << ")\n"; + if (!DNN) + dbgs() << " isKnownNonNegative(" << *DstSubscripts[I] << ")\n"; + if (!SLT) + dbgs() << " isKnownLessThan(" << *SrcSubscripts[I] << ", " + << *Sizes[I - 1] << ")\n"; + if (!DLT) + dbgs() << " isKnownLessThan(" << *DstSubscripts[I] << ", " + << *Sizes[I - 1] << ")\n"; + }); + return false; } return true; diff --git a/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp b/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp index 790e00e1b3b0..67e38ab8b35a 100644 --- a/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp +++ b/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp @@ -97,7 +97,8 @@ struct InlineEvent { /// Collect data we may use for training a model. class TrainingLogger final { public: - TrainingLogger(StringRef LogFileName, const ModelUnderTrainingRunner *MUTR); + TrainingLogger(StringRef LogFileName, const ModelUnderTrainingRunner *MUTR, + const std::vector<TensorSpec> &FeatureMap); /// Log one inlining event. void logInlineEvent(const InlineEvent &Event, @@ -106,6 +107,8 @@ public: private: StringRef LogFileName; const ModelUnderTrainingRunner *const MUTR; + const std::vector<TensorSpec> &FeatureMap; + std::unique_ptr<Logger> L; BitVector Effects; /// Set these 2 clearly OOB, to make sure we set them later. @@ -142,9 +145,10 @@ class DevelopmentModeMLInlineAdvisor : public MLInlineAdvisor { public: DevelopmentModeMLInlineAdvisor( Module &M, ModuleAnalysisManager &MAM, - std::unique_ptr<MLModelRunner> ModelRunner, - std::function<bool(CallBase &)> GetDefaultAdvice, - std::unique_ptr<TrainingLogger> Logger); + std::function< + std::unique_ptr<MLModelRunner>(const std::vector<TensorSpec> &)> + GetModelRunner, + std::function<bool(CallBase &)> GetDefaultAdvice); size_t getTotalSizeEstimate(); @@ -258,9 +262,13 @@ static const std::vector<TensorSpec> TrainingOnlyFeatures{ TensorSpec::createSpec<float>(TFFeedPrefix + "reward", {1}), TensorSpec::createSpec<int32_t>(TFFeedPrefix + "step_type", {1})}; -static const std::vector<TensorSpec> getInputFeatures() { +// add TFFeedPrefix to the names and also add the "TrainingOnlyFeatures" which +// the model runner needs to see present. We don't set them ourselves or +// interact with them. +static const std::vector<TensorSpec> +convertInputFeatures(const std::vector<TensorSpec> &OriginalFeatures) { std::vector<TensorSpec> InputSpecs; - for (const auto &Feature : FeatureMap) + for (const auto &Feature : OriginalFeatures) InputSpecs.push_back(TensorSpec(TFFeedPrefix + Feature.name(), Feature)); append_range(InputSpecs, TrainingOnlyFeatures); return InputSpecs; @@ -269,8 +277,9 @@ static const std::vector<TensorSpec> getInputFeatures() { } // namespace TrainingLogger::TrainingLogger(StringRef LogFileName, - const ModelUnderTrainingRunner *MUTR) - : LogFileName(LogFileName), MUTR(MUTR) { + const ModelUnderTrainingRunner *MUTR, + const std::vector<TensorSpec> &FeatureMap) + : LogFileName(LogFileName), MUTR(MUTR), FeatureMap(FeatureMap) { // The first output is the inlining decision. std::vector<TensorSpec> FT(FeatureMap.begin(), FeatureMap.end()); @@ -298,8 +307,7 @@ void TrainingLogger::logInlineEvent(const InlineEvent &Event, const MLModelRunner &ModelRunner) { L->startObservation(); size_t CurrentFeature = 0; - size_t FeatureMapSize = FeatureMap.size(); - for (; CurrentFeature < FeatureMapSize; ++CurrentFeature) + for (; CurrentFeature < FeatureMap.size(); ++CurrentFeature) L->logTensorValue(CurrentFeature, reinterpret_cast<const char *>( ModelRunner.getTensorUntyped(CurrentFeature))); @@ -327,15 +335,19 @@ void TrainingLogger::logInlineEvent(const InlineEvent &Event, DevelopmentModeMLInlineAdvisor::DevelopmentModeMLInlineAdvisor( Module &M, ModuleAnalysisManager &MAM, - std::unique_ptr<MLModelRunner> ModelRunner, - std::function<bool(CallBase &)> GetDefaultAdvice, - std::unique_ptr<TrainingLogger> Logger) - : MLInlineAdvisor(M, MAM, std::move(ModelRunner), GetDefaultAdvice), + std::function< + std::unique_ptr<MLModelRunner>(const std::vector<TensorSpec> &)> + GetModelRunner, + std::function<bool(CallBase &)> GetDefaultAdvice) + : MLInlineAdvisor(M, MAM, GetModelRunner, GetDefaultAdvice), IsDoingInference(isa<ModelUnderTrainingRunner>(getModelRunner())), - Logger(std::move(Logger)), InitialNativeSize(isLogging() ? getTotalSizeEstimate() : 0), CurrentNativeSize(InitialNativeSize) { // We cannot have the case of neither inference nor logging. + if (!TrainingLog.empty()) + Logger = std::make_unique<TrainingLogger>( + TrainingLog, dyn_cast<ModelUnderTrainingRunner>(ModelRunner.get()), + getFeatureMap()); assert(IsDoingInference || isLogging()); } @@ -401,21 +413,22 @@ std::unique_ptr<InlineAdvisor> llvm::getDevelopmentModeAdvisor( Module &M, ModuleAnalysisManager &MAM, std::function<bool(CallBase &)> GetDefaultAdvice) { auto &Ctx = M.getContext(); - std::unique_ptr<MLModelRunner> Runner; - if (TFModelUnderTrainingPath.empty()) - Runner.reset(new NoInferenceModelRunner(Ctx, getInputFeatures())); - else - Runner = ModelUnderTrainingRunner::createAndEnsureValid( - Ctx, TFModelUnderTrainingPath, DecisionName, getInputFeatures(), - TFOutputSpecOverride); - if (!Runner) - return nullptr; - std::unique_ptr<TrainingLogger> Logger; - if (!TrainingLog.empty()) - Logger = std::make_unique<TrainingLogger>( - TrainingLog, dyn_cast<ModelUnderTrainingRunner>(Runner.get())); - - return std::make_unique<DevelopmentModeMLInlineAdvisor>( - M, MAM, std::move(Runner), GetDefaultAdvice, std::move(Logger)); + auto RunnerFactory = [&](const std::vector<TensorSpec> &InputFeatures) + -> std::unique_ptr<MLModelRunner> { + std::unique_ptr<MLModelRunner> Runner; + const std::vector<TensorSpec> ConvertedFeatures = + convertInputFeatures(InputFeatures); + if (TFModelUnderTrainingPath.empty()) + Runner.reset(new NoInferenceModelRunner(Ctx, ConvertedFeatures)); + else + Runner = ModelUnderTrainingRunner::createAndEnsureValid( + Ctx, TFModelUnderTrainingPath, DecisionName, ConvertedFeatures, + TFOutputSpecOverride); + if (!Runner) + return nullptr; + return Runner; + }; + return std::make_unique<DevelopmentModeMLInlineAdvisor>(M, MAM, RunnerFactory, + GetDefaultAdvice); } #endif // defined(LLVM_HAVE_TFLITE) diff --git a/llvm/lib/Analysis/HashRecognize.cpp b/llvm/lib/Analysis/HashRecognize.cpp index 92c9e37dbb48..5d7ee1fe8eb1 100644 --- a/llvm/lib/Analysis/HashRecognize.cpp +++ b/llvm/lib/Analysis/HashRecognize.cpp @@ -8,8 +8,10 @@ // // The HashRecognize analysis recognizes unoptimized polynomial hash functions // with operations over a Galois field of characteristic 2, also called binary -// fields, or GF(2^n): this class of hash functions can be optimized using a -// lookup-table-driven implementation, or with target-specific instructions. +// fields, or GF(2^n). 2^n is termed the order of the Galois field. This class +// of hash functions can be optimized using a lookup-table-driven +// implementation, or with target-specific instructions. +// // Examples: // // 1. Cyclic redundancy check (CRC), which is a polynomial division in GF(2). @@ -24,12 +26,10 @@ // // c_m * x^m + c_(m-1) * x^(m-1) + ... + c_0 * x^0 // -// where each coefficient c is can take values in GF(2^n), where 2^n is termed -// the order of the Galois field. For GF(2), each coefficient can take values -// either 0 or 1, and the polynomial is simply represented by m+1 bits, -// corresponding to the coefficients. The different variants of CRC are named by -// degree of generating polynomial used: so CRC-32 would use a polynomial of -// degree 32. +// where each coefficient c is can take values 0 or 1. The polynomial is simply +// represented by m+1 bits, corresponding to the coefficients. The different +// variants of CRC are named by degree of generating polynomial used: so CRC-32 +// would use a polynomial of degree 32. // // The reason algorithms on GF(2^n) can be optimized with a lookup-table is the // following: in such fields, polynomial addition and subtraction are identical @@ -73,202 +73,31 @@ using namespace SCEVPatternMatch; #define DEBUG_TYPE "hash-recognize" -// KnownBits for a PHI node. There are at most two PHI nodes, corresponding to -// the Simple Recurrence and Conditional Recurrence. The IndVar PHI is not -// relevant. -using KnownPhiMap = SmallDenseMap<const PHINode *, KnownBits, 2>; - -// A pair of a PHI node along with its incoming value from within a loop. -using PhiStepPair = std::pair<const PHINode *, const Instruction *>; - -/// A much simpler version of ValueTracking, in that it computes KnownBits of -/// values, except that it computes the evolution of KnownBits in a loop with a -/// given trip count, and predication is specialized for a significant-bit -/// check. -class ValueEvolution { - const unsigned TripCount; - const bool ByteOrderSwapped; - APInt GenPoly; - StringRef ErrStr; - - // Compute the KnownBits of a BinaryOperator. - KnownBits computeBinOp(const BinaryOperator *I); - - // Compute the KnownBits of an Instruction. - KnownBits computeInstr(const Instruction *I); - - // Compute the KnownBits of a Value. - KnownBits compute(const Value *V); - -public: - // ValueEvolution is meant to be constructed with the TripCount of the loop, - // and a boolean indicating whether the polynomial algorithm is big-endian - // (for the significant-bit check). - ValueEvolution(unsigned TripCount, bool ByteOrderSwapped); - - // Given a list of PHI nodes along with their incoming value from within the - // loop, computeEvolutions computes the KnownBits of each of the PHI nodes on - // the final iteration. Returns true on success and false on error. - bool computeEvolutions(ArrayRef<PhiStepPair> PhiEvolutions); - - // In case ValueEvolution encounters an error, this is meant to be used for a - // precise error message. - StringRef getError() const { return ErrStr; } - - // A set of Instructions visited by ValueEvolution. The only unvisited - // instructions will be ones not on the use-def chain of the PHIs' evolutions. +/// Checks if there's a stray instruction in the loop \p L outside of the +/// use-def chains from \p Roots, or if we escape the loop during the use-def +/// walk. +static bool containsUnreachable(const Loop &L, + ArrayRef<const Instruction *> Roots) { SmallPtrSet<const Instruction *, 16> Visited; + BasicBlock *Latch = L.getLoopLatch(); - // The computed KnownBits for each PHI node, which is populated after - // computeEvolutions is called. - KnownPhiMap KnownPhis; -}; - -ValueEvolution::ValueEvolution(unsigned TripCount, bool ByteOrderSwapped) - : TripCount(TripCount), ByteOrderSwapped(ByteOrderSwapped) {} - -KnownBits ValueEvolution::computeBinOp(const BinaryOperator *I) { - KnownBits KnownL(compute(I->getOperand(0))); - KnownBits KnownR(compute(I->getOperand(1))); - - switch (I->getOpcode()) { - case Instruction::BinaryOps::And: - return KnownL & KnownR; - case Instruction::BinaryOps::Or: - return KnownL | KnownR; - case Instruction::BinaryOps::Xor: - return KnownL ^ KnownR; - case Instruction::BinaryOps::Shl: { - auto *OBO = cast<OverflowingBinaryOperator>(I); - return KnownBits::shl(KnownL, KnownR, OBO->hasNoUnsignedWrap(), - OBO->hasNoSignedWrap()); - } - case Instruction::BinaryOps::LShr: - return KnownBits::lshr(KnownL, KnownR); - case Instruction::BinaryOps::AShr: - return KnownBits::ashr(KnownL, KnownR); - case Instruction::BinaryOps::Add: { - auto *OBO = cast<OverflowingBinaryOperator>(I); - return KnownBits::add(KnownL, KnownR, OBO->hasNoUnsignedWrap(), - OBO->hasNoSignedWrap()); - } - case Instruction::BinaryOps::Sub: { - auto *OBO = cast<OverflowingBinaryOperator>(I); - return KnownBits::sub(KnownL, KnownR, OBO->hasNoUnsignedWrap(), - OBO->hasNoSignedWrap()); - } - case Instruction::BinaryOps::Mul: { - Value *Op0 = I->getOperand(0); - Value *Op1 = I->getOperand(1); - bool SelfMultiply = Op0 == Op1 && isGuaranteedNotToBeUndef(Op0); - return KnownBits::mul(KnownL, KnownR, SelfMultiply); - } - case Instruction::BinaryOps::UDiv: - return KnownBits::udiv(KnownL, KnownR); - case Instruction::BinaryOps::SDiv: - return KnownBits::sdiv(KnownL, KnownR); - case Instruction::BinaryOps::URem: - return KnownBits::urem(KnownL, KnownR); - case Instruction::BinaryOps::SRem: - return KnownBits::srem(KnownL, KnownR); - default: - ErrStr = "Unknown BinaryOperator"; - unsigned BitWidth = I->getType()->getScalarSizeInBits(); - return {BitWidth}; - } -} - -KnownBits ValueEvolution::computeInstr(const Instruction *I) { - unsigned BitWidth = I->getType()->getScalarSizeInBits(); - - // computeInstr is the only entry-point that needs to update the Visited set. - Visited.insert(I); + SmallVector<const Instruction *, 16> Worklist(Roots); + while (!Worklist.empty()) { + const Instruction *I = Worklist.pop_back_val(); + Visited.insert(I); - // We look up in the map that contains the KnownBits of the PHI from the - // previous iteration. - if (const PHINode *P = dyn_cast<PHINode>(I)) - return KnownPhis.lookup_or(P, BitWidth); + if (isa<PHINode>(I)) + continue; - // Compute the KnownBits for a Select(Cmp()), forcing it to take the branch - // that is predicated on the (least|most)-significant-bit check. - CmpPredicate Pred; - Value *L, *R; - Instruction *TV, *FV; - if (match(I, m_Select(m_ICmp(Pred, m_Value(L), m_Value(R)), m_Instruction(TV), - m_Instruction(FV)))) { - Visited.insert(cast<Instruction>(I->getOperand(0))); - - // We need to check LCR against [0, 2) in the little-endian case, because - // the RCR check is insufficient: it is simply [0, 1). - if (!ByteOrderSwapped) { - KnownBits KnownL = compute(L); - unsigned ICmpBW = KnownL.getBitWidth(); - auto LCR = ConstantRange::fromKnownBits(KnownL, false); - auto CheckLCR = ConstantRange(APInt::getZero(ICmpBW), APInt(ICmpBW, 2)); - if (LCR != CheckLCR) { - ErrStr = "Bad LHS of significant-bit-check"; - return {BitWidth}; + for (const Use &U : I->operands()) { + if (auto *UI = dyn_cast<Instruction>(U)) { + if (!L.contains(UI)) + return true; + Worklist.push_back(UI); } } - - // Check that the predication is on (most|least) significant bit. - KnownBits KnownR = compute(R); - unsigned ICmpBW = KnownR.getBitWidth(); - auto RCR = ConstantRange::fromKnownBits(KnownR, false); - auto AllowedR = ConstantRange::makeAllowedICmpRegion(Pred, RCR); - ConstantRange CheckRCR(APInt::getZero(ICmpBW), - ByteOrderSwapped ? APInt::getSignedMinValue(ICmpBW) - : APInt(ICmpBW, 1)); - - // We only compute KnownBits of either TV or FV, as the other value would - // just be a bit-shift as checked by isBigEndianBitShift. - if (AllowedR == CheckRCR) { - Visited.insert(FV); - return compute(TV); - } - if (AllowedR.inverse() == CheckRCR) { - Visited.insert(TV); - return compute(FV); - } - - ErrStr = "Bad RHS of significant-bit-check"; - return {BitWidth}; - } - - if (auto *BO = dyn_cast<BinaryOperator>(I)) - return computeBinOp(BO); - - switch (I->getOpcode()) { - case Instruction::CastOps::Trunc: - return compute(I->getOperand(0)).trunc(BitWidth); - case Instruction::CastOps::ZExt: - return compute(I->getOperand(0)).zext(BitWidth); - case Instruction::CastOps::SExt: - return compute(I->getOperand(0)).sext(BitWidth); - default: - ErrStr = "Unknown Instruction"; - return {BitWidth}; } -} - -KnownBits ValueEvolution::compute(const Value *V) { - if (auto *CI = dyn_cast<ConstantInt>(V)) - return KnownBits::makeConstant(CI->getValue()); - - if (auto *I = dyn_cast<Instruction>(V)) - return computeInstr(I); - - ErrStr = "Unknown Value"; - unsigned BitWidth = V->getType()->getScalarSizeInBits(); - return {BitWidth}; -} - -bool ValueEvolution::computeEvolutions(ArrayRef<PhiStepPair> PhiEvolutions) { - for (unsigned I = 0; I < TripCount; ++I) - for (auto [Phi, Step] : PhiEvolutions) - KnownPhis.emplace_or_assign(Phi, computeInstr(Step)); - - return ErrStr.empty(); + return std::distance(Latch->begin(), Latch->end()) != Visited.size(); } /// A structure that can hold either a Simple Recurrence or a Conditional @@ -320,6 +149,62 @@ private: Instruction::BinaryOps BOWithConstOpToMatch = Instruction::BinaryOpsEnd); }; +/// Check the well-formedness of the (most|least) significant bit check given \p +/// ConditionalRecurrence, \p SimpleRecurrence, depending on \p +/// ByteOrderSwapped. We check that ConditionalRecurrence.Step is a +/// Select(Cmp()) where the compare is `>= 0` in the big-endian case, and `== 0` +/// in the little-endian case (or the inverse, in which case the branches of the +/// compare are swapped). We check that the LHS is (ConditionalRecurrence.Phi +/// [xor SimpleRecurrence.Phi]) in the big-endian case, and additionally check +/// for an AND with one in the little-endian case. We then check AllowedByR +/// against CheckAllowedByR, which is [0, smin) in the big-endian case, and is +/// [0, 1) in the little-endian case. CheckAllowedByR checks for +/// significant-bit-clear, and we match the corresponding arms of the select +/// against bit-shift and bit-shift-and-xor-gen-poly. +static bool +isSignificantBitCheckWellFormed(const RecurrenceInfo &ConditionalRecurrence, + const RecurrenceInfo &SimpleRecurrence, + bool ByteOrderSwapped) { + auto *SI = cast<SelectInst>(ConditionalRecurrence.Step); + CmpPredicate Pred; + const Value *L; + const APInt *R; + Instruction *TV, *FV; + if (!match(SI, m_Select(m_ICmp(Pred, m_Value(L), m_APInt(R)), + m_Instruction(TV), m_Instruction(FV)))) + return false; + + // Match predicate with or without a SimpleRecurrence (the corresponding data + // is LHSAux). + auto MatchPred = m_CombineOr( + m_Specific(ConditionalRecurrence.Phi), + m_c_Xor(m_ZExtOrTruncOrSelf(m_Specific(ConditionalRecurrence.Phi)), + m_ZExtOrTruncOrSelf(m_Specific(SimpleRecurrence.Phi)))); + bool LWellFormed = ByteOrderSwapped ? match(L, MatchPred) + : match(L, m_c_And(MatchPred, m_One())); + if (!LWellFormed) + return false; + + KnownBits KnownR = KnownBits::makeConstant(*R); + unsigned BW = KnownR.getBitWidth(); + auto RCR = ConstantRange::fromKnownBits(KnownR, false); + auto AllowedByR = ConstantRange::makeAllowedICmpRegion(Pred, RCR); + ConstantRange CheckAllowedByR(APInt::getZero(BW), + ByteOrderSwapped ? APInt::getSignedMinValue(BW) + : APInt(BW, 1)); + + BinaryOperator *BitShift = ConditionalRecurrence.BO; + if (AllowedByR == CheckAllowedByR) + return TV == BitShift && + match(FV, m_c_Xor(m_Specific(BitShift), + m_SpecificInt(*ConditionalRecurrence.ExtraConst))); + if (AllowedByR.inverse() == CheckAllowedByR) + return FV == BitShift && + match(TV, m_c_Xor(m_Specific(BitShift), + m_SpecificInt(*ConditionalRecurrence.ExtraConst))); + return false; +} + /// Wraps llvm::matchSimpleRecurrence. Match a simple first order recurrence /// cycle of the form: /// @@ -336,8 +221,11 @@ private: /// %BO = binop %step, %rec /// bool RecurrenceInfo::matchSimpleRecurrence(const PHINode *P) { - Phi = P; - return llvm::matchSimpleRecurrence(Phi, BO, Start, Step); + if (llvm::matchSimpleRecurrence(P, BO, Start, Step)) { + Phi = P; + return true; + } + return false; } /// Digs for a recurrence starting with \p V hitting the PHI node in a use-def @@ -459,26 +347,6 @@ PolynomialInfo::PolynomialInfo(unsigned TripCount, Value *LHS, const APInt &RHS, : TripCount(TripCount), LHS(LHS), RHS(RHS), ComputedValue(ComputedValue), ByteOrderSwapped(ByteOrderSwapped), LHSAux(LHSAux) {} -/// In the big-endian case, checks the bottom N bits against CheckFn, and that -/// the rest are unknown. In the little-endian case, checks the top N bits -/// against CheckFn, and that the rest are unknown. Callers usually call this -/// function with N = TripCount, and CheckFn checking that the remainder bits of -/// the CRC polynomial division are zero. -static bool checkExtractBits(const KnownBits &Known, unsigned N, - function_ref<bool(const KnownBits &)> CheckFn, - bool ByteOrderSwapped) { - // Check that the entire thing is a constant. - if (N == Known.getBitWidth()) - return CheckFn(Known.extractBits(N, 0)); - - // Check that the {top, bottom} N bits are not unknown and that the {bottom, - // top} N bits are known. - unsigned BitPos = ByteOrderSwapped ? 0 : Known.getBitWidth() - N; - unsigned SwappedBitPos = ByteOrderSwapped ? N : 0; - return CheckFn(Known.extractBits(N, BitPos)) && - Known.extractBits(Known.getBitWidth() - N, SwappedBitPos).isUnknown(); -} - /// Generate a lookup table of 256 entries by interleaving the generating /// polynomial. The optimization technique of table-lookup for CRC is also /// called the Sarwate algorithm. @@ -511,8 +379,6 @@ CRCTable HashRecognize::genSarwateTable(const APInt &GenPoly, /// Checks that \p P1 and \p P2 are used together in an XOR in the use-def chain /// of \p SI's condition, ignoring any casts. The purpose of this function is to /// ensure that LHSAux from the SimpleRecurrence is used correctly in the CRC -/// computation. We cannot check the correctness of casts at this point, and -/// rely on the KnownBits propagation to check correctness of the CRC /// computation. /// /// In other words, it checks for the following pattern: @@ -540,8 +406,8 @@ static bool isConditionalOnXorOfPHIs(const SelectInst *SI, const PHINode *P1, continue; // If we match an XOR of the two PHIs ignoring casts, we're done. - if (match(I, m_c_Xor(m_CastOrSelf(m_Specific(P1)), - m_CastOrSelf(m_Specific(P2))))) + if (match(I, m_c_Xor(m_ZExtOrTruncOrSelf(m_Specific(P1)), + m_ZExtOrTruncOrSelf(m_Specific(P2))))) return true; // Continue along the use-def chain. @@ -570,10 +436,8 @@ static std::optional<bool> isBigEndianBitShift(Value *V, ScalarEvolution &SE) { } /// The main entry point for analyzing a loop and recognizing the CRC algorithm. -/// Returns a PolynomialInfo on success, and either an ErrBits or a StringRef on -/// failure. -std::variant<PolynomialInfo, ErrBits, StringRef> -HashRecognize::recognizeCRC() const { +/// Returns a PolynomialInfo on success, and a StringRef on failure. +std::variant<PolynomialInfo, StringRef> HashRecognize::recognizeCRC() const { if (!L.isInnermost()) return "Loop is not innermost"; BasicBlock *Latch = L.getLoopLatch(); @@ -582,7 +446,7 @@ HashRecognize::recognizeCRC() const { if (!Latch || !Exit || !IndVar || L.getNumBlocks() != 1) return "Loop not in canonical form"; unsigned TC = SE.getSmallConstantTripCount(&L); - if (!TC || TC > 256 || TC % 8) + if (!TC || TC % 8) return "Unable to find a small constant byte-multiple trip count"; auto R = getRecurrences(Latch, IndVar, L); @@ -637,36 +501,19 @@ HashRecognize::recognizeCRC() const { "Expected ExtraConst in conditional recurrence"); const APInt &GenPoly = *ConditionalRecurrence.ExtraConst; - // PhiEvolutions are pairs of PHINodes along with their incoming value from - // within the loop, which we term as their step. Note that in the case of a - // Simple Recurrence, Step is an operand of the BO, while in a Conditional - // Recurrence, it is a SelectInst. - SmallVector<PhiStepPair, 2> PhiEvolutions; - PhiEvolutions.emplace_back(ConditionalRecurrence.Phi, ComputedValue); + if (!isSignificantBitCheckWellFormed(ConditionalRecurrence, SimpleRecurrence, + *ByteOrderSwapped)) + return "Malformed significant-bit check"; + + SmallVector<const Instruction *> Roots( + {ComputedValue, + cast<Instruction>(IndVar->getIncomingValueForBlock(Latch)), + L.getLatchCmpInst(), Latch->getTerminator()}); if (SimpleRecurrence) - PhiEvolutions.emplace_back(SimpleRecurrence.Phi, SimpleRecurrence.BO); - - ValueEvolution VE(TC, *ByteOrderSwapped); - if (!VE.computeEvolutions(PhiEvolutions)) - return VE.getError(); - KnownBits ResultBits = VE.KnownPhis.at(ConditionalRecurrence.Phi); - - // There must be exactly four unvisited instructions, corresponding to the - // IndVar PHI. Any other unvisited instructions from the KnownBits propagation - // can complicate the optimization, which replaces the entire loop with the - // table-lookup version of the hash algorithm. - std::initializer_list<const Instruction *> AugmentVisited = { - IndVar, Latch->getTerminator(), L.getLatchCmpInst(), - cast<Instruction>(IndVar->getIncomingValueForBlock(Latch))}; - VE.Visited.insert_range(AugmentVisited); - if (std::distance(Latch->begin(), Latch->end()) != VE.Visited.size()) + Roots.push_back(SimpleRecurrence.BO); + if (containsUnreachable(L, Roots)) return "Found stray unvisited instructions"; - unsigned N = std::min(TC, ResultBits.getBitWidth()); - auto IsZero = [](const KnownBits &K) { return K.isZero(); }; - if (!checkExtractBits(ResultBits, N, IsZero, *ByteOrderSwapped)) - return ErrBits(ResultBits, TC, *ByteOrderSwapped); - return PolynomialInfo(TC, LHS, GenPoly, ComputedValue, *ByteOrderSwapped, LHSAux); } @@ -693,13 +540,6 @@ void HashRecognize::print(raw_ostream &OS) const { OS << "Did not find a hash algorithm\n"; if (std::holds_alternative<StringRef>(Ret)) OS << "Reason: " << std::get<StringRef>(Ret) << "\n"; - if (std::holds_alternative<ErrBits>(Ret)) { - auto [Actual, Iter, ByteOrderSwapped] = std::get<ErrBits>(Ret); - OS << "Reason: Expected " << (ByteOrderSwapped ? "bottom " : "top ") - << Iter << " bits zero ("; - Actual.print(OS); - OS << ")\n"; - } return; } diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp index 95f30fd3f427..99afc0601d52 100644 --- a/llvm/lib/Analysis/IR2Vec.cpp +++ b/llvm/lib/Analysis/IR2Vec.cpp @@ -32,11 +32,11 @@ using namespace ir2vec; #define DEBUG_TYPE "ir2vec" STATISTIC(VocabMissCounter, - "Number of lookups to entites not present in the vocabulary"); + "Number of lookups to entities not present in the vocabulary"); namespace llvm { namespace ir2vec { -static cl::OptionCategory IR2VecCategory("IR2Vec Options"); +cl::OptionCategory IR2VecCategory("IR2Vec Options"); // FIXME: Use a default vocab when not specified static cl::opt<std::string> @@ -52,6 +52,15 @@ cl::opt<float> TypeWeight("ir2vec-type-weight", cl::Optional, cl::init(0.5), cl::opt<float> ArgWeight("ir2vec-arg-weight", cl::Optional, cl::init(0.2), cl::desc("Weight for argument embeddings"), cl::cat(IR2VecCategory)); +cl::opt<IR2VecKind> IR2VecEmbeddingKind( + "ir2vec-kind", cl::Optional, + cl::values(clEnumValN(IR2VecKind::Symbolic, "symbolic", + "Generate symbolic embeddings"), + clEnumValN(IR2VecKind::FlowAware, "flow-aware", + "Generate flow-aware embeddings")), + cl::init(IR2VecKind::Symbolic), cl::desc("IR2Vec embedding kind"), + cl::cat(IR2VecCategory)); + } // namespace ir2vec } // namespace llvm @@ -123,8 +132,12 @@ bool Embedding::approximatelyEquals(const Embedding &RHS, double Tolerance) const { assert(this->size() == RHS.size() && "Vectors must have the same dimension"); for (size_t Itr = 0; Itr < this->size(); ++Itr) - if (std::abs((*this)[Itr] - RHS[Itr]) > Tolerance) + if (std::abs((*this)[Itr] - RHS[Itr]) > Tolerance) { + LLVM_DEBUG(errs() << "Embedding mismatch at index " << Itr << ": " + << (*this)[Itr] << " vs " << RHS[Itr] + << "; Tolerance: " << Tolerance << "\n"); return false; + } return true; } @@ -141,14 +154,16 @@ void Embedding::print(raw_ostream &OS) const { Embedder::Embedder(const Function &F, const Vocabulary &Vocab) : F(F), Vocab(Vocab), Dimension(Vocab.getDimension()), - OpcWeight(::OpcWeight), TypeWeight(::TypeWeight), ArgWeight(::ArgWeight) { -} + OpcWeight(::OpcWeight), TypeWeight(::TypeWeight), ArgWeight(::ArgWeight), + FuncVector(Embedding(Dimension)) {} std::unique_ptr<Embedder> Embedder::create(IR2VecKind Mode, const Function &F, const Vocabulary &Vocab) { switch (Mode) { case IR2VecKind::Symbolic: return std::make_unique<SymbolicEmbedder>(F, Vocab); + case IR2VecKind::FlowAware: + return std::make_unique<FlowAwareEmbedder>(F, Vocab); } return nullptr; } @@ -180,6 +195,17 @@ const Embedding &Embedder::getFunctionVector() const { return FuncVector; } +void Embedder::computeEmbeddings() const { + if (F.isDeclaration()) + return; + + // Consider only the basic blocks that are reachable from entry + for (const BasicBlock *BB : depth_first(&F)) { + computeEmbeddings(*BB); + FuncVector += BBVecMap[BB]; + } +} + void SymbolicEmbedder::computeEmbeddings(const BasicBlock &BB) const { Embedding BBVector(Dimension, 0); @@ -187,7 +213,7 @@ void SymbolicEmbedder::computeEmbeddings(const BasicBlock &BB) const { for (const auto &I : BB.instructionsWithoutDebug()) { Embedding ArgEmb(Dimension, 0); for (const auto &Op : I.operands()) - ArgEmb += Vocab[Op]; + ArgEmb += Vocab[*Op]; auto InstVector = Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb; InstVecMap[&I] = InstVector; @@ -196,51 +222,75 @@ void SymbolicEmbedder::computeEmbeddings(const BasicBlock &BB) const { BBVecMap[&BB] = BBVector; } -void SymbolicEmbedder::computeEmbeddings() const { - if (F.isDeclaration()) - return; +void FlowAwareEmbedder::computeEmbeddings(const BasicBlock &BB) const { + Embedding BBVector(Dimension, 0); - // Consider only the basic blocks that are reachable from entry - for (const BasicBlock *BB : depth_first(&F)) { - computeEmbeddings(*BB); - FuncVector += BBVecMap[BB]; + // We consider only the non-debug and non-pseudo instructions + for (const auto &I : BB.instructionsWithoutDebug()) { + // TODO: Handle call instructions differently. + // For now, we treat them like other instructions + Embedding ArgEmb(Dimension, 0); + for (const auto &Op : I.operands()) { + // If the operand is defined elsewhere, we use its embedding + if (const auto *DefInst = dyn_cast<Instruction>(Op)) { + auto DefIt = InstVecMap.find(DefInst); + assert(DefIt != InstVecMap.end() && + "Instruction should have been processed before its operands"); + ArgEmb += DefIt->second; + continue; + } + // If the operand is not defined by an instruction, we use the vocabulary + else { + LLVM_DEBUG(errs() << "Using embedding from vocabulary for operand: " + << *Op << "=" << Vocab[*Op][0] << "\n"); + ArgEmb += Vocab[*Op]; + } + } + // Create the instruction vector by combining opcode, type, and arguments + // embeddings + auto InstVector = + Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb; + InstVecMap[&I] = InstVector; + BBVector += InstVector; } + BBVecMap[&BB] = BBVector; } // ==----------------------------------------------------------------------===// // Vocabulary //===----------------------------------------------------------------------===// -Vocabulary::Vocabulary(VocabVector &&Vocab) - : Vocab(std::move(Vocab)), Valid(true) {} +unsigned Vocabulary::getDimension() const { + assert(isValid() && "IR2Vec Vocabulary is invalid"); + return Vocab[0].size(); +} -bool Vocabulary::isValid() const { - return Vocab.size() == Vocabulary::expectedSize() && Valid; +unsigned Vocabulary::getSlotIndex(unsigned Opcode) { + assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode"); + return Opcode - 1; // Convert to zero-based index } -size_t Vocabulary::size() const { - assert(Valid && "IR2Vec Vocabulary is invalid"); - return Vocab.size(); +unsigned Vocabulary::getSlotIndex(Type::TypeID TypeID) { + assert(static_cast<unsigned>(TypeID) < MaxTypeIDs && "Invalid type ID"); + return MaxOpcodes + static_cast<unsigned>(getCanonicalTypeID(TypeID)); } -unsigned Vocabulary::getDimension() const { - assert(Valid && "IR2Vec Vocabulary is invalid"); - return Vocab[0].size(); +unsigned Vocabulary::getSlotIndex(const Value &Op) { + unsigned Index = static_cast<unsigned>(getOperandKind(&Op)); + assert(Index < MaxOperandKinds && "Invalid OperandKind"); + return MaxOpcodes + MaxCanonicalTypeIDs + Index; } const Embedding &Vocabulary::operator[](unsigned Opcode) const { - assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode"); - return Vocab[Opcode - 1]; + return Vocab[getSlotIndex(Opcode)]; } -const Embedding &Vocabulary::operator[](Type::TypeID TypeId) const { - assert(static_cast<unsigned>(TypeId) < MaxTypeIDs && "Invalid type ID"); - return Vocab[MaxOpcodes + static_cast<unsigned>(TypeId)]; +const Embedding &Vocabulary::operator[](Type::TypeID TypeID) const { + return Vocab[getSlotIndex(TypeID)]; } -const ir2vec::Embedding &Vocabulary::operator[](const Value *Arg) const { - OperandKind ArgKind = getOperandKind(Arg); - return Vocab[MaxOpcodes + MaxTypeIDs + static_cast<unsigned>(ArgKind)]; +const ir2vec::Embedding &Vocabulary::operator[](const Value &Arg) const { + return Vocab[getSlotIndex(Arg)]; } StringRef Vocabulary::getVocabKeyForOpcode(unsigned Opcode) { @@ -254,43 +304,21 @@ StringRef Vocabulary::getVocabKeyForOpcode(unsigned Opcode) { return "UnknownOpcode"; } +StringRef Vocabulary::getVocabKeyForCanonicalTypeID(CanonicalTypeID CType) { + unsigned Index = static_cast<unsigned>(CType); + assert(Index < MaxCanonicalTypeIDs && "Invalid CanonicalTypeID"); + return CanonicalTypeNames[Index]; +} + +Vocabulary::CanonicalTypeID +Vocabulary::getCanonicalTypeID(Type::TypeID TypeID) { + unsigned Index = static_cast<unsigned>(TypeID); + assert(Index < MaxTypeIDs && "Invalid TypeID"); + return TypeIDMapping[Index]; +} + StringRef Vocabulary::getVocabKeyForTypeID(Type::TypeID TypeID) { - switch (TypeID) { - case Type::VoidTyID: - return "VoidTy"; - case Type::HalfTyID: - case Type::BFloatTyID: - case Type::FloatTyID: - case Type::DoubleTyID: - case Type::X86_FP80TyID: - case Type::FP128TyID: - case Type::PPC_FP128TyID: - return "FloatTy"; - case Type::IntegerTyID: - return "IntegerTy"; - case Type::FunctionTyID: - return "FunctionTy"; - case Type::StructTyID: - return "StructTy"; - case Type::ArrayTyID: - return "ArrayTy"; - case Type::PointerTyID: - case Type::TypedPointerTyID: - return "PointerTy"; - case Type::FixedVectorTyID: - case Type::ScalableVectorTyID: - return "VectorTy"; - case Type::LabelTyID: - return "LabelTy"; - case Type::TokenTyID: - return "TokenTy"; - case Type::MetadataTyID: - return "MetadataTy"; - case Type::X86_AMXTyID: - case Type::TargetExtTyID: - return "UnknownTy"; - } - return "UnknownTy"; + return getVocabKeyForCanonicalTypeID(getCanonicalTypeID(TypeID)); } StringRef Vocabulary::getVocabKeyForOperandKind(Vocabulary::OperandKind Kind) { @@ -299,20 +327,6 @@ StringRef Vocabulary::getVocabKeyForOperandKind(Vocabulary::OperandKind Kind) { return OperandKindNames[Index]; } -Vocabulary::VocabVector Vocabulary::createDummyVocabForTest(unsigned Dim) { - VocabVector DummyVocab; - float DummyVal = 0.1f; - // Create a dummy vocabulary with entries for all opcodes, types, and - // operand - for ([[maybe_unused]] unsigned _ : - seq(0u, Vocabulary::MaxOpcodes + Vocabulary::MaxTypeIDs + - Vocabulary::MaxOperandKinds)) { - DummyVocab.push_back(Embedding(Dim, DummyVal)); - DummyVal += 0.1f; - } - return DummyVocab; -} - // Helper function to classify an operand into OperandKind Vocabulary::OperandKind Vocabulary::getOperandKind(const Value *Op) { if (isa<Function>(Op)) @@ -324,34 +338,18 @@ Vocabulary::OperandKind Vocabulary::getOperandKind(const Value *Op) { return OperandKind::VariableID; } -unsigned Vocabulary::getNumericID(unsigned Opcode) { - assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode"); - return Opcode - 1; // Convert to zero-based index -} - -unsigned Vocabulary::getNumericID(Type::TypeID TypeID) { - assert(static_cast<unsigned>(TypeID) < MaxTypeIDs && "Invalid type ID"); - return MaxOpcodes + static_cast<unsigned>(TypeID); -} - -unsigned Vocabulary::getNumericID(const Value *Op) { - unsigned Index = static_cast<unsigned>(getOperandKind(Op)); - assert(Index < MaxOperandKinds && "Invalid OperandKind"); - return MaxOpcodes + MaxTypeIDs + Index; -} - StringRef Vocabulary::getStringKey(unsigned Pos) { - assert(Pos < Vocabulary::expectedSize() && - "Position out of bounds in vocabulary"); + assert(Pos < NumCanonicalEntries && "Position out of bounds in vocabulary"); // Opcode if (Pos < MaxOpcodes) return getVocabKeyForOpcode(Pos + 1); // Type - if (Pos < MaxOpcodes + MaxTypeIDs) - return getVocabKeyForTypeID(static_cast<Type::TypeID>(Pos - MaxOpcodes)); + if (Pos < MaxOpcodes + MaxCanonicalTypeIDs) + return getVocabKeyForCanonicalTypeID( + static_cast<CanonicalTypeID>(Pos - MaxOpcodes)); // Operand return getVocabKeyForOperandKind( - static_cast<OperandKind>(Pos - MaxOpcodes - MaxTypeIDs)); + static_cast<OperandKind>(Pos - MaxOpcodes - MaxCanonicalTypeIDs)); } // For now, assume vocabulary is stable unless explicitly invalidated. @@ -361,6 +359,21 @@ bool Vocabulary::invalidate(Module &M, const PreservedAnalyses &PA, return !(PAC.preservedWhenStateless()); } +Vocabulary::VocabVector Vocabulary::createDummyVocabForTest(unsigned Dim) { + VocabVector DummyVocab; + DummyVocab.reserve(NumCanonicalEntries); + float DummyVal = 0.1f; + // Create a dummy vocabulary with entries for all opcodes, types, and + // operands + for ([[maybe_unused]] unsigned _ : + seq(0u, Vocabulary::MaxOpcodes + Vocabulary::MaxCanonicalTypeIDs + + Vocabulary::MaxOperandKinds)) { + DummyVocab.push_back(Embedding(Dim, DummyVal)); + DummyVal += 0.1f; + } + return DummyVocab; +} + // ==----------------------------------------------------------------------===// // IR2VecVocabAnalysis //===----------------------------------------------------------------------===// @@ -452,7 +465,8 @@ void IR2VecVocabAnalysis::generateNumMappedVocab() { // Handle Opcodes std::vector<Embedding> NumericOpcodeEmbeddings(Vocabulary::MaxOpcodes, - Embedding(Dim, 0)); + Embedding(Dim)); + NumericOpcodeEmbeddings.reserve(Vocabulary::MaxOpcodes); for (unsigned Opcode : seq(0u, Vocabulary::MaxOpcodes)) { StringRef VocabKey = Vocabulary::getVocabKeyForOpcode(Opcode + 1); auto It = OpcVocab.find(VocabKey.str()); @@ -464,14 +478,15 @@ void IR2VecVocabAnalysis::generateNumMappedVocab() { Vocab.insert(Vocab.end(), NumericOpcodeEmbeddings.begin(), NumericOpcodeEmbeddings.end()); - // Handle Types - std::vector<Embedding> NumericTypeEmbeddings(Vocabulary::MaxTypeIDs, - Embedding(Dim, 0)); - for (unsigned TypeID : seq(0u, Vocabulary::MaxTypeIDs)) { - StringRef VocabKey = - Vocabulary::getVocabKeyForTypeID(static_cast<Type::TypeID>(TypeID)); + // Handle Types - only canonical types are present in vocabulary + std::vector<Embedding> NumericTypeEmbeddings(Vocabulary::MaxCanonicalTypeIDs, + Embedding(Dim)); + NumericTypeEmbeddings.reserve(Vocabulary::MaxCanonicalTypeIDs); + for (unsigned CTypeID : seq(0u, Vocabulary::MaxCanonicalTypeIDs)) { + StringRef VocabKey = Vocabulary::getVocabKeyForCanonicalTypeID( + static_cast<Vocabulary::CanonicalTypeID>(CTypeID)); if (auto It = TypeVocab.find(VocabKey.str()); It != TypeVocab.end()) { - NumericTypeEmbeddings[TypeID] = It->second; + NumericTypeEmbeddings[CTypeID] = It->second; continue; } handleMissingEntity(VocabKey.str()); @@ -481,7 +496,8 @@ void IR2VecVocabAnalysis::generateNumMappedVocab() { // Handle Arguments/Operands std::vector<Embedding> NumericArgEmbeddings(Vocabulary::MaxOperandKinds, - Embedding(Dim, 0)); + Embedding(Dim)); + NumericArgEmbeddings.reserve(Vocabulary::MaxOperandKinds); for (unsigned OpKind : seq(0u, Vocabulary::MaxOperandKinds)) { Vocabulary::OperandKind Kind = static_cast<Vocabulary::OperandKind>(OpKind); StringRef VocabKey = Vocabulary::getVocabKeyForOperandKind(Kind); @@ -552,8 +568,7 @@ PreservedAnalyses IR2VecPrinterPass::run(Module &M, assert(Vocabulary.isValid() && "IR2Vec Vocabulary is invalid"); for (Function &F : M) { - std::unique_ptr<Embedder> Emb = - Embedder::create(IR2VecKind::Symbolic, F, Vocabulary); + auto Emb = Embedder::create(IR2VecEmbeddingKind, F, Vocabulary); if (!Emb) { OS << "Error creating IR2Vec embeddings \n"; continue; diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index 5907e2106533..ebe329aa1d5f 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -5242,6 +5242,19 @@ static Value *simplifyExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs, } } + // Simplify umul_with_overflow where one operand is 1. + Value *V; + if (Idxs.size() == 1 && + (match(Agg, + m_Intrinsic<Intrinsic::umul_with_overflow>(m_Value(V), m_One())) || + match(Agg, m_Intrinsic<Intrinsic::umul_with_overflow>(m_One(), + m_Value(V))))) { + if (Idxs[0] == 0) + return V; + assert(Idxs[0] == 1 && "invalid index"); + return getFalse(CmpInst::makeCmpResultType(V->getType())); + } + return nullptr; } diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp index c7b0ca97a8e4..90bae77bcf70 100644 --- a/llvm/lib/Analysis/LazyValueInfo.cpp +++ b/llvm/lib/Analysis/LazyValueInfo.cpp @@ -1493,6 +1493,24 @@ LazyValueInfoImpl::getEdgeValueLocal(Value *Val, BasicBlock *BBFrom, // br %Condition, label %then, label %else APInt ConditionVal(1, isTrueDest ? 1 : 0); Result = constantFoldUser(Usr, Condition, ConditionVal, DL); + } else if (isa<TruncInst, ZExtInst, SExtInst>(Usr)) { + ValueLatticeElement OpLatticeVal = + *getValueFromCondition(Usr->getOperand(0), Condition, + isTrueDest, /*UseBlockValue*/ false); + + if (!OpLatticeVal.isConstantRange()) + return OpLatticeVal; + + const unsigned ResultBitWidth = + Usr->getType()->getScalarSizeInBits(); + if (auto *Trunc = dyn_cast<TruncInst>(Usr)) + return ValueLatticeElement::getRange( + OpLatticeVal.getConstantRange().truncate( + ResultBitWidth, Trunc->getNoWrapKind())); + + return ValueLatticeElement::getRange( + OpLatticeVal.getConstantRange().castOp( + cast<CastInst>(Usr)->getOpcode(), ResultBitWidth)); } else { // If one of Val's operand has an inferred value, we may be able to // infer the value of Val. diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp index 1168005f48c0..32a4264c0343 100644 --- a/llvm/lib/Analysis/Lint.cpp +++ b/llvm/lib/Analysis/Lint.cpp @@ -374,13 +374,6 @@ void Lint::visitCallBase(CallBase &I) { visitMemoryReference(I, MemoryLocation::getForArgument(&I, 0, TLI), std::nullopt, nullptr, MemRef::Read | MemRef::Write); break; - case Intrinsic::get_active_lane_mask: - if (auto *TripCount = dyn_cast<ConstantInt>(I.getArgOperand(1))) - Check(!TripCount->isZero(), - "get_active_lane_mask: operand #2 " - "must be greater than 0", - &I); - break; } } diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index 9a2c9ba63ec7..0c4e3a2e3b23 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -21,6 +21,7 @@ #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Operator.h" @@ -331,17 +332,10 @@ bool llvm::isDereferenceableAndAlignedInLoop( : SE.getBackedgeTakenCount(L); if (isa<SCEVCouldNotCompute>(MaxBECount)) return false; - - if (isa<SCEVCouldNotCompute>(BECount)) { - // TODO: Support symbolic max backedge taken counts for loops without - // computable backedge taken counts. - MaxBECount = - Predicates - ? SE.getPredicatedConstantMaxBackedgeTakenCount(L, *Predicates) - : SE.getConstantMaxBackedgeTakenCount(L); - } - const auto &[AccessStart, AccessEnd] = getStartAndEndForAccess( - L, PtrScev, LI->getType(), BECount, MaxBECount, &SE, nullptr, &DT, AC); + std::optional<ScalarEvolution::LoopGuards> LoopGuards; + const auto &[AccessStart, AccessEnd] = + getStartAndEndForAccess(L, PtrScev, LI->getType(), BECount, MaxBECount, + &SE, nullptr, &DT, AC, LoopGuards); if (isa<SCEVCouldNotCompute>(AccessStart) || isa<SCEVCouldNotCompute>(AccessEnd)) return false; @@ -350,7 +344,13 @@ bool llvm::isDereferenceableAndAlignedInLoop( const SCEV *PtrDiff = SE.getMinusSCEV(AccessEnd, AccessStart); if (isa<SCEVCouldNotCompute>(PtrDiff)) return false; - APInt MaxPtrDiff = SE.getUnsignedRangeMax(PtrDiff); + + if (!LoopGuards) + LoopGuards.emplace( + ScalarEvolution::LoopGuards::collect(AddRec->getLoop(), SE)); + + APInt MaxPtrDiff = + SE.getUnsignedRangeMax(SE.applyLoopGuards(PtrDiff, *LoopGuards)); Value *Base = nullptr; APInt AccessSize; @@ -381,7 +381,10 @@ bool llvm::isDereferenceableAndAlignedInLoop( if (Offset->getAPInt().urem(Alignment.value()) != 0) return false; - AccessSize = MaxPtrDiff + Offset->getAPInt(); + bool Overflow = false; + AccessSize = MaxPtrDiff.uadd_ov(Offset->getAPInt(), Overflow); + if (Overflow) + return false; AccessSizeSCEV = SE.getAddExpr(PtrDiff, Offset); Base = NewBase->getValue(); } else @@ -390,9 +393,11 @@ bool llvm::isDereferenceableAndAlignedInLoop( Instruction *HeaderFirstNonPHI = &*L->getHeader()->getFirstNonPHIIt(); return isDereferenceableAndAlignedPointerViaAssumption( Base, Alignment, - [&SE, AccessSizeSCEV](const RetainedKnowledge &RK) { - return SE.isKnownPredicate(CmpInst::ICMP_ULE, AccessSizeSCEV, - SE.getSCEV(RK.IRArgValue)); + [&SE, AccessSizeSCEV, &LoopGuards](const RetainedKnowledge &RK) { + return SE.isKnownPredicate( + CmpInst::ICMP_ULE, + SE.applyLoopGuards(AccessSizeSCEV, *LoopGuards), + SE.applyLoopGuards(SE.getSCEV(RK.IRArgValue), *LoopGuards)); }, DL, HeaderFirstNonPHI, AC, &DT) || isDereferenceableAndAlignedPointer(Base, Alignment, AccessSize, DL, @@ -855,17 +860,83 @@ bool llvm::canReplacePointersIfEqual(const Value *From, const Value *To, return isPointerAlwaysReplaceable(From, To, DL); } -bool llvm::isDereferenceableReadOnlyLoop( +bool llvm::isReadOnlyLoop( Loop *L, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, + SmallVectorImpl<LoadInst *> &NonDereferenceableAndAlignedLoads, SmallVectorImpl<const SCEVPredicate *> *Predicates) { for (BasicBlock *BB : L->blocks()) { for (Instruction &I : *BB) { if (auto *LI = dyn_cast<LoadInst>(&I)) { if (!isDereferenceableAndAlignedInLoop(LI, L, *SE, *DT, AC, Predicates)) - return false; - } else if (I.mayReadFromMemory() || I.mayWriteToMemory() || I.mayThrow()) + NonDereferenceableAndAlignedLoads.push_back(LI); + } else if (I.mayReadFromMemory() || I.mayWriteToMemory() || + I.mayThrow()) { return false; + } } } return true; } + +LinearExpression llvm::decomposeLinearExpression(const DataLayout &DL, + Value *Ptr) { + assert(Ptr->getType()->isPointerTy() && "Must be called with pointer arg"); + + unsigned BitWidth = DL.getIndexTypeSizeInBits(Ptr->getType()); + LinearExpression Expr(Ptr, BitWidth); + + while (true) { + auto *GEP = dyn_cast<GEPOperator>(Expr.BasePtr); + if (!GEP || GEP->getSourceElementType()->isScalableTy()) + return Expr; + + Value *VarIndex = nullptr; + for (Value *Index : GEP->indices()) { + if (isa<ConstantInt>(Index)) + continue; + // Only allow a single variable index. We do not bother to handle the + // case of the same variable index appearing multiple times. + if (Expr.Index || VarIndex) + return Expr; + VarIndex = Index; + } + + // Don't return non-canonical indexes. + if (VarIndex && !VarIndex->getType()->isIntegerTy(BitWidth)) + return Expr; + + // We have verified that we can fully handle this GEP, so we can update Expr + // members past this point. + Expr.BasePtr = GEP->getPointerOperand(); + Expr.Flags = Expr.Flags.intersectForOffsetAdd(GEP->getNoWrapFlags()); + for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); + GTI != GTE; ++GTI) { + Value *Index = GTI.getOperand(); + if (auto *ConstOffset = dyn_cast<ConstantInt>(Index)) { + if (ConstOffset->isZero()) + continue; + if (StructType *STy = GTI.getStructTypeOrNull()) { + unsigned ElementIdx = ConstOffset->getZExtValue(); + const StructLayout *SL = DL.getStructLayout(STy); + Expr.Offset += SL->getElementOffset(ElementIdx); + continue; + } + // Truncate if type size exceeds index space. + APInt IndexedSize(BitWidth, GTI.getSequentialElementStride(DL), + /*isSigned=*/false, + /*implcitTrunc=*/true); + Expr.Offset += ConstOffset->getValue() * IndexedSize; + continue; + } + + // FIXME: Also look through a mul/shl in the index. + assert(Expr.Index == nullptr && "Shouldn't have index yet"); + Expr.Index = Index; + // Truncate if type size exceeds index space. + Expr.Scale = APInt(BitWidth, GTI.getSequentialElementStride(DL), + /*isSigned=*/false, /*implicitTrunc=*/true); + } + } + + return Expr; +} diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index bceddd032527..87fae92977cd 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -210,11 +210,11 @@ static const SCEV *mulSCEVOverflow(const SCEV *A, const SCEV *B, /// Return true, if evaluating \p AR at \p MaxBTC cannot wrap, because \p AR at /// \p MaxBTC is guaranteed inbounds of the accessed object. -static bool -evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR, - const SCEV *MaxBTC, const SCEV *EltSize, - ScalarEvolution &SE, const DataLayout &DL, - DominatorTree *DT, AssumptionCache *AC) { +static bool evaluatePtrAddRecAtMaxBTCWillNotWrap( + const SCEVAddRecExpr *AR, const SCEV *MaxBTC, const SCEV *EltSize, + ScalarEvolution &SE, const DataLayout &DL, DominatorTree *DT, + AssumptionCache *AC, + std::optional<ScalarEvolution::LoopGuards> &LoopGuards) { auto *PointerBase = SE.getPointerBase(AR->getStart()); auto *StartPtr = dyn_cast<SCEVUnknown>(PointerBase); if (!StartPtr) @@ -238,8 +238,8 @@ evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR, StartPtrV, {Attribute::Dereferenceable}, *AC, L->getLoopPredecessor()->getTerminator(), DT); if (DerefRK) { - DerefBytesSCEV = SE.getUMaxExpr( - DerefBytesSCEV, SE.getConstant(WiderTy, DerefRK.ArgValue)); + DerefBytesSCEV = + SE.getUMaxExpr(DerefBytesSCEV, SE.getSCEV(DerefRK.IRArgValue)); } } @@ -259,10 +259,25 @@ evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR, const SCEV *StartOffset = SE.getNoopOrZeroExtend( SE.getMinusSCEV(AR->getStart(), StartPtr), WiderTy); + if (!LoopGuards) + LoopGuards.emplace(ScalarEvolution::LoopGuards::collect(AR->getLoop(), SE)); + MaxBTC = SE.applyLoopGuards(MaxBTC, *LoopGuards); + const SCEV *OffsetAtLastIter = mulSCEVOverflow(MaxBTC, SE.getAbsExpr(Step, /*IsNSW=*/false), SE); - if (!OffsetAtLastIter) - return false; + if (!OffsetAtLastIter) { + // Re-try with constant max backedge-taken count if using the symbolic one + // failed. + MaxBTC = SE.getConstantMaxBackedgeTakenCount(AR->getLoop()); + if (isa<SCEVCouldNotCompute>(MaxBTC)) + return false; + MaxBTC = SE.getNoopOrZeroExtend( + MaxBTC, WiderTy); + OffsetAtLastIter = + mulSCEVOverflow(MaxBTC, SE.getAbsExpr(Step, /*IsNSW=*/false), SE); + if (!OffsetAtLastIter) + return false; + } const SCEV *OffsetEndBytes = addSCEVNoOverflow( OffsetAtLastIter, SE.getNoopOrZeroExtend(EltSize, WiderTy), SE); @@ -276,6 +291,8 @@ evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR, const SCEV *EndBytes = addSCEVNoOverflow(StartOffset, OffsetEndBytes, SE); if (!EndBytes) return false; + + DerefBytesSCEV = SE.applyLoopGuards(DerefBytesSCEV, *LoopGuards); return SE.isKnownPredicate(CmpInst::ICMP_ULE, EndBytes, DerefBytesSCEV); } @@ -292,7 +309,8 @@ std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess( const SCEV *MaxBTC, ScalarEvolution *SE, DenseMap<std::pair<const SCEV *, Type *>, std::pair<const SCEV *, const SCEV *>> *PointerBounds, - DominatorTree *DT, AssumptionCache *AC) { + DominatorTree *DT, AssumptionCache *AC, + std::optional<ScalarEvolution::LoopGuards> &LoopGuards) { std::pair<const SCEV *, const SCEV *> *PtrBoundsPair; if (PointerBounds) { auto [Iter, Ins] = PointerBounds->insert( @@ -328,7 +346,7 @@ std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess( // separately checks that accesses cannot not wrap, so unsigned max // represents an upper bound. if (evaluatePtrAddRecAtMaxBTCWillNotWrap(AR, MaxBTC, EltSizeSCEV, *SE, DL, - DT, AC)) { + DT, AC, LoopGuards)) { ScEnd = AR->evaluateAtIteration(MaxBTC, *SE); } else { ScEnd = SE->getAddExpr( @@ -377,7 +395,7 @@ void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr, const SCEV *BTC = PSE.getBackedgeTakenCount(); const auto &[ScStart, ScEnd] = getStartAndEndForAccess( Lp, PtrExpr, AccessTy, BTC, SymbolicMaxBTC, PSE.getSE(), - &DC.getPointerBounds(), DC.getDT(), DC.getAC()); + &DC.getPointerBounds(), DC.getDT(), DC.getAC(), LoopGuards); assert(!isa<SCEVCouldNotCompute>(ScStart) && !isa<SCEVCouldNotCompute>(ScEnd) && "must be able to compute both start and end expressions"); @@ -1213,48 +1231,40 @@ static void findForkedSCEVs( } } -static SmallVector<PointerIntPair<const SCEV *, 1, bool>> -findForkedPointer(PredicatedScalarEvolution &PSE, - const DenseMap<Value *, const SCEV *> &StridesMap, Value *Ptr, - const Loop *L) { - ScalarEvolution *SE = PSE.getSE(); - assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!"); - SmallVector<PointerIntPair<const SCEV *, 1, bool>> Scevs; - findForkedSCEVs(SE, L, Ptr, Scevs, MaxForkedSCEVDepth); - - // For now, we will only accept a forked pointer with two possible SCEVs - // that are either SCEVAddRecExprs or loop invariant. - if (Scevs.size() == 2 && - (isa<SCEVAddRecExpr>(get<0>(Scevs[0])) || - SE->isLoopInvariant(get<0>(Scevs[0]), L)) && - (isa<SCEVAddRecExpr>(get<0>(Scevs[1])) || - SE->isLoopInvariant(get<0>(Scevs[1]), L))) { - LLVM_DEBUG(dbgs() << "LAA: Found forked pointer: " << *Ptr << "\n"); - LLVM_DEBUG(dbgs() << "\t(1) " << *get<0>(Scevs[0]) << "\n"); - LLVM_DEBUG(dbgs() << "\t(2) " << *get<0>(Scevs[1]) << "\n"); - return Scevs; - } - - return {{replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr), false}}; -} - bool AccessAnalysis::createCheckForAccess( RuntimePointerChecking &RtCheck, MemAccessInfo Access, Type *AccessTy, const DenseMap<Value *, const SCEV *> &StridesMap, DenseMap<Value *, unsigned> &DepSetId, Loop *TheLoop, unsigned &RunningDepId, unsigned ASId, bool Assume) { Value *Ptr = Access.getPointer(); + ScalarEvolution *SE = PSE.getSE(); + assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!"); - SmallVector<PointerIntPair<const SCEV *, 1, bool>> TranslatedPtrs = - findForkedPointer(PSE, StridesMap, Ptr, TheLoop); - assert(!TranslatedPtrs.empty() && "must have some translated pointers"); + SmallVector<PointerIntPair<const SCEV *, 1, bool>> RTCheckPtrs; + findForkedSCEVs(SE, TheLoop, Ptr, RTCheckPtrs, MaxForkedSCEVDepth); + assert(!RTCheckPtrs.empty() && + "Must have some runtime-check pointer candidates"); + + // RTCheckPtrs must have size 2 if there are forked pointers. Otherwise, there + // are no forked pointers; replaceSymbolicStridesSCEV in this case. + auto IsLoopInvariantOrAR = + [&SE, &TheLoop](const PointerIntPair<const SCEV *, 1, bool> &P) { + return SE->isLoopInvariant(P.getPointer(), TheLoop) || + isa<SCEVAddRecExpr>(P.getPointer()); + }; + if (RTCheckPtrs.size() == 2 && all_of(RTCheckPtrs, IsLoopInvariantOrAR)) { + LLVM_DEBUG(dbgs() << "LAA: Found forked pointer: " << *Ptr << "\n"; + for (const auto &[Idx, Q] : enumerate(RTCheckPtrs)) dbgs() + << "\t(" << Idx << ") " << *Q.getPointer() << "\n"); + } else { + RTCheckPtrs = {{replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr), false}}; + } /// Check whether all pointers can participate in a runtime bounds check. They - /// must either be invariant or AddRecs. If ShouldCheckWrap is true, they also - /// must not wrap. - for (auto &P : TranslatedPtrs) { + /// must either be invariant or non-wrapping affine AddRecs. + for (auto &P : RTCheckPtrs) { // The bounds for loop-invariant pointer is trivial. - if (PSE.getSE()->isLoopInvariant(P.getPointer(), TheLoop)) + if (SE->isLoopInvariant(P.getPointer(), TheLoop)) continue; const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(P.getPointer()); @@ -1265,21 +1275,18 @@ bool AccessAnalysis::createCheckForAccess( // If there's only one option for Ptr, look it up after bounds and wrap // checking, because assumptions might have been added to PSE. - if (TranslatedPtrs.size() == 1) { + if (RTCheckPtrs.size() == 1) { AR = cast<SCEVAddRecExpr>(replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr)); P.setPointer(AR); } - // When we run after a failing dependency check we have to make sure - // we don't have wrapping pointers. - if (!isNoWrap(PSE, AR, TranslatedPtrs.size() == 1 ? Ptr : nullptr, AccessTy, - TheLoop, Assume)) { + if (!isNoWrap(PSE, AR, RTCheckPtrs.size() == 1 ? Ptr : nullptr, AccessTy, + TheLoop, Assume)) return false; - } } - for (auto [PtrExpr, NeedsFreeze] : TranslatedPtrs) { + for (const auto &[PtrExpr, NeedsFreeze] : RTCheckPtrs) { // The id of the dependence set. unsigned DepId; @@ -1983,13 +1990,13 @@ bool MemoryDepChecker::areAccessesCompletelyBeforeOrAfter(const SCEV *Src, ScalarEvolution &SE = *PSE.getSE(); const auto &[SrcStart_, SrcEnd_] = getStartAndEndForAccess(InnermostLoop, Src, SrcTy, BTC, SymbolicMaxBTC, - &SE, &PointerBounds, DT, AC); + &SE, &PointerBounds, DT, AC, LoopGuards); if (isa<SCEVCouldNotCompute>(SrcStart_) || isa<SCEVCouldNotCompute>(SrcEnd_)) return false; const auto &[SinkStart_, SinkEnd_] = getStartAndEndForAccess(InnermostLoop, Sink, SinkTy, BTC, SymbolicMaxBTC, - &SE, &PointerBounds, DT, AC); + &SE, &PointerBounds, DT, AC, LoopGuards); if (isa<SCEVCouldNotCompute>(SinkStart_) || isa<SCEVCouldNotCompute>(SinkEnd_)) return false; @@ -3036,8 +3043,9 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE, TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) * 2; DepChecker = std::make_unique<MemoryDepChecker>( - *PSE, AC, DT, L, SymbolicStrides, MaxTargetVectorWidthInBits); - PtrRtChecking = std::make_unique<RuntimePointerChecking>(*DepChecker, SE); + *PSE, AC, DT, L, SymbolicStrides, MaxTargetVectorWidthInBits, LoopGuards); + PtrRtChecking = + std::make_unique<RuntimePointerChecking>(*DepChecker, SE, LoopGuards); if (canAnalyzeLoop()) CanVecMem = analyzeLoop(AA, LI, TLI, DT); } diff --git a/llvm/lib/Analysis/MLInlineAdvisor.cpp b/llvm/lib/Analysis/MLInlineAdvisor.cpp index 8853a13972be..f90717d3085e 100644 --- a/llvm/lib/Analysis/MLInlineAdvisor.cpp +++ b/llvm/lib/Analysis/MLInlineAdvisor.cpp @@ -27,6 +27,7 @@ #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ReleaseModeModelRunner.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/TensorSpec.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Module.h" @@ -74,21 +75,22 @@ llvm::getReleaseModeAdvisor(Module &M, ModuleAnalysisManager &MAM, if (!llvm::isEmbeddedModelEvaluatorValid<CompiledModelType>() && InteractiveChannelBaseName.empty()) return nullptr; - std::unique_ptr<MLModelRunner> AOTRunner; - if (InteractiveChannelBaseName.empty()) - AOTRunner = std::make_unique<ReleaseModeModelRunner<CompiledModelType>>( - M.getContext(), FeatureMap, DecisionName, - EmbeddedModelRunnerOptions().setModelSelector(ModelSelector)); - else { - auto Features = FeatureMap; - if (InteractiveIncludeDefault) - Features.push_back(DefaultDecisionSpec); - AOTRunner = std::make_unique<InteractiveModelRunner>( - M.getContext(), Features, InlineDecisionSpec, - InteractiveChannelBaseName + ".out", - InteractiveChannelBaseName + ".in"); - } - return std::make_unique<MLInlineAdvisor>(M, MAM, std::move(AOTRunner), + auto RunnerFactory = [&](const std::vector<TensorSpec> &InputFeatures) + -> std::unique_ptr<MLModelRunner> { + std::unique_ptr<MLModelRunner> AOTRunner; + if (InteractiveChannelBaseName.empty()) + AOTRunner = std::make_unique<ReleaseModeModelRunner<CompiledModelType>>( + M.getContext(), InputFeatures, DecisionName, + EmbeddedModelRunnerOptions().setModelSelector(ModelSelector)); + else { + AOTRunner = std::make_unique<InteractiveModelRunner>( + M.getContext(), InputFeatures, InlineDecisionSpec, + InteractiveChannelBaseName + ".out", + InteractiveChannelBaseName + ".in"); + } + return AOTRunner; + }; + return std::make_unique<MLInlineAdvisor>(M, MAM, RunnerFactory, GetDefaultAdvice); } @@ -106,8 +108,9 @@ static cl::opt<bool> KeepFPICache( "For test - keep the ML Inline advisor's FunctionPropertiesInfo cache"), cl::init(false)); -// clang-format off -std::vector<TensorSpec> llvm::FeatureMap{ +const std::vector<TensorSpec> &MLInlineAdvisor::getInitialFeatureMap() { + // clang-format off +static std::vector<TensorSpec> FeatureMap{ #define POPULATE_NAMES(DTYPE, SHAPE, NAME, __) TensorSpec::createSpec<DTYPE>(#NAME, SHAPE), // InlineCost features - these must come first INLINE_COST_FEATURE_ITERATOR(POPULATE_NAMES) @@ -116,7 +119,9 @@ std::vector<TensorSpec> llvm::FeatureMap{ INLINE_FEATURE_ITERATOR(POPULATE_NAMES) #undef POPULATE_NAMES }; -// clang-format on + // clang-format on + return FeatureMap; +} const char *const llvm::DecisionName = "inlining_decision"; const TensorSpec llvm::InlineDecisionSpec = @@ -138,17 +143,17 @@ CallBase *getInlinableCS(Instruction &I) { MLInlineAdvisor::MLInlineAdvisor( Module &M, ModuleAnalysisManager &MAM, - std::unique_ptr<MLModelRunner> Runner, + std::function< + std::unique_ptr<MLModelRunner>(const std::vector<TensorSpec> &)> + GetModelRunner, std::function<bool(CallBase &)> GetDefaultAdvice) : InlineAdvisor( M, MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager()), - ModelRunner(std::move(Runner)), GetDefaultAdvice(GetDefaultAdvice), + GetDefaultAdvice(GetDefaultAdvice), FeatureMap(getInitialFeatureMap()), CG(MAM.getResult<LazyCallGraphAnalysis>(M)), UseIR2Vec(MAM.getCachedResult<IR2VecVocabAnalysis>(M) != nullptr), InitialIRSize(getModuleIRSize()), CurrentIRSize(InitialIRSize), PSI(MAM.getResult<ProfileSummaryAnalysis>(M)) { - assert(ModelRunner); - ModelRunner->switchContext(""); // Extract the 'call site height' feature - the position of a call site // relative to the farthest statically reachable SCC node. We don't mutate // this value while inlining happens. Empirically, this feature proved @@ -188,7 +193,7 @@ MLInlineAdvisor::MLInlineAdvisor( } NodeCount = AllNodes.size(); - if (auto IR2VecVocabResult = MAM.getCachedResult<IR2VecVocabAnalysis>(M)) { + if (auto *IR2VecVocabResult = MAM.getCachedResult<IR2VecVocabAnalysis>(M)) { if (!IR2VecVocabResult->isValid()) { M.getContext().emitError("IR2VecVocabAnalysis is not valid"); return; @@ -200,6 +205,15 @@ MLInlineAdvisor::MLInlineAdvisor( FeatureMap.push_back( TensorSpec::createSpec<float>("caller_embedding", {IR2VecDim})); } + if (InteractiveIncludeDefault) + FeatureMap.push_back(DefaultDecisionSpec); + + ModelRunner = GetModelRunner(getFeatureMap()); + if (!ModelRunner) { + M.getContext().emitError("Could not create model runner"); + return; + } + ModelRunner->switchContext(""); } unsigned MLInlineAdvisor::getInitialFunctionLevel(const Function &F) const { @@ -471,7 +485,8 @@ std::unique_ptr<InlineAdvice> MLInlineAdvisor::getAdviceImpl(CallBase &CB) { } // This one would have been set up to be right at the end. if (!InteractiveChannelBaseName.empty() && InteractiveIncludeDefault) - *ModelRunner->getTensor<int64_t>(FeatureMap.size()) = GetDefaultAdvice(CB); + *ModelRunner->getTensor<int64_t>(getFeatureMap().size() - 1) = + GetDefaultAdvice(CB); return getAdviceFromModel(CB, ORE); } @@ -549,8 +564,8 @@ void MLInlineAdvice::reportContextForRemark( DiagnosticInfoOptimizationBase &OR) { using namespace ore; OR << NV("Callee", Callee->getName()); - for (size_t I = 0; I < FeatureMap.size(); ++I) - OR << NV(FeatureMap[I].name(), + for (size_t I = 0; I < getAdvisor()->getFeatureMap().size(); ++I) + OR << NV(getAdvisor()->getFeatureMap()[I].name(), *getAdvisor()->getModelRunner().getTensor<int64_t>(I)); OR << NV("ShouldInline", isInliningRecommended()); } diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp index e0b7f65d18a3..1df4eda2580d 100644 --- a/llvm/lib/Analysis/MemoryBuiltins.cpp +++ b/llvm/lib/Analysis/MemoryBuiltins.cpp @@ -589,6 +589,59 @@ bool llvm::getObjectSize(const Value *Ptr, uint64_t &Size, const DataLayout &DL, return true; } +std::optional<TypeSize> llvm::getBaseObjectSize(const Value *Ptr, + const DataLayout &DL, + const TargetLibraryInfo *TLI, + ObjectSizeOpts Opts) { + assert(Opts.EvalMode == ObjectSizeOpts::Mode::ExactSizeFromOffset && + "Other modes are currently not supported"); + + auto Align = [&](TypeSize Size, MaybeAlign Alignment) { + if (Opts.RoundToAlign && Alignment && !Size.isScalable()) + return TypeSize::getFixed(alignTo(Size.getFixedValue(), *Alignment)); + return Size; + }; + + if (isa<UndefValue>(Ptr)) + return TypeSize::getZero(); + + if (isa<ConstantPointerNull>(Ptr)) { + if (Opts.NullIsUnknownSize || Ptr->getType()->getPointerAddressSpace()) + return std::nullopt; + return TypeSize::getZero(); + } + + if (auto *GV = dyn_cast<GlobalVariable>(Ptr)) { + if (!GV->getValueType()->isSized() || GV->hasExternalWeakLinkage() || + !GV->hasInitializer() || GV->isInterposable()) + return std::nullopt; + return Align(DL.getTypeAllocSize(GV->getValueType()), GV->getAlign()); + } + + if (auto *A = dyn_cast<Argument>(Ptr)) { + Type *MemoryTy = A->getPointeeInMemoryValueType(); + if (!MemoryTy || !MemoryTy->isSized()) + return std::nullopt; + return Align(DL.getTypeAllocSize(MemoryTy), A->getParamAlign()); + } + + if (auto *AI = dyn_cast<AllocaInst>(Ptr)) { + if (std::optional<TypeSize> Size = AI->getAllocationSize(DL)) + return Align(*Size, AI->getAlign()); + return std::nullopt; + } + + if (auto *CB = dyn_cast<CallBase>(Ptr)) { + if (std::optional<APInt> Size = getAllocSize(CB, TLI)) { + if (std::optional<uint64_t> ZExtSize = Size->tryZExtValue()) + return TypeSize::getFixed(*ZExtSize); + } + return std::nullopt; + } + + return std::nullopt; +} + Value *llvm::lowerObjectSizeCall(IntrinsicInst *ObjectSize, const DataLayout &DL, const TargetLibraryInfo *TLI, diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp index 72b643c56a99..dcc51178b975 100644 --- a/llvm/lib/Analysis/MemoryLocation.cpp +++ b/llvm/lib/Analysis/MemoryLocation.cpp @@ -12,6 +12,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsARM.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include <optional> using namespace llvm; @@ -150,6 +151,33 @@ MemoryLocation::getForDest(const CallBase *CB, const TargetLibraryInfo &TLI) { return MemoryLocation::getBeforeOrAfter(UsedV, CB->getAAMetadata()); } +// If the mask for a memory op is a get active lane mask intrinsic +// we can possibly infer the size of memory written or read +static std::optional<FixedVectorType *> +getKnownTypeFromMaskedOp(Value *Mask, VectorType *Ty) { + using namespace llvm::PatternMatch; + ConstantInt *Op0, *Op1; + if (!match(Mask, m_Intrinsic<Intrinsic::get_active_lane_mask>( + m_ConstantInt(Op0), m_ConstantInt(Op1)))) + return std::nullopt; + + APInt LaneMaskLo = Op0->getValue(); + APInt LaneMaskHi = Op1->getValue(); + if (LaneMaskHi.ule(LaneMaskLo)) + return std::nullopt; + + APInt NumElts = LaneMaskHi - LaneMaskLo; + if (NumElts.ugt(Ty->getElementCount().getKnownMinValue())) { + if (isa<ScalableVectorType>(Ty)) + return std::nullopt; + // Unlike scalable vectors, fixed vector types are guaranteed to handle the + // KnownMinValue and can be clamped + NumElts = Ty->getElementCount().getKnownMinValue(); + } + + return FixedVectorType::get(Ty->getElementType(), NumElts.getZExtValue()); +} + MemoryLocation MemoryLocation::getForArgument(const CallBase *Call, unsigned ArgIdx, const TargetLibraryInfo *TLI) { @@ -213,20 +241,26 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call, cast<ConstantInt>(II->getArgOperand(0))->getZExtValue()), AATags); - case Intrinsic::masked_load: + case Intrinsic::masked_load: { assert(ArgIdx == 0 && "Invalid argument index"); - return MemoryLocation( - Arg, - LocationSize::upperBound(DL.getTypeStoreSize(II->getType())), - AATags); - case Intrinsic::masked_store: + auto *Ty = cast<VectorType>(II->getType()); + if (auto KnownType = getKnownTypeFromMaskedOp(II->getOperand(2), Ty)) + return MemoryLocation(Arg, DL.getTypeStoreSize(*KnownType), AATags); + + return MemoryLocation( + Arg, LocationSize::upperBound(DL.getTypeStoreSize(Ty)), AATags); + } + case Intrinsic::masked_store: { assert(ArgIdx == 1 && "Invalid argument index"); + + auto *Ty = cast<VectorType>(II->getArgOperand(0)->getType()); + if (auto KnownType = getKnownTypeFromMaskedOp(II->getOperand(3), Ty)) + return MemoryLocation(Arg, DL.getTypeStoreSize(*KnownType), AATags); + return MemoryLocation( - Arg, - LocationSize::upperBound( - DL.getTypeStoreSize(II->getArgOperand(0)->getType())), - AATags); + Arg, LocationSize::upperBound(DL.getTypeStoreSize(Ty)), AATags); + } case Intrinsic::invariant_end: // The first argument to an invariant.end is a "descriptor" type (e.g. a diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp index b3c8a7d4563b..b5ca6b13108f 100644 --- a/llvm/lib/Analysis/MemoryProfileInfo.cpp +++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp @@ -121,6 +121,24 @@ bool llvm::memprof::hasSingleAllocType(uint8_t AllocTypes) { return NumAllocTypes == 1; } +void llvm::memprof::removeAnyExistingAmbiguousAttribute(CallBase *CB) { + if (!CB->hasFnAttr("memprof")) + return; + assert(CB->getFnAttr("memprof").getValueAsString() == "ambiguous"); + CB->removeFnAttr("memprof"); +} + +void llvm::memprof::addAmbiguousAttribute(CallBase *CB) { + // We may have an existing ambiguous attribute if we are reanalyzing + // after inlining. + if (CB->hasFnAttr("memprof")) { + assert(CB->getFnAttr("memprof").getValueAsString() == "ambiguous"); + } else { + auto A = llvm::Attribute::get(CB->getContext(), "memprof", "ambiguous"); + CB->addFnAttr(A); + } +} + void CallStackTrie::addCallStack( AllocationType AllocType, ArrayRef<uint64_t> StackIds, std::vector<ContextTotalSize> ContextSizeInfo) { @@ -466,6 +484,9 @@ void CallStackTrie::addSingleAllocTypeAttribute(CallBase *CI, AllocationType AT, StringRef Descriptor) { auto AllocTypeString = getAllocTypeAttributeString(AT); auto A = llvm::Attribute::get(CI->getContext(), "memprof", AllocTypeString); + // After inlining we may be able to convert an existing ambiguous allocation + // to an unambiguous one. + removeAnyExistingAmbiguousAttribute(CI); CI->addFnAttr(A); if (MemProfReportHintedSizes) { std::vector<ContextTotalSize> ContextSizeInfo; @@ -525,6 +546,7 @@ bool CallStackTrie::buildAndAttachMIBMetadata(CallBase *CI) { assert(MIBCallStack.size() == 1 && "Should only be left with Alloc's location in stack"); CI->setMetadata(LLVMContext::MD_memprof, MDNode::get(Ctx, MIBNodes)); + addAmbiguousAttribute(CI); return true; } // If there exists corner case that CallStackTrie has one chain to leaf diff --git a/llvm/lib/Analysis/MemorySSAUpdater.cpp b/llvm/lib/Analysis/MemorySSAUpdater.cpp index ecfecb03c375..bb3e679219ae 100644 --- a/llvm/lib/Analysis/MemorySSAUpdater.cpp +++ b/llvm/lib/Analysis/MemorySSAUpdater.cpp @@ -411,17 +411,11 @@ void MemorySSAUpdater::insertDef(MemoryDef *MD, bool RenameUses) { FixupList.push_back(MD); } - // Remember the index where we stopped inserting new phis above, since the - // fixupDefs call in the loop below may insert more, that are already minimal. + // Update defining access of following defs. unsigned NewPhiIndexEnd = InsertedPHIs.size(); - - while (!FixupList.empty()) { - unsigned StartingPHISize = InsertedPHIs.size(); - fixupDefs(FixupList); - FixupList.clear(); - // Put any new phis on the fixup list, and process them - FixupList.append(InsertedPHIs.begin() + StartingPHISize, InsertedPHIs.end()); - } + fixupDefs(FixupList); + assert(NewPhiIndexEnd == InsertedPHIs.size() && + "Should not insert new phis during fixupDefs()"); // Optimize potentially non-minimal phis added in this method. unsigned NewPhiSize = NewPhiIndexEnd - NewPhiIndex; @@ -504,11 +498,8 @@ void MemorySSAUpdater::fixupDefs(const SmallVectorImpl<WeakVH> &Vars) { assert(MSSA->dominates(NewDef, FirstDef) && "Should have dominated the new access"); - // This may insert new phi nodes, because we are not guaranteed the - // block we are processing has a single pred, and depending where the - // store was inserted, it may require phi nodes below it. - cast<MemoryDef>(FirstDef)->setDefiningAccess(getPreviousDef(FirstDef)); - return; + cast<MemoryDef>(FirstDef)->setDefiningAccess(NewDef); + continue; } // We didn't find a def, so we must continue. for (const auto *S : successors(FixupBlock)) { diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index f60a1e9f2270..51caffc41002 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -500,10 +500,11 @@ const SCEV *ScalarEvolution::getVScale(Type *Ty) { return S; } -const SCEV *ScalarEvolution::getElementCount(Type *Ty, ElementCount EC) { +const SCEV *ScalarEvolution::getElementCount(Type *Ty, ElementCount EC, + SCEV::NoWrapFlags Flags) { const SCEV *Res = getConstant(Ty, EC.getKnownMinValue()); if (EC.isScalable()) - Res = getMulExpr(Res, getVScale(Ty)); + Res = getMulExpr(Res, getVScale(Ty), Flags); return Res; } @@ -3199,6 +3200,37 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops, AddRec->getNoWrapFlags(FlagsMask)); } } + + // Try to push the constant operand into a ZExt: C * zext (A + B) -> + // zext (C*A + C*B) if trunc (C) * (A + B) does not unsigned-wrap. + const SCEVAddExpr *InnerAdd; + if (match(Ops[1], m_scev_ZExt(m_scev_Add(InnerAdd)))) { + const SCEV *NarrowC = getTruncateExpr(LHSC, InnerAdd->getType()); + if (isa<SCEVConstant>(InnerAdd->getOperand(0)) && + getZeroExtendExpr(NarrowC, Ops[1]->getType()) == LHSC && + hasFlags(StrengthenNoWrapFlags(this, scMulExpr, {NarrowC, InnerAdd}, + SCEV::FlagAnyWrap), + SCEV::FlagNUW)) { + auto *Res = getMulExpr(NarrowC, InnerAdd, SCEV::FlagNUW, Depth + 1); + return getZeroExtendExpr(Res, Ops[1]->getType(), Depth + 1); + }; + } + + // Try to fold (C1 * D /u C2) -> C1/C2 * D, if C1 and C2 are powers-of-2, + // D is a multiple of C2, and C1 is a multiple of C2. + const SCEV *D; + APInt C1V = LHSC->getAPInt(); + // (C1 * D /u C2) == -1 * -C1 * D /u C2 when C1 != INT_MIN. + if (C1V.isNegative() && !C1V.isMinSignedValue()) + C1V = C1V.abs(); + const SCEVConstant *C2; + if (C1V.isPowerOf2() && + match(Ops[1], m_scev_UDiv(m_SCEV(D), m_SCEVConstant(C2))) && + C2->getAPInt().isPowerOf2() && C1V.uge(C2->getAPInt()) && + C1V.logBase2() <= getMinTrailingZeros(D)) { + const SCEV *NewMul = getMulExpr(getUDivExpr(getConstant(C1V), C2), D); + return C1V == LHSC->getAPInt() ? NewMul : getNegativeSCEV(NewMul); + } } } @@ -15985,6 +16017,16 @@ const SCEV *ScalarEvolution::LoopGuards::rewrite(const SCEV *Expr) const { } const SCEV *visitAddExpr(const SCEVAddExpr *Expr) { + // Trip count expressions sometimes consist of adding 3 operands, i.e. + // (Const + A + B). There may be guard info for A + B, and if so, apply + // it. + // TODO: Could more generally apply guards to Add sub-expressions. + if (isa<SCEVConstant>(Expr->getOperand(0)) && + Expr->getNumOperands() == 3) { + if (const SCEV *S = Map.lookup( + SE.getAddExpr(Expr->getOperand(1), Expr->getOperand(2)))) + return SE.getAddExpr(Expr->getOperand(0), S); + } SmallVector<const SCEV *, 2> Operands; bool Changed = false; for (const auto *Op : Expr->operands()) { diff --git a/llvm/lib/Analysis/ScalarEvolutionDivision.cpp b/llvm/lib/Analysis/ScalarEvolutionDivision.cpp index d03930d9e2d9..bce41f9f5329 100644 --- a/llvm/lib/Analysis/ScalarEvolutionDivision.cpp +++ b/llvm/lib/Analysis/ScalarEvolutionDivision.cpp @@ -15,10 +15,14 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" #include "llvm/Support/Casting.h" #include <cassert> #include <cstdint> +#define DEBUG_TYPE "scev-division" + namespace llvm { class Type; } // namespace llvm @@ -257,3 +261,31 @@ void SCEVDivision::cannotDivide(const SCEV *Numerator) { Quotient = Zero; Remainder = Numerator; } + +void SCEVDivisionPrinterPass::runImpl(Function &F, ScalarEvolution &SE) { + OS << "Printing analysis 'Scalar Evolution Division' for function '" + << F.getName() << "':\n"; + for (Instruction &Inst : instructions(F)) { + BinaryOperator *Div = dyn_cast<BinaryOperator>(&Inst); + if (!Div || Div->getOpcode() != Instruction::SDiv) + continue; + + const SCEV *Numerator = SE.getSCEV(Div->getOperand(0)); + const SCEV *Denominator = SE.getSCEV(Div->getOperand(1)); + const SCEV *Quotient, *Remainder; + SCEVDivision::divide(SE, Numerator, Denominator, &Quotient, &Remainder); + + OS << "Instruction: " << *Div << "\n"; + OS.indent(2) << "Numerator: " << *Numerator << "\n"; + OS.indent(2) << "Denominator: " << *Denominator << "\n"; + OS.indent(2) << "Quotient: " << *Quotient << "\n"; + OS.indent(2) << "Remainder: " << *Remainder << "\n"; + } +} + +PreservedAnalyses SCEVDivisionPrinterPass::run(Function &F, + FunctionAnalysisManager &AM) { + ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F); + runImpl(F, SE); + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 4ac8f03e6dbf..899806bf3734 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1283,9 +1283,10 @@ InstructionCost TargetTransformInfo::getExtendedReductionCost( } InstructionCost TargetTransformInfo::getMulAccReductionCost( - bool IsUnsigned, Type *ResTy, VectorType *Ty, + bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const { - return TTIImpl->getMulAccReductionCost(IsUnsigned, ResTy, Ty, CostKind); + return TTIImpl->getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, Ty, + CostKind); } InstructionCost @@ -1402,8 +1403,9 @@ unsigned TargetTransformInfo::getStoreVectorFactor(unsigned VF, return TTIImpl->getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy); } -bool TargetTransformInfo::preferFixedOverScalableIfEqualCost() const { - return TTIImpl->preferFixedOverScalableIfEqualCost(); +bool TargetTransformInfo::preferFixedOverScalableIfEqualCost( + bool IsEpilogue) const { + return TTIImpl->preferFixedOverScalableIfEqualCost(IsEpilogue); } bool TargetTransformInfo::preferInLoopReduction(RecurKind Kind, diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 7fe129b8456f..129823e0e98a 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -413,6 +413,18 @@ static void computeKnownBitsMul(const Value *Op0, const Value *Op1, bool NSW, isGuaranteedNotToBeUndef(Op0, Q.AC, Q.CxtI, Q.DT, Depth + 1); Known = KnownBits::mul(Known, Known2, SelfMultiply); + if (SelfMultiply) { + unsigned SignBits = ComputeNumSignBits(Op0, DemandedElts, Q, Depth + 1); + unsigned TyBits = Op0->getType()->getScalarSizeInBits(); + unsigned OutValidBits = 2 * (TyBits - SignBits + 1); + + if (OutValidBits < TyBits) { + APInt KnownZeroMask = + APInt::getHighBitsSet(TyBits, TyBits - OutValidBits + 1); + Known.Zero |= KnownZeroMask; + } + } + // Only make use of no-wrap flags if we failed to compute the sign bit // directly. This matters if the multiplication always overflows, in // which case we prefer to follow the result of the direct computation, @@ -727,17 +739,16 @@ static void computeKnownBitsFromCmp(const Value *V, CmpInst::Predicate Pred, // For those bits in C that are known, we can propagate them to known // bits in V shifted to the right by ShAmt. KnownBits RHSKnown = KnownBits::makeConstant(*C); - RHSKnown.Zero.lshrInPlace(ShAmt); - RHSKnown.One.lshrInPlace(ShAmt); + RHSKnown >>= ShAmt; Known = Known.unionWith(RHSKnown); // assume(V >> ShAmt = C) } else if (match(LHS, m_Shr(m_V, m_ConstantInt(ShAmt))) && ShAmt < BitWidth) { - KnownBits RHSKnown = KnownBits::makeConstant(*C); // For those bits in RHS that are known, we can propagate them to known // bits in V shifted to the right by C. - Known.Zero |= RHSKnown.Zero << ShAmt; - Known.One |= RHSKnown.One << ShAmt; + KnownBits RHSKnown = KnownBits::makeConstant(*C); + RHSKnown <<= ShAmt; + Known = Known.unionWith(RHSKnown); } break; case ICmpInst::ICMP_NE: { @@ -1829,18 +1840,16 @@ static void computeKnownBitsFromOperator(const Operator *I, case Intrinsic::abs: { computeKnownBits(I->getOperand(0), DemandedElts, Known2, Q, Depth + 1); bool IntMinIsPoison = match(II->getArgOperand(1), m_One()); - Known = Known2.abs(IntMinIsPoison); + Known = Known.unionWith(Known2.abs(IntMinIsPoison)); break; } case Intrinsic::bitreverse: computeKnownBits(I->getOperand(0), DemandedElts, Known2, Q, Depth + 1); - Known.Zero |= Known2.Zero.reverseBits(); - Known.One |= Known2.One.reverseBits(); + Known = Known.unionWith(Known2.reverseBits()); break; case Intrinsic::bswap: computeKnownBits(I->getOperand(0), DemandedElts, Known2, Q, Depth + 1); - Known.Zero |= Known2.Zero.byteSwap(); - Known.One |= Known2.One.byteSwap(); + Known = Known.unionWith(Known2.byteSwap()); break; case Intrinsic::ctlz: { computeKnownBits(I->getOperand(0), DemandedElts, Known2, Q, Depth + 1); @@ -1890,10 +1899,9 @@ static void computeKnownBitsFromOperator(const Operator *I, computeKnownBits(I->getOperand(0), DemandedElts, Known2, Q, Depth + 1); computeKnownBits(I->getOperand(1), DemandedElts, Known3, Q, Depth + 1); - Known.Zero = - Known2.Zero.shl(ShiftAmt) | Known3.Zero.lshr(BitWidth - ShiftAmt); - Known.One = - Known2.One.shl(ShiftAmt) | Known3.One.lshr(BitWidth - ShiftAmt); + Known2 <<= ShiftAmt; + Known3 >>= BitWidth - ShiftAmt; + Known = Known2.unionWith(Known3); break; } case Intrinsic::uadd_sat: diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 425ea311d653..091d94843698 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -166,6 +166,7 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, case Intrinsic::is_fpclass: case Intrinsic::vp_is_fpclass: case Intrinsic::powi: + case Intrinsic::vector_extract: return (ScalarOpdIdx == 1); case Intrinsic::smul_fix: case Intrinsic::smul_fix_sat: @@ -200,6 +201,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg( case Intrinsic::vp_llrint: case Intrinsic::ucmp: case Intrinsic::scmp: + case Intrinsic::vector_extract: return OpdIdx == -1 || OpdIdx == 0; case Intrinsic::modf: case Intrinsic::sincos: diff --git a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp index 33eed07c4629..8737dc0fc745 100644 --- a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp +++ b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp @@ -77,9 +77,10 @@ bool MetadataVerifier::verifyScalarEntry( msgpack::MapDocNode &MapNode, StringRef Key, bool Required, msgpack::Type SKind, function_ref<bool(msgpack::DocNode &)> verifyValue) { - return verifyEntry(MapNode, Key, Required, [=](msgpack::DocNode &Node) { - return verifyScalar(Node, SKind, verifyValue); - }); + return verifyEntry(MapNode, Key, Required, + [this, SKind, verifyValue](msgpack::DocNode &Node) { + return verifyScalar(Node, SKind, verifyValue); + }); } bool MetadataVerifier::verifyIntegerEntry(msgpack::MapDocNode &MapNode, diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp index 738e47b8b16c..a5cedadd3098 100644 --- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp +++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp @@ -43,6 +43,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TimeProfiler.h" #include <algorithm> #include <cassert> @@ -1052,6 +1053,7 @@ void MetadataLoader::MetadataLoaderImpl::callMDTypeCallback(Metadata **Val, /// Parse a METADATA_BLOCK. If ModuleLevel is true then we are parsing /// module level metadata. Error MetadataLoader::MetadataLoaderImpl::parseMetadata(bool ModuleLevel) { + llvm::TimeTraceScope timeScope("Parse metadata"); if (!ModuleLevel && MetadataList.hasFwdRefs()) return error("Invalid metadata: fwd refs into function blocks"); diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index a3f825408d0c..a1d5b36bde64 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -1495,14 +1495,11 @@ void ModuleBitcodeWriter::writeModuleInfo() { // compute the maximum alignment value. std::map<std::string, unsigned> SectionMap; std::map<std::string, unsigned> GCMap; - MaybeAlign MaxAlignment; + MaybeAlign MaxGVarAlignment; unsigned MaxGlobalType = 0; - const auto UpdateMaxAlignment = [&MaxAlignment](const MaybeAlign A) { - if (A) - MaxAlignment = !MaxAlignment ? *A : std::max(*MaxAlignment, *A); - }; for (const GlobalVariable &GV : M.globals()) { - UpdateMaxAlignment(GV.getAlign()); + if (MaybeAlign A = GV.getAlign()) + MaxGVarAlignment = !MaxGVarAlignment ? *A : std::max(*MaxGVarAlignment, *A); MaxGlobalType = std::max(MaxGlobalType, VE.getTypeID(GV.getValueType())); if (GV.hasSection()) { // Give section names unique ID's. @@ -1515,7 +1512,6 @@ void ModuleBitcodeWriter::writeModuleInfo() { } } for (const Function &F : M) { - UpdateMaxAlignment(F.getAlign()); if (F.hasSection()) { // Give section names unique ID's. unsigned &Entry = SectionMap[std::string(F.getSection())]; @@ -1551,10 +1547,10 @@ void ModuleBitcodeWriter::writeModuleInfo() { //| constant Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Initializer. Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 5)); // Linkage. - if (!MaxAlignment) // Alignment. + if (!MaxGVarAlignment) // Alignment. Abbv->Add(BitCodeAbbrevOp(0)); else { - unsigned MaxEncAlignment = getEncodedAlign(MaxAlignment); + unsigned MaxEncAlignment = getEncodedAlign(MaxGVarAlignment); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, Log2_32_Ceil(MaxEncAlignment+1))); } diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 23a3543e9ebe..cd14a4f57f76 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -1432,7 +1432,7 @@ void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) { MCSection *BBAddrMapSection = getObjFileLowering().getBBAddrMapSection(*MF.getSection()); assert(BBAddrMapSection && ".llvm_bb_addr_map section is not initialized."); - bool HasCalls = !CurrentFnCallsiteSymbols.empty(); + bool HasCalls = !CurrentFnCallsiteEndSymbols.empty(); const MCSymbol *FunctionSymbol = getFunctionBegin(); @@ -1497,13 +1497,13 @@ void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) { emitLabelDifferenceAsULEB128(MBBSymbol, PrevMBBEndSymbol); const MCSymbol *CurrentLabel = MBBSymbol; if (HasCalls) { - auto CallsiteSymbols = CurrentFnCallsiteSymbols.lookup(&MBB); + auto CallsiteEndSymbols = CurrentFnCallsiteEndSymbols.lookup(&MBB); OutStreamer->AddComment("number of callsites"); - OutStreamer->emitULEB128IntValue(CallsiteSymbols.size()); - for (const MCSymbol *CallsiteSymbol : CallsiteSymbols) { + OutStreamer->emitULEB128IntValue(CallsiteEndSymbols.size()); + for (const MCSymbol *CallsiteEndSymbol : CallsiteEndSymbols) { // Emit the callsite offset. - emitLabelDifferenceAsULEB128(CallsiteSymbol, CurrentLabel); - CurrentLabel = CallsiteSymbol; + emitLabelDifferenceAsULEB128(CallsiteEndSymbol, CurrentLabel); + CurrentLabel = CallsiteEndSymbol; } } // Emit the offset to the end of the block, which can be used to compute @@ -1941,8 +1941,6 @@ void AsmPrinter::emitFunctionBody() { !MI.isDebugInstr()) { HasAnyRealCode = true; } - if (MI.isCall() && MF->getTarget().Options.BBAddrMap) - OutStreamer->emitLabel(createCallsiteSymbol(MBB)); // If there is a pre-instruction symbol, emit a label for it here. if (MCSymbol *S = MI.getPreInstrSymbol()) @@ -2064,6 +2062,9 @@ void AsmPrinter::emitFunctionBody() { break; } + if (MI.isCall() && MF->getTarget().Options.BBAddrMap) + OutStreamer->emitLabel(createCallsiteEndSymbol(MBB)); + if (TM.Options.EmitCallGraphSection && MI.isCall()) emitIndirectCalleeLabels(FuncInfo, CallSitesInfoMap, MI); @@ -2897,11 +2898,11 @@ MCSymbol *AsmPrinter::getMBBExceptionSym(const MachineBasicBlock &MBB) { return Res.first->second; } -MCSymbol *AsmPrinter::createCallsiteSymbol(const MachineBasicBlock &MBB) { +MCSymbol *AsmPrinter::createCallsiteEndSymbol(const MachineBasicBlock &MBB) { MCContext &Ctx = MF->getContext(); MCSymbol *Sym = Ctx.createTempSymbol("BB" + Twine(MF->getFunctionNumber()) + "_" + Twine(MBB.getNumber()) + "_CS"); - CurrentFnCallsiteSymbols[&MBB].push_back(Sym); + CurrentFnCallsiteEndSymbols[&MBB].push_back(Sym); return Sym; } @@ -2939,7 +2940,7 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) { CurrentFnBegin = nullptr; CurrentFnBeginLocal = nullptr; CurrentSectionBeginSym = nullptr; - CurrentFnCallsiteSymbols.clear(); + CurrentFnCallsiteEndSymbols.clear(); MBBSectionRanges.clear(); MBBSectionExceptionSyms.clear(); bool NeedsLocalForSize = MAI->needsLocalForSize(); diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index c27f10077562..2090157a1a91 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -3111,8 +3111,10 @@ void DwarfDebug::emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT, &AP](const DbgValueLocEntry &Entry, DIExpressionCursor &Cursor) -> bool { if (Entry.isInt()) { - if (BT && (BT->getEncoding() == dwarf::DW_ATE_signed || - BT->getEncoding() == dwarf::DW_ATE_signed_char)) + if (BT && (BT->getEncoding() == dwarf::DW_ATE_boolean)) + DwarfExpr.addBooleanConstant(Entry.getInt()); + else if (BT && (BT->getEncoding() == dwarf::DW_ATE_signed || + BT->getEncoding() == dwarf::DW_ATE_signed_char)) DwarfExpr.addSignedConstant(Entry.getInt()); else DwarfExpr.addUnsignedConstant(Entry.getInt()); diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp index e684054ffa3e..8a30714db2fd 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp @@ -194,6 +194,15 @@ void DwarfExpression::addStackValue() { emitOp(dwarf::DW_OP_stack_value); } +void DwarfExpression::addBooleanConstant(int64_t Value) { + assert(isImplicitLocation() || isUnknownLocation()); + LocationKind = Implicit; + if (Value == 0) + emitOp(dwarf::DW_OP_lit0); + else + emitOp(dwarf::DW_OP_lit1); +} + void DwarfExpression::addSignedConstant(int64_t Value) { assert(isImplicitLocation() || isUnknownLocation()); LocationKind = Implicit; diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h index 06809ab26387..700e0ec5813e 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h @@ -229,6 +229,9 @@ public: /// This needs to be called last to commit any pending changes. void finalize(); + /// Emit a boolean constant. + void addBooleanConstant(int64_t Value); + /// Emit a signed constant. void addSignedConstant(int64_t Value); diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp index b03fac2d22a5..d76fd0c01020 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp @@ -1351,6 +1351,13 @@ DIE *DwarfUnit::getOrCreateSubprogramDIE(const DISubprogram *SP, bool Minimal) { ContextDIE = &getUnitDie(); // Build the decl now to ensure it precedes the definition. getOrCreateSubprogramDIE(SPDecl); + // Check whether the DIE for SP has already been created after the call + // above. + // FIXME: Should the creation of definition subprogram DIE during + // the creation of declaration subprogram DIE be allowed? + // See https://github.com/llvm/llvm-project/pull/154636. + if (DIE *SPDie = getDIE(SP)) + return SPDie; } } @@ -1403,11 +1410,8 @@ bool DwarfUnit::applySubprogramDefinitionAttributes(const DISubprogram *SP, // Add the linkage name if we have one and it isn't in the Decl. StringRef LinkageName = SP->getLinkageName(); - assert(((LinkageName.empty() || DeclLinkageName.empty()) || - LinkageName == DeclLinkageName) && - "decl has a linkage name and it is different"); - if (DeclLinkageName.empty() && - // Always emit it for abstract subprograms. + // Always emit linkage name for abstract subprograms. + if (DeclLinkageName != LinkageName && (DD->useAllLinkageNames() || DU->getAbstractScopeDIEs().lookup(SP))) addLinkageName(SPDie, LinkageName); diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 278dd6560e73..4931403ab83a 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -84,7 +84,7 @@ private: bool expandAtomicLoadToCmpXchg(LoadInst *LI); StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI); bool tryExpandAtomicStore(StoreInst *SI); - void expandAtomicStore(StoreInst *SI); + void expandAtomicStoreToXChg(StoreInst *SI); bool tryExpandAtomicRMW(AtomicRMWInst *AI); AtomicRMWInst *convertAtomicXchgToIntegerType(AtomicRMWInst *RMWI); Value * @@ -537,6 +537,9 @@ bool AtomicExpandImpl::tryExpandAtomicLoad(LoadInst *LI) { case TargetLoweringBase::AtomicExpansionKind::NotAtomic: LI->setAtomic(AtomicOrdering::NotAtomic); return true; + case TargetLoweringBase::AtomicExpansionKind::CustomExpand: + TLI->emitExpandAtomicLoad(LI); + return true; default: llvm_unreachable("Unhandled case in tryExpandAtomicLoad"); } @@ -546,8 +549,11 @@ bool AtomicExpandImpl::tryExpandAtomicStore(StoreInst *SI) { switch (TLI->shouldExpandAtomicStoreInIR(SI)) { case TargetLoweringBase::AtomicExpansionKind::None: return false; + case TargetLoweringBase::AtomicExpansionKind::CustomExpand: + TLI->emitExpandAtomicStore(SI); + return true; case TargetLoweringBase::AtomicExpansionKind::Expand: - expandAtomicStore(SI); + expandAtomicStoreToXChg(SI); return true; case TargetLoweringBase::AtomicExpansionKind::NotAtomic: SI->setAtomic(AtomicOrdering::NotAtomic); @@ -620,7 +626,7 @@ StoreInst *AtomicExpandImpl::convertAtomicStoreToIntegerType(StoreInst *SI) { return NewSI; } -void AtomicExpandImpl::expandAtomicStore(StoreInst *SI) { +void AtomicExpandImpl::expandAtomicStoreToXChg(StoreInst *SI) { // This function is only called on atomic stores that are too large to be // atomic if implemented as a native store. So we replace them by an // atomic swap, that can be implemented for example as a ldrex/strex on ARM @@ -741,7 +747,7 @@ bool AtomicExpandImpl::tryExpandAtomicRMW(AtomicRMWInst *AI) { } case TargetLoweringBase::AtomicExpansionKind::NotAtomic: return lowerAtomicRMWInst(AI); - case TargetLoweringBase::AtomicExpansionKind::Expand: + case TargetLoweringBase::AtomicExpansionKind::CustomExpand: TLI->emitExpandAtomicRMW(AI); return true; default: @@ -1454,7 +1460,8 @@ bool AtomicExpandImpl::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { // If the cmpxchg doesn't actually need any ordering when it fails, we can // jump straight past that fence instruction (if it exists). - Builder.CreateCondBr(ShouldStore, ReleasingStoreBB, NoStoreBB); + Builder.CreateCondBr(ShouldStore, ReleasingStoreBB, NoStoreBB, + MDBuilder(F->getContext()).createLikelyBranchWeights()); Builder.SetInsertPoint(ReleasingStoreBB); if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier) @@ -1473,7 +1480,8 @@ bool AtomicExpandImpl::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success"); BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB; Builder.CreateCondBr(StoreSuccess, SuccessBB, - CI->isWeak() ? FailureBB : RetryBB); + CI->isWeak() ? FailureBB : RetryBB, + MDBuilder(F->getContext()).createLikelyBranchWeights()); Builder.SetInsertPoint(ReleasedLoadBB); Value *SecondLoad; @@ -1486,7 +1494,9 @@ bool AtomicExpandImpl::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { // If the cmpxchg doesn't actually need any ordering when it fails, we can // jump straight past that fence instruction (if it exists). - Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB); + Builder.CreateCondBr( + ShouldStore, TryStoreBB, NoStoreBB, + MDBuilder(F->getContext()).createLikelyBranchWeights()); // Update PHI node in TryStoreBB. LoadedTryStore->addIncoming(SecondLoad, ReleasedLoadBB); } else @@ -1695,7 +1705,7 @@ bool AtomicExpandImpl::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) { return true; case TargetLoweringBase::AtomicExpansionKind::NotAtomic: return lowerAtomicCmpXchgInst(CI); - case TargetLoweringBase::AtomicExpansionKind::Expand: { + case TargetLoweringBase::AtomicExpansionKind::CustomExpand: { TLI->emitExpandAtomicCmpXchg(CI); return true; } diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 0e40a92fd8d6..9db4c9e5e280 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -2618,22 +2618,9 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros, LoopInfo &LI, bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) { BasicBlock *BB = CI->getParent(); - // Lower inline assembly if we can. - // If we found an inline asm expession, and if the target knows how to - // lower it to normal LLVM code, do so now. - if (CI->isInlineAsm()) { - if (TLI->ExpandInlineAsm(CI)) { - // Avoid invalidating the iterator. - CurInstIterator = BB->begin(); - // Avoid processing instructions out of order, which could cause - // reuse before a value is defined. - SunkAddrs.clear(); - return true; - } - // Sink address computing for memory operands into the block. - if (optimizeInlineAsmInst(CI)) - return true; - } + // Sink address computing for memory operands into the block. + if (CI->isInlineAsm() && optimizeInlineAsmInst(CI)) + return true; // Align the pointer arguments to this call if the target thinks it's a good // idea diff --git a/llvm/lib/CodeGen/CodeGenTargetMachineImpl.cpp b/llvm/lib/CodeGen/CodeGenTargetMachineImpl.cpp index 442ec3840930..5d7e2b59c204 100644 --- a/llvm/lib/CodeGen/CodeGenTargetMachineImpl.cpp +++ b/llvm/lib/CodeGen/CodeGenTargetMachineImpl.cpp @@ -45,7 +45,7 @@ static cl::opt<bool> EnableNoTrapAfterNoreturn( "after noreturn calls, even if --trap-unreachable is set.")); void CodeGenTargetMachineImpl::initAsmInfo() { - MRI.reset(TheTarget.createMCRegInfo(getTargetTriple().str())); + MRI.reset(TheTarget.createMCRegInfo(getTargetTriple())); assert(MRI && "Unable to create reg info"); MII.reset(TheTarget.createMCInstrInfo()); assert(MII && "Unable to create instruction info"); @@ -53,12 +53,12 @@ void CodeGenTargetMachineImpl::initAsmInfo() { // to some backends having subtarget feature dependent module level // code generation. This is similar to the hack in the AsmPrinter for // module level assembly etc. - STI.reset(TheTarget.createMCSubtargetInfo( - getTargetTriple().str(), getTargetCPU(), getTargetFeatureString())); + STI.reset(TheTarget.createMCSubtargetInfo(getTargetTriple(), getTargetCPU(), + getTargetFeatureString())); assert(STI && "Unable to create subtarget info"); - MCAsmInfo *TmpAsmInfo = TheTarget.createMCAsmInfo( - *MRI, getTargetTriple().str(), Options.MCOptions); + MCAsmInfo *TmpAsmInfo = + TheTarget.createMCAsmInfo(*MRI, getTargetTriple(), Options.MCOptions); // TargetSelect.h moved to a different directory between LLVM 2.9 and 3.0, // and if the old one gets included then MCAsmInfo will be NULL and // we'll crash later. diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp index 810dc29d728d..0522698adf18 100644 --- a/llvm/lib/CodeGen/CommandFlags.cpp +++ b/llvm/lib/CodeGen/CommandFlags.cpp @@ -68,7 +68,6 @@ CGOPT(bool, EnableUnsafeFPMath) CGOPT(bool, EnableNoInfsFPMath) CGOPT(bool, EnableNoNaNsFPMath) CGOPT(bool, EnableNoSignedZerosFPMath) -CGOPT(bool, EnableApproxFuncFPMath) CGOPT(bool, EnableNoTrappingFPMath) CGOPT(bool, EnableAIXExtendedAltivecABI) CGOPT(DenormalMode::DenormalModeKind, DenormalFPMath) @@ -245,12 +244,6 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() { cl::init(false)); CGBINDOPT(EnableNoSignedZerosFPMath); - static cl::opt<bool> EnableApproxFuncFPMath( - "enable-approx-func-fp-math", - cl::desc("Enable FP math optimizations that assume approx func"), - cl::init(false)); - CGBINDOPT(EnableApproxFuncFPMath); - static cl::opt<bool> EnableNoTrappingFPMath( "enable-no-trapping-fp-math", cl::desc("Enable setting the FP exceptions build " @@ -563,7 +556,6 @@ codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) { Options.NoInfsFPMath = getEnableNoInfsFPMath(); Options.NoNaNsFPMath = getEnableNoNaNsFPMath(); Options.NoSignedZerosFPMath = getEnableNoSignedZerosFPMath(); - Options.ApproxFuncFPMath = getEnableApproxFuncFPMath(); Options.NoTrappingFPMath = getEnableNoTrappingFPMath(); DenormalMode::DenormalModeKind DenormKind = getDenormalFPMath(); @@ -718,7 +710,6 @@ void codegen::setFunctionAttributes(StringRef CPU, StringRef Features, HANDLE_BOOL_ATTR(EnableNoInfsFPMathView, "no-infs-fp-math"); HANDLE_BOOL_ATTR(EnableNoNaNsFPMathView, "no-nans-fp-math"); HANDLE_BOOL_ATTR(EnableNoSignedZerosFPMathView, "no-signed-zeros-fp-math"); - HANDLE_BOOL_ATTR(EnableApproxFuncFPMathView, "approx-func-fp-math"); if (DenormalFPMathView->getNumOccurrences() > 0 && !F.hasFnAttribute("denormal-fp-math")) { diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp index de95e0aaf2cb..7d355e6e365d 100644 --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -60,6 +60,7 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/ComplexDeinterleavingPass.h" +#include "llvm/ADT/AllocatorList.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -263,6 +264,7 @@ public: }; using Addend = std::pair<Value *, bool>; + using AddendList = BumpPtrList<Addend>; using CompositeNode = ComplexDeinterleavingCompositeNode::CompositeNode; // Helper struct for holding info about potential partial multiplication @@ -291,7 +293,7 @@ private: SmallPtrSet<Instruction *, 16> FinalInstructions; /// Root instructions are instructions from which complex computation starts - std::map<Instruction *, CompositeNode *> RootToNode; + DenseMap<Instruction *, CompositeNode *> RootToNode; /// Topologically sorted root instructions SmallVector<Instruction *, 1> OrderedRoots; @@ -339,7 +341,7 @@ private: /// ComplexDeinterleavingOperation::ReductionPHI node replacement. It is then /// used in the ComplexDeinterleavingOperation::ReductionOperation node /// replacement process. - std::map<PHINode *, PHINode *> OldToNewPHI; + DenseMap<PHINode *, PHINode *> OldToNewPHI; CompositeNode *prepareCompositeNode(ComplexDeinterleavingOperation Operation, Value *R, Value *I) { @@ -417,28 +419,28 @@ private: /// and \p ImagAddens. If \p Accumulator is not null, add the result to it. /// Return nullptr if it is not possible to construct a complex number. /// \p Flags are needed to generate symmetric Add and Sub operations. - CompositeNode *identifyAdditions(std::list<Addend> &RealAddends, - std::list<Addend> &ImagAddends, + CompositeNode *identifyAdditions(AddendList &RealAddends, + AddendList &ImagAddends, std::optional<FastMathFlags> Flags, CompositeNode *Accumulator); /// Extract one addend that have both real and imaginary parts positive. - CompositeNode *extractPositiveAddend(std::list<Addend> &RealAddends, - std::list<Addend> &ImagAddends); + CompositeNode *extractPositiveAddend(AddendList &RealAddends, + AddendList &ImagAddends); /// Determine if sum of multiplications of complex numbers can be formed from /// \p RealMuls and \p ImagMuls. If \p Accumulator is not null, add the result /// to it. Return nullptr if it is not possible to construct a complex number. - CompositeNode *identifyMultiplications(std::vector<Product> &RealMuls, - std::vector<Product> &ImagMuls, + CompositeNode *identifyMultiplications(SmallVectorImpl<Product> &RealMuls, + SmallVectorImpl<Product> &ImagMuls, CompositeNode *Accumulator); /// Go through pairs of multiplication (one Real and one Imag) and find all /// possible candidates for partial multiplication and put them into \p /// Candidates. Returns true if all Product has pair with common operand - bool collectPartialMuls(const std::vector<Product> &RealMuls, - const std::vector<Product> &ImagMuls, - std::vector<PartialMulCandidate> &Candidates); + bool collectPartialMuls(ArrayRef<Product> RealMuls, + ArrayRef<Product> ImagMuls, + SmallVectorImpl<PartialMulCandidate> &Candidates); /// If the code is compiled with -Ofast or expressions have `reassoc` flag, /// the order of complex computation operations may be significantly altered, @@ -1255,8 +1257,8 @@ ComplexDeinterleavingGraph::identifyReassocNodes(Instruction *Real, // Collect multiplications and addend instructions from the given instruction // while traversing it operands. Additionally, verify that all instructions // have the same fast math flags. - auto Collect = [&Flags](Instruction *Insn, std::vector<Product> &Muls, - std::list<Addend> &Addends) -> bool { + auto Collect = [&Flags](Instruction *Insn, SmallVectorImpl<Product> &Muls, + AddendList &Addends) -> bool { SmallVector<PointerIntPair<Value *, 1, bool>> Worklist = {{Insn, true}}; SmallPtrSet<Value *, 8> Visited; while (!Worklist.empty()) { @@ -1336,8 +1338,8 @@ ComplexDeinterleavingGraph::identifyReassocNodes(Instruction *Real, return true; }; - std::vector<Product> RealMuls, ImagMuls; - std::list<Addend> RealAddends, ImagAddends; + SmallVector<Product> RealMuls, ImagMuls; + AddendList RealAddends, ImagAddends; if (!Collect(Real, RealMuls, RealAddends) || !Collect(Imag, ImagMuls, ImagAddends)) return nullptr; @@ -1371,8 +1373,8 @@ ComplexDeinterleavingGraph::identifyReassocNodes(Instruction *Real, } bool ComplexDeinterleavingGraph::collectPartialMuls( - const std::vector<Product> &RealMuls, const std::vector<Product> &ImagMuls, - std::vector<PartialMulCandidate> &PartialMulCandidates) { + ArrayRef<Product> RealMuls, ArrayRef<Product> ImagMuls, + SmallVectorImpl<PartialMulCandidate> &PartialMulCandidates) { // Helper function to extract a common operand from two products auto FindCommonInstruction = [](const Product &Real, const Product &Imag) -> Value * { @@ -1423,18 +1425,18 @@ bool ComplexDeinterleavingGraph::collectPartialMuls( ComplexDeinterleavingGraph::CompositeNode * ComplexDeinterleavingGraph::identifyMultiplications( - std::vector<Product> &RealMuls, std::vector<Product> &ImagMuls, + SmallVectorImpl<Product> &RealMuls, SmallVectorImpl<Product> &ImagMuls, CompositeNode *Accumulator = nullptr) { if (RealMuls.size() != ImagMuls.size()) return nullptr; - std::vector<PartialMulCandidate> Info; + SmallVector<PartialMulCandidate> Info; if (!collectPartialMuls(RealMuls, ImagMuls, Info)) return nullptr; // Map to store common instruction to node pointers - std::map<Value *, CompositeNode *> CommonToNode; - std::vector<bool> Processed(Info.size(), false); + DenseMap<Value *, CompositeNode *> CommonToNode; + SmallVector<bool> Processed(Info.size(), false); for (unsigned I = 0; I < Info.size(); ++I) { if (Processed[I]) continue; @@ -1463,8 +1465,8 @@ ComplexDeinterleavingGraph::identifyMultiplications( } } - std::vector<bool> ProcessedReal(RealMuls.size(), false); - std::vector<bool> ProcessedImag(ImagMuls.size(), false); + SmallVector<bool> ProcessedReal(RealMuls.size(), false); + SmallVector<bool> ProcessedImag(ImagMuls.size(), false); CompositeNode *Result = Accumulator; for (auto &PMI : Info) { if (ProcessedReal[PMI.RealIdx] || ProcessedImag[PMI.ImagIdx]) @@ -1580,7 +1582,7 @@ ComplexDeinterleavingGraph::identifyMultiplications( ComplexDeinterleavingGraph::CompositeNode * ComplexDeinterleavingGraph::identifyAdditions( - std::list<Addend> &RealAddends, std::list<Addend> &ImagAddends, + AddendList &RealAddends, AddendList &ImagAddends, std::optional<FastMathFlags> Flags, CompositeNode *Accumulator = nullptr) { if (RealAddends.size() != ImagAddends.size()) return nullptr; @@ -1671,8 +1673,8 @@ ComplexDeinterleavingGraph::identifyAdditions( } ComplexDeinterleavingGraph::CompositeNode * -ComplexDeinterleavingGraph::extractPositiveAddend( - std::list<Addend> &RealAddends, std::list<Addend> &ImagAddends) { +ComplexDeinterleavingGraph::extractPositiveAddend(AddendList &RealAddends, + AddendList &ImagAddends) { for (auto ItR = RealAddends.begin(); ItR != RealAddends.end(); ++ItR) { for (auto ItI = ImagAddends.begin(); ItI != ImagAddends.end(); ++ItI) { auto [R, IsPositiveR] = *ItR; diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp index 1c1047c1ce18..9cc6c6a706c5 100644 --- a/llvm/lib/CodeGen/ExpandFp.cpp +++ b/llvm/lib/CodeGen/ExpandFp.cpp @@ -16,18 +16,29 @@ #include "llvm/CodeGen/ExpandFp.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/SimplifyQuery.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" +#include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" +#include "llvm/IR/RuntimeLibcalls.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include <optional> + +#define DEBUG_TYPE "expand-fp" using namespace llvm; @@ -37,6 +48,359 @@ static cl::opt<unsigned> cl::desc("fp convert instructions on integers with " "more than <N> bits are expanded.")); +namespace { +/// This class implements a precise expansion of the frem instruction. +/// The generated code is based on the fmod implementation in the AMD device +/// libs. +class FRemExpander { + /// The IRBuilder to use for the expansion. + IRBuilder<> &B; + + /// Floating point type of the return value and the arguments of the FRem + /// instructions that should be expanded. + Type *FremTy; + + /// Floating point type to use for the computation. This may be + /// wider than the \p FremTy. + Type *ComputeFpTy; + + /// Integer type used to hold the exponents returned by frexp. + Type *ExTy; + + /// How many bits of the quotient to compute per iteration of the + /// algorithm, stored as a value of type \p ExTy. + Value *Bits; + + /// Constant 1 of type \p ExTy. + Value *One; + +public: + static bool canExpandType(Type *Ty) { + // TODO The expansion should work for other floating point types + // as well, but this would require additional testing. + return Ty->isIEEELikeFPTy() && !Ty->isBFloatTy() && !Ty->isFP128Ty(); + } + + static FRemExpander create(IRBuilder<> &B, Type *Ty) { + assert(canExpandType(Ty)); + + // The type to use for the computation of the remainder. This may be + // wider than the input/result type which affects the ... + Type *ComputeTy = Ty; + // ... maximum number of iterations of the remainder computation loop + // to use. This value is for the case in which the computation + // uses the same input/result type. + unsigned MaxIter = 2; + + if (Ty->isHalfTy()) { + // Use the wider type and less iterations. + ComputeTy = B.getFloatTy(); + MaxIter = 1; + } + + unsigned Precision = + llvm::APFloat::semanticsPrecision(Ty->getFltSemantics()); + return FRemExpander{B, Ty, Precision / MaxIter, ComputeTy}; + } + + /// Build the FRem expansion for the numerator \p X and the + /// denumerator \p Y. The type of X and Y must match \p FremTy. The + /// code will be generated at the insertion point of \p B and the + /// insertion point will be reset at exit. + Value *buildFRem(Value *X, Value *Y, std::optional<SimplifyQuery> &SQ) const; + + /// Build an approximate FRem expansion for the numerator \p X and + /// the denumerator \p Y at the insertion point of builder \p B. + /// The type of X and Y must match \p FremTy. + Value *buildApproxFRem(Value *X, Value *Y) const; + +private: + FRemExpander(IRBuilder<> &B, Type *FremTy, unsigned Bits, Type *ComputeFpTy) + : B(B), FremTy(FremTy), ComputeFpTy(ComputeFpTy), ExTy(B.getInt32Ty()), + Bits(ConstantInt::get(ExTy, Bits)), One(ConstantInt::get(ExTy, 1)) {}; + + Value *createRcp(Value *V, const Twine &Name) const { + // Leave it to later optimizations to turn this into an rcp + // instruction if available. + return B.CreateFDiv(ConstantFP::get(ComputeFpTy, 1.0), V, Name); + } + + // Helper function to build the UPDATE_AX code which is common to the + // loop body and the "final iteration". + Value *buildUpdateAx(Value *Ax, Value *Ay, Value *Ayinv) const { + // Build: + // float q = rint(ax * ayinv); + // ax = fma(-q, ay, ax); + // int clt = ax < 0.0f; + // float axp = ax + ay; + // ax = clt ? axp : ax; + Value *Q = B.CreateUnaryIntrinsic(Intrinsic::rint, B.CreateFMul(Ax, Ayinv), + {}, "q"); + Value *AxUpdate = B.CreateFMA(B.CreateFNeg(Q), Ay, Ax, {}, "ax"); + Value *Clt = B.CreateFCmp(CmpInst::FCMP_OLT, AxUpdate, + ConstantFP::getZero(ComputeFpTy), "clt"); + Value *Axp = B.CreateFAdd(AxUpdate, Ay, "axp"); + return B.CreateSelect(Clt, Axp, AxUpdate, "ax"); + } + + /// Build code to extract the exponent and mantissa of \p Src. + /// Return the exponent minus one for use as a loop bound and + /// the mantissa taken to the given \p NewExp power. + std::pair<Value *, Value *> buildExpAndPower(Value *Src, Value *NewExp, + const Twine &ExName, + const Twine &PowName) const { + // Build: + // ExName = frexp_exp(Src) - 1; + // PowName = fldexp(frexp_mant(ExName), NewExp); + Type *Ty = Src->getType(); + Type *ExTy = B.getInt32Ty(); + Value *Frexp = B.CreateIntrinsic(Intrinsic::frexp, {Ty, ExTy}, Src); + Value *Mant = B.CreateExtractValue(Frexp, {0}); + Value *Exp = B.CreateExtractValue(Frexp, {1}); + + Exp = B.CreateSub(Exp, One, ExName); + Value *Pow = B.CreateLdexp(Mant, NewExp, {}, PowName); + + return {Pow, Exp}; + } + + /// Build the main computation of the remainder for the case in which + /// Ax > Ay, where Ax = |X|, Ay = |Y|, and X is the numerator and Y the + /// denumerator. Add the incoming edge from the computation result + /// to \p RetPhi. + void buildRemainderComputation(Value *AxInitial, Value *AyInitial, Value *X, + PHINode *RetPhi, FastMathFlags FMF) const { + IRBuilder<>::FastMathFlagGuard Guard(B); + B.setFastMathFlags(FMF); + + // Build: + // ex = frexp_exp(ax) - 1; + // ax = fldexp(frexp_mant(ax), bits); + // ey = frexp_exp(ay) - 1; + // ay = fledxp(frexp_mant(ay), 1); + auto [Ax, Ex] = buildExpAndPower(AxInitial, Bits, "ex", "ax"); + auto [Ay, Ey] = buildExpAndPower(AyInitial, One, "ey", "ay"); + + // Build: + // int nb = ex - ey; + // float ayinv = 1.0/ay; + Value *Nb = B.CreateSub(Ex, Ey, "nb"); + Value *Ayinv = createRcp(Ay, "ayinv"); + + // Build: while (nb > bits) + BasicBlock *PreheaderBB = B.GetInsertBlock(); + Function *Fun = PreheaderBB->getParent(); + auto *LoopBB = BasicBlock::Create(B.getContext(), "frem.loop_body", Fun); + auto *ExitBB = BasicBlock::Create(B.getContext(), "frem.loop_exit", Fun); + + B.CreateCondBr(B.CreateICmp(CmpInst::ICMP_SGT, Nb, Bits), LoopBB, ExitBB); + + // Build loop body: + // UPDATE_AX + // ax = fldexp(ax, bits); + // nb -= bits; + // One iteration of the loop is factored out. The code shared by + // the loop and this "iteration" is denoted by UPDATE_AX. + B.SetInsertPoint(LoopBB); + PHINode *NbIv = B.CreatePHI(Nb->getType(), 2, "nb_iv"); + NbIv->addIncoming(Nb, PreheaderBB); + + auto *AxPhi = B.CreatePHI(ComputeFpTy, 2, "ax_loop_phi"); + AxPhi->addIncoming(Ax, PreheaderBB); + + Value *AxPhiUpdate = buildUpdateAx(AxPhi, Ay, Ayinv); + AxPhiUpdate = B.CreateLdexp(AxPhiUpdate, Bits, {}, "ax_update"); + AxPhi->addIncoming(AxPhiUpdate, LoopBB); + NbIv->addIncoming(B.CreateSub(NbIv, Bits, "nb_update"), LoopBB); + + B.CreateCondBr(B.CreateICmp(CmpInst::ICMP_SGT, NbIv, Bits), LoopBB, ExitBB); + + // Build final iteration + // ax = fldexp(ax, nb - bits + 1); + // UPDATE_AX + B.SetInsertPoint(ExitBB); + + auto *AxPhiExit = B.CreatePHI(ComputeFpTy, 2, "ax_exit_phi"); + AxPhiExit->addIncoming(Ax, PreheaderBB); + AxPhiExit->addIncoming(AxPhi, LoopBB); + auto *NbExitPhi = B.CreatePHI(Nb->getType(), 2, "nb_exit_phi"); + NbExitPhi->addIncoming(NbIv, LoopBB); + NbExitPhi->addIncoming(Nb, PreheaderBB); + + Value *AxFinal = B.CreateLdexp( + AxPhiExit, B.CreateAdd(B.CreateSub(NbExitPhi, Bits), One), {}, "ax"); + AxFinal = buildUpdateAx(AxFinal, Ay, Ayinv); + + // Build: + // ax = fldexp(ax, ey); + // ret = copysign(ax,x); + AxFinal = B.CreateLdexp(AxFinal, Ey, {}, "ax"); + if (ComputeFpTy != FremTy) + AxFinal = B.CreateFPTrunc(AxFinal, FremTy); + Value *Ret = B.CreateCopySign(AxFinal, X); + + RetPhi->addIncoming(Ret, ExitBB); + } + + /// Build the else-branch of the conditional in the FRem + /// expansion, i.e. the case in wich Ax <= Ay, where Ax = |X|, Ay + /// = |Y|, and X is the numerator and Y the denumerator. Add the + /// incoming edge from the result to \p RetPhi. + void buildElseBranch(Value *Ax, Value *Ay, Value *X, PHINode *RetPhi) const { + // Build: + // ret = ax == ay ? copysign(0.0f, x) : x; + Value *ZeroWithXSign = B.CreateCopySign(ConstantFP::getZero(FremTy), X); + Value *Ret = B.CreateSelect(B.CreateFCmpOEQ(Ax, Ay), ZeroWithXSign, X); + + RetPhi->addIncoming(Ret, B.GetInsertBlock()); + } + + /// Return a value that is NaN if one of the corner cases concerning + /// the inputs \p X and \p Y is detected, and \p Ret otherwise. + Value *handleInputCornerCases(Value *Ret, Value *X, Value *Y, + std::optional<SimplifyQuery> &SQ, + bool NoInfs) const { + // Build: + // ret = (y == 0.0f || isnan(y)) ? QNAN : ret; + // ret = isfinite(x) ? ret : QNAN; + Value *Nan = ConstantFP::getQNaN(FremTy); + Ret = B.CreateSelect(B.CreateFCmpUEQ(Y, ConstantFP::getZero(FremTy)), Nan, + Ret); + Value *XFinite = + NoInfs || (SQ && isKnownNeverInfinity(X, *SQ)) + ? B.getTrue() + : B.CreateFCmpULT(B.CreateUnaryIntrinsic(Intrinsic::fabs, X), + ConstantFP::getInfinity(FremTy)); + Ret = B.CreateSelect(XFinite, Ret, Nan); + + return Ret; + } +}; + +Value *FRemExpander::buildApproxFRem(Value *X, Value *Y) const { + IRBuilder<>::FastMathFlagGuard Guard(B); + // Propagating the approximate functions flag to the + // division leads to an unacceptable drop in precision + // on AMDGPU. + // TODO Find out if any flags might be worth propagating. + B.clearFastMathFlags(); + + Value *Quot = B.CreateFDiv(X, Y); + Value *Trunc = B.CreateUnaryIntrinsic(Intrinsic::trunc, Quot, {}); + Value *Neg = B.CreateFNeg(Trunc); + + return B.CreateFMA(Neg, Y, X); +} + +Value *FRemExpander::buildFRem(Value *X, Value *Y, + std::optional<SimplifyQuery> &SQ) const { + assert(X->getType() == FremTy && Y->getType() == FremTy); + + FastMathFlags FMF = B.getFastMathFlags(); + + // This function generates the following code structure: + // if (abs(x) > abs(y)) + // { ret = compute remainder } + // else + // { ret = x or 0 with sign of x } + // Adjust ret to NaN/inf in input + // return ret + Value *Ax = B.CreateUnaryIntrinsic(Intrinsic::fabs, X, {}, "ax"); + Value *Ay = B.CreateUnaryIntrinsic(Intrinsic::fabs, Y, {}, "ay"); + if (ComputeFpTy != X->getType()) { + Ax = B.CreateFPExt(Ax, ComputeFpTy, "ax"); + Ay = B.CreateFPExt(Ay, ComputeFpTy, "ay"); + } + Value *AxAyCmp = B.CreateFCmpOGT(Ax, Ay); + + PHINode *RetPhi = B.CreatePHI(FremTy, 2, "ret"); + Value *Ret = RetPhi; + + // We would return NaN in all corner cases handled here. + // Hence, if NaNs are excluded, keep the result as it is. + if (!FMF.noNaNs()) + Ret = handleInputCornerCases(Ret, X, Y, SQ, FMF.noInfs()); + + Function *Fun = B.GetInsertBlock()->getParent(); + auto *ThenBB = BasicBlock::Create(B.getContext(), "frem.compute", Fun); + auto *ElseBB = BasicBlock::Create(B.getContext(), "frem.else", Fun); + SplitBlockAndInsertIfThenElse(AxAyCmp, RetPhi, &ThenBB, &ElseBB); + + auto SavedInsertPt = B.GetInsertPoint(); + + // Build remainder computation for "then" branch + // + // The ordered comparison ensures that ax and ay are not NaNs + // in the then-branch. Furthermore, y cannot be an infinity and the + // check at the end of the function ensures that the result will not + // be used if x is an infinity. + FastMathFlags ComputeFMF = FMF; + ComputeFMF.setNoInfs(); + ComputeFMF.setNoNaNs(); + + B.SetInsertPoint(ThenBB); + buildRemainderComputation(Ax, Ay, X, RetPhi, FMF); + B.CreateBr(RetPhi->getParent()); + + // Build "else"-branch + B.SetInsertPoint(ElseBB); + buildElseBranch(Ax, Ay, X, RetPhi); + B.CreateBr(RetPhi->getParent()); + + B.SetInsertPoint(SavedInsertPt); + + return Ret; +} +} // namespace + +static bool expandFRem(BinaryOperator &I, std::optional<SimplifyQuery> &SQ) { + LLVM_DEBUG(dbgs() << "Expanding instruction: " << I << '\n'); + + Type *ReturnTy = I.getType(); + assert(FRemExpander::canExpandType(ReturnTy->getScalarType())); + + FastMathFlags FMF = I.getFastMathFlags(); + // TODO Make use of those flags for optimization? + FMF.setAllowReciprocal(false); + FMF.setAllowContract(false); + + IRBuilder<> B(&I); + B.setFastMathFlags(FMF); + B.SetCurrentDebugLocation(I.getDebugLoc()); + + Type *ElemTy = ReturnTy->getScalarType(); + const FRemExpander Expander = FRemExpander::create(B, ElemTy); + + Value *Ret; + if (ReturnTy->isFloatingPointTy()) + Ret = FMF.approxFunc() + ? Expander.buildApproxFRem(I.getOperand(0), I.getOperand(1)) + : Expander.buildFRem(I.getOperand(0), I.getOperand(1), SQ); + else { + auto *VecTy = cast<FixedVectorType>(ReturnTy); + + // This could use SplitBlockAndInsertForEachLane but the interface + // is a bit awkward for a constant number of elements and it will + // boil down to the same code. + // TODO Expand the FRem instruction only once and reuse the code. + Value *Nums = I.getOperand(0); + Value *Denums = I.getOperand(1); + Ret = PoisonValue::get(I.getType()); + for (int I = 0, E = VecTy->getNumElements(); I != E; ++I) { + Value *Num = B.CreateExtractElement(Nums, I); + Value *Denum = B.CreateExtractElement(Denums, I); + Value *Rem = FMF.approxFunc() ? Expander.buildApproxFRem(Num, Denum) + : Expander.buildFRem(Num, Denum, SQ); + Ret = B.CreateInsertElement(Ret, Rem, I); + } + } + + I.replaceAllUsesWith(Ret); + Ret->takeName(&I); + I.eraseFromParent(); + + return true; +} // clang-format off: preserve formatting of the following example /// Generate code to convert a fp number to integer, replacing FPToS(U)I with @@ -64,8 +428,8 @@ static cl::opt<unsigned> /// br i1 %cmp6.not, label %if.end12, label %if.then8 /// /// if.then8: ; preds = %if.end -/// %cond11 = select i1 %tobool.not, i64 9223372036854775807, i64 -9223372036854775808 -/// br label %cleanup +/// %cond11 = select i1 %tobool.not, i64 9223372036854775807, i64 +/// -9223372036854775808 br label %cleanup /// /// if.end12: ; preds = %if.end /// %cmp13 = icmp ult i64 %shr, 150 @@ -83,9 +447,10 @@ static cl::opt<unsigned> /// %mul19 = mul nsw i64 %shl, %conv /// br label %cleanup /// -/// cleanup: ; preds = %entry, %if.else, %if.then15, %if.then8 -/// %retval.0 = phi i64 [ %cond11, %if.then8 ], [ %mul, %if.then15 ], [ %mul19, %if.else ], [ 0, %entry ] -/// ret i64 %retval.0 +/// cleanup: ; preds = %entry, +/// %if.else, %if.then15, %if.then8 +/// %retval.0 = phi i64 [ %cond11, %if.then8 ], [ %mul, %if.then15 ], [ +/// %mul19, %if.else ], [ 0, %entry ] ret i64 %retval.0 /// } /// /// Replace fp to integer with generated code. @@ -272,13 +637,11 @@ static void expandFPToI(Instruction *FPToI) { /// %or = or i64 %shr6, %conv11 /// br label %sw.epilog /// -/// sw.epilog: ; preds = %sw.default, %if.then4, %sw.bb -/// %a.addr.0 = phi i64 [ %or, %sw.default ], [ %sub, %if.then4 ], [ %shl, %sw.bb ] -/// %1 = lshr i64 %a.addr.0, 2 -/// %2 = and i64 %1, 1 -/// %or16 = or i64 %2, %a.addr.0 -/// %inc = add nsw i64 %or16, 1 -/// %3 = and i64 %inc, 67108864 +/// sw.epilog: ; preds = %sw.default, +/// %if.then4, %sw.bb +/// %a.addr.0 = phi i64 [ %or, %sw.default ], [ %sub, %if.then4 ], [ %shl, +/// %sw.bb ] %1 = lshr i64 %a.addr.0, 2 %2 = and i64 %1, 1 %or16 = or i64 %2, +/// %a.addr.0 %inc = add nsw i64 %or16, 1 %3 = and i64 %inc, 67108864 /// %tobool.not = icmp eq i64 %3, 0 /// %spec.select.v = select i1 %tobool.not, i64 2, i64 3 /// %spec.select = ashr i64 %inc, %spec.select.v @@ -291,7 +654,8 @@ static void expandFPToI(Instruction *FPToI) { /// %shl25 = shl i64 %sub, %sh_prom24 /// br label %if.end26 /// -/// if.end26: ; preds = %sw.epilog, %if.else +/// if.end26: ; preds = %sw.epilog, +/// %if.else /// %a.addr.1 = phi i64 [ %shl25, %if.else ], [ %spec.select, %sw.epilog ] /// %e.0 = phi i32 [ %sub2, %if.else ], [ %spec.select56, %sw.epilog ] /// %conv27 = trunc i64 %shr to i32 @@ -305,7 +669,8 @@ static void expandFPToI(Instruction *FPToI) { /// %4 = bitcast i32 %or33 to float /// br label %return /// -/// return: ; preds = %entry, %if.end26 +/// return: ; preds = %entry, +/// %if.end26 /// %retval.0 = phi float [ %4, %if.end26 ], [ 0.000000e+00, %entry ] /// ret float %retval.0 /// } @@ -594,7 +959,38 @@ static void scalarize(Instruction *I, SmallVectorImpl<Instruction *> &Replace) { I->eraseFromParent(); } -static bool runImpl(Function &F, const TargetLowering &TLI) { +// This covers all floating point types; more than we need here. +// TODO Move somewhere else for general use? +/// Return the Libcall for a frem instruction of +/// type \p Ty. +static RTLIB::Libcall fremToLibcall(Type *Ty) { + assert(Ty->isFloatingPointTy()); + if (Ty->isFloatTy() || Ty->is16bitFPTy()) + return RTLIB::REM_F32; + if (Ty->isDoubleTy()) + return RTLIB::REM_F64; + if (Ty->isFP128Ty()) + return RTLIB::REM_F128; + if (Ty->isX86_FP80Ty()) + return RTLIB::REM_F80; + if (Ty->isPPC_FP128Ty()) + return RTLIB::REM_PPCF128; + + llvm_unreachable("Unknown floating point type"); +} + +/* Return true if, according to \p LibInfo, the target either directly + supports the frem instruction for the \p Ty, has a custom lowering, + or uses a libcall. */ +static bool targetSupportsFrem(const TargetLowering &TLI, Type *Ty) { + if (!TLI.isOperationExpand(ISD::FREM, EVT::getEVT(Ty))) + return true; + + return TLI.getLibcallName(fremToLibcall(Ty->getScalarType())); +} + +static bool runImpl(Function &F, const TargetLowering &TLI, + AssumptionCache *AC) { SmallVector<Instruction *, 4> Replace; SmallVector<Instruction *, 4> ReplaceVector; bool Modified = false; @@ -609,6 +1005,21 @@ static bool runImpl(Function &F, const TargetLowering &TLI) { for (auto &I : instructions(F)) { switch (I.getOpcode()) { + case Instruction::FRem: { + Type *Ty = I.getType(); + // TODO: This pass doesn't handle scalable vectors. + if (Ty->isScalableTy()) + continue; + + if (targetSupportsFrem(TLI, Ty) || + !FRemExpander::canExpandType(Ty->getScalarType())) + continue; + + Replace.push_back(&I); + Modified = true; + + break; + } case Instruction::FPToUI: case Instruction::FPToSI: { // TODO: This pass doesn't handle scalable vectors. @@ -659,8 +1070,20 @@ static bool runImpl(Function &F, const TargetLowering &TLI) { while (!Replace.empty()) { Instruction *I = Replace.pop_back_val(); - if (I->getOpcode() == Instruction::FPToUI || - I->getOpcode() == Instruction::FPToSI) { + if (I->getOpcode() == Instruction::FRem) { + auto SQ = [&]() -> std::optional<SimplifyQuery> { + if (AC) { + auto Res = std::make_optional<SimplifyQuery>( + I->getModule()->getDataLayout(), I); + Res->AC = AC; + return Res; + } + return {}; + }(); + + expandFRem(cast<BinaryOperator>(*I), SQ); + } else if (I->getOpcode() == Instruction::FPToUI || + I->getOpcode() == Instruction::FPToSI) { expandFPToI(I); } else { expandIToFP(I); @@ -672,31 +1095,58 @@ static bool runImpl(Function &F, const TargetLowering &TLI) { namespace { class ExpandFpLegacyPass : public FunctionPass { + CodeGenOptLevel OptLevel; + public: static char ID; - ExpandFpLegacyPass() : FunctionPass(ID) { + ExpandFpLegacyPass(CodeGenOptLevel OptLevel) + : FunctionPass(ID), OptLevel(OptLevel) { initializeExpandFpLegacyPassPass(*PassRegistry::getPassRegistry()); } + ExpandFpLegacyPass() : ExpandFpLegacyPass(CodeGenOptLevel::None) {}; + bool runOnFunction(Function &F) override { auto *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); auto *TLI = TM->getSubtargetImpl(F)->getTargetLowering(); - return runImpl(F, *TLI); + AssumptionCache *AC = nullptr; + + if (OptLevel != CodeGenOptLevel::None && !F.hasOptNone()) + AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + return runImpl(F, *TLI, AC); } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<TargetPassConfig>(); + if (OptLevel != CodeGenOptLevel::None) + AU.addRequired<AssumptionCacheTracker>(); AU.addPreserved<AAResultsWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); } }; } // namespace +ExpandFpPass::ExpandFpPass(const TargetMachine *TM, CodeGenOptLevel OptLevel) + : TM(TM), OptLevel(OptLevel) {} + +void ExpandFpPass::printPipeline( + raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { + static_cast<PassInfoMixin<ExpandFpPass> *>(this)->printPipeline( + OS, MapClassName2PassName); + OS << '<'; + OS << "O" << (int)OptLevel; + OS << '>'; +} + PreservedAnalyses ExpandFpPass::run(Function &F, FunctionAnalysisManager &FAM) { const TargetSubtargetInfo *STI = TM->getSubtargetImpl(F); - return runImpl(F, *STI->getTargetLowering()) ? PreservedAnalyses::none() - : PreservedAnalyses::all(); + auto &TLI = *STI->getTargetLowering(); + AssumptionCache *AC = nullptr; + if (OptLevel != CodeGenOptLevel::None) + AC = &FAM.getResult<AssumptionAnalysis>(F); + return runImpl(F, TLI, AC) ? PreservedAnalyses::none() + : PreservedAnalyses::all(); } char ExpandFpLegacyPass::ID = 0; @@ -704,4 +1154,6 @@ INITIALIZE_PASS_BEGIN(ExpandFpLegacyPass, "expand-fp", "Expand certain fp instructions", false, false) INITIALIZE_PASS_END(ExpandFpLegacyPass, "expand-fp", "Expand fp", false, false) -FunctionPass *llvm::createExpandFpPass() { return new ExpandFpLegacyPass(); } +FunctionPass *llvm::createExpandFpPass(CodeGenOptLevel OptLevel) { + return new ExpandFpLegacyPass(OptLevel); +} diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp index 753c65600770..03abc042e556 100644 --- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp +++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp @@ -150,9 +150,8 @@ struct CachingVPExpander { ElementCount ElemCount); /// If needed, folds the EVL in the mask operand and discards the EVL - /// parameter. Returns a pair of the value of the intrinsic after the change - /// (if any) and whether the mask was actually folded. - std::pair<Value *, bool> foldEVLIntoMask(VPIntrinsic &VPI); + /// parameter. Returns true if the mask was actually folded. + bool foldEVLIntoMask(VPIntrinsic &VPI); /// "Remove" the %evl parameter of \p PI by setting it to the static vector /// length of the operation. Returns true if the %evl (if any) was effectively @@ -160,34 +159,31 @@ struct CachingVPExpander { bool discardEVLParameter(VPIntrinsic &PI); /// Lower this VP binary operator to a unpredicated binary operator. - Value *expandPredicationInBinaryOperator(IRBuilder<> &Builder, - VPIntrinsic &PI); + bool expandPredicationInBinaryOperator(IRBuilder<> &Builder, VPIntrinsic &PI); /// Lower this VP int call to a unpredicated int call. - Value *expandPredicationToIntCall(IRBuilder<> &Builder, VPIntrinsic &PI); + bool expandPredicationToIntCall(IRBuilder<> &Builder, VPIntrinsic &PI); /// Lower this VP fp call to a unpredicated fp call. - Value *expandPredicationToFPCall(IRBuilder<> &Builder, VPIntrinsic &PI, - unsigned UnpredicatedIntrinsicID); + bool expandPredicationToFPCall(IRBuilder<> &Builder, VPIntrinsic &PI, + unsigned UnpredicatedIntrinsicID); /// Lower this VP reduction to a call to an unpredicated reduction intrinsic. - Value *expandPredicationInReduction(IRBuilder<> &Builder, - VPReductionIntrinsic &PI); + bool expandPredicationInReduction(IRBuilder<> &Builder, + VPReductionIntrinsic &PI); /// Lower this VP cast operation to a non-VP intrinsic. - Value *expandPredicationToCastIntrinsic(IRBuilder<> &Builder, - VPIntrinsic &VPI); + bool expandPredicationToCastIntrinsic(IRBuilder<> &Builder, VPIntrinsic &VPI); /// Lower this VP memory operation to a non-VP intrinsic. - Value *expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder, - VPIntrinsic &VPI); + bool expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder, + VPIntrinsic &VPI); /// Lower this VP comparison to a call to an unpredicated comparison. - Value *expandPredicationInComparison(IRBuilder<> &Builder, - VPCmpIntrinsic &PI); + bool expandPredicationInComparison(IRBuilder<> &Builder, VPCmpIntrinsic &PI); /// Query TTI and expand the vector predication in \p P accordingly. - Value *expandPredication(VPIntrinsic &PI); + bool expandPredication(VPIntrinsic &PI); /// Determine how and whether the VPIntrinsic \p VPI shall be expanded. This /// overrides TTI with the cl::opts listed at the top of this file. @@ -227,9 +223,8 @@ Value *CachingVPExpander::convertEVLToMask(IRBuilder<> &Builder, return Builder.CreateICmp(CmpInst::ICMP_ULT, IdxVec, VLSplat); } -Value * -CachingVPExpander::expandPredicationInBinaryOperator(IRBuilder<> &Builder, - VPIntrinsic &VPI) { +bool CachingVPExpander::expandPredicationInBinaryOperator(IRBuilder<> &Builder, + VPIntrinsic &VPI) { assert((maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()) && "Implicitly dropping %evl in non-speculatable operator!"); @@ -261,14 +256,14 @@ CachingVPExpander::expandPredicationInBinaryOperator(IRBuilder<> &Builder, Value *NewBinOp = Builder.CreateBinOp(OC, Op0, Op1, VPI.getName()); replaceOperation(*NewBinOp, VPI); - return NewBinOp; + return true; } -Value *CachingVPExpander::expandPredicationToIntCall(IRBuilder<> &Builder, - VPIntrinsic &VPI) { +bool CachingVPExpander::expandPredicationToIntCall(IRBuilder<> &Builder, + VPIntrinsic &VPI) { std::optional<unsigned> FID = VPI.getFunctionalIntrinsicID(); if (!FID) - return nullptr; + return false; SmallVector<Value *, 2> Argument; for (unsigned i = 0; i < VPI.getNumOperands() - 3; i++) { Argument.push_back(VPI.getOperand(i)); @@ -276,10 +271,10 @@ Value *CachingVPExpander::expandPredicationToIntCall(IRBuilder<> &Builder, Value *NewOp = Builder.CreateIntrinsic(FID.value(), {VPI.getType()}, Argument, /*FMFSource=*/nullptr, VPI.getName()); replaceOperation(*NewOp, VPI); - return NewOp; + return true; } -Value *CachingVPExpander::expandPredicationToFPCall( +bool CachingVPExpander::expandPredicationToFPCall( IRBuilder<> &Builder, VPIntrinsic &VPI, unsigned UnpredicatedIntrinsicID) { assert((maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()) && "Implicitly dropping %evl in non-speculatable operator!"); @@ -297,7 +292,7 @@ Value *CachingVPExpander::expandPredicationToFPCall( UnpredicatedIntrinsicID, {VPI.getType()}, Argument, /*FMFSource=*/nullptr, VPI.getName()); replaceOperation(*NewOp, VPI); - return NewOp; + return true; } case Intrinsic::fma: case Intrinsic::fmuladd: @@ -315,11 +310,11 @@ Value *CachingVPExpander::expandPredicationToFPCall( else NewOp = Builder.CreateCall(Fn, {Op0, Op1, Op2}, VPI.getName()); replaceOperation(*NewOp, VPI); - return NewOp; + return true; } } - return nullptr; + return false; } static Value *getNeutralReductionElement(const VPReductionIntrinsic &VPI, @@ -331,9 +326,8 @@ static Value *getNeutralReductionElement(const VPReductionIntrinsic &VPI, return getReductionIdentity(RdxID, EltTy, FMF); } -Value * -CachingVPExpander::expandPredicationInReduction(IRBuilder<> &Builder, - VPReductionIntrinsic &VPI) { +bool CachingVPExpander::expandPredicationInReduction( + IRBuilder<> &Builder, VPReductionIntrinsic &VPI) { assert((maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()) && "Implicitly dropping %evl in non-speculatable operator!"); @@ -391,11 +385,11 @@ CachingVPExpander::expandPredicationInReduction(IRBuilder<> &Builder, } replaceOperation(*Reduction, VPI); - return Reduction; + return true; } -Value *CachingVPExpander::expandPredicationToCastIntrinsic(IRBuilder<> &Builder, - VPIntrinsic &VPI) { +bool CachingVPExpander::expandPredicationToCastIntrinsic(IRBuilder<> &Builder, + VPIntrinsic &VPI) { Intrinsic::ID VPID = VPI.getIntrinsicID(); unsigned CastOpcode = VPIntrinsic::getFunctionalOpcodeForVP(VPID).value(); assert(Instruction::isCast(CastOpcode)); @@ -404,12 +398,11 @@ Value *CachingVPExpander::expandPredicationToCastIntrinsic(IRBuilder<> &Builder, VPI.getType(), VPI.getName()); replaceOperation(*CastOp, VPI); - return CastOp; + return true; } -Value * -CachingVPExpander::expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder, - VPIntrinsic &VPI) { +bool CachingVPExpander::expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder, + VPIntrinsic &VPI) { assert(VPI.canIgnoreVectorLengthParam()); const auto &DL = VPI.getDataLayout(); @@ -469,11 +462,11 @@ CachingVPExpander::expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder, assert(NewMemoryInst); replaceOperation(*NewMemoryInst, VPI); - return NewMemoryInst; + return true; } -Value *CachingVPExpander::expandPredicationInComparison(IRBuilder<> &Builder, - VPCmpIntrinsic &VPI) { +bool CachingVPExpander::expandPredicationInComparison(IRBuilder<> &Builder, + VPCmpIntrinsic &VPI) { assert((maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()) && "Implicitly dropping %evl in non-speculatable operator!"); @@ -487,7 +480,7 @@ Value *CachingVPExpander::expandPredicationInComparison(IRBuilder<> &Builder, auto *NewCmp = Builder.CreateCmp(Pred, Op0, Op1); replaceOperation(*NewCmp, VPI); - return NewCmp; + return true; } bool CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) { @@ -516,17 +509,24 @@ bool CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) { return true; } -std::pair<Value *, bool> CachingVPExpander::foldEVLIntoMask(VPIntrinsic &VPI) { +bool CachingVPExpander::foldEVLIntoMask(VPIntrinsic &VPI) { LLVM_DEBUG(dbgs() << "Folding vlen for " << VPI << '\n'); IRBuilder<> Builder(&VPI); // Ineffective %evl parameter and so nothing to do here. if (VPI.canIgnoreVectorLengthParam()) - return {&VPI, false}; + return false; // Only VP intrinsics can have an %evl parameter. Value *OldMaskParam = VPI.getMaskParam(); + if (!OldMaskParam) { + assert((VPI.getIntrinsicID() == Intrinsic::vp_merge || + VPI.getIntrinsicID() == Intrinsic::vp_select) && + "Unexpected VP intrinsic without mask operand"); + OldMaskParam = VPI.getArgOperand(0); + } + Value *OldEVLParam = VPI.getVectorLengthParam(); assert(OldMaskParam && "no mask param to fold the vl param into"); assert(OldEVLParam && "no EVL param to fold away"); @@ -538,7 +538,11 @@ std::pair<Value *, bool> CachingVPExpander::foldEVLIntoMask(VPIntrinsic &VPI) { ElementCount ElemCount = VPI.getStaticVectorLength(); Value *VLMask = convertEVLToMask(Builder, OldEVLParam, ElemCount); Value *NewMaskParam = Builder.CreateAnd(VLMask, OldMaskParam); - VPI.setMaskParam(NewMaskParam); + if (VPI.getIntrinsicID() == Intrinsic::vp_merge || + VPI.getIntrinsicID() == Intrinsic::vp_select) + VPI.setArgOperand(0, NewMaskParam); + else + VPI.setMaskParam(NewMaskParam); // Drop the %evl parameter. discardEVLParameter(VPI); @@ -546,10 +550,10 @@ std::pair<Value *, bool> CachingVPExpander::foldEVLIntoMask(VPIntrinsic &VPI) { "transformation did not render the evl param ineffective!"); // Reassess the modified instruction. - return {&VPI, true}; + return true; } -Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) { +bool CachingVPExpander::expandPredication(VPIntrinsic &VPI) { LLVM_DEBUG(dbgs() << "Lowering to unpredicated op: " << VPI << '\n'); IRBuilder<> Builder(&VPI); @@ -566,9 +570,8 @@ Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) { if (auto *VPCmp = dyn_cast<VPCmpIntrinsic>(&VPI)) return expandPredicationInComparison(Builder, *VPCmp); - if (VPCastIntrinsic::isVPCast(VPI.getIntrinsicID())) { + if (VPCastIntrinsic::isVPCast(VPI.getIntrinsicID())) return expandPredicationToCastIntrinsic(Builder, VPI); - } switch (VPI.getIntrinsicID()) { default: @@ -578,6 +581,14 @@ Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) { replaceOperation(*NewNegOp, VPI); return NewNegOp; } + case Intrinsic::vp_select: + case Intrinsic::vp_merge: { + assert(maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()); + Value *NewSelectOp = Builder.CreateSelect( + VPI.getOperand(0), VPI.getOperand(1), VPI.getOperand(2), VPI.getName()); + replaceOperation(*NewSelectOp, VPI); + return NewSelectOp; + } case Intrinsic::vp_abs: case Intrinsic::vp_smax: case Intrinsic::vp_smin: @@ -613,10 +624,10 @@ Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) { } if (auto CID = VPI.getConstrainedIntrinsicID()) - if (Value *Call = expandPredicationToFPCall(Builder, VPI, *CID)) - return Call; + if (expandPredicationToFPCall(Builder, VPI, *CID)) + return true; - return &VPI; + return false; } //// } CachingVPExpander @@ -673,8 +684,7 @@ CachingVPExpander::expandVectorPredication(VPIntrinsic &VPI) { Changed = VPExpansionDetails::IntrinsicUpdated; break; case VPLegalization::Convert: - if (auto [NewVPI, Folded] = foldEVLIntoMask(VPI); Folded) { - (void)NewVPI; + if (foldEVLIntoMask(VPI)) { Changed = VPExpansionDetails::IntrinsicUpdated; ++NumFoldedVL; } @@ -688,7 +698,7 @@ CachingVPExpander::expandVectorPredication(VPIntrinsic &VPI) { case VPLegalization::Discard: llvm_unreachable("Invalid strategy for operators."); case VPLegalization::Convert: - if (Value *V = expandPredication(VPI); V != &VPI) { + if (expandPredication(VPI)) { ++NumLoweredVPOps; Changed = VPExpansionDetails::IntrinsicReplaced; } diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp index 90a18b86c1b1..b3c312569736 100644 --- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -1256,7 +1256,7 @@ LLT CallLowering::ValueHandler::getStackValueStoreType( if (Flags.isPointer()) { LLT PtrTy = LLT::pointer(Flags.getPointerAddrSpace(), ValTy.getScalarSizeInBits()); - if (ValVT.isVector()) + if (ValVT.isVector() && ValVT.getVectorNumElements() != 1) return LLT::vector(ValTy.getElementCount(), PtrTy); return PtrTy; } diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 0674f5fd1ae0..0ebee2cfd868 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -2094,6 +2094,68 @@ bool CombinerHelper::matchCommuteShift(MachineInstr &MI, return true; } +bool CombinerHelper::matchLshrOfTruncOfLshr(MachineInstr &MI, + LshrOfTruncOfLshr &MatchInfo, + MachineInstr &ShiftMI) const { + assert(MI.getOpcode() == TargetOpcode::G_LSHR && "Expected a G_LSHR"); + + Register N0 = MI.getOperand(1).getReg(); + Register N1 = MI.getOperand(2).getReg(); + unsigned OpSizeInBits = MRI.getType(N0).getScalarSizeInBits(); + + APInt N1C, N001C; + if (!mi_match(N1, MRI, m_ICstOrSplat(N1C))) + return false; + auto N001 = ShiftMI.getOperand(2).getReg(); + if (!mi_match(N001, MRI, m_ICstOrSplat(N001C))) + return false; + + if (N001C.getBitWidth() > N1C.getBitWidth()) + N1C = N1C.zext(N001C.getBitWidth()); + else + N001C = N001C.zext(N1C.getBitWidth()); + + Register InnerShift = ShiftMI.getOperand(0).getReg(); + LLT InnerShiftTy = MRI.getType(InnerShift); + uint64_t InnerShiftSize = InnerShiftTy.getScalarSizeInBits(); + if ((N1C + N001C).ult(InnerShiftSize)) { + MatchInfo.Src = ShiftMI.getOperand(1).getReg(); + MatchInfo.ShiftAmt = N1C + N001C; + MatchInfo.ShiftAmtTy = MRI.getType(N001); + MatchInfo.InnerShiftTy = InnerShiftTy; + + if ((N001C + OpSizeInBits) == InnerShiftSize) + return true; + if (MRI.hasOneUse(N0) && MRI.hasOneUse(InnerShift)) { + MatchInfo.Mask = true; + MatchInfo.MaskVal = APInt(N1C.getBitWidth(), OpSizeInBits) - N1C; + return true; + } + } + return false; +} + +void CombinerHelper::applyLshrOfTruncOfLshr( + MachineInstr &MI, LshrOfTruncOfLshr &MatchInfo) const { + assert(MI.getOpcode() == TargetOpcode::G_LSHR && "Expected a G_LSHR"); + + Register Dst = MI.getOperand(0).getReg(); + auto ShiftAmt = + Builder.buildConstant(MatchInfo.ShiftAmtTy, MatchInfo.ShiftAmt); + auto Shift = + Builder.buildLShr(MatchInfo.InnerShiftTy, MatchInfo.Src, ShiftAmt); + if (MatchInfo.Mask == true) { + APInt MaskVal = + APInt::getLowBitsSet(MatchInfo.InnerShiftTy.getScalarSizeInBits(), + MatchInfo.MaskVal.getZExtValue()); + auto Mask = Builder.buildConstant(MatchInfo.InnerShiftTy, MaskVal); + auto And = Builder.buildAnd(MatchInfo.InnerShiftTy, Shift, Mask); + Builder.buildTrunc(Dst, And); + } else + Builder.buildTrunc(Dst, Shift); + MI.eraseFromParent(); +} + bool CombinerHelper::matchCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal) const { assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL"); diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 008c18837a52..b02465d99a60 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -2916,6 +2916,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { case TargetOpcode::G_SREM: case TargetOpcode::G_SMIN: case TargetOpcode::G_SMAX: + case TargetOpcode::G_ABDS: Observer.changingInstr(MI); widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT); widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT); @@ -2953,6 +2954,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { return Legalized; case TargetOpcode::G_UDIV: case TargetOpcode::G_UREM: + case TargetOpcode::G_ABDU: Observer.changingInstr(MI); widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT); widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT); @@ -4742,6 +4744,16 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) { return lowerShlSat(MI); case G_ABS: return lowerAbsToAddXor(MI); + case G_ABDS: + case G_ABDU: { + bool IsSigned = MI.getOpcode() == G_ABDS; + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + if ((IsSigned && LI.isLegal({G_SMIN, Ty}) && LI.isLegal({G_SMAX, Ty})) || + (!IsSigned && LI.isLegal({G_UMIN, Ty}) && LI.isLegal({G_UMAX, Ty}))) { + return lowerAbsDiffToMinMax(MI); + } + return lowerAbsDiffToSelect(MI); + } case G_FABS: return lowerFAbs(MI); case G_SELECT: @@ -4773,6 +4785,16 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) { return lowerVectorReduction(MI); case G_VAARG: return lowerVAArg(MI); + case G_ATOMICRMW_SUB: { + auto [Ret, Mem, Val] = MI.getFirst3Regs(); + const LLT ValTy = MRI.getType(Val); + MachineMemOperand *MMO = *MI.memoperands_begin(); + + auto VNeg = MIRBuilder.buildNeg(ValTy, Val); + MIRBuilder.buildAtomicRMW(G_ATOMICRMW_ADD, Ret, Mem, VNeg, *MMO); + MI.eraseFromParent(); + return Legalized; + } } } @@ -5222,19 +5244,13 @@ LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI, InsertVal = MI.getOperand(2).getReg(); Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg(); - - // TODO: Handle total scalarization case. - if (!NarrowVecTy.isVector()) - return UnableToLegalize; - LLT VecTy = MRI.getType(SrcVec); // If the index is a constant, we can really break this down as you would // expect, and index into the target size pieces. - int64_t IdxVal; auto MaybeCst = getIConstantVRegValWithLookThrough(Idx, MRI); if (MaybeCst) { - IdxVal = MaybeCst->Value.getSExtValue(); + uint64_t IdxVal = MaybeCst->Value.getZExtValue(); // Avoid out of bounds indexing the pieces. if (IdxVal >= VecTy.getNumElements()) { MIRBuilder.buildUndef(DstReg); @@ -5242,33 +5258,45 @@ LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI, return Legalized; } - SmallVector<Register, 8> VecParts; - LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec); + if (!NarrowVecTy.isVector()) { + SmallVector<Register, 8> SplitPieces; + extractParts(MI.getOperand(1).getReg(), NarrowVecTy, + VecTy.getNumElements(), SplitPieces, MIRBuilder, MRI); + if (IsInsert) { + SplitPieces[IdxVal] = InsertVal; + MIRBuilder.buildMergeLikeInstr(MI.getOperand(0).getReg(), SplitPieces); + } else { + MIRBuilder.buildCopy(MI.getOperand(0).getReg(), SplitPieces[IdxVal]); + } + } else { + SmallVector<Register, 8> VecParts; + LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec); - // Build a sequence of NarrowTy pieces in VecParts for this operand. - LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts, - TargetOpcode::G_ANYEXT); + // Build a sequence of NarrowTy pieces in VecParts for this operand. + LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts, + TargetOpcode::G_ANYEXT); - unsigned NewNumElts = NarrowVecTy.getNumElements(); + unsigned NewNumElts = NarrowVecTy.getNumElements(); - LLT IdxTy = MRI.getType(Idx); - int64_t PartIdx = IdxVal / NewNumElts; - auto NewIdx = - MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx); + LLT IdxTy = MRI.getType(Idx); + int64_t PartIdx = IdxVal / NewNumElts; + auto NewIdx = + MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx); - if (IsInsert) { - LLT PartTy = MRI.getType(VecParts[PartIdx]); + if (IsInsert) { + LLT PartTy = MRI.getType(VecParts[PartIdx]); - // Use the adjusted index to insert into one of the subvectors. - auto InsertPart = MIRBuilder.buildInsertVectorElement( - PartTy, VecParts[PartIdx], InsertVal, NewIdx); - VecParts[PartIdx] = InsertPart.getReg(0); + // Use the adjusted index to insert into one of the subvectors. + auto InsertPart = MIRBuilder.buildInsertVectorElement( + PartTy, VecParts[PartIdx], InsertVal, NewIdx); + VecParts[PartIdx] = InsertPart.getReg(0); - // Recombine the inserted subvector with the others to reform the result - // vector. - buildWidenedRemergeToDst(DstReg, LCMTy, VecParts); - } else { - MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx); + // Recombine the inserted subvector with the others to reform the result + // vector. + buildWidenedRemergeToDst(DstReg, LCMTy, VecParts); + } else { + MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx); + } } MI.eraseFromParent(); @@ -5970,7 +5998,6 @@ LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt, return Legalized; } -// TODO: Optimize if constant shift amount. LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx, LLT RequestedTy) { @@ -5992,6 +6019,27 @@ LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx, if (DstEltSize % 2 != 0) return UnableToLegalize; + // Check if we should use multi-way splitting instead of recursive binary + // splitting. + // + // Multi-way splitting directly decomposes wide shifts (e.g., 128-bit -> + // 4×32-bit) in a single legalization step, avoiding the recursive overhead + // and dependency chains created by usual binary splitting approach + // (128->64->32). + // + // The >= 8 parts threshold ensures we only use this optimization when binary + // splitting would require multiple recursive passes, avoiding overhead for + // simple 2-way splits where binary approach is sufficient. + if (RequestedTy.isValid() && RequestedTy.isScalar() && + DstEltSize % RequestedTy.getSizeInBits() == 0) { + const unsigned NumParts = DstEltSize / RequestedTy.getSizeInBits(); + // Use multiway if we have 8 or more parts (i.e., would need 3+ recursive + // steps). + if (NumParts >= 8) + return narrowScalarShiftMultiway(MI, RequestedTy); + } + + // Fall back to binary splitting: // Ignore the input type. We can only go to exactly half the size of the // input. If that isn't small enough, the resulting pieces will be further // legalized. @@ -6080,6 +6128,358 @@ LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx, return Legalized; } +Register LegalizerHelper::buildConstantShiftPart(unsigned Opcode, + unsigned PartIdx, + unsigned NumParts, + ArrayRef<Register> SrcParts, + const ShiftParams &Params, + LLT TargetTy, LLT ShiftAmtTy) { + auto WordShiftConst = getIConstantVRegVal(Params.WordShift, MRI); + auto BitShiftConst = getIConstantVRegVal(Params.BitShift, MRI); + assert(WordShiftConst && BitShiftConst && "Expected constants"); + + const unsigned ShiftWords = WordShiftConst->getZExtValue(); + const unsigned ShiftBits = BitShiftConst->getZExtValue(); + const bool NeedsInterWordShift = ShiftBits != 0; + + switch (Opcode) { + case TargetOpcode::G_SHL: { + // Data moves from lower indices to higher indices + // If this part would come from a source beyond our range, it's zero + if (PartIdx < ShiftWords) + return Params.Zero; + + unsigned SrcIdx = PartIdx - ShiftWords; + if (!NeedsInterWordShift) + return SrcParts[SrcIdx]; + + // Combine shifted main part with carry from previous part + auto Hi = MIRBuilder.buildShl(TargetTy, SrcParts[SrcIdx], Params.BitShift); + if (SrcIdx > 0) { + auto Lo = MIRBuilder.buildLShr(TargetTy, SrcParts[SrcIdx - 1], + Params.InvBitShift); + return MIRBuilder.buildOr(TargetTy, Hi, Lo).getReg(0); + } + return Hi.getReg(0); + } + + case TargetOpcode::G_LSHR: { + unsigned SrcIdx = PartIdx + ShiftWords; + if (SrcIdx >= NumParts) + return Params.Zero; + if (!NeedsInterWordShift) + return SrcParts[SrcIdx]; + + // Combine shifted main part with carry from next part + auto Lo = MIRBuilder.buildLShr(TargetTy, SrcParts[SrcIdx], Params.BitShift); + if (SrcIdx + 1 < NumParts) { + auto Hi = MIRBuilder.buildShl(TargetTy, SrcParts[SrcIdx + 1], + Params.InvBitShift); + return MIRBuilder.buildOr(TargetTy, Lo, Hi).getReg(0); + } + return Lo.getReg(0); + } + + case TargetOpcode::G_ASHR: { + // Like LSHR but preserves sign bit + unsigned SrcIdx = PartIdx + ShiftWords; + if (SrcIdx >= NumParts) + return Params.SignBit; + if (!NeedsInterWordShift) + return SrcParts[SrcIdx]; + + // Only the original MSB part uses arithmetic shift to preserve sign. All + // other parts use logical shift since they're just moving data bits. + auto Lo = + (SrcIdx == NumParts - 1) + ? MIRBuilder.buildAShr(TargetTy, SrcParts[SrcIdx], Params.BitShift) + : MIRBuilder.buildLShr(TargetTy, SrcParts[SrcIdx], Params.BitShift); + Register HiSrc = + (SrcIdx + 1 < NumParts) ? SrcParts[SrcIdx + 1] : Params.SignBit; + auto Hi = MIRBuilder.buildShl(TargetTy, HiSrc, Params.InvBitShift); + return MIRBuilder.buildOr(TargetTy, Lo, Hi).getReg(0); + } + + default: + llvm_unreachable("not a shift"); + } +} + +Register LegalizerHelper::buildVariableShiftPart(unsigned Opcode, + Register MainOperand, + Register ShiftAmt, + LLT TargetTy, + Register CarryOperand) { + // This helper generates a single output part for variable shifts by combining + // the main operand (shifted by BitShift) with carry bits from an adjacent + // part. + + // For G_ASHR, individual parts don't have their own sign bit, only the + // complete value does. So we use LSHR for the main operand shift in ASHR + // context. + unsigned MainOpcode = + (Opcode == TargetOpcode::G_ASHR) ? TargetOpcode::G_LSHR : Opcode; + + // Perform the primary shift on the main operand + Register MainShifted = + MIRBuilder.buildInstr(MainOpcode, {TargetTy}, {MainOperand, ShiftAmt}) + .getReg(0); + + // No carry operand available + if (!CarryOperand.isValid()) + return MainShifted; + + // If BitShift is 0 (word-aligned shift), no inter-word bit movement occurs, + // so carry bits aren't needed. + LLT ShiftAmtTy = MRI.getType(ShiftAmt); + auto ZeroConst = MIRBuilder.buildConstant(ShiftAmtTy, 0); + LLT BoolTy = LLT::scalar(1); + auto IsZeroBitShift = + MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, BoolTy, ShiftAmt, ZeroConst); + + // Extract bits from the adjacent part that will "carry over" into this part. + // The carry direction is opposite to the main shift direction, so we can + // align the two shifted values before combining them with OR. + + // Determine the carry shift opcode (opposite direction) + unsigned CarryOpcode = (Opcode == TargetOpcode::G_SHL) ? TargetOpcode::G_LSHR + : TargetOpcode::G_SHL; + + // Calculate inverse shift amount: BitWidth - ShiftAmt + auto TargetBitsConst = + MIRBuilder.buildConstant(ShiftAmtTy, TargetTy.getScalarSizeInBits()); + auto InvShiftAmt = MIRBuilder.buildSub(ShiftAmtTy, TargetBitsConst, ShiftAmt); + + // Shift the carry operand + Register CarryBits = + MIRBuilder + .buildInstr(CarryOpcode, {TargetTy}, {CarryOperand, InvShiftAmt}) + .getReg(0); + + // If BitShift is 0, don't include carry bits (InvShiftAmt would equal + // TargetBits which would be poison for the individual carry shift operation). + auto ZeroReg = MIRBuilder.buildConstant(TargetTy, 0); + Register SafeCarryBits = + MIRBuilder.buildSelect(TargetTy, IsZeroBitShift, ZeroReg, CarryBits) + .getReg(0); + + // Combine the main shifted part with the carry bits + return MIRBuilder.buildOr(TargetTy, MainShifted, SafeCarryBits).getReg(0); +} + +LegalizerHelper::LegalizeResult +LegalizerHelper::narrowScalarShiftByConstantMultiway(MachineInstr &MI, + const APInt &Amt, + LLT TargetTy, + LLT ShiftAmtTy) { + // Any wide shift can be decomposed into WordShift + BitShift components. + // When shift amount is known constant, directly compute the decomposition + // values and generate constant registers. + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + LLT DstTy = MRI.getType(DstReg); + + const unsigned DstBits = DstTy.getScalarSizeInBits(); + const unsigned TargetBits = TargetTy.getScalarSizeInBits(); + const unsigned NumParts = DstBits / TargetBits; + + assert(DstBits % TargetBits == 0 && "Target type must evenly divide source"); + + // When the shift amount is known at compile time, we just calculate which + // source parts contribute to each output part. + + SmallVector<Register, 8> SrcParts; + extractParts(SrcReg, TargetTy, NumParts, SrcParts, MIRBuilder, MRI); + + if (Amt.isZero()) { + // No shift needed, just copy + MIRBuilder.buildMergeLikeInstr(DstReg, SrcParts); + MI.eraseFromParent(); + return Legalized; + } + + ShiftParams Params; + const unsigned ShiftWords = Amt.getZExtValue() / TargetBits; + const unsigned ShiftBits = Amt.getZExtValue() % TargetBits; + + // Generate constants and values needed by all shift types + Params.WordShift = MIRBuilder.buildConstant(ShiftAmtTy, ShiftWords).getReg(0); + Params.BitShift = MIRBuilder.buildConstant(ShiftAmtTy, ShiftBits).getReg(0); + Params.InvBitShift = + MIRBuilder.buildConstant(ShiftAmtTy, TargetBits - ShiftBits).getReg(0); + Params.Zero = MIRBuilder.buildConstant(TargetTy, 0).getReg(0); + + // For ASHR, we need the sign-extended value to fill shifted-out positions + if (MI.getOpcode() == TargetOpcode::G_ASHR) + Params.SignBit = + MIRBuilder + .buildAShr(TargetTy, SrcParts[SrcParts.size() - 1], + MIRBuilder.buildConstant(ShiftAmtTy, TargetBits - 1)) + .getReg(0); + + SmallVector<Register, 8> DstParts(NumParts); + for (unsigned I = 0; I < NumParts; ++I) + DstParts[I] = buildConstantShiftPart(MI.getOpcode(), I, NumParts, SrcParts, + Params, TargetTy, ShiftAmtTy); + + MIRBuilder.buildMergeLikeInstr(DstReg, DstParts); + MI.eraseFromParent(); + return Legalized; +} + +LegalizerHelper::LegalizeResult +LegalizerHelper::narrowScalarShiftMultiway(MachineInstr &MI, LLT TargetTy) { + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + Register AmtReg = MI.getOperand(2).getReg(); + LLT DstTy = MRI.getType(DstReg); + LLT ShiftAmtTy = MRI.getType(AmtReg); + + const unsigned DstBits = DstTy.getScalarSizeInBits(); + const unsigned TargetBits = TargetTy.getScalarSizeInBits(); + const unsigned NumParts = DstBits / TargetBits; + + assert(DstBits % TargetBits == 0 && "Target type must evenly divide source"); + assert(isPowerOf2_32(TargetBits) && "Target bit width must be power of 2"); + + // If the shift amount is known at compile time, we can use direct indexing + // instead of generating select chains in the general case. + if (auto VRegAndVal = getIConstantVRegValWithLookThrough(AmtReg, MRI)) + return narrowScalarShiftByConstantMultiway(MI, VRegAndVal->Value, TargetTy, + ShiftAmtTy); + + // For runtime-variable shift amounts, we must generate a more complex + // sequence that handles all possible shift values using select chains. + + // Split the input into target-sized pieces + SmallVector<Register, 8> SrcParts; + extractParts(SrcReg, TargetTy, NumParts, SrcParts, MIRBuilder, MRI); + + // Shifting by zero should be a no-op. + auto ZeroAmtConst = MIRBuilder.buildConstant(ShiftAmtTy, 0); + LLT BoolTy = LLT::scalar(1); + auto IsZeroShift = + MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, BoolTy, AmtReg, ZeroAmtConst); + + // Any wide shift can be decomposed into two components: + // 1. WordShift: number of complete target-sized words to shift + // 2. BitShift: number of bits to shift within each word + // + // Example: 128-bit >> 50 with 32-bit target: + // WordShift = 50 / 32 = 1 (shift right by 1 complete word) + // BitShift = 50 % 32 = 18 (shift each word right by 18 bits) + unsigned TargetBitsLog2 = Log2_32(TargetBits); + auto TargetBitsLog2Const = + MIRBuilder.buildConstant(ShiftAmtTy, TargetBitsLog2); + auto TargetBitsMask = MIRBuilder.buildConstant(ShiftAmtTy, TargetBits - 1); + + Register WordShift = + MIRBuilder.buildLShr(ShiftAmtTy, AmtReg, TargetBitsLog2Const).getReg(0); + Register BitShift = + MIRBuilder.buildAnd(ShiftAmtTy, AmtReg, TargetBitsMask).getReg(0); + + // Fill values: + // - SHL/LSHR: fill with zeros + // - ASHR: fill with sign-extended MSB + Register ZeroReg = MIRBuilder.buildConstant(TargetTy, 0).getReg(0); + + Register FillValue; + if (MI.getOpcode() == TargetOpcode::G_ASHR) { + auto TargetBitsMinusOneConst = + MIRBuilder.buildConstant(ShiftAmtTy, TargetBits - 1); + FillValue = MIRBuilder + .buildAShr(TargetTy, SrcParts[NumParts - 1], + TargetBitsMinusOneConst) + .getReg(0); + } else { + FillValue = ZeroReg; + } + + SmallVector<Register, 8> DstParts(NumParts); + + // For each output part, generate a select chain that chooses the correct + // result based on the runtime WordShift value. This handles all possible + // word shift amounts by pre-calculating what each would produce. + for (unsigned I = 0; I < NumParts; ++I) { + // Initialize with appropriate default value for this shift type + Register InBoundsResult = FillValue; + + // clang-format off + // Build a branchless select chain by pre-computing results for all possible + // WordShift values (0 to NumParts-1). Each iteration nests a new select: + // + // K=0: select(WordShift==0, result0, FillValue) + // K=1: select(WordShift==1, result1, select(WordShift==0, result0, FillValue)) + // K=2: select(WordShift==2, result2, select(WordShift==1, result1, select(...))) + // clang-format on + for (unsigned K = 0; K < NumParts; ++K) { + auto WordShiftKConst = MIRBuilder.buildConstant(ShiftAmtTy, K); + auto IsWordShiftK = MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, BoolTy, + WordShift, WordShiftKConst); + + // Calculate source indices for this word shift + // + // For 4-part 128-bit value with K=1 word shift: + // SHL: [3][2][1][0] << K => [2][1][0][Z] + // -> (MainIdx = I-K, CarryIdx = I-K-1) + // LSHR: [3][2][1][0] >> K => [Z][3][2][1] + // -> (MainIdx = I+K, CarryIdx = I+K+1) + int MainSrcIdx; + int CarrySrcIdx; // Index for the word that provides the carried-in bits. + + switch (MI.getOpcode()) { + case TargetOpcode::G_SHL: + MainSrcIdx = (int)I - (int)K; + CarrySrcIdx = MainSrcIdx - 1; + break; + case TargetOpcode::G_LSHR: + case TargetOpcode::G_ASHR: + MainSrcIdx = (int)I + (int)K; + CarrySrcIdx = MainSrcIdx + 1; + break; + default: + llvm_unreachable("Not a shift"); + } + + // Check bounds and build the result for this word shift + Register ResultForK; + if (MainSrcIdx >= 0 && MainSrcIdx < (int)NumParts) { + Register MainOp = SrcParts[MainSrcIdx]; + Register CarryOp; + + // Determine carry operand with bounds checking + if (CarrySrcIdx >= 0 && CarrySrcIdx < (int)NumParts) + CarryOp = SrcParts[CarrySrcIdx]; + else if (MI.getOpcode() == TargetOpcode::G_ASHR && + CarrySrcIdx >= (int)NumParts) + CarryOp = FillValue; // Use sign extension + + ResultForK = buildVariableShiftPart(MI.getOpcode(), MainOp, BitShift, + TargetTy, CarryOp); + } else { + // Out of bounds - use fill value for this k + ResultForK = FillValue; + } + + // Select this result if WordShift equals k + InBoundsResult = + MIRBuilder + .buildSelect(TargetTy, IsWordShiftK, ResultForK, InBoundsResult) + .getReg(0); + } + + // Handle zero-shift special case: if shift is 0, use original input + DstParts[I] = + MIRBuilder + .buildSelect(TargetTy, IsZeroShift, SrcParts[I], InBoundsResult) + .getReg(0); + } + + MIRBuilder.buildMergeLikeInstr(DstReg, DstParts); + MI.eraseFromParent(); + return Legalized; +} + LegalizerHelper::LegalizeResult LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx, LLT MoreTy) { @@ -9537,6 +9937,54 @@ LegalizerHelper::lowerAbsToCNeg(MachineInstr &MI) { return Legalized; } +LegalizerHelper::LegalizeResult +LegalizerHelper::lowerAbsDiffToSelect(MachineInstr &MI) { + assert((MI.getOpcode() == TargetOpcode::G_ABDS || + MI.getOpcode() == TargetOpcode::G_ABDU) && + "Expected G_ABDS or G_ABDU instruction"); + + auto [DstReg, LHS, RHS] = MI.getFirst3Regs(); + LLT Ty = MRI.getType(LHS); + + // abds(lhs, rhs) -> select(sgt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs)) + // abdu(lhs, rhs) -> select(ugt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs)) + Register LHSSub = MIRBuilder.buildSub(Ty, LHS, RHS).getReg(0); + Register RHSSub = MIRBuilder.buildSub(Ty, RHS, LHS).getReg(0); + CmpInst::Predicate Pred = (MI.getOpcode() == TargetOpcode::G_ABDS) + ? CmpInst::ICMP_SGT + : CmpInst::ICMP_UGT; + auto ICmp = MIRBuilder.buildICmp(Pred, LLT::scalar(1), LHS, RHS); + MIRBuilder.buildSelect(DstReg, ICmp, LHSSub, RHSSub); + + MI.eraseFromParent(); + return Legalized; +} + +LegalizerHelper::LegalizeResult +LegalizerHelper::lowerAbsDiffToMinMax(MachineInstr &MI) { + assert((MI.getOpcode() == TargetOpcode::G_ABDS || + MI.getOpcode() == TargetOpcode::G_ABDU) && + "Expected G_ABDS or G_ABDU instruction"); + + auto [DstReg, LHS, RHS] = MI.getFirst3Regs(); + LLT Ty = MRI.getType(LHS); + + // abds(lhs, rhs) -→ sub(smax(lhs, rhs), smin(lhs, rhs)) + // abdu(lhs, rhs) -→ sub(umax(lhs, rhs), umin(lhs, rhs)) + Register MaxReg, MinReg; + if (MI.getOpcode() == TargetOpcode::G_ABDS) { + MaxReg = MIRBuilder.buildSMax(Ty, LHS, RHS).getReg(0); + MinReg = MIRBuilder.buildSMin(Ty, LHS, RHS).getReg(0); + } else { + MaxReg = MIRBuilder.buildUMax(Ty, LHS, RHS).getReg(0); + MinReg = MIRBuilder.buildUMin(Ty, LHS, RHS).getReg(0); + } + MIRBuilder.buildSub(DstReg, MaxReg, MinReg); + + MI.eraseFromParent(); + return Legalized; +} + LegalizerHelper::LegalizeResult LegalizerHelper::lowerFAbs(MachineInstr &MI) { Register SrcReg = MI.getOperand(1).getReg(); Register DstReg = MI.getOperand(0).getReg(); diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index e41fd81953f4..58d631e569b3 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -466,8 +466,14 @@ llvm::getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI) { std::optional<DefinitionAndSourceRegister> llvm::getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI) { Register DefSrcReg = Reg; - auto *DefMI = MRI.getVRegDef(Reg); - auto DstTy = MRI.getType(DefMI->getOperand(0).getReg()); + // This assumes that the code is in SSA form, so there should only be one + // definition. + auto DefIt = MRI.def_begin(Reg); + if (DefIt == MRI.def_end()) + return {}; + MachineOperand &DefOpnd = *DefIt; + MachineInstr *DefMI = DefOpnd.getParent(); + auto DstTy = MRI.getType(DefOpnd.getReg()); if (!DstTy.isValid()) return std::nullopt; unsigned Opc = DefMI->getOpcode(); diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 93f6e39b56ab..e3ded12a1847 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -537,28 +537,26 @@ bool InterleavedAccessImpl::lowerInterleavedStore( "number of stored element should be a multiple of Factor"); Value *Mask = nullptr; + auto GapMask = APInt::getAllOnes(Factor); if (SI) { LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n"); } else { // Check mask operand. Handle both all-true/false and interleaved mask. unsigned LaneMaskLen = NumStoredElements / Factor; - APInt GapMask(Factor, 0); std::tie(Mask, GapMask) = getMask(getMaskOperand(II), Factor, ElementCount::getFixed(LaneMaskLen)); if (!Mask) return false; - // We haven't supported gap mask for stores. Yet it is possible that we - // already changed the IR, hence returning true here. - if (GapMask.popcount() != Factor) - return true; LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store or masked.store: " << *Store << "\n"); + LLVM_DEBUG(dbgs() << "IA: With nominal factor " << Factor + << " and actual factor " << GapMask.popcount() << "\n"); } // Try to create target specific intrinsics to replace the store and // shuffle. - if (!TLI->lowerInterleavedStore(Store, Mask, SVI, Factor)) + if (!TLI->lowerInterleavedStore(Store, Mask, SVI, Factor, GapMask)) return false; // Already have a new target specific interleaved store. Erase the old store. @@ -662,6 +660,10 @@ static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor, } if (auto *SVI = dyn_cast<ShuffleVectorInst>(WideMask)) { + Type *Op1Ty = SVI->getOperand(1)->getType(); + if (!isa<FixedVectorType>(Op1Ty)) + return {nullptr, GapMask}; + // Check that the shuffle mask is: a) an interleave, b) all of the same // set of the elements, and c) contained by the first source. (c) could // be relaxed if desired. diff --git a/llvm/lib/CodeGen/LiveDebugVariables.cpp b/llvm/lib/CodeGen/LiveDebugVariables.cpp index f12f437c493e..9d98e6c085fe 100644 --- a/llvm/lib/CodeGen/LiveDebugVariables.cpp +++ b/llvm/lib/CodeGen/LiveDebugVariables.cpp @@ -536,12 +536,6 @@ public: namespace llvm { -/// Implementation of the LiveDebugVariables pass. - -LiveDebugVariables::LiveDebugVariables() = default; -LiveDebugVariables::~LiveDebugVariables() = default; -LiveDebugVariables::LiveDebugVariables(LiveDebugVariables &&) = default; - class LiveDebugVariables::LDVImpl { LocMap::Allocator allocator; MachineFunction *MF = nullptr; @@ -683,6 +677,12 @@ public: void print(raw_ostream&); }; +/// Implementation of the LiveDebugVariables pass. + +LiveDebugVariables::LiveDebugVariables() = default; +LiveDebugVariables::~LiveDebugVariables() = default; +LiveDebugVariables::LiveDebugVariables(LiveDebugVariables &&) = default; + } // namespace llvm static void printDebugLoc(const DebugLoc &DL, raw_ostream &CommentOS, diff --git a/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp b/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp index 116a919585d7..17a7f48e3f2e 100644 --- a/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp +++ b/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp @@ -21,6 +21,10 @@ using namespace llvm; AnalysisKey MachineFunctionAnalysis::Key; +llvm::MachineFunctionAnalysis::Result::Result( + std::unique_ptr<MachineFunction> MF) + : MF(std::move(MF)) {} + bool MachineFunctionAnalysis::Result::invalidate( Function &, const PreservedAnalyses &PA, FunctionAnalysisManager::Invalidator &) { diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp index d9e8484c08d7..da29ffc9d2fe 100644 --- a/llvm/lib/CodeGen/MachineInstrBundle.cpp +++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp @@ -133,7 +133,6 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, SmallSetVector<Register, 32> LocalDefs; BitVector LocalDefsP(TRI->getNumRegUnits()); SmallSet<Register, 8> DeadDefSet; - SmallSet<Register, 16> KilledDefSet; SmallSetVector<Register, 8> ExternUses; SmallSet<Register, 8> KilledUseSet; SmallSet<Register, 8> UndefUseSet; @@ -151,7 +150,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, MO.setIsInternalRead(); if (MO.isKill()) { // Internal def is now killed. - KilledDefSet.insert(Reg); + DeadDefSet.insert(Reg); } } else { if (ExternUses.insert(Reg)) { @@ -171,21 +170,18 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, continue; if (LocalDefs.insert(Reg)) { - if (MO.isDead()) - DeadDefSet.insert(Reg); + if (!MO.isDead() && Reg.isPhysical()) { + for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) + LocalDefsP.set(Unit); + } } else { - // Re-defined inside the bundle, it's no longer killed. - KilledDefSet.erase(Reg); if (!MO.isDead()) { - // Previously defined but dead. + // Re-defined inside the bundle, it's no longer dead. DeadDefSet.erase(Reg); } } - - if (!MO.isDead() && Reg.isPhysical()) { - for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) - LocalDefsP.set(Unit); - } + if (MO.isDead()) + DeadDefSet.insert(Reg); } // Set FrameSetup/FrameDestroy for the bundle. If any of the instructions @@ -198,7 +194,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, for (Register Reg : LocalDefs) { // If it's not live beyond end of the bundle, mark it dead. - bool isDead = DeadDefSet.contains(Reg) || KilledDefSet.contains(Reg); + bool isDead = DeadDefSet.contains(Reg); MIB.addReg(Reg, getDefRegState(true) | getDeadRegState(isDead) | getImplRegState(true)); } diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp index b0bce2c21a47..fdae3b470de0 100644 --- a/llvm/lib/CodeGen/MachineOutliner.cpp +++ b/llvm/lib/CodeGen/MachineOutliner.cpp @@ -59,8 +59,10 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/Twine.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/ModuleSummaryAnalysis.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CGData/CodeGenDataReader.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineModuleInfo.h" @@ -107,6 +109,16 @@ STATISTIC(StableHashAttempts, STATISTIC(StableHashDropped, "Count of unsuccessful hashing attempts for outlined functions"); STATISTIC(NumRemovedLOHs, "Total number of Linker Optimization Hints removed"); +STATISTIC(NumPGOBlockedOutlined, + "Number of times outlining was blocked by PGO"); +STATISTIC(NumPGOAllowedCold, + "Number of times outlining was allowed from cold functions"); +STATISTIC(NumPGOConservativeBlockedOutlined, + "Number of times outlining was blocked conservatively when profile " + "counts were missing"); +STATISTIC(NumPGOOptimisticOutlined, + "Number of times outlining was allowed optimistically when profile " + "counts were missing"); // Set to true if the user wants the outliner to run on linkonceodr linkage // functions. This is false by default because the linker can dedupe linkonceodr @@ -438,11 +450,10 @@ struct MachineOutliner : public ModulePass { /// The current repeat number of machine outlining. unsigned OutlineRepeatedNum = 0; - /// Set to true if the outliner should run on all functions in the module - /// considered safe for outlining. - /// Set to true by default for compatibility with llc's -run-pass option. - /// Set when the pass is constructed in TargetPassConfig. - bool RunOnAllFunctions = true; + /// The mode for whether to run the outliner + /// Set to always-outline by default for compatibility with llc's -run-pass + /// option. + RunOutliner RunOutlinerMode = RunOutliner::AlwaysOutline; /// This is a compact representation of hash sequences of outlined functions. /// It is used when OutlinerMode = CGDataMode::Write. @@ -468,6 +479,11 @@ struct MachineOutliner : public ModulePass { AU.addRequired<TargetPassConfig>(); AU.addPreserved<MachineModuleInfoWrapperPass>(); AU.addUsedIfAvailable<ImmutableModuleSummaryIndexWrapperPass>(); + if (RunOutlinerMode == RunOutliner::OptimisticPGO || + RunOutlinerMode == RunOutliner::ConservativePGO) { + AU.addRequired<BlockFrequencyInfoWrapperPass>(); + AU.addRequired<ProfileSummaryInfoWrapperPass>(); + } AU.setPreservesAll(); ModulePass::getAnalysisUsage(AU); } @@ -578,9 +594,9 @@ struct MachineOutliner : public ModulePass { char MachineOutliner::ID = 0; namespace llvm { -ModulePass *createMachineOutlinerPass(bool RunOnAllFunctions) { +ModulePass *createMachineOutlinerPass(RunOutliner RunOutlinerMode) { MachineOutliner *OL = new MachineOutliner(); - OL->RunOnAllFunctions = RunOnAllFunctions; + OL->RunOutlinerMode = RunOutlinerMode; return OL; } @@ -1017,9 +1033,6 @@ MachineFunction *MachineOutliner::createOutlinedFunction( /* Outlined code is optimized code by definition. */ DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized); - // Don't add any new variables to the subprogram. - DB.finalizeSubprogram(OutlinedSP); - // Attach subprogram to the function. F->setSubprogram(OutlinedSP); // We're done with the DIBuilder. @@ -1201,10 +1214,49 @@ bool MachineOutliner::outline( return OutlinedSomething; } +static bool allowPGOOutlining(RunOutliner RunOutlinerMode, + const ProfileSummaryInfo *PSI, + const BlockFrequencyInfo *BFI, + MachineBasicBlock &MBB) { + if (RunOutlinerMode != RunOutliner::OptimisticPGO && + RunOutlinerMode != RunOutliner::ConservativePGO) + return true; + auto *MF = MBB.getParent(); + if (MF->getFunction().hasFnAttribute(Attribute::Cold)) { + ++NumPGOAllowedCold; + return true; + } + + auto *BB = MBB.getBasicBlock(); + if (BB && PSI && BFI) + if (auto Count = BFI->getBlockProfileCount(BB)) + return *Count <= PSI->getOrCompColdCountThreshold(); + + if (RunOutlinerMode == RunOutliner::OptimisticPGO) { + auto *TII = MF->getSubtarget().getInstrInfo(); + if (TII->shouldOutlineFromFunctionByDefault(*MF)) { + // Profile data is unavailable, but we optimistically allow outlining + ++NumPGOOptimisticOutlined; + return true; + } + return false; + } + assert(RunOutlinerMode == RunOutliner::ConservativePGO); + // Profile data is unavailable, so we conservatively block outlining + ++NumPGOConservativeBlockedOutlined; + return false; +} + void MachineOutliner::populateMapper(InstructionMapper &Mapper, Module &M) { // Build instruction mappings for each function in the module. Start by // iterating over each Function in M. LLVM_DEBUG(dbgs() << "*** Populating mapper ***\n"); + bool EnableProfileGuidedOutlining = + RunOutlinerMode == RunOutliner::OptimisticPGO || + RunOutlinerMode == RunOutliner::ConservativePGO; + ProfileSummaryInfo *PSI = nullptr; + if (EnableProfileGuidedOutlining) + PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); for (Function &F : M) { LLVM_DEBUG(dbgs() << "MAPPING FUNCTION: " << F.getName() << "\n"); @@ -1225,7 +1277,11 @@ void MachineOutliner::populateMapper(InstructionMapper &Mapper, Module &M) { } const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); - if (!RunOnAllFunctions && !TII->shouldOutlineFromFunctionByDefault(*MF)) { + BlockFrequencyInfo *BFI = nullptr; + if (EnableProfileGuidedOutlining && F.hasProfileData()) + BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI(); + if (RunOutlinerMode == RunOutliner::TargetDefault && + !TII->shouldOutlineFromFunctionByDefault(*MF)) { LLVM_DEBUG(dbgs() << "SKIP: Target does not want to outline from " "function by default\n"); continue; @@ -1265,6 +1321,11 @@ void MachineOutliner::populateMapper(InstructionMapper &Mapper, Module &M) { continue; } + if (!allowPGOOutlining(RunOutlinerMode, PSI, BFI, MBB)) { + ++NumPGOBlockedOutlined; + continue; + } + // MBB is suitable for outlining. Map it to a list of unsigneds. Mapper.convertToUnsignedVec(MBB, *TII); } @@ -1437,10 +1498,22 @@ bool MachineOutliner::doOutline(Module &M, unsigned &OutlinedFunctionNum) { // the user how the outliner is running. LLVM_DEBUG({ dbgs() << "Machine Outliner: Running on "; - if (RunOnAllFunctions) + switch (RunOutlinerMode) { + case RunOutliner::AlwaysOutline: dbgs() << "all functions"; - else + break; + case RunOutliner::OptimisticPGO: + dbgs() << "optimistically cold functions"; + break; + case RunOutliner::ConservativePGO: + dbgs() << "conservatively cold functions"; + break; + case RunOutliner::TargetDefault: dbgs() << "target-default functions"; + break; + case RunOutliner::NeverOutline: + llvm_unreachable("should not outline"); + } dbgs() << "\n"; }); diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp index b7135251781a..abb3f3e61200 100644 --- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp +++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp @@ -432,6 +432,11 @@ bool MachineRegisterInfo::hasOneNonDBGUser(Register RegNo) const { return hasSingleElement(use_nodbg_instructions(RegNo)); } +MachineOperand *MachineRegisterInfo::getOneNonDBGUse(Register RegNo) const { + auto RegNoDbgUses = use_nodbg_operands(RegNo); + return hasSingleElement(RegNoDbgUses) ? &*RegNoDbgUses.begin() : nullptr; +} + MachineInstr *MachineRegisterInfo::getOneNonDBGUser(Register RegNo) const { auto RegNoDbgUsers = use_nodbg_instructions(RegNo); return hasSingleElement(RegNoDbgUsers) ? &*RegNoDbgUsers.begin() : nullptr; diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp index 96c9cde622b4..f54e2f264556 100644 --- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp +++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp @@ -507,83 +507,86 @@ bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const { }); break; case Intrinsic::objc_autorelease: - Changed |= lowerObjCCall(F, RTLIB::objc_autorelease); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_autorelease); break; case Intrinsic::objc_autoreleasePoolPop: - Changed |= lowerObjCCall(F, RTLIB::objc_autoreleasePoolPop); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_autoreleasePoolPop); break; case Intrinsic::objc_autoreleasePoolPush: - Changed |= lowerObjCCall(F, RTLIB::objc_autoreleasePoolPush); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_autoreleasePoolPush); break; case Intrinsic::objc_autoreleaseReturnValue: - Changed |= lowerObjCCall(F, RTLIB::objc_autoreleaseReturnValue); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_autoreleaseReturnValue); break; case Intrinsic::objc_copyWeak: - Changed |= lowerObjCCall(F, RTLIB::objc_copyWeak); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_copyWeak); break; case Intrinsic::objc_destroyWeak: - Changed |= lowerObjCCall(F, RTLIB::objc_destroyWeak); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_destroyWeak); break; case Intrinsic::objc_initWeak: - Changed |= lowerObjCCall(F, RTLIB::objc_initWeak); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_initWeak); break; case Intrinsic::objc_loadWeak: - Changed |= lowerObjCCall(F, RTLIB::objc_loadWeak); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_loadWeak); break; case Intrinsic::objc_loadWeakRetained: - Changed |= lowerObjCCall(F, RTLIB::objc_loadWeakRetained); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_loadWeakRetained); break; case Intrinsic::objc_moveWeak: - Changed |= lowerObjCCall(F, RTLIB::objc_moveWeak); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_moveWeak); break; case Intrinsic::objc_release: - Changed |= lowerObjCCall(F, RTLIB::objc_release, true); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_release, true); break; case Intrinsic::objc_retain: - Changed |= lowerObjCCall(F, RTLIB::objc_retain, true); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_retain, true); break; case Intrinsic::objc_retainAutorelease: - Changed |= lowerObjCCall(F, RTLIB::objc_retainAutorelease); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_retainAutorelease); break; case Intrinsic::objc_retainAutoreleaseReturnValue: - Changed |= lowerObjCCall(F, RTLIB::objc_retainAutoreleaseReturnValue); + Changed |= + lowerObjCCall(F, RTLIB::impl_objc_retainAutoreleaseReturnValue); break; case Intrinsic::objc_retainAutoreleasedReturnValue: - Changed |= lowerObjCCall(F, RTLIB::objc_retainAutoreleasedReturnValue); + Changed |= + lowerObjCCall(F, RTLIB::impl_objc_retainAutoreleasedReturnValue); break; case Intrinsic::objc_claimAutoreleasedReturnValue: - Changed |= lowerObjCCall(F, RTLIB::objc_claimAutoreleasedReturnValue); + Changed |= + lowerObjCCall(F, RTLIB::impl_objc_claimAutoreleasedReturnValue); break; case Intrinsic::objc_retainBlock: - Changed |= lowerObjCCall(F, RTLIB::objc_retainBlock); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_retainBlock); break; case Intrinsic::objc_storeStrong: - Changed |= lowerObjCCall(F, RTLIB::objc_storeStrong); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_storeStrong); break; case Intrinsic::objc_storeWeak: - Changed |= lowerObjCCall(F, RTLIB::objc_storeWeak); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_storeWeak); break; case Intrinsic::objc_unsafeClaimAutoreleasedReturnValue: Changed |= - lowerObjCCall(F, RTLIB::objc_unsafeClaimAutoreleasedReturnValue); + lowerObjCCall(F, RTLIB::impl_objc_unsafeClaimAutoreleasedReturnValue); break; case Intrinsic::objc_retainedObject: - Changed |= lowerObjCCall(F, RTLIB::objc_retainedObject); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_retainedObject); break; case Intrinsic::objc_unretainedObject: - Changed |= lowerObjCCall(F, RTLIB::objc_unretainedObject); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_unretainedObject); break; case Intrinsic::objc_unretainedPointer: - Changed |= lowerObjCCall(F, RTLIB::objc_unretainedPointer); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_unretainedPointer); break; case Intrinsic::objc_retain_autorelease: - Changed |= lowerObjCCall(F, RTLIB::objc_retain_autorelease); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_retain_autorelease); break; case Intrinsic::objc_sync_enter: - Changed |= lowerObjCCall(F, RTLIB::objc_sync_enter); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_sync_enter); break; case Intrinsic::objc_sync_exit: - Changed |= lowerObjCCall(F, RTLIB::objc_sync_exit); + Changed |= lowerObjCCall(F, RTLIB::impl_objc_sync_exit); break; case Intrinsic::exp: case Intrinsic::exp2: diff --git a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp index 415674231b5c..a589ef761dd7 100644 --- a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp +++ b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp @@ -275,7 +275,6 @@ void ReachingDefAnalysis::printAllReachingDefs(MachineFunction &MF) { bool ReachingDefAnalysis::runOnMachineFunction(MachineFunction &mf) { MF = &mf; - TRI = MF->getSubtarget().getRegisterInfo(); const TargetSubtargetInfo &STI = MF->getSubtarget(); TRI = STI.getRegisterInfo(); TII = STI.getInstrInfo(); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 27b5a0d37b67..d130efe96b56 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -4710,7 +4710,10 @@ template <class MatchContextClass> SDValue DAGCombiner::visitMUL(SDNode *N) { if (SDValue LogBase2 = BuildLogBase2(N1, DL)) { EVT ShiftVT = getShiftAmountTy(N0.getValueType()); SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT); - return Matcher.getNode(ISD::SHL, DL, VT, N0, Trunc); + SDNodeFlags Flags; + Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap()); + // TODO: Preserve setNoSignedWrap if LogBase2 isn't BitWidth - 1. + return Matcher.getNode(ISD::SHL, DL, VT, N0, Trunc, Flags); } } @@ -9998,13 +10001,16 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { } } - // fold (not (neg x)) -> (add X, -1) - // FIXME: This can be generalized to (not (sub Y, X)) -> (add X, ~Y) if - // Y is a constant or the subtract has a single use. - if (isAllOnesConstant(N1) && N0.getOpcode() == ISD::SUB && - isNullConstant(N0.getOperand(0))) { - return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(1), - DAG.getAllOnesConstant(DL, VT)); + // fold (not (sub Y, X)) -> (add X, ~Y) if Y is a constant + if (N0.getOpcode() == ISD::SUB && isAllOnesConstant(N1)) { + SDValue Y = N0.getOperand(0); + SDValue X = N0.getOperand(1); + + if (auto *YConst = dyn_cast<ConstantSDNode>(Y)) { + APInt NotYValue = ~YConst->getAPIntValue(); + SDValue NotY = DAG.getConstant(NotYValue, DL, VT); + return DAG.getNode(ISD::ADD, DL, VT, X, NotY, N->getFlags()); + } } // fold (not (add X, -1)) -> (neg X) @@ -11089,38 +11095,43 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { } } - // fold (srl (shl x, c1), c2) -> (and (shl x, (sub c1, c2), MASK) or - // (and (srl x, (sub c2, c1), MASK) - if (N0.getOpcode() == ISD::SHL && - (N0.getOperand(1) == N1 || N0->hasOneUse()) && - TLI.shouldFoldConstantShiftPairToMask(N, Level)) { - auto MatchShiftAmount = [OpSizeInBits](ConstantSDNode *LHS, - ConstantSDNode *RHS) { - const APInt &LHSC = LHS->getAPIntValue(); - const APInt &RHSC = RHS->getAPIntValue(); - return LHSC.ult(OpSizeInBits) && RHSC.ult(OpSizeInBits) && - LHSC.getZExtValue() <= RHSC.getZExtValue(); - }; - if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount, - /*AllowUndefs*/ false, - /*AllowTypeMismatch*/ true)) { - SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT); - SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N01, N1); - SDValue Mask = DAG.getAllOnesConstant(DL, VT); - Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, N01); - Mask = DAG.getNode(ISD::SHL, DL, VT, Mask, Diff); - SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Diff); - return DAG.getNode(ISD::AND, DL, VT, Shift, Mask); - } - if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchShiftAmount, - /*AllowUndefs*/ false, - /*AllowTypeMismatch*/ true)) { - SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT); - SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N1, N01); - SDValue Mask = DAG.getAllOnesConstant(DL, VT); - Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, N1); - SDValue Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Diff); - return DAG.getNode(ISD::AND, DL, VT, Shift, Mask); + if (N0.getOpcode() == ISD::SHL) { + // fold (srl (shl nuw x, c), c) -> x + if (N0.getOperand(1) == N1 && N0->getFlags().hasNoUnsignedWrap()) + return N0.getOperand(0); + + // fold (srl (shl x, c1), c2) -> (and (shl x, (sub c1, c2), MASK) or + // (and (srl x, (sub c2, c1), MASK) + if ((N0.getOperand(1) == N1 || N0->hasOneUse()) && + TLI.shouldFoldConstantShiftPairToMask(N, Level)) { + auto MatchShiftAmount = [OpSizeInBits](ConstantSDNode *LHS, + ConstantSDNode *RHS) { + const APInt &LHSC = LHS->getAPIntValue(); + const APInt &RHSC = RHS->getAPIntValue(); + return LHSC.ult(OpSizeInBits) && RHSC.ult(OpSizeInBits) && + LHSC.getZExtValue() <= RHSC.getZExtValue(); + }; + if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount, + /*AllowUndefs*/ false, + /*AllowTypeMismatch*/ true)) { + SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT); + SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N01, N1); + SDValue Mask = DAG.getAllOnesConstant(DL, VT); + Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, N01); + Mask = DAG.getNode(ISD::SHL, DL, VT, Mask, Diff); + SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Diff); + return DAG.getNode(ISD::AND, DL, VT, Shift, Mask); + } + if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchShiftAmount, + /*AllowUndefs*/ false, + /*AllowTypeMismatch*/ true)) { + SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT); + SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N1, N01); + SDValue Mask = DAG.getAllOnesConstant(DL, VT); + Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, N1); + SDValue Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Diff); + return DAG.getNode(ISD::AND, DL, VT, Shift, Mask); + } } } @@ -15137,7 +15148,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { return foldedExt; } else if (ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && - TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) { + TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, VT, N0.getValueType())) { bool DoXform = true; SmallVector<SDNode *, 4> SetCCs; if (!N0.hasOneUse()) @@ -16309,7 +16320,15 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { if (VT.isScalarInteger() || TLI.isOperationLegal(N0.getOpcode(), VT)) { SDValue NarrowL = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0)); SDValue NarrowR = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1)); - return DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR); + SDNodeFlags Flags; + // Propagate nuw for sub. + if (N0->getOpcode() == ISD::SUB && N0->getFlags().hasNoUnsignedWrap() && + DAG.MaskedValueIsZero( + N0->getOperand(0), + APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(), + VT.getScalarSizeInBits()))) + Flags.setNoUnsignedWrap(true); + return DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR, Flags); } } break; @@ -16788,6 +16807,8 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { // If we have frozen and unfrozen users of N0, update so everything uses N. if (!N0.isUndef() && !N0.hasOneUse()) { SDValue FrozenN0(N, 0); + // Unfreeze all uses of N to avoid double deleting N from the CSE map. + DAG.ReplaceAllUsesOfValueWith(FrozenN0, N0); DAG.ReplaceAllUsesOfValueWith(N0, FrozenN0); // ReplaceAllUsesOfValueWith will have also updated the use in N, thus // creating a cycle in a DAG. Let's undo that by mutating the freeze. @@ -19346,13 +19367,13 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) { // MachineBasicBlock CFG, which is awkward. // fold a brcond with a setcc condition into a BR_CC node if BR_CC is legal - // on the target. + // on the target, also copy fast math flags. if (N1.getOpcode() == ISD::SETCC && TLI.isOperationLegalOrCustom(ISD::BR_CC, N1.getOperand(0).getValueType())) { - return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other, - Chain, N1.getOperand(2), - N1.getOperand(0), N1.getOperand(1), N2); + return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other, Chain, + N1.getOperand(2), N1.getOperand(0), N1.getOperand(1), N2, + N1->getFlags()); } if (N1.hasOneUse()) { diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index 1a63518ab37a..861f76e93f2c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -238,7 +238,7 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node, // Create the result registers for this node and add the result regs to // the machine instruction. - if (VRBase == 0) { + if (!VRBase) { assert(RC && "Isn't a register operand!"); VRBase = MRI->createVirtualRegister(RC); MIB.addReg(VRBase, RegState::Define); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 90d62e6da8e9..9e85f08abb76 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -324,6 +324,11 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { Res = PromoteIntRes_VP_REDUCE(N); break; + case ISD::LOOP_DEPENDENCE_WAR_MASK: + case ISD::LOOP_DEPENDENCE_RAW_MASK: + Res = PromoteIntRes_LOOP_DEPENDENCE_MASK(N); + break; + case ISD::FREEZE: Res = PromoteIntRes_FREEZE(N); break; @@ -374,6 +379,12 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MERGE_VALUES(SDNode *N, return GetPromotedInteger(Op); } +SDValue DAGTypeLegalizer::PromoteIntRes_LOOP_DEPENDENCE_MASK(SDNode *N) { + EVT VT = N->getValueType(0); + EVT NewVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); + return DAG.getNode(N->getOpcode(), SDLoc(N), NewVT, N->ops()); +} + SDValue DAGTypeLegalizer::PromoteIntRes_AssertSext(SDNode *N) { // Sign-extend the new bits, and continue the assertion. SDValue Op = SExtPromotedInteger(N->getOperand(0)); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 65fd863e55ac..586c3411791f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -382,6 +382,7 @@ private: SDValue PromoteIntRes_VECTOR_FIND_LAST_ACTIVE(SDNode *N); SDValue PromoteIntRes_GET_ACTIVE_LANE_MASK(SDNode *N); SDValue PromoteIntRes_PARTIAL_REDUCE_MLA(SDNode *N); + SDValue PromoteIntRes_LOOP_DEPENDENCE_MASK(SDNode *N); // Integer Operand Promotion. bool PromoteIntegerOperand(SDNode *N, unsigned OpNo); @@ -436,6 +437,7 @@ private: SDValue PromoteIntOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_GET_ACTIVE_LANE_MASK(SDNode *N); SDValue PromoteIntOp_PARTIAL_REDUCE_MLA(SDNode *N); + SDValue PromoteIntOp_LOOP_DEPENDENCE_MASK(SDNode *N, unsigned OpNo); void SExtOrZExtPromotedOperands(SDValue &LHS, SDValue &RHS); void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code); @@ -868,6 +870,7 @@ private: // Vector Result Scalarization: <1 x ty> -> ty. void ScalarizeVectorResult(SDNode *N, unsigned ResNo); SDValue ScalarizeVecRes_MERGE_VALUES(SDNode *N, unsigned ResNo); + SDValue ScalarizeVecRes_LOOP_DEPENDENCE_MASK(SDNode *N); SDValue ScalarizeVecRes_BinOp(SDNode *N); SDValue ScalarizeVecRes_CMP(SDNode *N); SDValue ScalarizeVecRes_TernaryOp(SDNode *N); @@ -964,6 +967,7 @@ private: void SplitVecRes_FIX(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_LOOP_DEPENDENCE_MASK(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); @@ -1070,6 +1074,7 @@ private: SDValue WidenVecRes_ADDRSPACECAST(SDNode *N); SDValue WidenVecRes_AssertZext(SDNode* N); SDValue WidenVecRes_BITCAST(SDNode* N); + SDValue WidenVecRes_LOOP_DEPENDENCE_MASK(SDNode *N); SDValue WidenVecRes_BUILD_VECTOR(SDNode* N); SDValue WidenVecRes_CONCAT_VECTORS(SDNode* N); SDValue WidenVecRes_EXTEND_VECTOR_INREG(SDNode* N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 2ca98958fde0..8e423c4f83b3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -138,6 +138,7 @@ class VectorLegalizer { SDValue ExpandVP_FNEG(SDNode *Node); SDValue ExpandVP_FABS(SDNode *Node); SDValue ExpandVP_FCOPYSIGN(SDNode *Node); + SDValue ExpandLOOP_DEPENDENCE_MASK(SDNode *N); SDValue ExpandSELECT(SDNode *Node); std::pair<SDValue, SDValue> ExpandLoad(SDNode *N); SDValue ExpandStore(SDNode *N); @@ -475,6 +476,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::VECTOR_COMPRESS: case ISD::SCMP: case ISD::UCMP: + case ISD::LOOP_DEPENDENCE_WAR_MASK: + case ISD::LOOP_DEPENDENCE_RAW_MASK: Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0)); break; case ISD::SMULFIX: @@ -1291,6 +1294,10 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) { case ISD::UCMP: Results.push_back(TLI.expandCMP(Node, DAG)); return; + case ISD::LOOP_DEPENDENCE_WAR_MASK: + case ISD::LOOP_DEPENDENCE_RAW_MASK: + Results.push_back(ExpandLOOP_DEPENDENCE_MASK(Node)); + return; case ISD::FADD: case ISD::FMUL: @@ -1796,6 +1803,50 @@ SDValue VectorLegalizer::ExpandVP_FCOPYSIGN(SDNode *Node) { return DAG.getNode(ISD::BITCAST, DL, VT, CopiedSign); } +SDValue VectorLegalizer::ExpandLOOP_DEPENDENCE_MASK(SDNode *N) { + SDLoc DL(N); + SDValue SourceValue = N->getOperand(0); + SDValue SinkValue = N->getOperand(1); + SDValue EltSize = N->getOperand(2); + + bool IsReadAfterWrite = N->getOpcode() == ISD::LOOP_DEPENDENCE_RAW_MASK; + EVT VT = N->getValueType(0); + EVT PtrVT = SourceValue->getValueType(0); + + SDValue Diff = DAG.getNode(ISD::SUB, DL, PtrVT, SinkValue, SourceValue); + if (IsReadAfterWrite) + Diff = DAG.getNode(ISD::ABS, DL, PtrVT, Diff); + + Diff = DAG.getNode(ISD::SDIV, DL, PtrVT, Diff, EltSize); + + // If the difference is positive then some elements may alias + EVT CmpVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), + Diff.getValueType()); + SDValue Zero = DAG.getTargetConstant(0, DL, PtrVT); + SDValue Cmp = DAG.getSetCC(DL, CmpVT, Diff, Zero, + IsReadAfterWrite ? ISD::SETEQ : ISD::SETLE); + + // Create the lane mask + EVT SplatVT = VT.changeElementType(PtrVT); + SDValue DiffSplat = DAG.getSplat(SplatVT, DL, Diff); + SDValue VectorStep = DAG.getStepVector(DL, SplatVT); + EVT MaskVT = VT.changeElementType(MVT::i1); + SDValue DiffMask = + DAG.getSetCC(DL, MaskVT, VectorStep, DiffSplat, ISD::CondCode::SETULT); + + EVT EltVT = VT.getVectorElementType(); + // Extend the diff setcc in case the intrinsic has been promoted to a vector + // type with elements larger than i1 + if (EltVT.getScalarSizeInBits() > MaskVT.getScalarSizeInBits()) + DiffMask = DAG.getNode(ISD::ANY_EXTEND, DL, VT, DiffMask); + + // Splat the compare result then OR it with the lane mask + if (CmpVT.getScalarSizeInBits() < EltVT.getScalarSizeInBits()) + Cmp = DAG.getNode(ISD::ZERO_EXTEND, DL, EltVT, Cmp); + SDValue Splat = DAG.getSplat(VT, DL, Cmp); + return DAG.getNode(ISD::OR, DL, VT, DiffMask, Splat); +} + void VectorLegalizer::ExpandFP_TO_UINT(SDNode *Node, SmallVectorImpl<SDValue> &Results) { // Attempt to expand using TargetLowering. diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 10e3a5149a5d..118fd8418f78 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -53,6 +53,10 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { report_fatal_error("Do not know how to scalarize the result of this " "operator!\n"); + case ISD::LOOP_DEPENDENCE_WAR_MASK: + case ISD::LOOP_DEPENDENCE_RAW_MASK: + R = ScalarizeVecRes_LOOP_DEPENDENCE_MASK(N); + break; case ISD::MERGE_VALUES: R = ScalarizeVecRes_MERGE_VALUES(N, ResNo);break; case ISD::BITCAST: R = ScalarizeVecRes_BITCAST(N); break; case ISD::BUILD_VECTOR: R = ScalarizeVecRes_BUILD_VECTOR(N); break; @@ -396,6 +400,22 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_MERGE_VALUES(SDNode *N, return GetScalarizedVector(Op); } +SDValue DAGTypeLegalizer::ScalarizeVecRes_LOOP_DEPENDENCE_MASK(SDNode *N) { + SDValue SourceValue = N->getOperand(0); + SDValue SinkValue = N->getOperand(1); + SDValue EltSize = N->getOperand(2); + EVT PtrVT = SourceValue->getValueType(0); + SDLoc DL(N); + + SDValue Diff = DAG.getNode(ISD::SUB, DL, PtrVT, SinkValue, SourceValue); + EVT CmpVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), + Diff.getValueType()); + SDValue Zero = DAG.getTargetConstant(0, DL, PtrVT); + return DAG.getNode(ISD::OR, DL, CmpVT, + DAG.getSetCC(DL, CmpVT, Diff, EltSize, ISD::SETGE), + DAG.getSetCC(DL, CmpVT, Diff, Zero, ISD::SETEQ)); +} + SDValue DAGTypeLegalizer::ScalarizeVecRes_BITCAST(SDNode *N) { SDValue Op = N->getOperand(0); if (getTypeAction(Op.getValueType()) == TargetLowering::TypeScalarizeVector) @@ -1159,6 +1179,10 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { report_fatal_error("Do not know how to split the result of this " "operator!\n"); + case ISD::LOOP_DEPENDENCE_RAW_MASK: + case ISD::LOOP_DEPENDENCE_WAR_MASK: + SplitVecRes_LOOP_DEPENDENCE_MASK(N, Lo, Hi); + break; case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break; case ISD::AssertZext: SplitVecRes_AssertZext(N, Lo, Hi); break; case ISD::VSELECT: @@ -1652,6 +1676,25 @@ void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi); } +void DAGTypeLegalizer::SplitVecRes_LOOP_DEPENDENCE_MASK(SDNode *N, SDValue &Lo, + SDValue &Hi) { + SDLoc DL(N); + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); + SDValue PtrA = N->getOperand(0); + SDValue PtrB = N->getOperand(1); + Lo = DAG.getNode(N->getOpcode(), DL, LoVT, PtrA, PtrB, N->getOperand(2)); + + unsigned EltSize = N->getConstantOperandVal(2); + unsigned Offset = EltSize * HiVT.getVectorMinNumElements(); + SDValue Addend = HiVT.isScalableVT() + ? DAG.getVScale(DL, MVT::i64, APInt(64, Offset)) + : DAG.getConstant(Offset, DL, MVT::i64); + + PtrA = DAG.getNode(ISD::ADD, DL, MVT::i64, PtrA, Addend); + Hi = DAG.getNode(N->getOpcode(), DL, HiVT, PtrA, PtrB, N->getOperand(2)); +} + void DAGTypeLegalizer::SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT LoVT, HiVT; @@ -2517,10 +2560,10 @@ void DAGTypeLegalizer::SplitVecRes_Gather(MemSDNode *N, SDValue &Lo, else std::tie(IndexLo, IndexHi) = DAG.SplitVector(Ops.Index, dl); + MachineMemOperand::Flags MMOFlags = N->getMemOperand()->getFlags(); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( - N->getPointerInfo(), MachineMemOperand::MOLoad, - LocationSize::beforeOrAfterPointer(), Alignment, N->getAAInfo(), - N->getRanges()); + N->getPointerInfo(), MMOFlags, LocationSize::beforeOrAfterPointer(), + Alignment, N->getAAInfo(), N->getRanges()); if (auto *MGT = dyn_cast<MaskedGatherSDNode>(N)) { SDValue PassThru = MGT->getPassThru(); @@ -4321,10 +4364,10 @@ SDValue DAGTypeLegalizer::SplitVecOp_Scatter(MemSDNode *N, unsigned OpNo) { std::tie(IndexLo, IndexHi) = DAG.SplitVector(Ops.Index, DL); SDValue Lo; + MachineMemOperand::Flags MMOFlags = N->getMemOperand()->getFlags(); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( - N->getPointerInfo(), MachineMemOperand::MOStore, - LocationSize::beforeOrAfterPointer(), Alignment, N->getAAInfo(), - N->getRanges()); + N->getPointerInfo(), MMOFlags, LocationSize::beforeOrAfterPointer(), + Alignment, N->getAAInfo(), N->getRanges()); if (auto *MSC = dyn_cast<MaskedScatterSDNode>(N)) { SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo, Ops.Scale}; @@ -4784,6 +4827,10 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { #endif report_fatal_error("Do not know how to widen the result of this operator!"); + case ISD::LOOP_DEPENDENCE_RAW_MASK: + case ISD::LOOP_DEPENDENCE_WAR_MASK: + Res = WidenVecRes_LOOP_DEPENDENCE_MASK(N); + break; case ISD::MERGE_VALUES: Res = WidenVecRes_MERGE_VALUES(N, ResNo); break; case ISD::ADDRSPACECAST: Res = WidenVecRes_ADDRSPACECAST(N); @@ -5986,6 +6033,13 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) { return CreateStackStoreLoad(InOp, WidenVT); } +SDValue DAGTypeLegalizer::WidenVecRes_LOOP_DEPENDENCE_MASK(SDNode *N) { + return DAG.getNode( + N->getOpcode(), SDLoc(N), + TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)), + N->getOperand(0), N->getOperand(1), N->getOperand(2)); +} + SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) { SDLoc dl(N); // Build a vector with undefined for the new nodes. diff --git a/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp index 0a449fd011e6..72ea0898f975 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp @@ -63,6 +63,8 @@ ResourcePriorityQueue::ResourcePriorityQueue(SelectionDAGISel *IS) HorizontalVerticalBalance = 0; } +ResourcePriorityQueue::~ResourcePriorityQueue() = default; + unsigned ResourcePriorityQueue::numberRCValPredInSU(SUnit *SU, unsigned RCId) { unsigned NumberDeps = 0; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 3672a91e33a3..bcf25958d098 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -3299,7 +3299,7 @@ SelectionDAG::getValidShiftAmountRange(SDValue V, const APInt &DemandedElts, return std::nullopt; } -std::optional<uint64_t> +std::optional<unsigned> SelectionDAG::getValidShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth) const { assert((V.getOpcode() == ISD::SHL || V.getOpcode() == ISD::SRL || @@ -3312,7 +3312,7 @@ SelectionDAG::getValidShiftAmount(SDValue V, const APInt &DemandedElts, return std::nullopt; } -std::optional<uint64_t> +std::optional<unsigned> SelectionDAG::getValidShiftAmount(SDValue V, unsigned Depth) const { EVT VT = V.getValueType(); APInt DemandedElts = VT.isFixedLengthVector() @@ -3321,7 +3321,7 @@ SelectionDAG::getValidShiftAmount(SDValue V, unsigned Depth) const { return getValidShiftAmount(V, DemandedElts, Depth); } -std::optional<uint64_t> +std::optional<unsigned> SelectionDAG::getValidMinimumShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth) const { assert((V.getOpcode() == ISD::SHL || V.getOpcode() == ISD::SRL || @@ -3333,7 +3333,7 @@ SelectionDAG::getValidMinimumShiftAmount(SDValue V, const APInt &DemandedElts, return std::nullopt; } -std::optional<uint64_t> +std::optional<unsigned> SelectionDAG::getValidMinimumShiftAmount(SDValue V, unsigned Depth) const { EVT VT = V.getValueType(); APInt DemandedElts = VT.isFixedLengthVector() @@ -3342,7 +3342,7 @@ SelectionDAG::getValidMinimumShiftAmount(SDValue V, unsigned Depth) const { return getValidMinimumShiftAmount(V, DemandedElts, Depth); } -std::optional<uint64_t> +std::optional<unsigned> SelectionDAG::getValidMaximumShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth) const { assert((V.getOpcode() == ISD::SHL || V.getOpcode() == ISD::SRL || @@ -3354,7 +3354,7 @@ SelectionDAG::getValidMaximumShiftAmount(SDValue V, const APInt &DemandedElts, return std::nullopt; } -std::optional<uint64_t> +std::optional<unsigned> SelectionDAG::getValidMaximumShiftAmount(SDValue V, unsigned Depth) const { EVT VT = V.getValueType(); APInt DemandedElts = VT.isFixedLengthVector() @@ -3828,7 +3828,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, Known = KnownBits::shl(Known, Known2, NUW, NSW, ShAmtNonZero); // Minimum shift low bits are known zero. - if (std::optional<uint64_t> ShMinAmt = + if (std::optional<unsigned> ShMinAmt = getValidMinimumShiftAmount(Op, DemandedElts, Depth + 1)) Known.Zero.setLowBits(*ShMinAmt); break; @@ -3840,7 +3840,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, Op->getFlags().hasExact()); // Minimum shift high bits are known zero. - if (std::optional<uint64_t> ShMinAmt = + if (std::optional<unsigned> ShMinAmt = getValidMinimumShiftAmount(Op, DemandedElts, Depth + 1)) Known.Zero.setHighBits(*ShMinAmt); break; @@ -3850,6 +3850,22 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, Known = KnownBits::ashr(Known, Known2, /*ShAmtNonZero=*/false, Op->getFlags().hasExact()); break; + case ISD::ROTL: + case ISD::ROTR: + if (ConstantSDNode *C = + isConstOrConstSplat(Op.getOperand(1), DemandedElts)) { + unsigned Amt = C->getAPIntValue().urem(BitWidth); + + Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + + // Canonicalize to ROTR. + if (Opcode == ISD::ROTL && Amt != 0) + Amt = BitWidth - Amt; + + Known.Zero = Known.Zero.rotr(Amt); + Known.One = Known.One.rotr(Amt); + } + break; case ISD::FSHL: case ISD::FSHR: if (ConstantSDNode *C = isConstOrConstSplat(Op.getOperand(2), DemandedElts)) { @@ -3868,15 +3884,11 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); if (Opcode == ISD::FSHL) { - Known.One <<= Amt; - Known.Zero <<= Amt; - Known2.One.lshrInPlace(BitWidth - Amt); - Known2.Zero.lshrInPlace(BitWidth - Amt); + Known <<= Amt; + Known2 >>= BitWidth - Amt; } else { - Known.One <<= BitWidth - Amt; - Known.Zero <<= BitWidth - Amt; - Known2.One.lshrInPlace(Amt); - Known2.Zero.lshrInPlace(Amt); + Known <<= BitWidth - Amt; + Known2 >>= Amt; } Known = Known.unionWith(Known2); } @@ -4875,15 +4887,15 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts, case ISD::SRA: Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1); // SRA X, C -> adds C sign bits. - if (std::optional<uint64_t> ShAmt = + if (std::optional<unsigned> ShAmt = getValidMinimumShiftAmount(Op, DemandedElts, Depth + 1)) - Tmp = std::min<uint64_t>(Tmp + *ShAmt, VTBits); + Tmp = std::min(Tmp + *ShAmt, VTBits); return Tmp; case ISD::SHL: if (std::optional<ConstantRange> ShAmtRange = getValidShiftAmountRange(Op, DemandedElts, Depth + 1)) { - uint64_t MaxShAmt = ShAmtRange->getUnsignedMax().getZExtValue(); - uint64_t MinShAmt = ShAmtRange->getUnsignedMin().getZExtValue(); + unsigned MaxShAmt = ShAmtRange->getUnsignedMax().getZExtValue(); + unsigned MinShAmt = ShAmtRange->getUnsignedMin().getZExtValue(); // Try to look through ZERO/SIGN/ANY_EXTEND. If all extended bits are // shifted out, then we can compute the number of sign bits for the // operand being extended. A future improvement could be to pass along the @@ -4894,7 +4906,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts, EVT ExtVT = Ext.getValueType(); SDValue Extendee = Ext.getOperand(0); EVT ExtendeeVT = Extendee.getValueType(); - uint64_t SizeDifference = + unsigned SizeDifference = ExtVT.getScalarSizeInBits() - ExtendeeVT.getScalarSizeInBits(); if (SizeDifference <= MinShAmt) { Tmp = SizeDifference + @@ -5127,7 +5139,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts, // If the sign portion ends in our element the subtraction gives correct // result. Otherwise it gives either negative or > bitwidth result - return std::clamp(KnownSign - rIndex * BitWidth, 0, BitWidth); + return std::clamp(KnownSign - rIndex * BitWidth, 1, BitWidth); } case ISD::INSERT_VECTOR_ELT: { if (VT.isScalableVector()) @@ -5660,6 +5672,10 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts, case ISD::USUBSAT: case ISD::MULHU: case ISD::MULHS: + case ISD::AVGFLOORS: + case ISD::AVGFLOORU: + case ISD::AVGCEILS: + case ISD::AVGCEILU: case ISD::ABDU: case ISD::ABDS: case ISD::SMIN: diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 62ba801f6992..430e47451fd4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7974,12 +7974,19 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, } case Intrinsic::amdgcn_call_whole_wave: { TargetLowering::ArgListTy Args; + bool isTailCall = I.isTailCall(); // The first argument is the callee. Skip it when assembling the call args. for (unsigned Idx = 1; Idx < I.arg_size(); ++Idx) { TargetLowering::ArgListEntry Arg(getValue(I.getArgOperand(Idx)), I.getArgOperand(Idx)->getType()); Arg.setAttributes(&I, Idx); + + // If we have an explicit sret argument that is an Instruction, (i.e., it + // might point to function-local memory), we can't meaningfully tail-call. + if (Arg.IsSRet && isa<Instruction>(I.getArgOperand(Idx))) + isTailCall = false; + Args.push_back(Arg); } @@ -7994,7 +8001,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, .setChain(getRoot()) .setCallee(CallingConv::AMDGPU_Gfx_WholeWave, I.getType(), getValue(I.getArgOperand(0)), std::move(Args)) - .setTailCall(false) + .setTailCall(isTailCall && canTailCall(I)) .setIsPreallocated( I.countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0) .setConvergent(I.isConvergent()) @@ -8295,6 +8302,18 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, visitVectorExtractLastActive(I, Intrinsic); return; } + case Intrinsic::loop_dependence_war_mask: + setValue(&I, + DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, sdl, + EVT::getEVT(I.getType()), getValue(I.getOperand(0)), + getValue(I.getOperand(1)), getValue(I.getOperand(2)))); + return; + case Intrinsic::loop_dependence_raw_mask: + setValue(&I, + DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, sdl, + EVT::getEVT(I.getType()), getValue(I.getOperand(0)), + getValue(I.getOperand(1)), getValue(I.getOperand(2)))); + return; } } @@ -8456,8 +8475,11 @@ void SelectionDAGBuilder::visitVPLoad( MemoryLocation ML = MemoryLocation::getAfter(PtrOperand, AAInfo); bool AddToChain = !BatchAA || !BatchAA->pointsToConstantMemory(ML); SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + MachineMemOperand::Flags MMOFlags = + TLI.getVPIntrinsicMemOperandFlags(VPIntrin); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( - MachinePointerInfo(PtrOperand), MachineMemOperand::MOLoad, + MachinePointerInfo(PtrOperand), MMOFlags, LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo, Ranges); LD = DAG.getLoadVP(VT, DL, InChain, OpValues[0], OpValues[1], OpValues[2], MMO, false /*IsExpanding */); @@ -8508,9 +8530,11 @@ void SelectionDAGBuilder::visitVPGather( Alignment = DAG.getEVTAlign(VT.getScalarType()); unsigned AS = PtrOperand->getType()->getScalarType()->getPointerAddressSpace(); + MachineMemOperand::Flags MMOFlags = + TLI.getVPIntrinsicMemOperandFlags(VPIntrin); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( - MachinePointerInfo(AS), MachineMemOperand::MOLoad, - LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo, Ranges); + MachinePointerInfo(AS), MMOFlags, LocationSize::beforeOrAfterPointer(), + *Alignment, AAInfo, Ranges); SDValue Base, Index, Scale; bool UniformBase = getUniformBase(PtrOperand, Base, Index, Scale, this, VPIntrin.getParent(), @@ -8546,8 +8570,11 @@ void SelectionDAGBuilder::visitVPStore( Alignment = DAG.getEVTAlign(VT); SDValue Ptr = OpValues[1]; SDValue Offset = DAG.getUNDEF(Ptr.getValueType()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + MachineMemOperand::Flags MMOFlags = + TLI.getVPIntrinsicMemOperandFlags(VPIntrin); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( - MachinePointerInfo(PtrOperand), MachineMemOperand::MOStore, + MachinePointerInfo(PtrOperand), MMOFlags, LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo); ST = DAG.getStoreVP(getMemoryRoot(), DL, OpValues[0], Ptr, Offset, OpValues[2], OpValues[3], VT, MMO, ISD::UNINDEXED, @@ -8569,9 +8596,11 @@ void SelectionDAGBuilder::visitVPScatter( Alignment = DAG.getEVTAlign(VT.getScalarType()); unsigned AS = PtrOperand->getType()->getScalarType()->getPointerAddressSpace(); + MachineMemOperand::Flags MMOFlags = + TLI.getVPIntrinsicMemOperandFlags(VPIntrin); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( - MachinePointerInfo(AS), MachineMemOperand::MOStore, - LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo); + MachinePointerInfo(AS), MMOFlags, LocationSize::beforeOrAfterPointer(), + *Alignment, AAInfo); SDValue Base, Index, Scale; bool UniformBase = getUniformBase(PtrOperand, Base, Index, Scale, this, VPIntrin.getParent(), @@ -8609,9 +8638,12 @@ void SelectionDAGBuilder::visitVPStridedLoad( bool AddToChain = !BatchAA || !BatchAA->pointsToConstantMemory(ML); SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode(); unsigned AS = PtrOperand->getType()->getPointerAddressSpace(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + MachineMemOperand::Flags MMOFlags = + TLI.getVPIntrinsicMemOperandFlags(VPIntrin); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( - MachinePointerInfo(AS), MachineMemOperand::MOLoad, - LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo, Ranges); + MachinePointerInfo(AS), MMOFlags, LocationSize::beforeOrAfterPointer(), + *Alignment, AAInfo, Ranges); SDValue LD = DAG.getStridedLoadVP(VT, DL, InChain, OpValues[0], OpValues[1], OpValues[2], OpValues[3], MMO, @@ -8632,9 +8664,12 @@ void SelectionDAGBuilder::visitVPStridedStore( Alignment = DAG.getEVTAlign(VT.getScalarType()); AAMDNodes AAInfo = VPIntrin.getAAMetadata(); unsigned AS = PtrOperand->getType()->getPointerAddressSpace(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + MachineMemOperand::Flags MMOFlags = + TLI.getVPIntrinsicMemOperandFlags(VPIntrin); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( - MachinePointerInfo(AS), MachineMemOperand::MOStore, - LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo); + MachinePointerInfo(AS), MMOFlags, LocationSize::beforeOrAfterPointer(), + *Alignment, AAInfo); SDValue ST = DAG.getStridedStoreVP( getMemoryRoot(), DL, OpValues[0], OpValues[1], @@ -8901,6 +8936,29 @@ SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI, return Result; } +bool SelectionDAGBuilder::canTailCall(const CallBase &CB) const { + bool isMustTailCall = CB.isMustTailCall(); + + // Avoid emitting tail calls in functions with the disable-tail-calls + // attribute. + const Function *Caller = CB.getParent()->getParent(); + if (Caller->getFnAttribute("disable-tail-calls").getValueAsString() == + "true" && + !isMustTailCall) + return false; + + // We can't tail call inside a function with a swifterror argument. Lowering + // does not support this yet. It would have to move into the swifterror + // register before the call. + if (DAG.getTargetLoweringInfo().supportSwiftError() && + Caller->getAttributes().hasAttrSomewhere(Attribute::SwiftError)) + return false; + + // Check if target-independent constraints permit a tail call here. + // Target-dependent constraints are checked within TLI->LowerCallTo. + return isInTailCallPosition(CB, DAG.getTarget()); +} + void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee, bool isTailCall, bool isMustTailCall, const BasicBlock *EHPadBB, @@ -8915,21 +8973,8 @@ void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee, const Value *SwiftErrorVal = nullptr; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (isTailCall) { - // Avoid emitting tail calls in functions with the disable-tail-calls - // attribute. - auto *Caller = CB.getParent()->getParent(); - if (Caller->getFnAttribute("disable-tail-calls").getValueAsString() == - "true" && !isMustTailCall) - isTailCall = false; - - // We can't tail call inside a function with a swifterror argument. Lowering - // does not support this yet. It would have to move into the swifterror - // register before the call. - if (TLI.supportSwiftError() && - Caller->getAttributes().hasAttrSomewhere(Attribute::SwiftError)) - isTailCall = false; - } + if (isTailCall) + isTailCall = canTailCall(CB); for (auto I = CB.arg_begin(), E = CB.arg_end(); I != E; ++I) { const Value *V = *I; @@ -8969,11 +9014,6 @@ void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee, Args.push_back(Entry); } - // Check if target-independent constraints permit a tail call here. - // Target-dependent constraints are checked within TLI->LowerCallTo. - if (isTailCall && !isInTailCallPosition(CB, DAG.getTarget())) - isTailCall = false; - // Disable tail calls if there is an swifterror argument. Targets have not // been updated to support tail calls. if (TLI.supportSwiftError() && SwiftErrorVal) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index e0835e631035..c7577fa335fe 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -408,6 +408,10 @@ public: bool IsMustTailCall, const BasicBlock *EHPadBB = nullptr, const TargetLowering::PtrAuthInfo *PAI = nullptr); + // Check some of the target-independent constraints for tail calls. This does + // not iterate over the call arguments. + bool canTailCall(const CallBase &CB) const; + // Lower range metadata from 0 to N to assert zext to an integer of nearest // floor power of two. SDValue lowerRangeToAssertZExt(SelectionDAG &DAG, const Instruction &I, diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 900da7645504..4b2a00c2e2cf 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -587,6 +587,10 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { return "partial_reduce_smla"; case ISD::PARTIAL_REDUCE_SUMLA: return "partial_reduce_sumla"; + case ISD::LOOP_DEPENDENCE_WAR_MASK: + return "loop_dep_war"; + case ISD::LOOP_DEPENDENCE_RAW_MASK: + return "loop_dep_raw"; // Vector Predication #define BEGIN_REGISTER_VP_SDNODE(SDID, LEGALARG, NAME, ...) \ diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index ece50ed95fc4..e61558c59bf0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -1729,10 +1729,18 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { // Setup an EH landing-pad block. FuncInfo->ExceptionPointerVirtReg = Register(); FuncInfo->ExceptionSelectorVirtReg = Register(); - if (LLVMBB->isEHPad()) + if (LLVMBB->isEHPad()) { if (!PrepareEHLandingPad()) continue; + if (!FastIS) { + SDValue NewRoot = TLI->lowerEHPadEntry(CurDAG->getRoot(), + SDB->getCurSDLoc(), *CurDAG); + if (NewRoot && NewRoot != CurDAG->getRoot()) + CurDAG->setRoot(NewRoot); + } + } + // Before doing SelectionDAG ISel, see if FastISel has been requested. if (FastIS) { if (LLVMBB != &Fn.getEntryBlock()) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 402a012e8e55..fd6d20e146bb 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -832,7 +832,7 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits( case ISD::SHL: { // If we are only demanding sign bits then we can use the shift source // directly. - if (std::optional<uint64_t> MaxSA = + if (std::optional<unsigned> MaxSA = DAG.getValidMaximumShiftAmount(Op, DemandedElts, Depth + 1)) { SDValue Op0 = Op.getOperand(0); unsigned ShAmt = *MaxSA; @@ -847,7 +847,7 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits( case ISD::SRL: { // If we are only demanding sign bits then we can use the shift source // directly. - if (std::optional<uint64_t> MaxSA = + if (std::optional<unsigned> MaxSA = DAG.getValidMaximumShiftAmount(Op, DemandedElts, Depth + 1)) { SDValue Op0 = Op.getOperand(0); unsigned ShAmt = *MaxSA; @@ -1780,7 +1780,7 @@ bool TargetLowering::SimplifyDemandedBits( SDValue Op1 = Op.getOperand(1); EVT ShiftVT = Op1.getValueType(); - if (std::optional<uint64_t> KnownSA = + if (std::optional<unsigned> KnownSA = TLO.DAG.getValidShiftAmount(Op, DemandedElts, Depth + 1)) { unsigned ShAmt = *KnownSA; if (ShAmt == 0) @@ -1792,7 +1792,7 @@ bool TargetLowering::SimplifyDemandedBits( // TODO - support non-uniform vector amounts. if (Op0.getOpcode() == ISD::SRL) { if (!DemandedBits.intersects(APInt::getLowBitsSet(BitWidth, ShAmt))) { - if (std::optional<uint64_t> InnerSA = + if (std::optional<unsigned> InnerSA = TLO.DAG.getValidShiftAmount(Op0, DemandedElts, Depth + 2)) { unsigned C1 = *InnerSA; unsigned Opc = ISD::SHL; @@ -1832,7 +1832,7 @@ bool TargetLowering::SimplifyDemandedBits( // TODO - support non-uniform vector amounts. if (InnerOp.getOpcode() == ISD::SRL && Op0.hasOneUse() && InnerOp.hasOneUse()) { - if (std::optional<uint64_t> SA2 = TLO.DAG.getValidShiftAmount( + if (std::optional<unsigned> SA2 = TLO.DAG.getValidShiftAmount( InnerOp, DemandedElts, Depth + 2)) { unsigned InnerShAmt = *SA2; if (InnerShAmt < ShAmt && InnerShAmt < InnerBits && @@ -1858,8 +1858,7 @@ bool TargetLowering::SimplifyDemandedBits( Op->dropFlags(SDNodeFlags::NoWrap); return true; } - Known.Zero <<= ShAmt; - Known.One <<= ShAmt; + Known <<= ShAmt; // low bits known zero. Known.Zero.setLowBits(ShAmt); @@ -1950,7 +1949,7 @@ bool TargetLowering::SimplifyDemandedBits( // If we are only demanding sign bits then we can use the shift source // directly. - if (std::optional<uint64_t> MaxSA = + if (std::optional<unsigned> MaxSA = TLO.DAG.getValidMaximumShiftAmount(Op, DemandedElts, Depth + 1)) { unsigned ShAmt = *MaxSA; unsigned NumSignBits = @@ -1966,7 +1965,7 @@ bool TargetLowering::SimplifyDemandedBits( SDValue Op1 = Op.getOperand(1); EVT ShiftVT = Op1.getValueType(); - if (std::optional<uint64_t> KnownSA = + if (std::optional<unsigned> KnownSA = TLO.DAG.getValidShiftAmount(Op, DemandedElts, Depth + 1)) { unsigned ShAmt = *KnownSA; if (ShAmt == 0) @@ -1978,7 +1977,7 @@ bool TargetLowering::SimplifyDemandedBits( // TODO - support non-uniform vector amounts. if (Op0.getOpcode() == ISD::SHL) { if (!DemandedBits.intersects(APInt::getHighBitsSet(BitWidth, ShAmt))) { - if (std::optional<uint64_t> InnerSA = + if (std::optional<unsigned> InnerSA = TLO.DAG.getValidShiftAmount(Op0, DemandedElts, Depth + 2)) { unsigned C1 = *InnerSA; unsigned Opc = ISD::SRL; @@ -1998,7 +1997,7 @@ bool TargetLowering::SimplifyDemandedBits( // single sra. We can do this if the top bits are never demanded. if (Op0.getOpcode() == ISD::SRA && Op0.hasOneUse()) { if (!DemandedBits.intersects(APInt::getHighBitsSet(BitWidth, ShAmt))) { - if (std::optional<uint64_t> InnerSA = + if (std::optional<unsigned> InnerSA = TLO.DAG.getValidShiftAmount(Op0, DemandedElts, Depth + 2)) { unsigned C1 = *InnerSA; // Clamp the combined shift amount if it exceeds the bit width. @@ -2042,8 +2041,7 @@ bool TargetLowering::SimplifyDemandedBits( if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO, Depth + 1)) return true; - Known.Zero.lshrInPlace(ShAmt); - Known.One.lshrInPlace(ShAmt); + Known >>= ShAmt; // High bits known zero. Known.Zero.setHighBits(ShAmt); @@ -2064,7 +2062,7 @@ bool TargetLowering::SimplifyDemandedBits( // If we are only demanding sign bits then we can use the shift source // directly. - if (std::optional<uint64_t> MaxSA = + if (std::optional<unsigned> MaxSA = TLO.DAG.getValidMaximumShiftAmount(Op, DemandedElts, Depth + 1)) { unsigned ShAmt = *MaxSA; // Must already be signbits in DemandedBits bounds, and can't demand any @@ -2103,7 +2101,7 @@ bool TargetLowering::SimplifyDemandedBits( if (DemandedBits.isOne()) return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT, Op0, Op1)); - if (std::optional<uint64_t> KnownSA = + if (std::optional<unsigned> KnownSA = TLO.DAG.getValidShiftAmount(Op, DemandedElts, Depth + 1)) { unsigned ShAmt = *KnownSA; if (ShAmt == 0) @@ -2112,7 +2110,7 @@ bool TargetLowering::SimplifyDemandedBits( // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target // supports sext_inreg. if (Op0.getOpcode() == ISD::SHL) { - if (std::optional<uint64_t> InnerSA = + if (std::optional<unsigned> InnerSA = TLO.DAG.getValidShiftAmount(Op0, DemandedElts, Depth + 2)) { unsigned LowBits = BitWidth - ShAmt; EVT ExtVT = EVT::getIntegerVT(*TLO.DAG.getContext(), LowBits); @@ -2153,8 +2151,7 @@ bool TargetLowering::SimplifyDemandedBits( if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO, Depth + 1)) return true; - Known.Zero.lshrInPlace(ShAmt); - Known.One.lshrInPlace(ShAmt); + Known >>= ShAmt; // If the input sign bit is known to be zero, or if none of the top bits // are demanded, turn this into an unsigned shift right. @@ -2225,10 +2222,8 @@ bool TargetLowering::SimplifyDemandedBits( Depth + 1)) return true; - Known2.One <<= (IsFSHL ? Amt : (BitWidth - Amt)); - Known2.Zero <<= (IsFSHL ? Amt : (BitWidth - Amt)); - Known.One.lshrInPlace(IsFSHL ? (BitWidth - Amt) : Amt); - Known.Zero.lshrInPlace(IsFSHL ? (BitWidth - Amt) : Amt); + Known2 <<= (IsFSHL ? Amt : (BitWidth - Amt)); + Known >>= (IsFSHL ? (BitWidth - Amt) : Amt); Known = Known.unionWith(Known2); // Attempt to avoid multi-use ops if we don't need anything from them. @@ -2363,8 +2358,7 @@ bool TargetLowering::SimplifyDemandedBits( if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, Known2, TLO, Depth + 1)) return true; - Known.One = Known2.One.reverseBits(); - Known.Zero = Known2.Zero.reverseBits(); + Known = Known2.reverseBits(); break; } case ISD::BSWAP: { @@ -2397,8 +2391,7 @@ bool TargetLowering::SimplifyDemandedBits( if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, Known2, TLO, Depth + 1)) return true; - Known.One = Known2.One.byteSwap(); - Known.Zero = Known2.Zero.byteSwap(); + Known = Known2.byteSwap(); break; } case ISD::CTPOP: { @@ -2664,11 +2657,11 @@ bool TargetLowering::SimplifyDemandedBits( break; } - std::optional<uint64_t> ShAmtC = + std::optional<unsigned> ShAmtC = TLO.DAG.getValidShiftAmount(Src, DemandedElts, Depth + 2); if (!ShAmtC || *ShAmtC >= BitWidth) break; - uint64_t ShVal = *ShAmtC; + unsigned ShVal = *ShAmtC; APInt HighBits = APInt::getHighBitsSet(OperandBitWidth, OperandBitWidth - BitWidth); @@ -3234,27 +3227,6 @@ bool TargetLowering::SimplifyDemandedVectorElts( KnownUndef.setAllBits(); return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT)); } - SDValue ScalarSrc = Op.getOperand(0); - if (ScalarSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { - SDValue Src = ScalarSrc.getOperand(0); - SDValue Idx = ScalarSrc.getOperand(1); - EVT SrcVT = Src.getValueType(); - - ElementCount SrcEltCnt = SrcVT.getVectorElementCount(); - - if (SrcEltCnt.isScalable()) - return false; - - unsigned NumSrcElts = SrcEltCnt.getFixedValue(); - if (isNullConstant(Idx)) { - APInt SrcDemandedElts = APInt::getOneBitSet(NumSrcElts, 0); - APInt SrcUndef = KnownUndef.zextOrTrunc(NumSrcElts); - APInt SrcZero = KnownZero.zextOrTrunc(NumSrcElts); - if (SimplifyDemandedVectorElts(Src, SrcDemandedElts, SrcUndef, SrcZero, - TLO, Depth + 1)) - return true; - } - } KnownUndef.setHighBits(NumElts - 1); break; } @@ -9740,8 +9712,8 @@ SDValue TargetLowering::expandABS(SDNode *N, SelectionDAG &DAG, SDValue TargetLowering::expandABD(SDNode *N, SelectionDAG &DAG) const { SDLoc dl(N); EVT VT = N->getValueType(0); - SDValue LHS = DAG.getFreeze(N->getOperand(0)); - SDValue RHS = DAG.getFreeze(N->getOperand(1)); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); bool IsSigned = N->getOpcode() == ISD::ABDS; // abds(lhs, rhs) -> sub(smax(lhs,rhs), smin(lhs,rhs)) @@ -9749,34 +9721,37 @@ SDValue TargetLowering::expandABD(SDNode *N, SelectionDAG &DAG) const { unsigned MaxOpc = IsSigned ? ISD::SMAX : ISD::UMAX; unsigned MinOpc = IsSigned ? ISD::SMIN : ISD::UMIN; if (isOperationLegal(MaxOpc, VT) && isOperationLegal(MinOpc, VT)) { + LHS = DAG.getFreeze(LHS); + RHS = DAG.getFreeze(RHS); SDValue Max = DAG.getNode(MaxOpc, dl, VT, LHS, RHS); SDValue Min = DAG.getNode(MinOpc, dl, VT, LHS, RHS); return DAG.getNode(ISD::SUB, dl, VT, Max, Min); } // abdu(lhs, rhs) -> or(usubsat(lhs,rhs), usubsat(rhs,lhs)) - if (!IsSigned && isOperationLegal(ISD::USUBSAT, VT)) + if (!IsSigned && isOperationLegal(ISD::USUBSAT, VT)) { + LHS = DAG.getFreeze(LHS); + RHS = DAG.getFreeze(RHS); return DAG.getNode(ISD::OR, dl, VT, DAG.getNode(ISD::USUBSAT, dl, VT, LHS, RHS), DAG.getNode(ISD::USUBSAT, dl, VT, RHS, LHS)); + } // If the subtract doesn't overflow then just use abs(sub()) - // NOTE: don't use frozen operands for value tracking. - bool IsNonNegative = DAG.SignBitIsZero(N->getOperand(1)) && - DAG.SignBitIsZero(N->getOperand(0)); + bool IsNonNegative = DAG.SignBitIsZero(LHS) && DAG.SignBitIsZero(RHS); - if (DAG.willNotOverflowSub(IsSigned || IsNonNegative, N->getOperand(0), - N->getOperand(1))) + if (DAG.willNotOverflowSub(IsSigned || IsNonNegative, LHS, RHS)) return DAG.getNode(ISD::ABS, dl, VT, DAG.getNode(ISD::SUB, dl, VT, LHS, RHS)); - if (DAG.willNotOverflowSub(IsSigned || IsNonNegative, N->getOperand(1), - N->getOperand(0))) + if (DAG.willNotOverflowSub(IsSigned || IsNonNegative, RHS, LHS)) return DAG.getNode(ISD::ABS, dl, VT, DAG.getNode(ISD::SUB, dl, VT, RHS, LHS)); EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); ISD::CondCode CC = IsSigned ? ISD::CondCode::SETGT : ISD::CondCode::SETUGT; + LHS = DAG.getFreeze(LHS); + RHS = DAG.getFreeze(RHS); SDValue Cmp = DAG.getSetCC(dl, CCVT, LHS, RHS, CC); // Branchless expansion iff cmp result is allbits: diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp index 9e49dddd46ba..0d7b128fc736 100644 --- a/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -996,7 +996,7 @@ static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, (MI->getOpcode() != CombineOpc && CombineOpc != 0)) return false; // Must only used by the user we combine with. - if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) + if (!MRI.hasOneNonDBGUse(MO.getReg())) return false; return true; @@ -1456,11 +1456,13 @@ void TargetInstrInfo::reassociateOps( MIB1->clearFlag(MachineInstr::MIFlag::NoSWrap); MIB1->clearFlag(MachineInstr::MIFlag::NoUWrap); MIB1->clearFlag(MachineInstr::MIFlag::IsExact); + MIB1->clearFlag(MachineInstr::MIFlag::Disjoint); MIB2->setFlags(IntersectedFlags); MIB2->clearFlag(MachineInstr::MIFlag::NoSWrap); MIB2->clearFlag(MachineInstr::MIFlag::NoUWrap); MIB2->clearFlag(MachineInstr::MIFlag::IsExact); + MIB2->clearFlag(MachineInstr::MIFlag::Disjoint); setSpecialOperandAttr(Root, Prev, *MIB1, *MIB2); diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 9ffced80b07f..c23281a820b2 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -612,23 +612,23 @@ RTLIB::Libcall RTLIB::getMEMSET_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize) { ISD::CondCode TargetLoweringBase::getSoftFloatCmpLibcallPredicate( RTLIB::LibcallImpl Impl) const { switch (Impl) { - case RTLIB::__aeabi_dcmpeq__une: - case RTLIB::__aeabi_fcmpeq__une: + case RTLIB::impl___aeabi_dcmpeq__une: + case RTLIB::impl___aeabi_fcmpeq__une: // Usage in the eq case, so we have to invert the comparison. return ISD::SETEQ; - case RTLIB::__aeabi_dcmpeq__oeq: - case RTLIB::__aeabi_fcmpeq__oeq: + case RTLIB::impl___aeabi_dcmpeq__oeq: + case RTLIB::impl___aeabi_fcmpeq__oeq: // Normal comparison to boolean value. return ISD::SETNE; - case RTLIB::__aeabi_dcmplt: - case RTLIB::__aeabi_dcmple: - case RTLIB::__aeabi_dcmpge: - case RTLIB::__aeabi_dcmpgt: - case RTLIB::__aeabi_dcmpun: - case RTLIB::__aeabi_fcmplt: - case RTLIB::__aeabi_fcmple: - case RTLIB::__aeabi_fcmpge: - case RTLIB::__aeabi_fcmpgt: + case RTLIB::impl___aeabi_dcmplt: + case RTLIB::impl___aeabi_dcmple: + case RTLIB::impl___aeabi_dcmpge: + case RTLIB::impl___aeabi_dcmpgt: + case RTLIB::impl___aeabi_dcmpun: + case RTLIB::impl___aeabi_fcmplt: + case RTLIB::impl___aeabi_fcmple: + case RTLIB::impl___aeabi_fcmpge: + case RTLIB::impl___aeabi_fcmpgt: /// The AEABI versions return a typical boolean value, so we can compare /// against the integer result as simply != 0. return ISD::SETNE; @@ -900,6 +900,9 @@ void TargetLoweringBase::initActions() { // Masked vector extracts default to expand. setOperationAction(ISD::VECTOR_FIND_LAST_ACTIVE, VT, Expand); + setOperationAction(ISD::LOOP_DEPENDENCE_RAW_MASK, VT, Expand); + setOperationAction(ISD::LOOP_DEPENDENCE_WAR_MASK, VT, Expand); + // FP environment operations default to expand. setOperationAction(ISD::GET_FPENV, VT, Expand); setOperationAction(ISD::SET_FPENV, VT, Expand); @@ -2406,6 +2409,34 @@ TargetLoweringBase::getAtomicMemOperandFlags(const Instruction &AI, return Flags; } +MachineMemOperand::Flags TargetLoweringBase::getVPIntrinsicMemOperandFlags( + const VPIntrinsic &VPIntrin) const { + MachineMemOperand::Flags Flags = MachineMemOperand::MONone; + Intrinsic::ID IntrinID = VPIntrin.getIntrinsicID(); + + switch (IntrinID) { + default: + llvm_unreachable("unexpected intrinsic. Existing code may be appropriate " + "for it, but support must be explicitly enabled"); + case Intrinsic::vp_load: + case Intrinsic::vp_gather: + case Intrinsic::experimental_vp_strided_load: + Flags = MachineMemOperand::MOLoad; + break; + case Intrinsic::vp_store: + case Intrinsic::vp_scatter: + case Intrinsic::experimental_vp_strided_store: + Flags = MachineMemOperand::MOStore; + break; + } + + if (VPIntrin.hasMetadata(LLVMContext::MD_nontemporal)) + Flags |= MachineMemOperand::MONonTemporal; + + Flags |= getTargetMMOFlags(VPIntrin); + return Flags; +} + Instruction *TargetLoweringBase::emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const { diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index d19ef923ef74..ae681b9aebdf 100644 --- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -247,6 +247,8 @@ void TargetLoweringObjectFileELF::Initialize(MCContext &Ctx, break; case Triple::riscv32: case Triple::riscv64: + case Triple::riscv32be: + case Triple::riscv64be: LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4; PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4; @@ -1918,6 +1920,13 @@ void TargetLoweringObjectFileCOFF::emitModuleMetadata(MCStreamer &Streamer, } emitCGProfileMetadata(Streamer, M); + emitPseudoProbeDescMetadata(Streamer, M, [](MCStreamer &Streamer) { + if (MCSymbol *Sym = + static_cast<MCSectionCOFF *>(Streamer.getCurrentSectionOnly()) + ->getCOMDATSymbol()) + if (Sym->isUndefined()) + Streamer.emitLabel(Sym); + }); } void TargetLoweringObjectFileCOFF::emitLinkerDirectives( diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp index 7d7c6e743fa7..b6169e6c4dc3 100644 --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -134,12 +134,18 @@ static cl::opt<cl::boolOrDefault> DebugifyCheckAndStripAll( static cl::opt<RunOutliner> EnableMachineOutliner( "enable-machine-outliner", cl::desc("Enable the machine outliner"), cl::Hidden, cl::ValueOptional, cl::init(RunOutliner::TargetDefault), - cl::values(clEnumValN(RunOutliner::AlwaysOutline, "always", - "Run on all functions guaranteed to be beneficial"), - clEnumValN(RunOutliner::NeverOutline, "never", - "Disable all outlining"), - // Sentinel value for unspecified option. - clEnumValN(RunOutliner::AlwaysOutline, "", ""))); + cl::values( + clEnumValN(RunOutliner::AlwaysOutline, "always", + "Run on all functions guaranteed to be beneficial"), + clEnumValN(RunOutliner::OptimisticPGO, "optimistic-pgo", + "Outline cold code only. If a code block does not have " + "profile data, optimistically assume it is cold."), + clEnumValN(RunOutliner::ConservativePGO, "conservative-pgo", + "Outline cold code only. If a code block does not have " + "profile, data, conservatively assume it is hot."), + clEnumValN(RunOutliner::NeverOutline, "never", "Disable all outlining"), + // Sentinel value for unspecified option. + clEnumValN(RunOutliner::AlwaysOutline, "", ""))); static cl::opt<bool> EnableGlobalMergeFunc( "enable-global-merge-func", cl::Hidden, cl::desc("Enable global merge functions that are based on hash function")); @@ -1074,7 +1080,7 @@ bool TargetPassConfig::addISelPasses() { PM->add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis())); addPass(createPreISelIntrinsicLoweringPass()); addPass(createExpandLargeDivRemPass()); - addPass(createExpandFpPass()); + addPass(createExpandFpPass(getOptLevel())); addIRPasses(); addCodeGenPrepare(); addPassesToHandleExceptions(); @@ -1224,12 +1230,9 @@ void TargetPassConfig::addMachinePasses() { if (TM->Options.EnableMachineOutliner && getOptLevel() != CodeGenOptLevel::None && EnableMachineOutliner != RunOutliner::NeverOutline) { - bool RunOnAllFunctions = - (EnableMachineOutliner == RunOutliner::AlwaysOutline); - bool AddOutliner = - RunOnAllFunctions || TM->Options.SupportsDefaultOutlining; - if (AddOutliner) - addPass(createMachineOutlinerPass(RunOnAllFunctions)); + if (EnableMachineOutliner != RunOutliner::TargetDefault || + TM->Options.SupportsDefaultOutlining) + addPass(createMachineOutlinerPass(EnableMachineOutliner)); } if (GCEmptyBlocks) diff --git a/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp b/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp index 4e4d86e5cb8d..1c0ddc8e1ca3 100644 --- a/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp +++ b/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp @@ -55,7 +55,7 @@ Error DwarfStreamer::init(Triple TheTriple, TripleName = TheTriple.getTriple(); // Create all the MC Objects. - MRI.reset(TheTarget->createMCRegInfo(TripleName)); + MRI.reset(TheTarget->createMCRegInfo(TheTriple)); if (!MRI) return createStringError(std::errc::invalid_argument, "no register info for target %s", @@ -64,12 +64,12 @@ Error DwarfStreamer::init(Triple TheTriple, MCTargetOptions MCOptions = mc::InitMCTargetOptionsFromFlags(); MCOptions.AsmVerbose = true; MCOptions.MCUseDwarfDirectory = MCTargetOptions::EnableDwarfDirectory; - MAI.reset(TheTarget->createMCAsmInfo(*MRI, TripleName, MCOptions)); + MAI.reset(TheTarget->createMCAsmInfo(*MRI, TheTriple, MCOptions)); if (!MAI) return createStringError(std::errc::invalid_argument, "no asm info for target %s", TripleName.c_str()); - MSTI.reset(TheTarget->createMCSubtargetInfo(TripleName, "", "")); + MSTI.reset(TheTarget->createMCSubtargetInfo(TheTriple, "", "")); if (!MSTI) return createStringError(std::errc::invalid_argument, "no subtarget info for target %s", diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFEmitterImpl.cpp b/llvm/lib/DWARFLinker/Parallel/DWARFEmitterImpl.cpp index 379f60b0bfb9..9222235d7a41 100644 --- a/llvm/lib/DWARFLinker/Parallel/DWARFEmitterImpl.cpp +++ b/llvm/lib/DWARFLinker/Parallel/DWARFEmitterImpl.cpp @@ -35,7 +35,7 @@ Error DwarfEmitterImpl::init(Triple TheTriple, TripleName = TheTriple.getTriple(); // Create all the MC Objects. - MRI.reset(TheTarget->createMCRegInfo(TripleName)); + MRI.reset(TheTarget->createMCRegInfo(TheTriple)); if (!MRI) return createStringError(std::errc::invalid_argument, "no register info for target %s", @@ -44,12 +44,12 @@ Error DwarfEmitterImpl::init(Triple TheTriple, MCTargetOptions MCOptions = mc::InitMCTargetOptionsFromFlags(); MCOptions.AsmVerbose = true; MCOptions.MCUseDwarfDirectory = MCTargetOptions::EnableDwarfDirectory; - MAI.reset(TheTarget->createMCAsmInfo(*MRI, TripleName, MCOptions)); + MAI.reset(TheTarget->createMCAsmInfo(*MRI, TheTriple, MCOptions)); if (!MAI) return createStringError(std::errc::invalid_argument, "no asm info for target %s", TripleName.c_str()); - MSTI.reset(TheTarget->createMCSubtargetInfo(TripleName, "", "")); + MSTI.reset(TheTarget->createMCSubtargetInfo(TheTriple, "", "")); if (!MSTI) return createStringError(std::errc::invalid_argument, "no subtarget info for target %s", diff --git a/llvm/lib/DWARFLinker/Parallel/DebugLineSectionEmitter.h b/llvm/lib/DWARFLinker/Parallel/DebugLineSectionEmitter.h index b035c4b1d6c3..03c0566f58f8 100644 --- a/llvm/lib/DWARFLinker/Parallel/DebugLineSectionEmitter.h +++ b/llvm/lib/DWARFLinker/Parallel/DebugLineSectionEmitter.h @@ -73,19 +73,19 @@ private: TripleName = TheTriple.getTriple(); // Create all the MC Objects. - MRI.reset(TheTarget->createMCRegInfo(TripleName)); + MRI.reset(TheTarget->createMCRegInfo(TheTriple)); if (!MRI) return createStringError(std::errc::invalid_argument, "no register info for target %s", TripleName.c_str()); MCTargetOptions MCOptions = mc::InitMCTargetOptionsFromFlags(); - MAI.reset(TheTarget->createMCAsmInfo(*MRI, TripleName, MCOptions)); + MAI.reset(TheTarget->createMCAsmInfo(*MRI, TheTriple, MCOptions)); if (!MAI) return createStringError(std::errc::invalid_argument, "no asm info for target %s", TripleName.c_str()); - MSTI.reset(TheTarget->createMCSubtargetInfo(TripleName, "", "")); + MSTI.reset(TheTarget->createMCSubtargetInfo(TheTriple, "", "")); if (!MSTI) return createStringError(std::errc::invalid_argument, "no subtarget info for target %s", diff --git a/llvm/lib/DebugInfo/DWARF/DWARFExpressionPrinter.cpp b/llvm/lib/DebugInfo/DWARF/DWARFExpressionPrinter.cpp index ebcd4dda5048..078ebf4e7c03 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFExpressionPrinter.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFExpressionPrinter.cpp @@ -48,13 +48,52 @@ static bool printOp(const DWARFExpression::Operation *Op, raw_ostream &OS, DIDumpOptions DumpOpts, const DWARFExpression *Expr, DWARFUnit *U) { if (Op->isError()) { - OS << "<decoding error>"; + if (!DumpOpts.PrintRegisterOnly) + OS << "<decoding error>"; return false; } - StringRef Name = OperationEncodingString(Op->getCode()); - assert(!Name.empty() && "DW_OP has no name!"); - OS << Name; + // In "register-only" mode, still show simple constant-valued locations. + // This lets clients print annotations like "i = 0" when the location is + // a constant (e.g. DW_OP_constu/consts ... DW_OP_stack_value). + // We continue to suppress all other non-register ops in this mode. + if (DumpOpts.PrintRegisterOnly) { + // First, try pretty-printing registers (existing behavior below also does + // this, but we need to short-circuit here to avoid printing opcode names). + if ((Op->getCode() >= DW_OP_breg0 && Op->getCode() <= DW_OP_breg31) || + (Op->getCode() >= DW_OP_reg0 && Op->getCode() <= DW_OP_reg31) || + Op->getCode() == DW_OP_bregx || Op->getCode() == DW_OP_regx || + Op->getCode() == DW_OP_regval_type) { + if (prettyPrintRegisterOp(U, OS, DumpOpts, Op->getCode(), + Op->getRawOperands())) + return true; + // If we couldn't pretty-print, fall through and suppress. + } + + // Show constants (decimal), suppress everything else. + if (Op->getCode() == DW_OP_constu) { + OS << (uint64_t)Op->getRawOperand(0); + return true; + } + if (Op->getCode() == DW_OP_consts) { + OS << (int64_t)Op->getRawOperand(0); + return true; + } + if (Op->getCode() >= DW_OP_lit0 && Op->getCode() <= DW_OP_lit31) { + OS << (unsigned)(Op->getCode() - DW_OP_lit0); + return true; + } + if (Op->getCode() == DW_OP_stack_value) + return true; // metadata; don't print a token + + return true; // suppress other opcodes silently in register-only mode + } + + if (!DumpOpts.PrintRegisterOnly) { + StringRef Name = OperationEncodingString(Op->getCode()); + assert(!Name.empty() && "DW_OP has no name!"); + OS << Name; + } if ((Op->getCode() >= DW_OP_breg0 && Op->getCode() <= DW_OP_breg31) || (Op->getCode() >= DW_OP_reg0 && Op->getCode() <= DW_OP_reg31) || @@ -64,48 +103,51 @@ static bool printOp(const DWARFExpression::Operation *Op, raw_ostream &OS, Op->getRawOperands())) return true; - for (unsigned Operand = 0; Operand < Op->getDescription().Op.size(); - ++Operand) { - unsigned Size = Op->getDescription().Op[Operand]; - unsigned Signed = Size & DWARFExpression::Operation::SignBit; - - if (Size == DWARFExpression::Operation::SizeSubOpLEB) { - StringRef SubName = - SubOperationEncodingString(Op->getCode(), Op->getRawOperand(Operand)); - assert(!SubName.empty() && "DW_OP SubOp has no name!"); - OS << " " << SubName; - } else if (Size == DWARFExpression::Operation::BaseTypeRef && U) { - // For DW_OP_convert the operand may be 0 to indicate that conversion to - // the generic type should be done. The same holds for DW_OP_reinterpret, - // which is currently not supported. - if (Op->getCode() == DW_OP_convert && Op->getRawOperand(Operand) == 0) - OS << " 0x0"; - else - prettyPrintBaseTypeRef(U, OS, DumpOpts, Op->getRawOperands(), Operand); - } else if (Size == DWARFExpression::Operation::WasmLocationArg) { - assert(Operand == 1); - switch (Op->getRawOperand(0)) { - case 0: - case 1: - case 2: - case 3: // global as uint32 - case 4: - OS << format(" 0x%" PRIx64, Op->getRawOperand(Operand)); - break; - default: - assert(false); + if (!DumpOpts.PrintRegisterOnly) { + for (unsigned Operand = 0; Operand < Op->getDescription().Op.size(); + ++Operand) { + unsigned Size = Op->getDescription().Op[Operand]; + unsigned Signed = Size & DWARFExpression::Operation::SignBit; + + if (Size == DWARFExpression::Operation::SizeSubOpLEB) { + StringRef SubName = SubOperationEncodingString( + Op->getCode(), Op->getRawOperand(Operand)); + assert(!SubName.empty() && "DW_OP SubOp has no name!"); + OS << " " << SubName; + } else if (Size == DWARFExpression::Operation::BaseTypeRef && U) { + // For DW_OP_convert the operand may be 0 to indicate that conversion to + // the generic type should be done. The same holds for + // DW_OP_reinterpret, which is currently not supported. + if (Op->getCode() == DW_OP_convert && Op->getRawOperand(Operand) == 0) + OS << " 0x0"; + else + prettyPrintBaseTypeRef(U, OS, DumpOpts, Op->getRawOperands(), + Operand); + } else if (Size == DWARFExpression::Operation::WasmLocationArg) { + assert(Operand == 1); + switch (Op->getRawOperand(0)) { + case 0: + case 1: + case 2: + case 3: // global as uint32 + case 4: + OS << format(" 0x%" PRIx64, Op->getRawOperand(Operand)); + break; + default: + assert(false); + } + } else if (Size == DWARFExpression::Operation::SizeBlock) { + uint64_t Offset = Op->getRawOperand(Operand); + for (unsigned i = 0; i < Op->getRawOperand(Operand - 1); ++i) + OS << format(" 0x%02x", + static_cast<uint8_t>(Expr->getData()[Offset++])); + } else { + if (Signed) + OS << format(" %+" PRId64, (int64_t)Op->getRawOperand(Operand)); + else if (Op->getCode() != DW_OP_entry_value && + Op->getCode() != DW_OP_GNU_entry_value) + OS << format(" 0x%" PRIx64, Op->getRawOperand(Operand)); } - } else if (Size == DWARFExpression::Operation::SizeBlock) { - uint64_t Offset = Op->getRawOperand(Operand); - for (unsigned i = 0; i < Op->getRawOperand(Operand - 1); ++i) - OS << format(" 0x%02x", - static_cast<uint8_t>(Expr->getData()[Offset++])); - } else { - if (Signed) - OS << format(" %+" PRId64, (int64_t)Op->getRawOperand(Operand)); - else if (Op->getCode() != DW_OP_entry_value && - Op->getCode() != DW_OP_GNU_entry_value) - OS << format(" 0x%" PRIx64, Op->getRawOperand(Operand)); } } return true; @@ -120,29 +162,30 @@ void printDwarfExpression(const DWARFExpression *E, raw_ostream &OS, for (auto &Op : *E) { DumpOpts.IsEH = IsEH; - if (!printOp(&Op, OS, DumpOpts, E, U)) { + if (!printOp(&Op, OS, DumpOpts, E, U) && !DumpOpts.PrintRegisterOnly) { uint64_t FailOffset = Op.getEndOffset(); while (FailOffset < E->getData().size()) OS << format(" %02x", static_cast<uint8_t>(E->getData()[FailOffset++])); return; } + if (!DumpOpts.PrintRegisterOnly) { + if (Op.getCode() == DW_OP_entry_value || + Op.getCode() == DW_OP_GNU_entry_value) { + OS << "("; + EntryValExprSize = Op.getRawOperand(0); + EntryValStartOffset = Op.getEndOffset(); + continue; + } - if (Op.getCode() == DW_OP_entry_value || - Op.getCode() == DW_OP_GNU_entry_value) { - OS << "("; - EntryValExprSize = Op.getRawOperand(0); - EntryValStartOffset = Op.getEndOffset(); - continue; - } + if (EntryValExprSize) { + EntryValExprSize -= Op.getEndOffset() - EntryValStartOffset; + if (EntryValExprSize == 0) + OS << ")"; + } - if (EntryValExprSize) { - EntryValExprSize -= Op.getEndOffset() - EntryValStartOffset; - if (EntryValExprSize == 0) - OS << ")"; + if (Op.getEndOffset() < E->getData().size()) + OS << ", "; } - - if (Op.getEndOffset() < E->getData().size()) - OS << ", "; } } diff --git a/llvm/lib/DebugInfo/GSYM/GsymContext.cpp b/llvm/lib/DebugInfo/GSYM/GsymContext.cpp index 18be6d098546..62b4caa327d8 100644 --- a/llvm/lib/DebugInfo/GSYM/GsymContext.cpp +++ b/llvm/lib/DebugInfo/GSYM/GsymContext.cpp @@ -14,6 +14,7 @@ using namespace llvm; using namespace llvm::gsym; +GsymContext::~GsymContext() = default; GsymContext::GsymContext(std::unique_ptr<GsymReader> Reader) : DIContext(CK_GSYM), Reader(std::move(Reader)) {} diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp index 0df9137a3bd3..0d0383158dd4 100644 --- a/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp +++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp @@ -274,9 +274,10 @@ void LVBinaryReader::mapVirtualAddress(const object::COFFObjectFile &COFFObj) { }); } -Error LVBinaryReader::loadGenericTargetInfo(StringRef TheTriple, +Error LVBinaryReader::loadGenericTargetInfo(StringRef TripleName, StringRef TheFeatures, StringRef TheCPU) { + Triple TheTriple(TripleName); std::string TargetLookupError; const Target *TheTarget = TargetRegistry::lookupTarget(TheTriple, TargetLookupError); @@ -287,7 +288,7 @@ Error LVBinaryReader::loadGenericTargetInfo(StringRef TheTriple, MCRegisterInfo *RegisterInfo = TheTarget->createMCRegInfo(TheTriple); if (!RegisterInfo) return createStringError(errc::invalid_argument, - "no register info for target " + TheTriple); + "no register info for target " + TripleName); MRI.reset(RegisterInfo); // Assembler properties and features. @@ -295,7 +296,7 @@ Error LVBinaryReader::loadGenericTargetInfo(StringRef TheTriple, MCAsmInfo *AsmInfo(TheTarget->createMCAsmInfo(*MRI, TheTriple, MCOptions)); if (!AsmInfo) return createStringError(errc::invalid_argument, - "no assembly info for target " + TheTriple); + "no assembly info for target " + TripleName); MAI.reset(AsmInfo); // Target subtargets. @@ -303,14 +304,14 @@ Error LVBinaryReader::loadGenericTargetInfo(StringRef TheTriple, TheTarget->createMCSubtargetInfo(TheTriple, TheCPU, TheFeatures)); if (!SubtargetInfo) return createStringError(errc::invalid_argument, - "no subtarget info for target " + TheTriple); + "no subtarget info for target " + TripleName); STI.reset(SubtargetInfo); // Instructions Info. MCInstrInfo *InstructionInfo(TheTarget->createMCInstrInfo()); if (!InstructionInfo) return createStringError(errc::invalid_argument, - "no instruction info for target " + TheTriple); + "no instruction info for target " + TripleName); MII.reset(InstructionInfo); MC = std::make_unique<MCContext>(Triple(TheTriple), MAI.get(), MRI.get(), @@ -320,7 +321,7 @@ Error LVBinaryReader::loadGenericTargetInfo(StringRef TheTriple, MCDisassembler *DisAsm(TheTarget->createMCDisassembler(*STI, *MC)); if (!DisAsm) return createStringError(errc::invalid_argument, - "no disassembler for target " + TheTriple); + "no disassembler for target " + TripleName); MD.reset(DisAsm); MCInstPrinter *InstructionPrinter(TheTarget->createMCInstPrinter( @@ -328,7 +329,7 @@ Error LVBinaryReader::loadGenericTargetInfo(StringRef TheTriple, if (!InstructionPrinter) return createStringError(errc::invalid_argument, "no target assembly language printer for target " + - TheTriple); + TripleName); MIP.reset(InstructionPrinter); InstructionPrinter->setPrintImmHex(true); diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp index 62134dfdadf4..3ba506171814 100644 --- a/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp +++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp @@ -274,7 +274,7 @@ void LVDWARFReader::processOneAttribute(const DWARFDie &Die, for (DWARFAddressRange &Range : Ranges) { // This seems to be a tombstone for empty ranges. if ((Range.LowPC == Range.HighPC) || - (Range.LowPC = getTombstoneAddress())) + (Range.LowPC == getTombstoneAddress())) continue; // Store the real upper limit for the address range. if (UpdateHighAddress && Range.HighPC > 0) @@ -461,13 +461,17 @@ LVScope *LVDWARFReader::processOneDie(const DWARFDie &InputDIE, LVScope *Parent, if (!CurrentRanges.empty()) { for (LVAddressRange &Range : CurrentRanges) addSectionRange(SectionIndex, CurrentScope, Range.first, - Range.second); + Range.second > Range.first + ? Range.second - 1 // Make hi-pc exclusive + : Range.second); CurrentRanges.clear(); } // If the scope is the CU, do not update the ranges set. if (FoundLowPC && FoundHighPC && !IsCompileUnit) { addSectionRange(SectionIndex, CurrentScope, CurrentLowPC, - CurrentHighPC); + CurrentHighPC > CurrentLowPC + ? CurrentHighPC - 1 // Make hi-pc exclusive + : CurrentHighPC); } } } diff --git a/llvm/lib/DebugInfo/PDB/Native/InputFile.cpp b/llvm/lib/DebugInfo/PDB/Native/InputFile.cpp index 328d0f5ab060..49be0edc33a1 100644 --- a/llvm/lib/DebugInfo/PDB/Native/InputFile.cpp +++ b/llvm/lib/DebugInfo/PDB/Native/InputFile.cpp @@ -586,3 +586,8 @@ bool llvm::pdb::shouldDumpSymbolGroup(uint32_t Idx, const SymbolGroup &Group, // Otherwise, only dump if this is the same module specified. return (Filters.DumpModi == Idx); } +llvm::pdb::InputFile::InputFile(PDBFile *Pdb) { PdbOrObj = Pdb; } + +llvm::pdb::InputFile::InputFile(object::COFFObjectFile *Obj) { PdbOrObj = Obj; } + +llvm::pdb::InputFile::InputFile(MemoryBuffer *Buffer) { PdbOrObj = Buffer; } diff --git a/llvm/lib/DebugInfo/PDB/Native/PublicsStream.cpp b/llvm/lib/DebugInfo/PDB/Native/PublicsStream.cpp index c350e0e0b3e1..0453eea26605 100644 --- a/llvm/lib/DebugInfo/PDB/Native/PublicsStream.cpp +++ b/llvm/lib/DebugInfo/PDB/Native/PublicsStream.cpp @@ -22,9 +22,12 @@ //===----------------------------------------------------------------------===// #include "llvm/DebugInfo/PDB/Native/PublicsStream.h" +#include "llvm/DebugInfo/CodeView/SymbolDeserializer.h" +#include "llvm/DebugInfo/CodeView/SymbolRecord.h" #include "llvm/DebugInfo/MSF/MappedBlockStream.h" #include "llvm/DebugInfo/PDB/Native/RawError.h" #include "llvm/DebugInfo/PDB/Native/RawTypes.h" +#include "llvm/DebugInfo/PDB/Native/SymbolStream.h" #include "llvm/Support/BinaryStreamReader.h" #include "llvm/Support/Error.h" #include <cstdint> @@ -96,3 +99,50 @@ Error PublicsStream::reload() { "Corrupted publics stream."); return Error::success(); } + +// This is a reimplementation of NearestSym: +// https://github.com/microsoft/microsoft-pdb/blob/805655a28bd8198004be2ac27e6e0290121a5e89/PDB/dbi/gsi.cpp#L1492-L1581 +std::optional<std::pair<codeview::PublicSym32, size_t>> +PublicsStream::findByAddress(const SymbolStream &Symbols, uint16_t Segment, + uint32_t Offset) const { + // The address map is sorted by address, so we can use lower_bound to find the + // position. Each element is an offset into the symbols for a public symbol. + auto It = llvm::lower_bound( + AddressMap, std::tuple(Segment, Offset), + [&](support::ulittle32_t Cur, auto Addr) { + auto Sym = Symbols.readRecord(Cur.value()); + if (Sym.kind() != codeview::S_PUB32) + return false; // stop here, this is most likely corrupted debug info + + auto Psym = + codeview::SymbolDeserializer::deserializeAs<codeview::PublicSym32>( + Sym); + if (!Psym) { + consumeError(Psym.takeError()); + return false; + } + + return std::tie(Psym->Segment, Psym->Offset) < Addr; + }); + + if (It == AddressMap.end()) + return std::nullopt; + + auto Sym = Symbols.readRecord(It->value()); + if (Sym.kind() != codeview::S_PUB32) + return std::nullopt; // this is most likely corrupted debug info + + auto MaybePsym = + codeview::SymbolDeserializer::deserializeAs<codeview::PublicSym32>(Sym); + if (!MaybePsym) { + consumeError(MaybePsym.takeError()); + return std::nullopt; + } + codeview::PublicSym32 Psym = std::move(*MaybePsym); + + if (std::tuple(Segment, Offset) != std::tuple(Psym.Segment, Psym.Offset)) + return std::nullopt; + + std::ptrdiff_t IterOffset = It - AddressMap.begin(); + return std::pair{Psym, static_cast<size_t>(IterOffset)}; +} diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp index eca2a09c1f77..7c8ef18f126d 100644 --- a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp +++ b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp @@ -10,6 +10,12 @@ #include "llvm/DebugInfo/PDB/PDBSymDumper.h" +namespace llvm { +namespace pdb { +PDBSymbolTypeBuiltin::~PDBSymbolTypeBuiltin() = default; +} // namespace pdb +} // namespace llvm + using namespace llvm; using namespace llvm::pdb; diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp index 71a0f14368ac..0dc97104610b 100644 --- a/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp @@ -154,18 +154,22 @@ public: std::unique_ptr<LinkGraph> G, PassConfiguration PassConfig) : JITLinker(std::move(Ctx), std::move(G), std::move(PassConfig)) { JITLinkerBase::getPassConfig().PostAllocationPasses.push_back( - [this](LinkGraph &G) { return gatherRISCVPCRelHi20(G); }); + [this](LinkGraph &G) { return gatherRISCVPairs(G); }); } private: DenseMap<std::pair<const Block *, orc::ExecutorAddrDiff>, const Edge *> RelHi20; + DenseMap<std::pair<const Block *, orc::ExecutorAddrDiff>, const Edge *> + SetULEB128; - Error gatherRISCVPCRelHi20(LinkGraph &G) { + Error gatherRISCVPairs(LinkGraph &G) { for (Block *B : G.blocks()) for (Edge &E : B->edges()) if (E.getKind() == R_RISCV_PCREL_HI20) RelHi20[{B, E.getOffset()}] = &E; + else if (E.getKind() == R_RISCV_SET_ULEB128) + SetULEB128[{B, E.getOffset()}] = &E; return Error::success(); } @@ -189,6 +193,20 @@ private: "for LO12 PCREL relocation type"); } + Expected<const Edge &> getRISCVSetULEB128(const Block &B, + const Edge &E) const { + using namespace riscv; + assert(E.getKind() == R_RISCV_SUB_ULEB128 && + "Can only have pair relocation for R_RISCV_SUB_ULEB128"); + + auto It = SetULEB128.find({&B, E.getOffset()}); + if (It != SetULEB128.end()) + return *It->second; + + return make_error<JITLinkError>( + "No RISCV_SET_ULEB128 relocation type be found"); + } + Error applyFixup(LinkGraph &G, Block &B, const Edge &E) const { using namespace riscv; using namespace llvm::support; @@ -467,6 +485,21 @@ private: *(little32_t *)FixupPtr = static_cast<uint32_t>(Value); break; } + case R_RISCV_SET_ULEB128: + break; + case R_RISCV_SUB_ULEB128: { + auto SetULEB128 = getRISCVSetULEB128(B, E); + if (!SetULEB128) + return SetULEB128.takeError(); + uint64_t Value = SetULEB128->getTarget().getAddress() + + SetULEB128->getAddend() - E.getTarget().getAddress() - + E.getAddend(); + if (overwriteULEB128(reinterpret_cast<uint8_t *>(FixupPtr), Value) >= + 0x80) + return make_error<StringError>("ULEB128 value exceeds available space", + inconvertibleErrorCode()); + break; + } } return Error::success(); } @@ -843,6 +876,10 @@ private: return EdgeKind_riscv::R_RISCV_32_PCREL; case ELF::R_RISCV_ALIGN: return EdgeKind_riscv::AlignRelaxable; + case ELF::R_RISCV_SET_ULEB128: + return EdgeKind_riscv::R_RISCV_SET_ULEB128; + case ELF::R_RISCV_SUB_ULEB128: + return EdgeKind_riscv::R_RISCV_SUB_ULEB128; } return make_error<JITLinkError>( diff --git a/llvm/lib/ExecutionEngine/JITLink/riscv.cpp b/llvm/lib/ExecutionEngine/JITLink/riscv.cpp index a4e4daef97fb..9e9f4433a9fc 100644 --- a/llvm/lib/ExecutionEngine/JITLink/riscv.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/riscv.cpp @@ -84,6 +84,10 @@ const char *getEdgeKindName(Edge::Kind K) { return "AlignRelaxable"; case NegDelta32: return "NegDelta32"; + case R_RISCV_SET_ULEB128: + return "R_RISCV_SET_ULEB128"; + case R_RISCV_SUB_ULEB128: + return "R_RISCV_SUB_ULEB128"; } return getGenericEdgeKindName(K); } diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp index ff48a938cbd4..afe3b671547d 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp @@ -756,59 +756,56 @@ private: Expected<TargetInfo> getTargetInfo(const Triple &TT, const StringRef &CPU, const SubtargetFeatures &TF) const { - - auto TripleName = TT.str(); std::string ErrorStr; - const Target *TheTarget = - TargetRegistry::lookupTarget(TripleName, ErrorStr); + const Target *TheTarget = TargetRegistry::lookupTarget(TT, ErrorStr); if (!TheTarget) - return make_error<StringError>("Error accessing target '" + TripleName + + return make_error<StringError>("Error accessing target '" + TT.str() + "': " + ErrorStr, inconvertibleErrorCode()); std::unique_ptr<MCSubtargetInfo> STI( - TheTarget->createMCSubtargetInfo(TripleName, CPU, TF.getString())); + TheTarget->createMCSubtargetInfo(TT, CPU, TF.getString())); if (!STI) return make_error<StringError>("Unable to create subtarget for " + - TripleName, + TT.str(), inconvertibleErrorCode()); - std::unique_ptr<MCRegisterInfo> MRI(TheTarget->createMCRegInfo(TripleName)); + std::unique_ptr<MCRegisterInfo> MRI(TheTarget->createMCRegInfo(TT)); if (!MRI) return make_error<StringError>("Unable to create target register info " "for " + - TripleName, + TT.str(), inconvertibleErrorCode()); MCTargetOptions MCOptions; std::unique_ptr<MCAsmInfo> MAI( - TheTarget->createMCAsmInfo(*MRI, TripleName, MCOptions)); + TheTarget->createMCAsmInfo(*MRI, TT, MCOptions)); if (!MAI) return make_error<StringError>("Unable to create target asm info " + - TripleName, + TT.str(), inconvertibleErrorCode()); - auto Ctx = std::make_unique<MCContext>(Triple(TripleName), MAI.get(), + auto Ctx = std::make_unique<MCContext>(Triple(TT.str()), MAI.get(), MRI.get(), STI.get()); std::unique_ptr<MCDisassembler> Disassembler( TheTarget->createMCDisassembler(*STI, *Ctx)); if (!Disassembler) return make_error<StringError>("Unable to create disassembler for " + - TripleName, + TT.str(), inconvertibleErrorCode()); std::unique_ptr<MCInstrInfo> MII(TheTarget->createMCInstrInfo()); if (!MII) return make_error<StringError>("Unable to create instruction info for" + - TripleName, + TT.str(), inconvertibleErrorCode()); - std::unique_ptr<MCInstPrinter> InstPrinter(TheTarget->createMCInstPrinter( - Triple(TripleName), 0, *MAI, *MII, *MRI)); + std::unique_ptr<MCInstPrinter> InstPrinter( + TheTarget->createMCInstPrinter(TT, 0, *MAI, *MII, *MRI)); if (!InstPrinter) return make_error<StringError>( - "Unable to create instruction printer for" + TripleName, + "Unable to create instruction printer for" + TT.str(), inconvertibleErrorCode()); return TargetInfo({TheTarget, std::move(STI), std::move(MRI), diff --git a/llvm/lib/FileCheck/FileCheck.cpp b/llvm/lib/FileCheck/FileCheck.cpp index ce35a5bad761..9245db442611 100644 --- a/llvm/lib/FileCheck/FileCheck.cpp +++ b/llvm/lib/FileCheck/FileCheck.cpp @@ -1218,6 +1218,14 @@ Pattern::MatchResult Pattern::match(StringRef Buffer, StringRef MatchedValue = MatchInfo[CaptureParenGroup]; ExpressionFormat Format = DefinedNumericVariable->getImplicitFormat(); APInt Value = Format.valueFromStringRepr(MatchedValue, SM); + // Numeric variables are already inserted into GlobalNumericVariableTable + // during parsing, but clearLocalVars might remove them, so we must + // reinsert them. Numeric-variable resolution does not access + // GlobalNumericVariableTable; it directly uses a pointer to the variable. + // However, other functions (such as clearLocalVars) may require active + // variables to be in the table. + Context->GlobalNumericVariableTable.try_emplace(NumericVariableDef.getKey(), + DefinedNumericVariable); DefinedNumericVariable->setValue(Value, MatchedValue); } diff --git a/llvm/lib/Frontend/HLSL/HLSLBinding.cpp b/llvm/lib/Frontend/HLSL/HLSLBinding.cpp index 45391460354d..401402fb5a7b 100644 --- a/llvm/lib/Frontend/HLSL/HLSLBinding.cpp +++ b/llvm/lib/Frontend/HLSL/HLSLBinding.cpp @@ -131,9 +131,9 @@ BindingInfo BindingInfoBuilder::calculateBindingInfo( return Info; } -const BindingInfoBuilder::Binding &BindingInfoBuilder::findOverlapping( - const BindingInfoBuilder::Binding &ReportedBinding) const { - for (const BindingInfoBuilder::Binding &Other : Bindings) +const Binding & +BindingInfoBuilder::findOverlapping(const Binding &ReportedBinding) const { + for (const Binding &Other : Bindings) if (ReportedBinding.LowerBound <= Other.UpperBound && Other.LowerBound <= ReportedBinding.UpperBound) return Other; diff --git a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp index dece8f197aaf..31605e390034 100644 --- a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp +++ b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp @@ -52,6 +52,17 @@ static std::optional<StringRef> extractMdStringValue(MDNode *Node, return NodeText->getString(); } +static Expected<dxbc::ShaderVisibility> +extractShaderVisibility(MDNode *Node, unsigned int OpId) { + if (std::optional<uint32_t> Val = extractMdIntValue(Node, OpId)) { + if (!dxbc::isValidShaderVisibility(*Val)) + return make_error<RootSignatureValidationError<uint32_t>>( + "ShaderVisibility", *Val); + return dxbc::ShaderVisibility(*Val); + } + return make_error<InvalidRSMetadataValue>("ShaderVisibility"); +} + namespace { // We use the OverloadVisit with std::visit to ensure the compiler catches if a @@ -221,17 +232,12 @@ Error MetadataParser::parseRootConstants(mcdxbc::RootSignatureDesc &RSD, if (RootConstantNode->getNumOperands() != 5) return make_error<InvalidRSMetadataFormat>("RootConstants Element"); - dxbc::RTS0::v1::RootParameterHeader Header; - // The parameter offset doesn't matter here - we recalculate it during - // serialization Header.ParameterOffset = 0; - Header.ParameterType = to_underlying(dxbc::RootParameterType::Constants32Bit); - - if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 1)) - Header.ShaderVisibility = *Val; - else - return make_error<InvalidRSMetadataValue>("ShaderVisibility"); + Expected<dxbc::ShaderVisibility> Visibility = + extractShaderVisibility(RootConstantNode, 1); + if (auto E = Visibility.takeError()) + return Error(std::move(E)); - dxbc::RTS0::v1::RootConstants Constants; + mcdxbc::RootConstants Constants; if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 2)) Constants.ShaderRegister = *Val; else @@ -247,7 +253,8 @@ Error MetadataParser::parseRootConstants(mcdxbc::RootSignatureDesc &RSD, else return make_error<InvalidRSMetadataValue>("Num32BitValues"); - RSD.ParametersContainer.addParameter(Header, Constants); + RSD.ParametersContainer.addParameter(dxbc::RootParameterType::Constants32Bit, + *Visibility, Constants); return Error::success(); } @@ -263,28 +270,28 @@ Error MetadataParser::parseRootDescriptors( if (RootDescriptorNode->getNumOperands() != 5) return make_error<InvalidRSMetadataFormat>("Root Descriptor Element"); - dxbc::RTS0::v1::RootParameterHeader Header; + dxbc::RootParameterType Type; switch (ElementKind) { case RootSignatureElementKind::SRV: - Header.ParameterType = to_underlying(dxbc::RootParameterType::SRV); + Type = dxbc::RootParameterType::SRV; break; case RootSignatureElementKind::UAV: - Header.ParameterType = to_underlying(dxbc::RootParameterType::UAV); + Type = dxbc::RootParameterType::UAV; break; case RootSignatureElementKind::CBV: - Header.ParameterType = to_underlying(dxbc::RootParameterType::CBV); + Type = dxbc::RootParameterType::CBV; break; default: llvm_unreachable("invalid Root Descriptor kind"); break; } - if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 1)) - Header.ShaderVisibility = *Val; - else - return make_error<InvalidRSMetadataValue>("ShaderVisibility"); + Expected<dxbc::ShaderVisibility> Visibility = + extractShaderVisibility(RootDescriptorNode, 1); + if (auto E = Visibility.takeError()) + return Error(std::move(E)); - dxbc::RTS0::v2::RootDescriptor Descriptor; + mcdxbc::RootDescriptor Descriptor; if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 2)) Descriptor.ShaderRegister = *Val; else @@ -296,7 +303,7 @@ Error MetadataParser::parseRootDescriptors( return make_error<InvalidRSMetadataValue>("RegisterSpace"); if (RSD.Version == 1) { - RSD.ParametersContainer.addParameter(Header, Descriptor); + RSD.ParametersContainer.addParameter(Type, *Visibility, Descriptor); return Error::success(); } assert(RSD.Version > 1); @@ -306,7 +313,7 @@ Error MetadataParser::parseRootDescriptors( else return make_error<InvalidRSMetadataValue>("Root Descriptor Flags"); - RSD.ParametersContainer.addParameter(Header, Descriptor); + RSD.ParametersContainer.addParameter(Type, *Visibility, Descriptor); return Error::success(); } @@ -315,7 +322,7 @@ Error MetadataParser::parseDescriptorRange(mcdxbc::DescriptorTable &Table, if (RangeDescriptorNode->getNumOperands() != 6) return make_error<InvalidRSMetadataFormat>("Descriptor Range"); - dxbc::RTS0::v2::DescriptorRange Range; + mcdxbc::DescriptorRange Range; std::optional<StringRef> ElementText = extractMdStringValue(RangeDescriptorNode, 0); @@ -323,15 +330,15 @@ Error MetadataParser::parseDescriptorRange(mcdxbc::DescriptorTable &Table, if (!ElementText.has_value()) return make_error<InvalidRSMetadataFormat>("Descriptor Range"); - Range.RangeType = - StringSwitch<uint32_t>(*ElementText) - .Case("CBV", to_underlying(dxbc::DescriptorRangeType::CBV)) - .Case("SRV", to_underlying(dxbc::DescriptorRangeType::SRV)) - .Case("UAV", to_underlying(dxbc::DescriptorRangeType::UAV)) - .Case("Sampler", to_underlying(dxbc::DescriptorRangeType::Sampler)) - .Default(~0U); - - if (Range.RangeType == ~0U) + if (*ElementText == "CBV") + Range.RangeType = dxil::ResourceClass::CBuffer; + else if (*ElementText == "SRV") + Range.RangeType = dxil::ResourceClass::SRV; + else if (*ElementText == "UAV") + Range.RangeType = dxil::ResourceClass::UAV; + else if (*ElementText == "Sampler") + Range.RangeType = dxil::ResourceClass::Sampler; + else return make_error<GenericRSMetadataError>("Invalid Descriptor Range type.", RangeDescriptorNode); @@ -372,15 +379,12 @@ Error MetadataParser::parseDescriptorTable(mcdxbc::RootSignatureDesc &RSD, if (NumOperands < 2) return make_error<InvalidRSMetadataFormat>("Descriptor Table"); - dxbc::RTS0::v1::RootParameterHeader Header; - if (std::optional<uint32_t> Val = extractMdIntValue(DescriptorTableNode, 1)) - Header.ShaderVisibility = *Val; - else - return make_error<InvalidRSMetadataValue>("ShaderVisibility"); + Expected<dxbc::ShaderVisibility> Visibility = + extractShaderVisibility(DescriptorTableNode, 1); + if (auto E = Visibility.takeError()) + return Error(std::move(E)); mcdxbc::DescriptorTable Table; - Header.ParameterType = - to_underlying(dxbc::RootParameterType::DescriptorTable); for (unsigned int I = 2; I < NumOperands; I++) { MDNode *Element = dyn_cast<MDNode>(DescriptorTableNode->getOperand(I)); @@ -392,7 +396,8 @@ Error MetadataParser::parseDescriptorTable(mcdxbc::RootSignatureDesc &RSD, return Err; } - RSD.ParametersContainer.addParameter(Header, Table); + RSD.ParametersContainer.addParameter(dxbc::RootParameterType::DescriptorTable, + *Visibility, Table); return Error::success(); } @@ -528,21 +533,15 @@ Error MetadataParser::validateRootSignature( } for (const mcdxbc::RootParameterInfo &Info : RSD.ParametersContainer) { - if (!dxbc::isValidShaderVisibility(Info.Header.ShaderVisibility)) - DeferredErrs = - joinErrors(std::move(DeferredErrs), - make_error<RootSignatureValidationError<uint32_t>>( - "ShaderVisibility", Info.Header.ShaderVisibility)); - - assert(dxbc::isValidParameterType(Info.Header.ParameterType) && - "Invalid value for ParameterType"); - switch (Info.Header.ParameterType) { + switch (Info.Type) { + case dxbc::RootParameterType::Constants32Bit: + break; - case to_underlying(dxbc::RootParameterType::CBV): - case to_underlying(dxbc::RootParameterType::UAV): - case to_underlying(dxbc::RootParameterType::SRV): { - const dxbc::RTS0::v2::RootDescriptor &Descriptor = + case dxbc::RootParameterType::CBV: + case dxbc::RootParameterType::UAV: + case dxbc::RootParameterType::SRV: { + const mcdxbc::RootDescriptor &Descriptor = RSD.ParametersContainer.getRootDescriptor(Info.Location); if (!hlsl::rootsig::verifyRegisterValue(Descriptor.ShaderRegister)) DeferredErrs = @@ -566,16 +565,10 @@ Error MetadataParser::validateRootSignature( } break; } - case to_underlying(dxbc::RootParameterType::DescriptorTable): { + case dxbc::RootParameterType::DescriptorTable: { const mcdxbc::DescriptorTable &Table = RSD.ParametersContainer.getDescriptorTable(Info.Location); - for (const dxbc::RTS0::v2::DescriptorRange &Range : Table) { - if (!hlsl::rootsig::verifyRangeType(Range.RangeType)) - DeferredErrs = - joinErrors(std::move(DeferredErrs), - make_error<RootSignatureValidationError<uint32_t>>( - "RangeType", Range.RangeType)); - + for (const mcdxbc::DescriptorRange &Range : Table) { if (!hlsl::rootsig::verifyRegisterSpace(Range.RegisterSpace)) DeferredErrs = joinErrors(std::move(DeferredErrs), @@ -589,7 +582,8 @@ Error MetadataParser::validateRootSignature( "NumDescriptors", Range.NumDescriptors)); if (!hlsl::rootsig::verifyDescriptorRangeFlag( - RSD.Version, Range.RangeType, Range.Flags)) + RSD.Version, Range.RangeType, + dxbc::DescriptorRangeFlags(Range.Flags))) DeferredErrs = joinErrors(std::move(DeferredErrs), make_error<RootSignatureValidationError<uint32_t>>( diff --git a/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp b/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp index 72308a3de5fd..d682dda0bab2 100644 --- a/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp +++ b/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp @@ -51,25 +51,11 @@ bool verifyRootDescriptorFlag(uint32_t Version, uint32_t FlagsVal) { return (Flags | DataFlags) == DataFlags; } -bool verifyRangeType(uint32_t Type) { - switch (Type) { - case llvm::to_underlying(dxbc::DescriptorRangeType::CBV): - case llvm::to_underlying(dxbc::DescriptorRangeType::SRV): - case llvm::to_underlying(dxbc::DescriptorRangeType::UAV): - case llvm::to_underlying(dxbc::DescriptorRangeType::Sampler): - return true; - }; - - return false; -} - -bool verifyDescriptorRangeFlag(uint32_t Version, uint32_t Type, - uint32_t FlagsVal) { +bool verifyDescriptorRangeFlag(uint32_t Version, dxil::ResourceClass Type, + dxbc::DescriptorRangeFlags Flags) { using FlagT = dxbc::DescriptorRangeFlags; - FlagT Flags = FlagT(FlagsVal); - const bool IsSampler = - (Type == llvm::to_underlying(dxbc::DescriptorRangeType::Sampler)); + const bool IsSampler = (Type == dxil::ResourceClass::Sampler); if (Version == 1) { // Since the metadata is unversioned, we expect to explicitly see the values @@ -180,6 +166,22 @@ bool verifyBorderColor(uint32_t BorderColor) { bool verifyLOD(float LOD) { return !std::isnan(LOD); } +bool verifyBoundOffset(uint32_t Offset) { + return Offset != NumDescriptorsUnbounded; +} + +bool verifyNoOverflowedOffset(uint64_t Offset) { + return Offset <= std::numeric_limits<uint32_t>::max(); +} + +uint64_t computeRangeBound(uint32_t Offset, uint32_t Size) { + assert(0 < Size && "Must be a non-empty range"); + if (Size == NumDescriptorsUnbounded) + return NumDescriptorsUnbounded; + + return uint64_t(Offset) + uint64_t(Size) - 1; +} + } // namespace rootsig } // namespace hlsl } // namespace llvm diff --git a/llvm/lib/Frontend/OpenMP/OMP.cpp b/llvm/lib/Frontend/OpenMP/OMP.cpp index 9e625b809de9..f12941492547 100644 --- a/llvm/lib/Frontend/OpenMP/OMP.cpp +++ b/llvm/lib/Frontend/OpenMP/OMP.cpp @@ -9,6 +9,8 @@ #include "llvm/Frontend/OpenMP/OMP.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/Sequence.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Demangle/Demangle.h" @@ -75,6 +77,26 @@ getFirstCompositeRange(iterator_range<ArrayRef<Directive>::iterator> Leafs) { return llvm::make_range(Begin, End); } +static void +collectPrivatizingConstructs(llvm::SmallSet<Directive, 16> &Constructs, + unsigned Version) { + llvm::SmallSet<Clause, 16> Privatizing; + for (auto C : + llvm::enum_seq_inclusive<Clause>(Clause::First_, Clause::Last_)) { + if (isPrivatizingClause(C)) + Privatizing.insert(C); + } + + for (auto D : llvm::enum_seq_inclusive<Directive>(Directive::First_, + Directive::Last_)) { + bool AllowsPrivatizing = llvm::any_of(Privatizing, [&](Clause C) { + return isAllowedClauseForDirective(D, C, Version); + }); + if (AllowsPrivatizing) + Constructs.insert(D); + } +} + namespace llvm::omp { ArrayRef<Directive> getLeafConstructs(Directive D) { auto Idx = static_cast<std::size_t>(D); @@ -194,6 +216,18 @@ ArrayRef<unsigned> getOpenMPVersions() { return Versions; } +bool isPrivatizingConstruct(Directive D, unsigned Version) { + static llvm::SmallSet<Directive, 16> Privatizing; + [[maybe_unused]] static bool Init = + (collectPrivatizingConstructs(Privatizing, Version), true); + + // As of OpenMP 6.0, privatizing constructs (with the test being if they + // allow a privatizing clause) are: dispatch, distribute, do, for, loop, + // parallel, scope, sections, simd, single, target, target_data, task, + // taskgroup, taskloop, and teams. + return llvm::is_contained(Privatizing, D); +} + std::string prettifyFunctionName(StringRef FunctionName) { // Internalized functions have the right name, but simply a suffix. if (FunctionName.ends_with(".internalized")) diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 50ab206e2db8..3d5e487c8990 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -307,7 +307,19 @@ void llvm::spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, // Move instructions to new block. BasicBlock *Old = IP.getBlock(); - New->splice(New->begin(), Old, IP.getPoint(), Old->end()); + // If the `Old` block is empty then there are no instructions to move. But in + // the new debug scheme, it could have trailing debug records which will be + // moved to `New` in `spliceDebugInfoEmptyBlock`. We dont want that for 2 + // reasons: + // 1. If `New` is also empty, `BasicBlock::splice` crashes. + // 2. Even if `New` is not empty, the rationale to move those records to `New` + // (in `spliceDebugInfoEmptyBlock`) does not apply here. That function + // assumes that `Old` is optimized out and is going away. This is not the case + // here. The `Old` block is still being used e.g. a branch instruction is + // added to it later in this function. + // So we call `BasicBlock::splice` only when `Old` is not empty. + if (!Old->empty()) + New->splice(New->begin(), Old, IP.getPoint(), Old->end()); if (CreateBranch) { auto *NewBr = BranchInst::Create(New, Old); @@ -903,6 +915,13 @@ Constant *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr, ConstantInt::get(Int32, uint32_t(LocFlags)), ConstantInt::get(Int32, Reserve2Flags), ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr}; + + size_t SrcLocStrArgIdx = 4; + if (OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx) + ->getPointerAddressSpace() != + IdentData[SrcLocStrArgIdx]->getType()->getPointerAddressSpace()) + IdentData[SrcLocStrArgIdx] = ConstantExpr::getAddrSpaceCast( + SrcLocStr, OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx)); Constant *Initializer = ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData); @@ -943,8 +962,9 @@ Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr, GV.getInitializer() == Initializer) return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr); - SrcLocStr = Builder.CreateGlobalString(LocStr, /* Name */ "", - /* AddressSpace */ 0, &M); + SrcLocStr = Builder.CreateGlobalString( + LocStr, /*Name=*/"", M.getDataLayout().getDefaultGlobalsAddressSpace(), + &M); } return SrcLocStr; } @@ -5581,13 +5601,13 @@ OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops, // Compute the trip counts of the floor loops. Builder.SetCurrentDebugLocation(DL); Builder.restoreIP(OutermostLoop->getPreheaderIP()); - SmallVector<Value *, 4> FloorCount, FloorRems; + SmallVector<Value *, 4> FloorCompleteCount, FloorCount, FloorRems; for (int i = 0; i < NumLoops; ++i) { Value *TileSize = TileSizes[i]; Value *OrigTripCount = OrigTripCounts[i]; Type *IVType = OrigTripCount->getType(); - Value *FloorTripCount = Builder.CreateUDiv(OrigTripCount, TileSize); + Value *FloorCompleteTripCount = Builder.CreateUDiv(OrigTripCount, TileSize); Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize); // 0 if tripcount divides the tilesize, 1 otherwise. @@ -5601,11 +5621,12 @@ OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops, Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0)); FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType); - FloorTripCount = - Builder.CreateAdd(FloorTripCount, FloorTripOverflow, + Value *FloorTripCount = + Builder.CreateAdd(FloorCompleteTripCount, FloorTripOverflow, "omp_floor" + Twine(i) + ".tripcount", true); // Remember some values for later use. + FloorCompleteCount.push_back(FloorCompleteTripCount); FloorCount.push_back(FloorTripCount); FloorRems.push_back(FloorTripRem); } @@ -5660,7 +5681,7 @@ OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops, Value *TileSize = TileSizes[i]; Value *FloorIsEpilogue = - Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCount[i]); + Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCompleteCount[i]); Value *TileTripCount = Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize); @@ -7369,9 +7390,8 @@ static void FixupDebugInfoForOutlinedFunction( // The location and scope of variable intrinsics and records still point to // the parent function of the target region. Update them. for (Instruction &I : instructions(Func)) { - if (auto *DDI = dyn_cast<llvm::DbgVariableIntrinsic>(&I)) - UpdateDebugRecord(DDI); - + assert(!isa<llvm::DbgVariableIntrinsic>(&I) && + "Unexpected debug intrinsic"); for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) UpdateDebugRecord(&DVR); } diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index dc6d599fa958..094678f32af2 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -88,6 +88,8 @@ using namespace llvm; +// See https://llvm.org/docs/DebuggingLLVM.html for why these flags are useful. + static cl::opt<bool> PrintInstAddrs("print-inst-addrs", cl::Hidden, cl::desc("Print addresses of instructions when dumping")); diff --git a/llvm/lib/IR/Assumptions.cpp b/llvm/lib/IR/Assumptions.cpp index 6adbbc4a63b0..f8bbcb32231c 100644 --- a/llvm/lib/IR/Assumptions.cpp +++ b/llvm/lib/IR/Assumptions.cpp @@ -101,12 +101,16 @@ bool llvm::addAssumptions(CallBase &CB, return ::addAssumptionsImpl(CB, Assumptions); } -StringSet<> llvm::KnownAssumptionStrings({ - "omp_no_openmp", // OpenMP 5.1 - "omp_no_openmp_routines", // OpenMP 5.1 - "omp_no_parallelism", // OpenMP 5.1 - "omp_no_openmp_constructs", // OpenMP 6.0 - "ompx_spmd_amenable", // OpenMPOpt extension - "ompx_no_call_asm", // OpenMPOpt extension - "ompx_aligned_barrier", // OpenMPOpt extension -}); +StringSet<> &llvm::getKnownAssumptionStrings() { + static StringSet<> Object({ + "omp_no_openmp", // OpenMP 5.1 + "omp_no_openmp_routines", // OpenMP 5.1 + "omp_no_parallelism", // OpenMP 5.1 + "omp_no_openmp_constructs", // OpenMP 6.0 + "ompx_spmd_amenable", // OpenMPOpt extension + "ompx_no_call_asm", // OpenMPOpt extension + "ompx_aligned_barrier", // OpenMPOpt extension + }); + + return Object; +} diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp index d1fbcb9e893a..4ac2ebd55dca 100644 --- a/llvm/lib/IR/Attributes.cpp +++ b/llvm/lib/IR/Attributes.cpp @@ -954,6 +954,19 @@ AttributeSet AttributeSet::addAttributes(LLVMContext &C, return get(C, B); } +AttributeSet AttributeSet::addAttributes(LLVMContext &C, + const AttrBuilder &B) const { + if (!hasAttributes()) + return get(C, B); + + if (!B.hasAttributes()) + return *this; + + AttrBuilder Merged(C, *this); + Merged.merge(B); + return get(C, Merged); +} + AttributeSet AttributeSet::removeAttribute(LLVMContext &C, Attribute::AttrKind Kind) const { if (!hasAttribute(Kind)) return *this; diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index e200f3626e69..8d8120ac9ed9 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -48,6 +48,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/NVPTXAddrSpace.h" #include "llvm/Support/Regex.h" +#include "llvm/Support/TimeProfiler.h" #include "llvm/TargetParser/Triple.h" #include <cstdint> #include <cstring> @@ -106,6 +107,24 @@ static bool upgradeX86MaskedFPCompare(Function *F, Intrinsic::ID IID, return true; } +// Upgrade the declaration of multiply and add bytes intrinsics whose input +// arguments' types have changed from vectors of i32 to vectors of i8 +static bool upgradeX86MultiplyAddBytes(Function *F, Intrinsic::ID IID, + Function *&NewFn) { + // check if input argument type is a vector of i8 + Type *Arg1Type = F->getFunctionType()->getParamType(1); + Type *Arg2Type = F->getFunctionType()->getParamType(2); + if (Arg1Type->isVectorTy() && + cast<VectorType>(Arg1Type)->getElementType()->isIntegerTy(8) && + Arg2Type->isVectorTy() && + cast<VectorType>(Arg2Type)->getElementType()->isIntegerTy(8)) + return false; + + rename(F); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID); + return true; +} + static bool upgradeX86BF16Intrinsic(Function *F, Intrinsic::ID IID, Function *&NewFn) { if (F->getReturnType()->getScalarType()->isBFloatTy()) @@ -545,19 +564,34 @@ static bool upgradeX86IntrinsicFunction(Function *F, StringRef Name, if (ID != Intrinsic::not_intrinsic) return upgradeX86IntrinsicsWith8BitMask(F, ID, NewFn); - if (Name.consume_front("avx512.mask.cmp.")) { - // Added in 7.0 - ID = StringSwitch<Intrinsic::ID>(Name) - .Case("pd.128", Intrinsic::x86_avx512_mask_cmp_pd_128) - .Case("pd.256", Intrinsic::x86_avx512_mask_cmp_pd_256) - .Case("pd.512", Intrinsic::x86_avx512_mask_cmp_pd_512) - .Case("ps.128", Intrinsic::x86_avx512_mask_cmp_ps_128) - .Case("ps.256", Intrinsic::x86_avx512_mask_cmp_ps_256) - .Case("ps.512", Intrinsic::x86_avx512_mask_cmp_ps_512) - .Default(Intrinsic::not_intrinsic); - if (ID != Intrinsic::not_intrinsic) - return upgradeX86MaskedFPCompare(F, ID, NewFn); - return false; // No other 'x86.avx523.mask.cmp.*'. + if (Name.consume_front("avx512.")) { + if (Name.consume_front("mask.cmp.")) { + // Added in 7.0 + ID = StringSwitch<Intrinsic::ID>(Name) + .Case("pd.128", Intrinsic::x86_avx512_mask_cmp_pd_128) + .Case("pd.256", Intrinsic::x86_avx512_mask_cmp_pd_256) + .Case("pd.512", Intrinsic::x86_avx512_mask_cmp_pd_512) + .Case("ps.128", Intrinsic::x86_avx512_mask_cmp_ps_128) + .Case("ps.256", Intrinsic::x86_avx512_mask_cmp_ps_256) + .Case("ps.512", Intrinsic::x86_avx512_mask_cmp_ps_512) + .Default(Intrinsic::not_intrinsic); + if (ID != Intrinsic::not_intrinsic) + return upgradeX86MaskedFPCompare(F, ID, NewFn); + } else if (Name.starts_with("vpdpbusd.") || + Name.starts_with("vpdpbusds.")) { + // Added in 21.1 + ID = StringSwitch<Intrinsic::ID>(Name) + .Case("vpdpbusd.128", Intrinsic::x86_avx512_vpdpbusd_128) + .Case("vpdpbusd.256", Intrinsic::x86_avx512_vpdpbusd_256) + .Case("vpdpbusd.512", Intrinsic::x86_avx512_vpdpbusd_512) + .Case("vpdpbusds.128", Intrinsic::x86_avx512_vpdpbusds_128) + .Case("vpdpbusds.256", Intrinsic::x86_avx512_vpdpbusds_256) + .Case("vpdpbusds.512", Intrinsic::x86_avx512_vpdpbusds_512) + .Default(Intrinsic::not_intrinsic); + if (ID != Intrinsic::not_intrinsic) + return upgradeX86MultiplyAddBytes(F, ID, NewFn); + } + return false; // No other 'x86.avx512.*'. } if (Name.consume_front("avx512bf16.")) { @@ -4148,6 +4182,32 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2)}; + + // Input arguments types were incorrectly set to vectors of i32 before but + // they should be vectors of i8. Insert bit cast when encountering the old + // types + if (Args[1]->getType()->isVectorTy() && + cast<VectorType>(Args[1]->getType()) + ->getElementType() + ->isIntegerTy(32) && + Args[2]->getType()->isVectorTy() && + cast<VectorType>(Args[2]->getType()) + ->getElementType() + ->isIntegerTy(32)) { + Type *NewArgType = nullptr; + if (VecWidth == 128) + NewArgType = VectorType::get(Builder.getInt8Ty(), 16, false); + else if (VecWidth == 256) + NewArgType = VectorType::get(Builder.getInt8Ty(), 32, false); + else if (VecWidth == 512) + NewArgType = VectorType::get(Builder.getInt8Ty(), 64, false); + else + llvm_unreachable("Unexpected vector bit width"); + + Args[1] = Builder.CreateBitCast(Args[1], NewArgType); + Args[2] = Builder.CreateBitCast(Args[2], NewArgType); + } + Rep = Builder.CreateIntrinsic(IID, Args); Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType()) : CI->getArgOperand(0); @@ -5155,6 +5215,23 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) { CI->eraseFromParent(); return; } + + case Intrinsic::x86_avx512_vpdpbusd_128: + case Intrinsic::x86_avx512_vpdpbusd_256: + case Intrinsic::x86_avx512_vpdpbusd_512: + case Intrinsic::x86_avx512_vpdpbusds_128: + case Intrinsic::x86_avx512_vpdpbusds_256: + case Intrinsic::x86_avx512_vpdpbusds_512: { + unsigned NumElts = CI->getType()->getPrimitiveSizeInBits() / 8; + Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2)}; + Type *NewArgType = VectorType::get(Builder.getInt8Ty(), NumElts, false); + Args[1] = Builder.CreateBitCast(Args[1], NewArgType); + Args[2] = Builder.CreateBitCast(Args[2], NewArgType); + + NewCall = Builder.CreateCall(NewFn, Args); + break; + } } assert(NewCall && "Should have either set this variable or returned through " "the default case"); @@ -5256,6 +5333,7 @@ bool llvm::UpgradeDebugInfo(Module &M) { if (DisableAutoUpgradeDebugInfo) return false; + llvm::TimeTraceScope timeScope("Upgrade debug info"); // We need to get metadata before the module is verified (i.e., getModuleFlag // makes assumptions that we haven't verified yet). Carefully extract the flag // from the metadata. @@ -5381,6 +5459,16 @@ bool static upgradeSingleNVVMAnnotation(GlobalValue *GV, StringRef K, upgradeNVVMFnVectorAttr("nvvm.cluster_dim", K[0], GV, V); return true; } + if (K == "grid_constant") { + const auto Attr = Attribute::get(GV->getContext(), "nvvm.grid_constant"); + for (const auto &Op : cast<MDNode>(V)->operands()) { + // For some reason, the index is 1-based in the metadata. Good thing we're + // able to auto-upgrade it! + const auto Index = mdconst::extract<ConstantInt>(Op)->getZExtValue() - 1; + cast<Function>(GV)->addParamAttr(Index, Attr); + } + return true; + } return false; } diff --git a/llvm/lib/IR/DataLayout.cpp b/llvm/lib/IR/DataLayout.cpp index dbd6d81ad2e2..ed629d4e5ea2 100644 --- a/llvm/lib/IR/DataLayout.cpp +++ b/llvm/lib/IR/DataLayout.cpp @@ -187,7 +187,6 @@ const char *DataLayout::getManglingComponent(const Triple &T) { // Default primitive type specifications. // NOTE: These arrays must be sorted by type bit width. constexpr DataLayout::PrimitiveSpec DefaultIntSpecs[] = { - {1, Align::Constant<1>(), Align::Constant<1>()}, // i1:8:8 {8, Align::Constant<1>(), Align::Constant<1>()}, // i8:8:8 {16, Align::Constant<2>(), Align::Constant<2>()}, // i16:16:16 {32, Align::Constant<4>(), Align::Constant<4>()}, // i32:32:32 @@ -694,7 +693,12 @@ void DataLayout::setPointerSpec(uint32_t AddrSpace, uint32_t BitWidth, Align DataLayout::getIntegerAlignment(uint32_t BitWidth, bool abi_or_pref) const { - auto I = lower_bound(IntSpecs, BitWidth, LessPrimitiveBitWidth()); + auto I = IntSpecs.begin(); + for (; I != IntSpecs.end(); ++I) { + if (I->BitWidth >= BitWidth) + break; + } + // If we don't have an exact match, use alignment of next larger integer // type. If there is none, use alignment of largest integer type by going // back one element. @@ -839,6 +843,44 @@ Align DataLayout::getAlignment(Type *Ty, bool abi_or_pref) const { } } +TypeSize DataLayout::getTypeAllocSize(Type *Ty) const { + switch (Ty->getTypeID()) { + case Type::ArrayTyID: { + // The alignment of the array is the alignment of the element, so there + // is no need for further adjustment. + auto *ATy = cast<ArrayType>(Ty); + return ATy->getNumElements() * getTypeAllocSize(ATy->getElementType()); + } + case Type::StructTyID: { + const StructLayout *Layout = getStructLayout(cast<StructType>(Ty)); + TypeSize Size = Layout->getSizeInBytes(); + + if (cast<StructType>(Ty)->isPacked()) + return Size; + + Align A = std::max(StructABIAlignment, Layout->getAlignment()); + return alignTo(Size, A.value()); + } + case Type::IntegerTyID: { + unsigned BitWidth = Ty->getIntegerBitWidth(); + TypeSize Size = TypeSize::getFixed(divideCeil(BitWidth, 8)); + Align A = getIntegerAlignment(BitWidth, /*ABI=*/true); + return alignTo(Size, A.value()); + } + case Type::PointerTyID: { + unsigned AS = Ty->getPointerAddressSpace(); + TypeSize Size = TypeSize::getFixed(getPointerSize(AS)); + return alignTo(Size, getPointerABIAlignment(AS).value()); + } + case Type::TargetExtTyID: { + Type *LayoutTy = cast<TargetExtType>(Ty)->getLayoutType(); + return getTypeAllocSize(LayoutTy); + } + default: + return alignTo(getTypeStoreSize(Ty), getABITypeAlign(Ty).value()); + } +} + Align DataLayout::getABITypeAlign(Type *Ty) const { return getAlignment(Ty, true); } @@ -926,12 +968,13 @@ static APInt getElementIndex(TypeSize ElemSize, APInt &Offset) { return APInt::getZero(BitWidth); } - APInt Index = Offset.sdiv(ElemSize); - Offset -= Index * ElemSize; + uint64_t FixedElemSize = ElemSize.getFixedValue(); + APInt Index = Offset.sdiv(FixedElemSize); + Offset -= Index * FixedElemSize; if (Offset.isNegative()) { // Prefer a positive remaining offset to allow struct indexing. --Index; - Offset += ElemSize; + Offset += FixedElemSize; assert(Offset.isNonNegative() && "Remaining offset shouldn't be negative"); } return Index; diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp index 8e523bcf7960..166521a27664 100644 --- a/llvm/lib/IR/DebugInfo.cpp +++ b/llvm/lib/IR/DebugInfo.cpp @@ -36,6 +36,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/TimeProfiler.h" #include <algorithm> #include <cassert> #include <optional> @@ -563,6 +564,7 @@ bool llvm::stripDebugInfo(Function &F) { } bool llvm::StripDebugInfo(Module &M) { + llvm::TimeTraceScope timeScope("Strip debug info"); bool Changed = false; for (NamedMDNode &NMD : llvm::make_early_inc_range(M.named_metadata())) { @@ -755,7 +757,7 @@ private: return getReplacementMDNode(N); }; - // Seperate recursive doRemap and operator [] into 2 lines to avoid + // Separate recursive doRemap and operator [] into 2 lines to avoid // out-of-order evaluations since both of them can access the same memory // location in map Replacements. auto Value = doRemap(N); diff --git a/llvm/lib/IR/DebugLoc.cpp b/llvm/lib/IR/DebugLoc.cpp index 79c5b896f8f2..01dafcab94ce 100644 --- a/llvm/lib/IR/DebugLoc.cpp +++ b/llvm/lib/IR/DebugLoc.cpp @@ -181,10 +181,19 @@ DebugLoc DebugLoc::getMergedLocations(ArrayRef<DebugLoc> Locs) { return Merged; } DebugLoc DebugLoc::getMergedLocation(DebugLoc LocA, DebugLoc LocB) { - if (!LocA) - return LocA; - if (!LocB) + if (!LocA || !LocB) { + // If coverage tracking is enabled, prioritize returning empty non-annotated + // locations to empty annotated locations. +#if LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE + if (!LocA && LocA.getKind() == DebugLocKind::Normal) + return LocA; + if (!LocB && LocB.getKind() == DebugLocKind::Normal) + return LocB; +#endif // LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE + if (!LocA) + return LocA; return LocB; + } return DILocation::getMergedLocation(LocA, LocB); } diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp index 70d364176062..30b5e48652b2 100644 --- a/llvm/lib/IR/Module.cpp +++ b/llvm/lib/IR/Module.cpp @@ -44,6 +44,7 @@ #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" #include "llvm/Support/RandomNumberGenerator.h" +#include "llvm/Support/TimeProfiler.h" #include "llvm/Support/VersionTuple.h" #include <cassert> #include <cstdint> @@ -478,6 +479,7 @@ Error Module::materializeAll() { } Error Module::materializeMetadata() { + llvm::TimeTraceScope timeScope("Materialize metadata"); if (!Materializer) return Error::success(); return Materializer->materializeMetadata(); diff --git a/llvm/lib/IR/ProfDataUtils.cpp b/llvm/lib/IR/ProfDataUtils.cpp index d24263f8b3bd..b41256f59909 100644 --- a/llvm/lib/IR/ProfDataUtils.cpp +++ b/llvm/lib/IR/ProfDataUtils.cpp @@ -250,7 +250,15 @@ void setExplicitlyUnknownBranchWeights(Instruction &I) { MDB.createString(MDProfLabels::UnknownBranchWeightsMarker))); } -bool isExplicitlyUnknownBranchWeightsMetadata(const MDNode &MD) { +void setExplicitlyUnknownFunctionEntryCount(Function &F) { + MDBuilder MDB(F.getContext()); + F.setMetadata( + LLVMContext::MD_prof, + MDNode::get(F.getContext(), + MDB.createString(MDProfLabels::UnknownBranchWeightsMarker))); +} + +bool isExplicitlyUnknownProfileMetadata(const MDNode &MD) { if (MD.getNumOperands() != 1) return false; return MD.getOperand(0).equalsStr(MDProfLabels::UnknownBranchWeightsMarker); @@ -260,7 +268,7 @@ bool hasExplicitlyUnknownBranchWeights(const Instruction &I) { auto *MD = I.getMetadata(LLVMContext::MD_prof); if (!MD) return false; - return isExplicitlyUnknownBranchWeightsMetadata(*MD); + return isExplicitlyUnknownProfileMetadata(*MD); } void setBranchWeights(Instruction &I, ArrayRef<uint32_t> Weights, diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index 3c324f2fe0d8..4fe5714a74e3 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -40,13 +40,19 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT, // hard-float calling convention by default. if (!TT.isWatchABI()) { if (isAAPCS_ABI(TT, ABIName)) { - setLibcallImplCallingConv(RTLIB::__truncsfhf2, CallingConv::ARM_AAPCS); - setLibcallImplCallingConv(RTLIB::__truncdfhf2, CallingConv::ARM_AAPCS); - setLibcallImplCallingConv(RTLIB::__extendhfsf2, CallingConv::ARM_AAPCS); + setLibcallImplCallingConv(RTLIB::impl___truncsfhf2, + CallingConv::ARM_AAPCS); + setLibcallImplCallingConv(RTLIB::impl___truncdfhf2, + CallingConv::ARM_AAPCS); + setLibcallImplCallingConv(RTLIB::impl___extendhfsf2, + CallingConv::ARM_AAPCS); } else { - setLibcallImplCallingConv(RTLIB::__truncsfhf2, CallingConv::ARM_APCS); - setLibcallImplCallingConv(RTLIB::__truncdfhf2, CallingConv::ARM_APCS); - setLibcallImplCallingConv(RTLIB::__extendhfsf2, CallingConv::ARM_APCS); + setLibcallImplCallingConv(RTLIB::impl___truncsfhf2, + CallingConv::ARM_APCS); + setLibcallImplCallingConv(RTLIB::impl___truncdfhf2, + CallingConv::ARM_APCS); + setLibcallImplCallingConv(RTLIB::impl___extendhfsf2, + CallingConv::ARM_APCS); } } diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp index 9c3466234035..9db48e8f6a96 100644 --- a/llvm/lib/IR/Type.cpp +++ b/llvm/lib/IR/Type.cpp @@ -1036,7 +1036,8 @@ static TargetTypeInfo getTargetTypeInfo(const TargetExtType *Ty) { // DirectX resources if (Name.starts_with("dx.")) return TargetTypeInfo(PointerType::get(C, 0), TargetExtType::CanBeGlobal, - TargetExtType::CanBeLocal); + TargetExtType::CanBeLocal, + TargetExtType::IsTokenLike); // Opaque types in the AMDGPU name space. if (Name == "amdgcn.named.barrier") { @@ -1054,6 +1055,14 @@ static TargetTypeInfo getTargetTypeInfo(const TargetExtType *Ty) { return TargetTypeInfo(Type::getVoidTy(C)); } +bool Type::isTokenLikeTy() const { + if (isTokenTy()) + return true; + if (auto *TT = dyn_cast<TargetExtType>(this)) + return TT->hasProperty(TargetExtType::Property::IsTokenLike); + return false; +} + Type *TargetExtType::getLayoutType() const { return getTargetTypeInfo(this).LayoutType; } diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp index 5928c89029b8..4e8f359481b8 100644 --- a/llvm/lib/IR/Value.cpp +++ b/llvm/lib/IR/Value.cpp @@ -836,6 +836,9 @@ bool Value::canBeFreed() const { return false; } + if (isa<IntToPtrInst>(this) && getMetadata(LLVMContext::MD_nofree)) + return false; + const Function *F = nullptr; if (auto *I = dyn_cast<Instruction>(this)) F = I->getFunction(); diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 4eb4b58d022e..81a53722f489 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -119,6 +119,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/ModRef.h" +#include "llvm/Support/TimeProfiler.h" #include "llvm/Support/raw_ostream.h" #include <algorithm> #include <cassert> @@ -399,6 +400,7 @@ public: bool hasBrokenDebugInfo() const { return BrokenDebugInfo; } bool verify(const Function &F) { + llvm::TimeTraceScope timeScope("Verifier"); assert(F.getParent() == &M && "An instance of this class only works with a specific module!"); @@ -526,6 +528,7 @@ private: void visitRangeMetadata(Instruction &I, MDNode *Range, Type *Ty); void visitNoaliasAddrspaceMetadata(Instruction &I, MDNode *Range, Type *Ty); void visitDereferenceableMetadata(Instruction &I, MDNode *MD); + void visitNofreeMetadata(Instruction &I, MDNode *MD); void visitProfMetadata(Instruction &I, MDNode *MD); void visitCallStackMetadata(MDNode *MD); void visitMemProfMetadata(Instruction &I, MDNode *MD); @@ -1298,9 +1301,11 @@ void Verifier::visitDIDerivedType(const DIDerivedType &N) { if (N.getTag() == dwarf::DW_TAG_set_type) { if (auto *T = N.getRawBaseType()) { auto *Enum = dyn_cast_or_null<DICompositeType>(T); + auto *Subrange = dyn_cast_or_null<DISubrangeType>(T); auto *Basic = dyn_cast_or_null<DIBasicType>(T); CheckDI( (Enum && Enum->getTag() == dwarf::DW_TAG_enumeration_type) || + (Subrange && Subrange->getTag() == dwarf::DW_TAG_subrange_type) || (Basic && (Basic->getEncoding() == dwarf::DW_ATE_unsigned || Basic->getEncoding() == dwarf::DW_ATE_signed || Basic->getEncoding() == dwarf::DW_ATE_unsigned_char || @@ -2443,16 +2448,6 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs, CheckFailed("invalid value for 'frame-pointer' attribute: " + FP, V); } - // Check EVEX512 feature. - if (TT.isX86() && MaxParameterWidth >= 512) { - Attribute TargetFeaturesAttr = Attrs.getFnAttr("target-features"); - if (TargetFeaturesAttr.isValid()) { - StringRef TF = TargetFeaturesAttr.getValueAsString(); - Check(!TF.contains("+avx512f") || !TF.contains("-evex512"), - "512-bit vector arguments require 'evex512' for AVX512", V); - } - } - checkUnsignedBaseTenFuncAttr(Attrs, "patchable-function-prefix", V); checkUnsignedBaseTenFuncAttr(Attrs, "patchable-function-entry", V); if (Attrs.hasFnAttr("patchable-function-entry-section")) @@ -2526,12 +2521,11 @@ void Verifier::verifyFunctionMetadata( for (const auto &Pair : MDs) { if (Pair.first == LLVMContext::MD_prof) { MDNode *MD = Pair.second; - if (isExplicitlyUnknownBranchWeightsMetadata(*MD)) { - CheckFailed("'unknown' !prof metadata should appear only on " - "instructions supporting the 'branch_weights' metadata", - MD); + // We may have functions that are synthesized by the compiler, e.g. in + // WPD, that we can't currently determine the entry count. + if (isExplicitlyUnknownProfileMetadata(*MD)) continue; - } + Check(MD->getNumOperands() >= 2, "!prof annotations should have no less than 2 operands", MD); @@ -2830,6 +2824,7 @@ static Instruction *getSuccPad(Instruction *Terminator) { } void Verifier::verifySiblingFuncletUnwinds() { + llvm::TimeTraceScope timeScope("Verifier verify sibling funclet unwinds"); SmallPtrSet<Instruction *, 8> Visited; SmallPtrSet<Instruction *, 8> Active; for (const auto &Pair : SiblingFuncletInfo) { @@ -3006,7 +3001,7 @@ void Verifier::visitFunction(const Function &F) { if (!IsIntrinsic) { Check(!Arg.getType()->isMetadataTy(), "Function takes metadata but isn't an intrinsic", &Arg, &F); - Check(!Arg.getType()->isTokenTy(), + Check(!Arg.getType()->isTokenLikeTy(), "Function takes token but isn't an intrinsic", &Arg, &F); Check(!Arg.getType()->isX86_AMXTy(), "Function takes x86_amx but isn't an intrinsic", &Arg, &F); @@ -3020,7 +3015,7 @@ void Verifier::visitFunction(const Function &F) { } if (!IsIntrinsic) { - Check(!F.getReturnType()->isTokenTy(), + Check(!F.getReturnType()->isTokenLikeTy(), "Function returns a token but isn't an intrinsic", &F); Check(!F.getReturnType()->isX86_AMXTy(), "Function returns a x86_amx but isn't an intrinsic", &F); @@ -3190,7 +3185,7 @@ void Verifier::visitFunction(const Function &F) { // Scope and SP could be the same MDNode and we don't want to skip // validation in that case - if (SP && ((Scope != SP) && !Seen.insert(SP).second)) + if ((Scope != SP) && !Seen.insert(SP).second) return; CheckDI(SP->describes(&F), @@ -3634,7 +3629,7 @@ void Verifier::visitPHINode(PHINode &PN) { "PHI nodes not grouped at top of basic block!", &PN, PN.getParent()); // Check that a PHI doesn't yield a Token. - Check(!PN.getType()->isTokenTy(), "PHI nodes cannot have token type!"); + Check(!PN.getType()->isTokenLikeTy(), "PHI nodes cannot have token type!"); // Check that all of the values of the PHI node have the same type as the // result. @@ -3839,14 +3834,14 @@ void Verifier::visitCallBase(CallBase &Call) { for (Type *ParamTy : FTy->params()) { Check(!ParamTy->isMetadataTy(), "Function has metadata parameter but isn't an intrinsic", Call); - Check(!ParamTy->isTokenTy(), + Check(!ParamTy->isTokenLikeTy(), "Function has token parameter but isn't an intrinsic", Call); } } // Verify that indirect calls don't return tokens. if (!Call.getCalledFunction()) { - Check(!FTy->getReturnType()->isTokenTy(), + Check(!FTy->getReturnType()->isTokenLikeTy(), "Return type cannot be token for indirect call!"); Check(!FTy->getReturnType()->isX86_AMXTy(), "Return type cannot be x86_amx for indirect call!"); @@ -5021,6 +5016,13 @@ void Verifier::visitDereferenceableMetadata(Instruction& I, MDNode* MD) { &I); } +void Verifier::visitNofreeMetadata(Instruction &I, MDNode *MD) { + Check(I.getType()->isPointerTy(), "nofree applies only to pointer types", &I); + Check((isa<IntToPtrInst>(I)), "nofree applies only to inttoptr instruction", + &I); + Check(MD->getNumOperands() == 0, "nofree metadata must be empty", &I); +} + void Verifier::visitProfMetadata(Instruction &I, MDNode *MD) { auto GetBranchingTerminatorNumOperands = [&]() { unsigned ExpectedNumOperands = 0; @@ -5496,6 +5498,9 @@ void Verifier::visitInstruction(Instruction &I) { if (MDNode *MD = I.getMetadata(LLVMContext::MD_dereferenceable_or_null)) visitDereferenceableMetadata(I, MD); + if (MDNode *MD = I.getMetadata(LLVMContext::MD_nofree)) + visitNofreeMetadata(I, MD); + if (MDNode *TBAA = I.getMetadata(LLVMContext::MD_tbaa)) TBAAVerifyHelper.visitTBAAMetadata(I, TBAA); @@ -6724,7 +6729,9 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { "invalid vector type for format", &Call, Src1, Call.getArgOperand(5)); break; } - case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4: { + case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4: + case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4: + case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: { Value *Src0 = Call.getArgOperand(1); Value *Src1 = Call.getArgOperand(3); @@ -6772,6 +6779,28 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { "invalid vector type for format", &Call, Src1, Call.getArgOperand(2)); break; } + case Intrinsic::amdgcn_cooperative_atomic_load_32x4B: + case Intrinsic::amdgcn_cooperative_atomic_load_16x8B: + case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: + case Intrinsic::amdgcn_cooperative_atomic_store_32x4B: + case Intrinsic::amdgcn_cooperative_atomic_store_16x8B: + case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: { + // Check we only use this intrinsic on the FLAT or GLOBAL address spaces. + Value *PtrArg = Call.getArgOperand(0); + const unsigned AS = PtrArg->getType()->getPointerAddressSpace(); + Check(AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::GLOBAL_ADDRESS, + "cooperative atomic intrinsics require a generic or global pointer", + &Call, PtrArg); + + // Last argument must be a MD string + auto *Op = cast<MetadataAsValue>(Call.getArgOperand(Call.arg_size() - 1)); + MDNode *MD = cast<MDNode>(Op->getMetadata()); + Check((MD->getNumOperands() == 1) && isa<MDString>(MD->getOperand(0)), + "cooperative atomic intrinsics require that the last argument is a " + "metadata string", + &Call, Op); + break; + } case Intrinsic::nvvm_setmaxnreg_inc_sync_aligned_u32: case Intrinsic::nvvm_setmaxnreg_dec_sync_aligned_u32: { Value *V = Call.getArgOperand(0); diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index 35d24c17bbd9..ce9ecc35e192 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -631,6 +631,7 @@ LTO::~LTO() = default; void LTO::addModuleToGlobalRes(ArrayRef<InputFile::Symbol> Syms, ArrayRef<SymbolResolution> Res, unsigned Partition, bool InSummary) { + llvm::TimeTraceScope timeScope("LTO add module to global resolution"); auto *ResI = Res.begin(); auto *ResE = Res.end(); (void)ResE; @@ -731,6 +732,7 @@ static void writeToResolutionFile(raw_ostream &OS, InputFile *Input, Error LTO::add(std::unique_ptr<InputFile> Input, ArrayRef<SymbolResolution> Res) { + llvm::TimeTraceScope timeScope("LTO add input", Input->getName()); assert(!CalledGetMaxTasks); if (Conf.ResolutionFile) @@ -756,6 +758,7 @@ Error LTO::add(std::unique_ptr<InputFile> Input, Expected<ArrayRef<SymbolResolution>> LTO::addModule(InputFile &Input, ArrayRef<SymbolResolution> InputRes, unsigned ModI, ArrayRef<SymbolResolution> Res) { + llvm::TimeTraceScope timeScope("LTO add module", Input.getName()); Expected<BitcodeLTOInfo> LTOInfo = Input.Mods[ModI].getLTOInfo(); if (!LTOInfo) return LTOInfo.takeError(); @@ -850,6 +853,7 @@ Expected< LTO::addRegularLTO(InputFile &Input, ArrayRef<SymbolResolution> InputRes, BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, ArrayRef<SymbolResolution> Res) { + llvm::TimeTraceScope timeScope("LTO add regular LTO"); RegularLTOState::AddedModule Mod; Expected<std::unique_ptr<Module>> MOrErr = BM.getLazyModule(RegularLTO.Ctx, /*ShouldLazyLoadMetadata*/ true, @@ -1024,6 +1028,7 @@ LTO::addRegularLTO(InputFile &Input, ArrayRef<SymbolResolution> InputRes, Error LTO::linkRegularLTO(RegularLTOState::AddedModule Mod, bool LivenessFromIndex) { + llvm::TimeTraceScope timeScope("LTO link regular LTO"); std::vector<GlobalValue *> Keep; for (GlobalValue *GV : Mod.Keep) { if (LivenessFromIndex && !ThinLTO.CombinedIndex.isGUIDLive(GV->getGUID())) { @@ -1063,6 +1068,7 @@ Error LTO::linkRegularLTO(RegularLTOState::AddedModule Mod, Expected<ArrayRef<SymbolResolution>> LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, ArrayRef<SymbolResolution> Res) { + llvm::TimeTraceScope timeScope("LTO add thin LTO"); ArrayRef<SymbolResolution> ResTmp = Res; for (const InputFile::Symbol &Sym : Syms) { assert(!ResTmp.empty()); @@ -1252,6 +1258,7 @@ Error LTO::run(AddStreamFn AddStream, FileCache Cache) { void lto::updateMemProfAttributes(Module &Mod, const ModuleSummaryIndex &Index) { + llvm::TimeTraceScope timeScope("LTO update memprof attributes"); if (Index.withSupportsHotColdNew()) return; @@ -1282,6 +1289,7 @@ void lto::updateMemProfAttributes(Module &Mod, } Error LTO::runRegularLTO(AddStreamFn AddStream) { + llvm::TimeTraceScope timeScope("Run regular LTO"); // Setup optimization remarks. auto DiagFileOrErr = lto::setupLLVMOptimizationRemarks( RegularLTO.CombinedModule->getContext(), Conf.RemarksFilename, @@ -1294,10 +1302,12 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) { // Finalize linking of regular LTO modules containing summaries now that // we have computed liveness information. - for (auto &M : RegularLTO.ModsWithSummaries) - if (Error Err = linkRegularLTO(std::move(M), - /*LivenessFromIndex=*/true)) - return Err; + { + llvm::TimeTraceScope timeScope("Link regular LTO"); + for (auto &M : RegularLTO.ModsWithSummaries) + if (Error Err = linkRegularLTO(std::move(M), /*LivenessFromIndex=*/true)) + return Err; + } // Ensure we don't have inconsistently split LTO units with type tests. // FIXME: this checks both LTO and ThinLTO. It happens to work as we take @@ -1526,6 +1536,9 @@ public: const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR, const GVSummaryMapTy &DefinedGlobals, MapVector<StringRef, BitcodeModule> &ModuleMap) { + auto ModuleID = BM.getModuleIdentifier(); + llvm::TimeTraceScope timeScope("Run ThinLTO backend thread (in-process)", + ModuleID); auto RunThinBackend = [&](AddStreamFn AddStream) { LTOLLVMContext BackendContext(Conf); Expected<std::unique_ptr<Module>> MOrErr = BM.parseModule(BackendContext); @@ -1536,9 +1549,6 @@ public: ImportList, DefinedGlobals, &ModuleMap, Conf.CodeGenOnly); }; - - auto ModuleID = BM.getModuleIdentifier(); - if (ShouldEmitIndexFiles) { if (auto E = emitFiles(ImportList, ModuleID, ModuleID.str())) return E; @@ -1639,6 +1649,9 @@ public: const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR, const GVSummaryMapTy &DefinedGlobals, MapVector<StringRef, BitcodeModule> &ModuleMap) override { + auto ModuleID = BM.getModuleIdentifier(); + llvm::TimeTraceScope timeScope("Run ThinLTO backend thread (first round)", + ModuleID); auto RunThinBackend = [&](AddStreamFn CGAddStream, AddStreamFn IRAddStream) { LTOLLVMContext BackendContext(Conf); @@ -1650,8 +1663,6 @@ public: ImportList, DefinedGlobals, &ModuleMap, Conf.CodeGenOnly, IRAddStream); }; - - auto ModuleID = BM.getModuleIdentifier(); // Like InProcessThinBackend, we produce index files as needed for // FirstRoundThinBackend. However, these files are not generated for // SecondRoundThinBackend. @@ -1735,6 +1746,9 @@ public: const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR, const GVSummaryMapTy &DefinedGlobals, MapVector<StringRef, BitcodeModule> &ModuleMap) override { + auto ModuleID = BM.getModuleIdentifier(); + llvm::TimeTraceScope timeScope("Run ThinLTO backend thread (second round)", + ModuleID); auto RunThinBackend = [&](AddStreamFn AddStream) { LTOLLVMContext BackendContext(Conf); std::unique_ptr<Module> LoadedModule = @@ -1744,8 +1758,6 @@ public: ImportList, DefinedGlobals, &ModuleMap, /*CodeGenOnly=*/true); }; - - auto ModuleID = BM.getModuleIdentifier(); if (!Cache.isValid() || !CombinedIndex.modulePaths().count(ModuleID) || all_of(CombinedIndex.getModuleHash(ModuleID), [](uint32_t V) { return V == 0; })) @@ -1915,6 +1927,7 @@ ThinBackend lto::createWriteIndexesThinBackend( Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache, const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols) { + llvm::TimeTraceScope timeScope("Run ThinLTO"); LLVM_DEBUG(dbgs() << "Running ThinLTO\n"); ThinLTO.CombinedIndex.releaseTemporaryMemory(); timeTraceProfilerBegin("ThinLink", StringRef("")); diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index 5e8cd12fe040..ce42fc526bea 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -366,6 +366,7 @@ bool lto::opt(const Config &Conf, TargetMachine *TM, unsigned Task, Module &Mod, bool IsThinLTO, ModuleSummaryIndex *ExportSummary, const ModuleSummaryIndex *ImportSummary, const std::vector<uint8_t> &CmdArgs) { + llvm::TimeTraceScope timeScope("opt"); if (EmbedBitcode == LTOBitcodeEmbedding::EmbedPostMergePreOptimized) { // FIXME: the motivation for capturing post-merge bitcode and command line // is replicating the compilation environment from bitcode, without needing @@ -399,6 +400,7 @@ bool lto::opt(const Config &Conf, TargetMachine *TM, unsigned Task, Module &Mod, static void codegen(const Config &Conf, TargetMachine *TM, AddStreamFn AddStream, unsigned Task, Module &Mod, const ModuleSummaryIndex &CombinedIndex) { + llvm::TimeTraceScope timeScope("codegen"); if (Conf.PreCodeGenModuleHook && !Conf.PreCodeGenModuleHook(Task, Mod)) return; @@ -552,6 +554,7 @@ Error lto::finalizeOptimizationRemarks( Error lto::backend(const Config &C, AddStreamFn AddStream, unsigned ParallelCodeGenParallelismLevel, Module &Mod, ModuleSummaryIndex &CombinedIndex) { + llvm::TimeTraceScope timeScope("LTO backend"); Expected<const Target *> TOrErr = initAndLookupTarget(C, Mod); if (!TOrErr) return TOrErr.takeError(); @@ -577,6 +580,7 @@ Error lto::backend(const Config &C, AddStreamFn AddStream, static void dropDeadSymbols(Module &Mod, const GVSummaryMapTy &DefinedGlobals, const ModuleSummaryIndex &Index) { + llvm::TimeTraceScope timeScope("Drop dead symbols"); std::vector<GlobalValue*> DeadGVs; for (auto &GV : Mod.global_values()) if (GlobalValueSummary *GVS = DefinedGlobals.lookup(GV.getGUID())) @@ -603,6 +607,7 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream, MapVector<StringRef, BitcodeModule> *ModuleMap, bool CodeGenOnly, AddStreamFn IRAddStream, const std::vector<uint8_t> &CmdArgs) { + llvm::TimeTraceScope timeScope("Thin backend", Mod.getModuleIdentifier()); Expected<const Target *> TOrErr = initAndLookupTarget(Conf, Mod); if (!TOrErr) return TOrErr.takeError(); @@ -679,6 +684,7 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream, return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); auto ModuleLoader = [&](StringRef Identifier) { + llvm::TimeTraceScope moduleLoaderScope("Module loader", Identifier); assert(Mod.getContext().isODRUniquingDebugTypes() && "ODR Type uniquing should be enabled on the context"); if (ModuleMap) { @@ -712,10 +718,13 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream, return MOrErr; }; - FunctionImporter Importer(CombinedIndex, ModuleLoader, - ClearDSOLocalOnDeclarations); - if (Error Err = Importer.importFunctions(Mod, ImportList).takeError()) - return Err; + { + llvm::TimeTraceScope importScope("Import functions"); + FunctionImporter Importer(CombinedIndex, ModuleLoader, + ClearDSOLocalOnDeclarations); + if (Error Err = Importer.importFunctions(Mod, ImportList).takeError()) + return Err; + } // Do this after any importing so that imported code is updated. updateMemProfAttributes(Mod, CombinedIndex); diff --git a/llvm/lib/LTO/LTOCodeGenerator.cpp b/llvm/lib/LTO/LTOCodeGenerator.cpp index 09b91d81225a..cdeab98ff6c9 100644 --- a/llvm/lib/LTO/LTOCodeGenerator.cpp +++ b/llvm/lib/LTO/LTOCodeGenerator.cpp @@ -384,7 +384,7 @@ bool LTOCodeGenerator::determineTarget() { // create target machine from info for merged modules std::string ErrMsg; - MArch = TargetRegistry::lookupTarget(TripleStr, ErrMsg); + MArch = TargetRegistry::lookupTarget(Triple, ErrMsg); if (!MArch) { emitError(ErrMsg); return false; diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp index d6c15de4c4cd..1bff6cd25156 100644 --- a/llvm/lib/Linker/IRMover.cpp +++ b/llvm/lib/Linker/IRMover.cpp @@ -8,6 +8,8 @@ #include "llvm/Linker/IRMover.h" #include "LinkDiagnosticInfo.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallString.h" #include "llvm/IR/AutoUpgrade.h" @@ -290,6 +292,9 @@ class IRLinker { Module &DstM; std::unique_ptr<Module> SrcM; + // Lookup table to optimize IRMover::linkNamedMDNodes(). + IRMover::NamedMDNodesT &NamedMDNodes; + /// See IRMover::move(). IRMover::LazyCallback AddLazyFor; @@ -435,10 +440,12 @@ public: IRLinker(Module &DstM, MDMapT &SharedMDs, IRMover::IdentifiedStructTypeSet &Set, std::unique_ptr<Module> SrcM, ArrayRef<GlobalValue *> ValuesToLink, - IRMover::LazyCallback AddLazyFor, bool IsPerformingImport) - : DstM(DstM), SrcM(std::move(SrcM)), AddLazyFor(std::move(AddLazyFor)), - TypeMap(Set), GValMaterializer(*this), LValMaterializer(*this), - SharedMDs(SharedMDs), IsPerformingImport(IsPerformingImport), + IRMover::LazyCallback AddLazyFor, bool IsPerformingImport, + IRMover::NamedMDNodesT &NamedMDNodes) + : DstM(DstM), SrcM(std::move(SrcM)), NamedMDNodes(NamedMDNodes), + AddLazyFor(std::move(AddLazyFor)), TypeMap(Set), + GValMaterializer(*this), LValMaterializer(*this), SharedMDs(SharedMDs), + IsPerformingImport(IsPerformingImport), Mapper(ValueMap, RF_ReuseAndMutateDistinctMDs | RF_IgnoreMissingLocals, &TypeMap, &GValMaterializer), IndirectSymbolMCID(Mapper.registerAlternateMappingContext( @@ -1132,10 +1139,17 @@ void IRLinker::linkNamedMDNodes() { continue; NamedMDNode *DestNMD = DstM.getOrInsertNamedMetadata(NMD.getName()); + + auto &Inserted = NamedMDNodes[DestNMD]; + if (Inserted.empty()) { + // Must be the first module, copy everything from DestNMD. + Inserted.insert(DestNMD->operands().begin(), DestNMD->operands().end()); + } + // Add Src elements into Dest node. for (const MDNode *Op : NMD.operands()) { MDNode *MD = Mapper.mapMDNode(*Op); - if (!is_contained(DestNMD->operands(), MD)) + if (Inserted.insert(MD).second) DestNMD->addOperand(MD); } } @@ -1670,6 +1684,6 @@ Error IRMover::move(std::unique_ptr<Module> Src, LazyCallback AddLazyFor, bool IsPerformingImport) { IRLinker TheIRLinker(Composite, SharedMDs, IdentifiedStructTypes, std::move(Src), ValuesToLink, std::move(AddLazyFor), - IsPerformingImport); + IsPerformingImport, NamedMDNodes); return TheIRLinker.run(); } diff --git a/llvm/lib/MC/DXContainerRootSignature.cpp b/llvm/lib/MC/DXContainerRootSignature.cpp index 482280b5ef28..b9ebb7a9e789 100644 --- a/llvm/lib/MC/DXContainerRootSignature.cpp +++ b/llvm/lib/MC/DXContainerRootSignature.cpp @@ -20,49 +20,62 @@ static uint32_t writePlaceholder(raw_svector_ostream &Stream) { return Offset; } -static void rewriteOffsetToCurrentByte(raw_svector_ostream &Stream, - uint32_t Offset) { +static uint32_t rewriteOffsetToCurrentByte(raw_svector_ostream &Stream, + uint32_t Offset) { + uint32_t ByteOffset = Stream.tell(); uint32_t Value = support::endian::byte_swap<uint32_t, llvm::endianness::little>( - Stream.tell()); + ByteOffset); Stream.pwrite(reinterpret_cast<const char *>(&Value), sizeof(Value), Offset); + return ByteOffset; } size_t RootSignatureDesc::getSize() const { - size_t Size = - sizeof(dxbc::RTS0::v1::RootSignatureHeader) + - ParametersContainer.size() * sizeof(dxbc::RTS0::v1::RootParameterHeader) + + uint32_t StaticSamplersOffset = computeStaticSamplersOffset(); + size_t StaticSamplersSize = StaticSamplers.size() * sizeof(dxbc::RTS0::v1::StaticSampler); + return size_t(StaticSamplersOffset) + StaticSamplersSize; +} + +uint32_t RootSignatureDesc::computeRootParametersOffset() const { + return sizeof(dxbc::RTS0::v1::RootSignatureHeader); +} + +uint32_t RootSignatureDesc::computeStaticSamplersOffset() const { + uint32_t Offset = computeRootParametersOffset(); + for (const RootParameterInfo &I : ParametersContainer) { - switch (I.Header.ParameterType) { - case llvm::to_underlying(dxbc::RootParameterType::Constants32Bit): - Size += sizeof(dxbc::RTS0::v1::RootConstants); + Offset += sizeof(dxbc::RTS0::v1::RootParameterHeader); + switch (I.Type) { + case dxbc::RootParameterType::Constants32Bit: + Offset += sizeof(dxbc::RTS0::v1::RootConstants); break; - case llvm::to_underlying(dxbc::RootParameterType::CBV): - case llvm::to_underlying(dxbc::RootParameterType::SRV): - case llvm::to_underlying(dxbc::RootParameterType::UAV): + case dxbc::RootParameterType::CBV: + case dxbc::RootParameterType::SRV: + case dxbc::RootParameterType::UAV: if (Version == 1) - Size += sizeof(dxbc::RTS0::v1::RootDescriptor); + Offset += sizeof(dxbc::RTS0::v1::RootDescriptor); else - Size += sizeof(dxbc::RTS0::v2::RootDescriptor); + Offset += sizeof(dxbc::RTS0::v2::RootDescriptor); break; - case llvm::to_underlying(dxbc::RootParameterType::DescriptorTable): + case dxbc::RootParameterType::DescriptorTable: const DescriptorTable &Table = ParametersContainer.getDescriptorTable(I.Location); // 4 bytes for the number of ranges in table and // 4 bytes for the ranges offset - Size += 2 * sizeof(uint32_t); + Offset += 2 * sizeof(uint32_t); if (Version == 1) - Size += sizeof(dxbc::RTS0::v1::DescriptorRange) * Table.Ranges.size(); + Offset += sizeof(dxbc::RTS0::v1::DescriptorRange) * Table.Ranges.size(); else - Size += sizeof(dxbc::RTS0::v2::DescriptorRange) * Table.Ranges.size(); + Offset += sizeof(dxbc::RTS0::v2::DescriptorRange) * Table.Ranges.size(); break; } } - return Size; + + return Offset; } void RootSignatureDesc::write(raw_ostream &OS) const { @@ -76,19 +89,13 @@ void RootSignatureDesc::write(raw_ostream &OS) const { support::endian::write(BOS, NumParameters, llvm::endianness::little); support::endian::write(BOS, RootParameterOffset, llvm::endianness::little); support::endian::write(BOS, NumSamplers, llvm::endianness::little); - uint32_t SSO = StaticSamplersOffset; - if (NumSamplers > 0) - SSO = writePlaceholder(BOS); - else - support::endian::write(BOS, SSO, llvm::endianness::little); + uint32_t SSO = writePlaceholder(BOS); support::endian::write(BOS, Flags, llvm::endianness::little); SmallVector<uint32_t> ParamsOffsets; - for (const RootParameterInfo &P : ParametersContainer) { - support::endian::write(BOS, P.Header.ParameterType, - llvm::endianness::little); - support::endian::write(BOS, P.Header.ShaderVisibility, - llvm::endianness::little); + for (const RootParameterInfo &I : ParametersContainer) { + support::endian::write(BOS, I.Type, llvm::endianness::little); + support::endian::write(BOS, I.Visibility, llvm::endianness::little); ParamsOffsets.push_back(writePlaceholder(BOS)); } @@ -96,11 +103,11 @@ void RootSignatureDesc::write(raw_ostream &OS) const { assert(NumParameters == ParamsOffsets.size()); for (size_t I = 0; I < NumParameters; ++I) { rewriteOffsetToCurrentByte(BOS, ParamsOffsets[I]); - const auto &[Type, Loc] = ParametersContainer.getTypeAndLocForParameter(I); - switch (Type) { - case llvm::to_underlying(dxbc::RootParameterType::Constants32Bit): { - const dxbc::RTS0::v1::RootConstants &Constants = - ParametersContainer.getConstant(Loc); + const RootParameterInfo &Info = ParametersContainer.getInfo(I); + switch (Info.Type) { + case dxbc::RootParameterType::Constants32Bit: { + const mcdxbc::RootConstants &Constants = + ParametersContainer.getConstant(Info.Location); support::endian::write(BOS, Constants.ShaderRegister, llvm::endianness::little); support::endian::write(BOS, Constants.RegisterSpace, @@ -109,11 +116,11 @@ void RootSignatureDesc::write(raw_ostream &OS) const { llvm::endianness::little); break; } - case llvm::to_underlying(dxbc::RootParameterType::CBV): - case llvm::to_underlying(dxbc::RootParameterType::SRV): - case llvm::to_underlying(dxbc::RootParameterType::UAV): { - const dxbc::RTS0::v2::RootDescriptor &Descriptor = - ParametersContainer.getRootDescriptor(Loc); + case dxbc::RootParameterType::CBV: + case dxbc::RootParameterType::SRV: + case dxbc::RootParameterType::UAV: { + const mcdxbc::RootDescriptor &Descriptor = + ParametersContainer.getRootDescriptor(Info.Location); support::endian::write(BOS, Descriptor.ShaderRegister, llvm::endianness::little); @@ -123,14 +130,15 @@ void RootSignatureDesc::write(raw_ostream &OS) const { support::endian::write(BOS, Descriptor.Flags, llvm::endianness::little); break; } - case llvm::to_underlying(dxbc::RootParameterType::DescriptorTable): { + case dxbc::RootParameterType::DescriptorTable: { const DescriptorTable &Table = - ParametersContainer.getDescriptorTable(Loc); + ParametersContainer.getDescriptorTable(Info.Location); support::endian::write(BOS, (uint32_t)Table.Ranges.size(), llvm::endianness::little); rewriteOffsetToCurrentByte(BOS, writePlaceholder(BOS)); for (const auto &Range : Table) { - support::endian::write(BOS, Range.RangeType, llvm::endianness::little); + support::endian::write(BOS, static_cast<uint32_t>(Range.RangeType), + llvm::endianness::little); support::endian::write(BOS, Range.NumDescriptors, llvm::endianness::little); support::endian::write(BOS, Range.BaseShaderRegister, @@ -146,23 +154,23 @@ void RootSignatureDesc::write(raw_ostream &OS) const { } } } - if (NumSamplers > 0) { - rewriteOffsetToCurrentByte(BOS, SSO); - for (const auto &S : StaticSamplers) { - support::endian::write(BOS, S.Filter, llvm::endianness::little); - support::endian::write(BOS, S.AddressU, llvm::endianness::little); - support::endian::write(BOS, S.AddressV, llvm::endianness::little); - support::endian::write(BOS, S.AddressW, llvm::endianness::little); - support::endian::write(BOS, S.MipLODBias, llvm::endianness::little); - support::endian::write(BOS, S.MaxAnisotropy, llvm::endianness::little); - support::endian::write(BOS, S.ComparisonFunc, llvm::endianness::little); - support::endian::write(BOS, S.BorderColor, llvm::endianness::little); - support::endian::write(BOS, S.MinLOD, llvm::endianness::little); - support::endian::write(BOS, S.MaxLOD, llvm::endianness::little); - support::endian::write(BOS, S.ShaderRegister, llvm::endianness::little); - support::endian::write(BOS, S.RegisterSpace, llvm::endianness::little); - support::endian::write(BOS, S.ShaderVisibility, llvm::endianness::little); - } + [[maybe_unused]] uint32_t Offset = rewriteOffsetToCurrentByte(BOS, SSO); + assert(Offset == computeStaticSamplersOffset() && + "Computed offset does not match written offset"); + for (const auto &S : StaticSamplers) { + support::endian::write(BOS, S.Filter, llvm::endianness::little); + support::endian::write(BOS, S.AddressU, llvm::endianness::little); + support::endian::write(BOS, S.AddressV, llvm::endianness::little); + support::endian::write(BOS, S.AddressW, llvm::endianness::little); + support::endian::write(BOS, S.MipLODBias, llvm::endianness::little); + support::endian::write(BOS, S.MaxAnisotropy, llvm::endianness::little); + support::endian::write(BOS, S.ComparisonFunc, llvm::endianness::little); + support::endian::write(BOS, S.BorderColor, llvm::endianness::little); + support::endian::write(BOS, S.MinLOD, llvm::endianness::little); + support::endian::write(BOS, S.MaxLOD, llvm::endianness::little); + support::endian::write(BOS, S.ShaderRegister, llvm::endianness::little); + support::endian::write(BOS, S.RegisterSpace, llvm::endianness::little); + support::endian::write(BOS, S.ShaderVisibility, llvm::endianness::little); } assert(Storage.size() == getSize()); OS.write(Storage.data(), Storage.size()); diff --git a/llvm/lib/MC/MCDisassembler/Disassembler.cpp b/llvm/lib/MC/MCDisassembler/Disassembler.cpp index 684413e1e3a5..0429227f0fec 100644 --- a/llvm/lib/MC/MCDisassembler/Disassembler.cpp +++ b/llvm/lib/MC/MCDisassembler/Disassembler.cpp @@ -17,7 +17,6 @@ #include "llvm/MC/MCDisassembler/MCSymbolizer.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstPrinter.h" -#include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSchedule.h" @@ -45,20 +44,23 @@ LLVMCreateDisasmCPUFeatures(const char *TT, const char *CPU, const char *Features, void *DisInfo, int TagType, LLVMOpInfoCallback GetOpInfo, LLVMSymbolLookupCallback SymbolLookUp) { + Triple TheTriple(TT); + // Get the target. std::string Error; - const Target *TheTarget = TargetRegistry::lookupTarget(TT, Error); + const Target *TheTarget = TargetRegistry::lookupTarget(TheTriple, Error); if (!TheTarget) return nullptr; - std::unique_ptr<const MCRegisterInfo> MRI(TheTarget->createMCRegInfo(TT)); + std::unique_ptr<const MCRegisterInfo> MRI( + TheTarget->createMCRegInfo(TheTriple)); if (!MRI) return nullptr; MCTargetOptions MCOptions; // Get the assembler info needed to setup the MCContext. std::unique_ptr<const MCAsmInfo> MAI( - TheTarget->createMCAsmInfo(*MRI, TT, MCOptions)); + TheTarget->createMCAsmInfo(*MRI, TheTriple, MCOptions)); if (!MAI) return nullptr; @@ -67,13 +69,13 @@ LLVMCreateDisasmCPUFeatures(const char *TT, const char *CPU, return nullptr; std::unique_ptr<const MCSubtargetInfo> STI( - TheTarget->createMCSubtargetInfo(TT, CPU, Features)); + TheTarget->createMCSubtargetInfo(TheTriple, CPU, Features)); if (!STI) return nullptr; // Set up the MCContext for creating symbols and MCExpr's. std::unique_ptr<MCContext> Ctx( - new MCContext(Triple(TT), MAI.get(), MRI.get(), STI.get())); + new MCContext(TheTriple, MAI.get(), MRI.get(), STI.get())); if (!Ctx) return nullptr; @@ -84,12 +86,13 @@ LLVMCreateDisasmCPUFeatures(const char *TT, const char *CPU, return nullptr; std::unique_ptr<MCRelocationInfo> RelInfo( - TheTarget->createMCRelocationInfo(TT, *Ctx)); + TheTarget->createMCRelocationInfo(TheTriple, *Ctx)); if (!RelInfo) return nullptr; - std::unique_ptr<MCSymbolizer> Symbolizer(TheTarget->createMCSymbolizer( - TT, GetOpInfo, SymbolLookUp, DisInfo, Ctx.get(), std::move(RelInfo))); + std::unique_ptr<MCSymbolizer> Symbolizer( + TheTarget->createMCSymbolizer(TheTriple, GetOpInfo, SymbolLookUp, DisInfo, + Ctx.get(), std::move(RelInfo))); DisAsm->setSymbolizer(std::move(Symbolizer)); // Set up the instruction printer. diff --git a/llvm/lib/MC/MCGOFFStreamer.cpp b/llvm/lib/MC/MCGOFFStreamer.cpp index 1718e2a4eb2d..8b228db0e8b3 100644 --- a/llvm/lib/MC/MCGOFFStreamer.cpp +++ b/llvm/lib/MC/MCGOFFStreamer.cpp @@ -45,3 +45,9 @@ MCStreamer *llvm::createGOFFStreamer(MCContext &Context, new MCGOFFStreamer(Context, std::move(MAB), std::move(OW), std::move(CE)); return S; } +llvm::MCGOFFStreamer::MCGOFFStreamer(MCContext &Context, + std::unique_ptr<MCAsmBackend> MAB, + std::unique_ptr<MCObjectWriter> OW, + std::unique_ptr<MCCodeEmitter> Emitter) + : MCObjectStreamer(Context, std::move(MAB), std::move(OW), + std::move(Emitter)) {} diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp index d505ac6dd4bf..a0cd09b11d8d 100644 --- a/llvm/lib/MC/MCObjectFileInfo.cpp +++ b/llvm/lib/MC/MCObjectFileInfo.cpp @@ -849,6 +849,16 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) { StackMapSection = Ctx->getCOFFSection(".llvm_stackmaps", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ); + + // Set IMAGE_SCN_MEM_DISCARDABLE so that lld will not truncate section name. + PseudoProbeSection = Ctx->getCOFFSection( + ".pseudo_probe", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | + COFF::IMAGE_SCN_MEM_DISCARDABLE | + COFF::IMAGE_SCN_MEM_READ); + PseudoProbeDescSection = Ctx->getCOFFSection( + ".pseudo_probe_desc", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | + COFF::IMAGE_SCN_MEM_DISCARDABLE | + COFF::IMAGE_SCN_MEM_READ); } void MCObjectFileInfo::initSPIRVMCObjectFileInfo(const Triple &T) { @@ -1220,44 +1230,68 @@ MCObjectFileInfo::getKCFITrapSection(const MCSection &TextSec) const { MCSection * MCObjectFileInfo::getPseudoProbeSection(const MCSection &TextSec) const { - if (Ctx->getObjectFileType() != MCContext::IsELF) - return PseudoProbeSection; - - const auto &ElfSec = static_cast<const MCSectionELF &>(TextSec); - unsigned Flags = ELF::SHF_LINK_ORDER; - StringRef GroupName; - if (const MCSymbol *Group = ElfSec.getGroup()) { - GroupName = Group->getName(); - Flags |= ELF::SHF_GROUP; + auto ObjFileType = Ctx->getObjectFileType(); + if (ObjFileType == MCContext::IsELF) { + const auto &ElfSec = static_cast<const MCSectionELF &>(TextSec); + unsigned Flags = ELF::SHF_LINK_ORDER; + StringRef GroupName; + if (const MCSymbol *Group = ElfSec.getGroup()) { + GroupName = Group->getName(); + Flags |= ELF::SHF_GROUP; + } + return Ctx->getELFSection( + PseudoProbeSection->getName(), ELF::SHT_PROGBITS, Flags, 0, GroupName, + true, ElfSec.getUniqueID(), + static_cast<const MCSymbolELF *>(TextSec.getBeginSymbol())); + } else if (ObjFileType == MCContext::IsCOFF) { + StringRef COMDATSymName = ""; + int Selection = 0; + unsigned Characteristics = + static_cast<MCSectionCOFF *>(PseudoProbeSection)->getCharacteristics(); + const auto &COFFSec = static_cast<const MCSectionCOFF &>(TextSec); + if (const MCSymbol *COMDATSym = COFFSec.getCOMDATSymbol()) { + // Associate .pseudo_probe to its function section. + COMDATSymName = COMDATSym->getName(); + Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT; + Selection = COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE; + } + return Ctx->getCOFFSection(PseudoProbeSection->getName(), Characteristics, + COMDATSymName, Selection, COFFSec.getUniqueID()); } - return Ctx->getELFSection( - PseudoProbeSection->getName(), ELF::SHT_PROGBITS, Flags, 0, GroupName, - true, ElfSec.getUniqueID(), - static_cast<const MCSymbolELF *>(TextSec.getBeginSymbol())); + return PseudoProbeSection; } MCSection * MCObjectFileInfo::getPseudoProbeDescSection(StringRef FuncName) const { - if (Ctx->getObjectFileType() == MCContext::IsELF) { - // Create a separate comdat group for each function's descriptor in order - // for the linker to deduplicate. The duplication, must be from different - // tranlation unit, can come from: - // 1. Inline functions defined in header files; - // 2. ThinLTO imported funcions; - // 3. Weak-linkage definitions. - // Use a concatenation of the section name and the function name as the - // group name so that descriptor-only groups won't be folded with groups of - // code. - if (Ctx->getTargetTriple().supportsCOMDAT() && !FuncName.empty()) { - auto *S = static_cast<MCSectionELF *>(PseudoProbeDescSection); - auto Flags = S->getFlags() | ELF::SHF_GROUP; - return Ctx->getELFSection(S->getName(), S->getType(), Flags, - S->getEntrySize(), - S->getName() + "_" + FuncName, - /*IsComdat=*/true); - } + if (!Ctx->getTargetTriple().supportsCOMDAT() || FuncName.empty()) + return PseudoProbeDescSection; + + // Create a separate comdat group for each function's descriptor in order + // for the linker to deduplicate. The duplication, must be from different + // tranlation unit, can come from: + // 1. Inline functions defined in header files; + // 2. ThinLTO imported funcions; + // 3. Weak-linkage definitions. + // Use a concatenation of the section name and the function name as the + // group name so that descriptor-only groups won't be folded with groups of + // code. + auto ObjFileType = Ctx->getObjectFileType(); + if (ObjFileType == MCContext::IsELF) { + auto *S = static_cast<MCSectionELF *>(PseudoProbeDescSection); + auto Flags = S->getFlags() | ELF::SHF_GROUP; + return Ctx->getELFSection(S->getName(), S->getType(), Flags, + S->getEntrySize(), S->getName() + "_" + FuncName, + /*IsComdat=*/true); + } else if (ObjFileType == MCContext::IsCOFF) { + auto *S = static_cast<MCSectionCOFF *>(PseudoProbeDescSection); + unsigned Characteristics = + S->getCharacteristics() | COFF::IMAGE_SCN_LNK_COMDAT; + std::string COMDATSymName = (S->getName() + "_" + FuncName).str(); + return Ctx->getCOFFSection(S->getName(), Characteristics, COMDATSymName, + COFF::IMAGE_COMDAT_SELECT_EXACT_MATCH); } + return PseudoProbeDescSection; } diff --git a/llvm/lib/MC/MCSFrame.cpp b/llvm/lib/MC/MCSFrame.cpp index ee17b774e474..a0d6c80ab72e 100644 --- a/llvm/lib/MC/MCSFrame.cpp +++ b/llvm/lib/MC/MCSFrame.cpp @@ -21,6 +21,24 @@ using namespace sframe; namespace { +// High-level structure to track info needed to emit a +// sframe_frame_row_entry_addrX. On disk these have both a fixed portion of type +// sframe_frame_row_entry_addrX and trailing data of X * S bytes, where X is the +// datum size, and S is 1, 2, or 3 depending on which of CFA, SP, and FP are +// being tracked. +struct SFrameFRE { + // An FRE describes how to find the registers when the PC is at this + // Label from function start. + const MCSymbol *Label = nullptr; + size_t CFAOffset = 0; + size_t FPOffset = 0; + size_t RAOffset = 0; + bool FromFP = false; + bool CFARegSet = false; + + SFrameFRE(const MCSymbol *Start) : Label(Start) {} +}; + // High-level structure to track info needed to emit a sframe_func_desc_entry // and its associated FREs. struct SFrameFDE { @@ -28,6 +46,8 @@ struct SFrameFDE { const MCDwarfFrameInfo &DFrame; // Label where this FDE's FREs start. MCSymbol *FREStart; + // Unwinding fres + SmallVector<SFrameFRE> FREs; SFrameFDE(const MCDwarfFrameInfo &DF, MCSymbol *FRES) : DFrame(DF), FREStart(FRES) {} @@ -53,7 +73,8 @@ struct SFrameFDE { MCFixup::getDataKindForSize(4))); S.emitInt32(0); - // sfde_func_start_num_fres + // sfde_func_num_fres + // TODO: When we actually emit fres, replace 0 with FREs.size() S.emitInt32(0); // sfde_func_info word @@ -76,10 +97,90 @@ class SFrameEmitterImpl { MCObjectStreamer &Streamer; SmallVector<SFrameFDE> FDEs; ABI SFrameABI; + // Target-specific convenience variables to detect when a CFI instruction + // references these registers. Unlike in dwarf frame descriptions, they never + // escape into the sframe section itself. + unsigned SPReg; + unsigned FPReg; + unsigned RAReg; MCSymbol *FDESubSectionStart; MCSymbol *FRESubSectionStart; MCSymbol *FRESubSectionEnd; + bool setCFARegister(SFrameFRE &FRE, const MCCFIInstruction &I) { + if (I.getRegister() == SPReg) { + FRE.CFARegSet = true; + FRE.FromFP = false; + return true; + } + if (I.getRegister() == FPReg) { + FRE.CFARegSet = true; + FRE.FromFP = true; + return true; + } + Streamer.getContext().reportWarning( + I.getLoc(), "canonical Frame Address not in stack- or frame-pointer. " + "Omitting SFrame unwind info for this function"); + return false; + } + + bool setCFAOffset(SFrameFRE &FRE, const SMLoc &Loc, size_t Offset) { + if (!FRE.CFARegSet) { + Streamer.getContext().reportWarning( + Loc, "adjusting CFA offset without a base register. " + "Omitting SFrame unwind info for this function"); + return false; + } + FRE.CFAOffset = Offset; + return true; + } + + // Add the effects of CFI to the current FDE, creating a new FRE when + // necessary. + bool handleCFI(SFrameFDE &FDE, SFrameFRE &FRE, const MCCFIInstruction &CFI) { + switch (CFI.getOperation()) { + case MCCFIInstruction::OpDefCfaRegister: + return setCFARegister(FRE, CFI); + case MCCFIInstruction::OpDefCfa: + case MCCFIInstruction::OpLLVMDefAspaceCfa: + if (!setCFARegister(FRE, CFI)) + return false; + return setCFAOffset(FRE, CFI.getLoc(), CFI.getOffset()); + case MCCFIInstruction::OpOffset: + if (CFI.getRegister() == FPReg) + FRE.FPOffset = CFI.getOffset(); + else if (CFI.getRegister() == RAReg) + FRE.RAOffset = CFI.getOffset(); + return true; + case MCCFIInstruction::OpRelOffset: + if (CFI.getRegister() == FPReg) + FRE.FPOffset += CFI.getOffset(); + else if (CFI.getRegister() == RAReg) + FRE.RAOffset += CFI.getOffset(); + return true; + case MCCFIInstruction::OpDefCfaOffset: + return setCFAOffset(FRE, CFI.getLoc(), CFI.getOffset()); + case MCCFIInstruction::OpAdjustCfaOffset: + return setCFAOffset(FRE, CFI.getLoc(), FRE.CFAOffset + CFI.getOffset()); + case MCCFIInstruction::OpRememberState: + // TODO: Implement. Will use FDE. + return true; + case MCCFIInstruction::OpRestore: + // TODO: Implement. Will use FDE. + return true; + case MCCFIInstruction::OpRestoreState: + // TODO: Implement. Will use FDE. + return true; + case MCCFIInstruction::OpEscape: + // TODO: Implement. Will use FDE. + return true; + default: + // Instructions that don't affect the CFA, RA, and SP can be safely + // ignored. + return true; + } + } + public: SFrameEmitterImpl(MCObjectStreamer &Streamer) : Streamer(Streamer) { assert(Streamer.getContext() @@ -88,13 +189,96 @@ public: .has_value()); FDEs.reserve(Streamer.getDwarfFrameInfos().size()); SFrameABI = *Streamer.getContext().getObjectFileInfo()->getSFrameABIArch(); + switch (SFrameABI) { + case ABI::AArch64EndianBig: + case ABI::AArch64EndianLittle: + SPReg = 31; + RAReg = 29; + FPReg = 30; + break; + case ABI::AMD64EndianLittle: + SPReg = 7; + // RARegister untracked in this abi. Value chosen to match + // MCDwarfFrameInfo constructor. + RAReg = static_cast<unsigned>(INT_MAX); + FPReg = 6; + break; + } + FDESubSectionStart = Streamer.getContext().createTempSymbol(); FRESubSectionStart = Streamer.getContext().createTempSymbol(); FRESubSectionEnd = Streamer.getContext().createTempSymbol(); } - void BuildSFDE(const MCDwarfFrameInfo &DF) { - FDEs.emplace_back(DF, Streamer.getContext().createTempSymbol()); + bool atSameLocation(const MCSymbol *Left, const MCSymbol *Right) { + return Left != nullptr && Right != nullptr && + Left->getFragment() == Right->getFragment() && + Left->getOffset() == Right->getOffset(); + } + + bool equalIgnoringLocation(const SFrameFRE &Left, const SFrameFRE &Right) { + return Left.CFAOffset == Right.CFAOffset && + Left.FPOffset == Right.FPOffset && Left.RAOffset == Right.RAOffset && + Left.FromFP == Right.FromFP && Left.CFARegSet == Right.CFARegSet; + } + + void buildSFDE(const MCDwarfFrameInfo &DF) { + bool Valid = true; + SFrameFDE FDE(DF, Streamer.getContext().createTempSymbol()); + // This would have been set via ".cfi_return_column", but + // MCObjectStreamer doesn't emit an MCCFIInstruction for that. It just + // sets the DF.RAReg. + // FIXME: This also prevents providing a proper location for the error. + // LLVM doesn't change the return column itself, so this was + // hand-written assembly. + if (DF.RAReg != RAReg) { + Streamer.getContext().reportWarning( + SMLoc(), "non-default RA register in .cfi_return_column " + + Twine(DF.RAReg) + + ". Omitting SFrame unwind info for this function"); + Valid = false; + } + MCSymbol *LastLabel = DF.Begin; + SFrameFRE BaseFRE(LastLabel); + if (!DF.IsSimple) { + for (const auto &CFI : + Streamer.getContext().getAsmInfo()->getInitialFrameState()) + if (!handleCFI(FDE, BaseFRE, CFI)) + Valid = false; + } + FDE.FREs.push_back(BaseFRE); + + for (const auto &CFI : DF.Instructions) { + // Instructions from InitialFrameState may not have a label, but if these + // instructions don't, then they are in dead code or otherwise unused. + // TODO: This check follows MCDwarf.cpp + // FrameEmitterImplementation::emitCFIInstructions, but nothing in the + // testsuite triggers it. We should see if it can be removed in both + // places, or alternately, add a test to exercise it. + auto *L = CFI.getLabel(); + if (L && !L->isDefined()) + continue; + + SFrameFRE FRE = FDE.FREs.back(); + if (!handleCFI(FDE, FRE, CFI)) + Valid = false; + + // If nothing relevant but the location changed, don't add the FRE. + if (equalIgnoringLocation(FRE, FDE.FREs.back())) + continue; + + // If the location stayed the same, then update the current + // row. Otherwise, add a new one. + if (atSameLocation(LastLabel, L)) + FDE.FREs.back() = FRE; + else { + FDE.FREs.push_back(FRE); + FDE.FREs.back().Label = L; + LastLabel = L; + } + } + if (Valid) + FDEs.push_back(FDE); } void emitPreamble() { @@ -116,7 +300,9 @@ public: // shf_num_fdes Streamer.emitInt32(FDEs.size()); // shf_num_fres - Streamer.emitInt32(0); + uint32_t TotalFREs = 0; + Streamer.emitInt32(TotalFREs); + // shf_fre_len Streamer.emitAbsoluteSymbolDiff(FRESubSectionEnd, FRESubSectionStart, sizeof(int32_t)); @@ -161,7 +347,7 @@ void MCSFrameEmitter::emit(MCObjectStreamer &Streamer) { // Both the header itself and the FDEs include various offsets and counts. // Therefore, all of this must be precomputed. for (const auto &DFrame : FrameArray) - Emitter.BuildSFDE(DFrame); + Emitter.buildSFDE(DFrame); MCSection *Section = Context.getObjectFileInfo()->getSFrameSection(); // Not strictly necessary, but gas always aligns to 8, so match that. diff --git a/llvm/lib/MC/MCWin64EH.cpp b/llvm/lib/MC/MCWin64EH.cpp index a87648afde7d..8111ccb8bc69 100644 --- a/llvm/lib/MC/MCWin64EH.cpp +++ b/llvm/lib/MC/MCWin64EH.cpp @@ -22,6 +22,7 @@ class MCSection; /// MCExpr that represents the epilog unwind code in an unwind table. class MCUnwindV2EpilogTargetExpr final : public MCTargetExpr { + const MCSymbol *Function; const MCSymbol *FunctionEnd; const MCSymbol *UnwindV2Start; const MCSymbol *EpilogEnd; @@ -31,7 +32,7 @@ class MCUnwindV2EpilogTargetExpr final : public MCTargetExpr { MCUnwindV2EpilogTargetExpr(const WinEH::FrameInfo &FrameInfo, const WinEH::FrameInfo::Epilog &Epilog, uint8_t EpilogSize_) - : FunctionEnd(FrameInfo.FuncletOrFuncEnd), + : Function(FrameInfo.Function), FunctionEnd(FrameInfo.FuncletOrFuncEnd), UnwindV2Start(Epilog.UnwindV2Start), EpilogEnd(Epilog.End), EpilogSize(EpilogSize_), Loc(Epilog.Loc) {} @@ -253,13 +254,15 @@ static void EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) { OS->getAssembler(), LastEpilog.End, LastEpilog.UnwindV2Start); if (!MaybeSize) { context.reportError(LastEpilog.Loc, - "Failed to evaluate epilog size for Unwind v2"); + "Failed to evaluate epilog size for Unwind v2 in " + + info->Function->getName()); return; } assert(*MaybeSize >= 0); if (*MaybeSize >= (int64_t)UINT8_MAX) { context.reportError(LastEpilog.Loc, - "Epilog size is too large for Unwind v2"); + "Epilog size is too large for Unwind v2 in " + + info->Function->getName()); return; } EpilogSize = *MaybeSize + 1; @@ -282,7 +285,8 @@ static void EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) { // Too many epilogs to handle. if ((size_t)numCodes + numEpilogCodes > UINT8_MAX) { context.reportError(info->FunctionLoc, - "Too many unwind codes with Unwind v2 enabled"); + "Too many unwind codes with Unwind v2 enabled in " + + info->Function->getName()); return; } @@ -383,14 +387,16 @@ bool MCUnwindV2EpilogTargetExpr::evaluateAsRelocatableImpl( auto Offset = GetOptionalAbsDifference(*Asm, FunctionEnd, UnwindV2Start); if (!Offset) { Asm->getContext().reportError( - Loc, "Failed to evaluate epilog offset for Unwind v2"); + Loc, "Failed to evaluate epilog offset for Unwind v2 in " + + Function->getName()); return false; } assert(*Offset > 0); constexpr uint16_t MaxEpilogOffset = 0x0fff; if (*Offset > MaxEpilogOffset) { - Asm->getContext().reportError(Loc, - "Epilog offset is too large for Unwind v2"); + Asm->getContext().reportError( + Loc, + "Epilog offset is too large for Unwind v2 in " + Function->getName()); return false; } @@ -398,8 +404,8 @@ bool MCUnwindV2EpilogTargetExpr::evaluateAsRelocatableImpl( auto Size = GetOptionalAbsDifference(*Asm, EpilogEnd, UnwindV2Start); if (Size != (EpilogSize - 1)) { Asm->getContext().reportError( - Loc, - "Size of this epilog does not match size of last epilog in function"); + Loc, "Size of this epilog does not match size of last epilog in " + + Function->getName()); return false; } diff --git a/llvm/lib/ObjCopy/COFF/COFFObject.cpp b/llvm/lib/ObjCopy/COFF/COFFObject.cpp index 5fa13391c908..91cf7e32a739 100644 --- a/llvm/lib/ObjCopy/COFF/COFFObject.cpp +++ b/llvm/lib/ObjCopy/COFF/COFFObject.cpp @@ -18,6 +18,8 @@ using namespace object; void Object::addSymbols(ArrayRef<Symbol> NewSymbols) { for (Symbol S : NewSymbols) { S.UniqueId = NextSymbolUniqueId++; + S.OriginalRawIndex = NextSymbolOriginalIndex; + NextSymbolOriginalIndex += 1 + S.Sym.NumberOfAuxSymbols; Symbols.emplace_back(S); } updateSymbols(); diff --git a/llvm/lib/ObjCopy/COFF/COFFObject.h b/llvm/lib/ObjCopy/COFF/COFFObject.h index cdd1f17fc605..6b70add1bb1b 100644 --- a/llvm/lib/ObjCopy/COFF/COFFObject.h +++ b/llvm/lib/ObjCopy/COFF/COFFObject.h @@ -89,6 +89,7 @@ struct Symbol { std::optional<size_t> WeakTargetSymbolId; size_t UniqueId; size_t RawIndex; + size_t OriginalRawIndex; bool Referenced; }; @@ -140,6 +141,7 @@ private: DenseMap<size_t, Symbol *> SymbolMap; size_t NextSymbolUniqueId = 0; + size_t NextSymbolOriginalIndex = 0; std::vector<Section> Sections; DenseMap<ssize_t, Section *> SectionMap; diff --git a/llvm/lib/ObjCopy/COFF/COFFWriter.cpp b/llvm/lib/ObjCopy/COFF/COFFWriter.cpp index 350c4aec572c..fed67d67f13a 100644 --- a/llvm/lib/ObjCopy/COFF/COFFWriter.cpp +++ b/llvm/lib/ObjCopy/COFF/COFFWriter.cpp @@ -12,6 +12,8 @@ #include "llvm/ADT/StringRef.h" #include "llvm/BinaryFormat/COFF.h" #include "llvm/Object/COFF.h" +#include "llvm/Support/CRC.h" +#include "llvm/Support/Endian.h" #include "llvm/Support/Errc.h" #include "llvm/Support/ErrorHandling.h" #include <cstddef> @@ -92,6 +94,77 @@ Error COFFWriter::finalizeSymbolContents() { return Error::success(); } +Error COFFWriter::finalizeSymIdxContents() { + // CFGuards shouldn't be present in PE. + if (Obj.IsPE) + return Error::success(); + + // Currently handle only sections consisting only of .symidx. + // TODO: other sections such as .impcall and .hybmp$x require more complex + // handling as they have more complex layout. + auto IsSymIdxSection = [](StringRef Name) { + return Name == ".gljmp$y" || Name == ".giats$y" || Name == ".gfids$y" || + Name == ".gehcont$y"; + }; + + DenseMap<size_t, size_t> SymIdMap; + SmallDenseMap<ssize_t, coff_aux_section_definition *, 4> SecIdMap; + for (Symbol &Sym : Obj.getMutableSymbols()) { + SymIdMap[Sym.OriginalRawIndex] = Sym.RawIndex; + + // We collect only definition symbols of the sections to update the + // checksums. + if (Sym.Sym.StorageClass == IMAGE_SYM_CLASS_STATIC && + Sym.Sym.NumberOfAuxSymbols == 1 && Sym.Sym.Value == 0 && + IsSymIdxSection(Sym.Name)) + SecIdMap[Sym.TargetSectionId] = + reinterpret_cast<coff_aux_section_definition *>( + Sym.AuxData[0].Opaque); + } + + for (Section &Sec : Obj.getMutableSections()) { + if (!IsSymIdxSection(Sec.Name)) + continue; + + ArrayRef<uint8_t> RawIds = Sec.getContents(); + // Nothing to do and also the checksum will be -1 instead of 0 if we + // recalculate it on empty input. + if (RawIds.size() == 0) + continue; + + auto SecDefIt = SecIdMap.find(Sec.UniqueId); + if (SecDefIt == SecIdMap.end()) + return createStringError(object_error::invalid_symbol_index, + "section '%s' does not have the corresponding " + "symbol or the symbol has unexpected format", + Sec.Name.str().c_str()); + + // Create updated content. + ArrayRef<support::ulittle32_t> Ids( + reinterpret_cast<const support::ulittle32_t *>(RawIds.data()), + RawIds.size() / 4); + std::vector<support::ulittle32_t> NewIds; + for (support::ulittle32_t Id : Ids) { + auto SymIdIt = SymIdMap.find(Id); + if (SymIdIt == SymIdMap.end()) + return createStringError(object_error::invalid_symbol_index, + "section '%s' contains a .symidx (%d) that is " + "incorrect or was stripped", + Sec.Name.str().c_str(), Id.value()); + NewIds.push_back(support::ulittle32_t(SymIdIt->getSecond())); + } + ArrayRef<uint8_t> NewRawIds(reinterpret_cast<uint8_t *>(NewIds.data()), + RawIds.size()); + // Update the checksum. + JamCRC JC(/*Init=*/0); + JC.update(NewRawIds); + SecDefIt->getSecond()->CheckSum = JC.getCRC(); + // Set new content. + Sec.setOwnedContents(NewRawIds.vec()); + } + return Error::success(); +} + void COFFWriter::layoutSections() { for (auto &S : Obj.getMutableSections()) { if (S.Header.SizeOfRawData > 0) @@ -183,6 +256,8 @@ Error COFFWriter::finalize(bool IsBigObj) { return E; if (Error E = finalizeSymbolContents()) return E; + if (Error E = finalizeSymIdxContents()) + return E; size_t SizeOfHeaders = 0; FileAlignment = 1; diff --git a/llvm/lib/ObjCopy/COFF/COFFWriter.h b/llvm/lib/ObjCopy/COFF/COFFWriter.h index b7dca69e9a81..66d7f01c87f1 100644 --- a/llvm/lib/ObjCopy/COFF/COFFWriter.h +++ b/llvm/lib/ObjCopy/COFF/COFFWriter.h @@ -34,6 +34,7 @@ class COFFWriter { template <class SymbolTy> std::pair<size_t, size_t> finalizeSymbolTable(); Error finalizeRelocTargets(); Error finalizeSymbolContents(); + Error finalizeSymIdxContents(); void layoutSections(); Expected<size_t> finalizeStringTable(); diff --git a/llvm/lib/ObjCopy/ConfigManager.cpp b/llvm/lib/ObjCopy/ConfigManager.cpp index 2b17d728aad3..eef8a2190c4d 100644 --- a/llvm/lib/ObjCopy/ConfigManager.cpp +++ b/llvm/lib/ObjCopy/ConfigManager.cpp @@ -13,6 +13,13 @@ using namespace llvm; using namespace llvm::objcopy; +Expected<const ELFConfig &> ConfigManager::getELFConfig() const { + if (!Common.ExtractSection.empty()) + return createStringError(llvm::errc::invalid_argument, + "option is not supported for ELF"); + return ELF; +} + Expected<const COFFConfig &> ConfigManager::getCOFFConfig() const { if (!Common.SplitDWO.empty() || !Common.SymbolsPrefix.empty() || !Common.SymbolsPrefixRemove.empty() || !Common.SymbolsToSkip.empty() || @@ -27,7 +34,7 @@ Expected<const COFFConfig &> ConfigManager::getCOFFConfig() const { Common.DiscardMode == DiscardType::Locals || !Common.SymbolsToAdd.empty() || Common.GapFill != 0 || Common.PadTo != 0 || Common.ChangeSectionLMAValAll != 0 || - !Common.ChangeSectionAddress.empty()) + !Common.ChangeSectionAddress.empty() || !Common.ExtractSection.empty()) return createStringError(llvm::errc::invalid_argument, "option is not supported for COFF"); @@ -48,7 +55,7 @@ Expected<const MachOConfig &> ConfigManager::getMachOConfig() const { Common.DiscardMode == DiscardType::Locals || !Common.SymbolsToAdd.empty() || Common.GapFill != 0 || Common.PadTo != 0 || Common.ChangeSectionLMAValAll != 0 || - !Common.ChangeSectionAddress.empty()) + !Common.ChangeSectionAddress.empty() || !Common.ExtractSection.empty()) return createStringError(llvm::errc::invalid_argument, "option is not supported for MachO"); @@ -69,7 +76,7 @@ Expected<const WasmConfig &> ConfigManager::getWasmConfig() const { !Common.SetSectionFlags.empty() || !Common.SetSectionType.empty() || !Common.SymbolsToRename.empty() || Common.GapFill != 0 || Common.PadTo != 0 || Common.ChangeSectionLMAValAll != 0 || - !Common.ChangeSectionAddress.empty()) + !Common.ChangeSectionAddress.empty() || !Common.ExtractSection.empty()) return createStringError(llvm::errc::invalid_argument, "only flags for section dumping, removal, and " "addition are supported"); @@ -99,7 +106,7 @@ Expected<const XCOFFConfig &> ConfigManager::getXCOFFConfig() const { Common.Weaken || Common.StripUnneeded || Common.DecompressDebugSections || Common.GapFill != 0 || Common.PadTo != 0 || Common.ChangeSectionLMAValAll != 0 || - !Common.ChangeSectionAddress.empty()) { + !Common.ChangeSectionAddress.empty() || !Common.ExtractSection.empty()) { return createStringError( llvm::errc::invalid_argument, "no flags are supported yet, only basic copying is allowed"); @@ -116,18 +123,16 @@ ConfigManager::getDXContainerConfig() const { !Common.AllocSectionsPrefix.empty() || Common.DiscardMode != DiscardType::None || !Common.AddSection.empty() || !Common.DumpSection.empty() || !Common.KeepSection.empty() || - !Common.OnlySection.empty() || !Common.SectionsToRename.empty() || - !Common.SetSectionAlignment.empty() || !Common.SetSectionFlags.empty() || - !Common.SetSectionType.empty() || Common.ExtractDWO || - Common.OnlyKeepDebug || Common.StripAllGNU || Common.StripDWO || - Common.StripDebug || Common.StripNonAlloc || Common.StripSections || - Common.StripUnneeded || Common.DecompressDebugSections || - Common.GapFill != 0 || Common.PadTo != 0 || - Common.ChangeSectionLMAValAll != 0 || + !Common.SectionsToRename.empty() || !Common.SetSectionAlignment.empty() || + !Common.SetSectionFlags.empty() || !Common.SetSectionType.empty() || + Common.ExtractDWO || Common.OnlyKeepDebug || Common.StripAllGNU || + Common.StripDWO || Common.StripDebug || Common.StripNonAlloc || + Common.StripSections || Common.StripUnneeded || + Common.DecompressDebugSections || Common.GapFill != 0 || + Common.PadTo != 0 || Common.ChangeSectionLMAValAll != 0 || !Common.ChangeSectionAddress.empty()) { - return createStringError( - llvm::errc::invalid_argument, - "no flags are supported yet, only basic copying is allowed"); + return createStringError(llvm::errc::invalid_argument, + "option is not supported for DXContainer"); } return DXContainer; } diff --git a/llvm/lib/ObjCopy/DXContainer/DXContainerObjcopy.cpp b/llvm/lib/ObjCopy/DXContainer/DXContainerObjcopy.cpp index 375e382ddb04..d7f3c0d1f7b3 100644 --- a/llvm/lib/ObjCopy/DXContainer/DXContainerObjcopy.cpp +++ b/llvm/lib/ObjCopy/DXContainer/DXContainerObjcopy.cpp @@ -11,6 +11,7 @@ #include "DXContainerWriter.h" #include "llvm/ObjCopy/CommonConfig.h" #include "llvm/ObjCopy/DXContainer/DXContainerConfig.h" +#include "llvm/Support/raw_ostream.h" namespace llvm { namespace objcopy { @@ -18,7 +19,40 @@ namespace dxbc { using namespace object; +static Error extractPartAsObject(StringRef PartName, StringRef OutFilename, + StringRef InputFilename, const Object &Obj) { + for (const Part &P : Obj.Parts) + if (P.Name == PartName) { + Object PartObj; + PartObj.Header = Obj.Header; + PartObj.Parts.push_back({P.Name, P.Data}); + PartObj.recomputeHeader(); + + auto Write = [&OutFilename, &PartObj](raw_ostream &Out) -> Error { + DXContainerWriter Writer(PartObj, Out); + if (Error E = Writer.write()) + return createFileError(OutFilename, std::move(E)); + return Error::success(); + }; + + return writeToOutput(OutFilename, Write); + } + + return createFileError(InputFilename, object_error::parse_failed, + "part '%s' not found", PartName.str().c_str()); +} + static Error handleArgs(const CommonConfig &Config, Object &Obj) { + // Extract all sections before any modifications. + for (StringRef Flag : Config.ExtractSection) { + StringRef SectionName; + StringRef FileName; + std::tie(SectionName, FileName) = Flag.split('='); + if (Error E = extractPartAsObject(SectionName, FileName, + Config.InputFilename, Obj)) + return E; + } + std::function<bool(const Part &)> RemovePred = [](const Part &) { return false; }; @@ -28,6 +62,13 @@ static Error handleArgs(const CommonConfig &Config, Object &Obj) { return Config.ToRemove.matches(P.Name); }; + if (!Config.OnlySection.empty()) + RemovePred = [&Config](const Part &P) { + // Explicitly keep these sections regardless of previous removes and + // remove everything else. + return !Config.OnlySection.matches(P.Name); + }; + if (auto E = Obj.removeParts(RemovePred)) return E; diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.cpp b/llvm/lib/ObjCopy/ELF/ELFObject.cpp index e5de17e093df..78b674c5fa34 100644 --- a/llvm/lib/ObjCopy/ELF/ELFObject.cpp +++ b/llvm/lib/ObjCopy/ELF/ELFObject.cpp @@ -1307,6 +1307,9 @@ Error BasicELFBuilder::initSections() { return Error::success(); } +BasicELFBuilder::BasicELFBuilder() : Obj(std::make_unique<Object>()) {} +BasicELFBuilder::~BasicELFBuilder() = default; + void BinaryELFBuilder::addData(SymbolTableSection *SymTab) { auto Data = ArrayRef<uint8_t>( reinterpret_cast<const uint8_t *>(MemBuf->getBufferStart()), diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.h b/llvm/lib/ObjCopy/ELF/ELFObject.h index d8f79a4b1a3c..7ec0e9be3dda 100644 --- a/llvm/lib/ObjCopy/ELF/ELFObject.h +++ b/llvm/lib/ObjCopy/ELF/ELFObject.h @@ -1059,7 +1059,8 @@ protected: Error initSections(); public: - BasicELFBuilder() : Obj(std::make_unique<Object>()) {} + BasicELFBuilder(); + ~BasicELFBuilder(); }; class BinaryELFBuilder : public BasicELFBuilder { diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp index 788c6020a7f9..53699ce0d4fc 100644 --- a/llvm/lib/Object/ELF.cpp +++ b/llvm/lib/Object/ELF.cpp @@ -847,7 +847,7 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF, if (!FeatEnableOrErr) return FeatEnableOrErr.takeError(); FeatEnable = *FeatEnableOrErr; - if (FeatEnable.CallsiteOffsets && Version < 3) + if (FeatEnable.CallsiteEndOffsets && Version < 3) return createError("version should be >= 3 for SHT_LLVM_BB_ADDR_MAP when " "callsite offsets feature is enabled: version = " + Twine(static_cast<int>(Version)) + @@ -890,22 +890,22 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF, uint32_t ID = readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr); uint32_t Offset = readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr); // Read the callsite offsets. - uint32_t LastCallsiteOffset = 0; - SmallVector<uint32_t, 1> CallsiteOffsets; - if (FeatEnable.CallsiteOffsets) { + uint32_t LastCallsiteEndOffset = 0; + SmallVector<uint32_t, 1> CallsiteEndOffsets; + if (FeatEnable.CallsiteEndOffsets) { uint32_t NumCallsites = readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr); - CallsiteOffsets.reserve(NumCallsites); + CallsiteEndOffsets.reserve(NumCallsites); for (uint32_t CallsiteIndex = 0; !ULEBSizeErr && Cur && (CallsiteIndex < NumCallsites); ++CallsiteIndex) { - LastCallsiteOffset += + LastCallsiteEndOffset += readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr); - CallsiteOffsets.push_back(LastCallsiteOffset); + CallsiteEndOffsets.push_back(LastCallsiteEndOffset); } } uint32_t Size = readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr) + - LastCallsiteOffset; + LastCallsiteEndOffset; uint32_t MD = readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr); Expected<BBAddrMap::BBEntry::Metadata> MetadataOrErr = BBAddrMap::BBEntry::Metadata::decode(MD); @@ -914,7 +914,7 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF, break; } BBEntries.push_back({ID, Offset + PrevBBEndOffset, Size, - *MetadataOrErr, CallsiteOffsets}); + *MetadataOrErr, CallsiteEndOffsets}); PrevBBEndOffset += Offset + Size; } TotalNumBlocks += BBEntries.size(); diff --git a/llvm/lib/Object/ModuleSymbolTable.cpp b/llvm/lib/Object/ModuleSymbolTable.cpp index 170677291277..9442becdb7d3 100644 --- a/llvm/lib/Object/ModuleSymbolTable.cpp +++ b/llvm/lib/Object/ModuleSymbolTable.cpp @@ -81,17 +81,16 @@ initializeRecordStreamer(const Module &M, const Target *T = TargetRegistry::lookupTarget(TT, Err); assert(T && T->hasMCAsmParser()); - std::unique_ptr<MCRegisterInfo> MRI(T->createMCRegInfo(TT.str())); + std::unique_ptr<MCRegisterInfo> MRI(T->createMCRegInfo(TT)); if (!MRI) return; MCTargetOptions MCOptions; - std::unique_ptr<MCAsmInfo> MAI(T->createMCAsmInfo(*MRI, TT.str(), MCOptions)); + std::unique_ptr<MCAsmInfo> MAI(T->createMCAsmInfo(*MRI, TT, MCOptions)); if (!MAI) return; - std::unique_ptr<MCSubtargetInfo> STI( - T->createMCSubtargetInfo(TT.str(), "", "")); + std::unique_ptr<MCSubtargetInfo> STI(T->createMCSubtargetInfo(TT, "", "")); if (!STI) return; diff --git a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp index 043b575a43b1..1078b1188bb6 100644 --- a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp +++ b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp @@ -38,7 +38,7 @@ private: Error validateSize(uint32_t Computed); void writeHeader(raw_ostream &OS); - void writeParts(raw_ostream &OS); + Error writeParts(raw_ostream &OS); }; } // namespace @@ -107,7 +107,7 @@ void DXContainerWriter::writeHeader(raw_ostream &OS) { Offsets.size() * sizeof(uint32_t)); } -void DXContainerWriter::writeParts(raw_ostream &OS) { +Error DXContainerWriter::writeParts(raw_ostream &OS) { uint32_t RollingOffset = sizeof(dxbc::Header) + (ObjectFile.Header.PartCount * sizeof(uint32_t)); for (auto I : llvm::zip(ObjectFile.Parts, *ObjectFile.Header.PartOffsets)) { @@ -269,65 +269,68 @@ void DXContainerWriter::writeParts(raw_ostream &OS) { mcdxbc::RootSignatureDesc RS; RS.Flags = P.RootSignature->getEncodedFlags(); RS.Version = P.RootSignature->Version; - RS.RootParameterOffset = P.RootSignature->RootParametersOffset; RS.NumStaticSamplers = P.RootSignature->NumStaticSamplers; - RS.StaticSamplersOffset = P.RootSignature->StaticSamplersOffset; for (DXContainerYAML::RootParameterLocationYaml &L : P.RootSignature->Parameters.Locations) { - dxbc::RTS0::v1::RootParameterHeader Header{L.Header.Type, L.Header.Visibility, - L.Header.Offset}; - switch (L.Header.Type) { - case llvm::to_underlying(dxbc::RootParameterType::Constants32Bit): { + assert(dxbc::isValidParameterType(L.Header.Type) && + "invalid DXContainer YAML"); + assert(dxbc::isValidShaderVisibility(L.Header.Visibility) && + "invalid DXContainer YAML"); + dxbc::RootParameterType Type = dxbc::RootParameterType(L.Header.Type); + dxbc::ShaderVisibility Visibility = + dxbc::ShaderVisibility(L.Header.Visibility); + + switch (Type) { + case dxbc::RootParameterType::Constants32Bit: { const DXContainerYAML::RootConstantsYaml &ConstantYaml = P.RootSignature->Parameters.getOrInsertConstants(L); - dxbc::RTS0::v1::RootConstants Constants; + mcdxbc::RootConstants Constants; + Constants.Num32BitValues = ConstantYaml.Num32BitValues; Constants.RegisterSpace = ConstantYaml.RegisterSpace; Constants.ShaderRegister = ConstantYaml.ShaderRegister; - RS.ParametersContainer.addParameter(Header, Constants); + RS.ParametersContainer.addParameter(Type, Visibility, Constants); break; } - case llvm::to_underlying(dxbc::RootParameterType::CBV): - case llvm::to_underlying(dxbc::RootParameterType::SRV): - case llvm::to_underlying(dxbc::RootParameterType::UAV): { + case dxbc::RootParameterType::CBV: + case dxbc::RootParameterType::SRV: + case dxbc::RootParameterType::UAV: { const DXContainerYAML::RootDescriptorYaml &DescriptorYaml = P.RootSignature->Parameters.getOrInsertDescriptor(L); - dxbc::RTS0::v2::RootDescriptor Descriptor; + mcdxbc::RootDescriptor Descriptor; Descriptor.RegisterSpace = DescriptorYaml.RegisterSpace; Descriptor.ShaderRegister = DescriptorYaml.ShaderRegister; if (RS.Version > 1) Descriptor.Flags = DescriptorYaml.getEncodedFlags(); - RS.ParametersContainer.addParameter(Header, Descriptor); + RS.ParametersContainer.addParameter(Type, Visibility, Descriptor); break; } - case llvm::to_underlying(dxbc::RootParameterType::DescriptorTable): { + case dxbc::RootParameterType::DescriptorTable: { const DXContainerYAML::DescriptorTableYaml &TableYaml = P.RootSignature->Parameters.getOrInsertTable(L); mcdxbc::DescriptorTable Table; for (const auto &R : TableYaml.Ranges) { - - dxbc::RTS0::v2::DescriptorRange Range; - Range.RangeType = R.RangeType; + assert(dxbc::isValidRangeType(R.RangeType) && + "Invalid Descriptor Range Type"); + mcdxbc::DescriptorRange Range; + Range.RangeType = dxil::ResourceClass(R.RangeType); Range.NumDescriptors = R.NumDescriptors; Range.BaseShaderRegister = R.BaseShaderRegister; Range.RegisterSpace = R.RegisterSpace; Range.OffsetInDescriptorsFromTableStart = R.OffsetInDescriptorsFromTableStart; + if (RS.Version > 1) Range.Flags = R.getEncodedFlags(); + Table.Ranges.push_back(Range); } - RS.ParametersContainer.addParameter(Header, Table); + RS.ParametersContainer.addParameter(Type, Visibility, Table); break; } - default: - // Handling invalid parameter type edge case. We intentionally let - // obj2yaml/yaml2obj parse and emit invalid dxcontainer data, in order - // for that to be used as a testing tool more effectively. - RS.ParametersContainer.addInvalidParameter(Header); } } @@ -350,6 +353,27 @@ void DXContainerWriter::writeParts(raw_ostream &OS) { RS.StaticSamplers.push_back(NewSampler); } + // Handling of offsets + RS.RootParameterOffset = RS.computeRootParametersOffset(); + if (P.RootSignature->RootParametersOffset && + P.RootSignature->RootParametersOffset.value() != + RS.RootParameterOffset) { + return createStringError( + errc::invalid_argument, + "Specified RootParametersOffset does not match required value: %d.", + RS.RootParameterOffset); + } + + RS.StaticSamplersOffset = RS.computeStaticSamplersOffset(); + if (P.RootSignature->StaticSamplersOffset && + P.RootSignature->StaticSamplersOffset.value() != + RS.StaticSamplersOffset) { + return createStringError( + errc::invalid_argument, + "Specified StaticSamplersOffset does not match computed value: %d.", + RS.StaticSamplersOffset); + } + RS.write(OS); break; } @@ -359,14 +383,15 @@ void DXContainerWriter::writeParts(raw_ostream &OS) { OS.write_zeros(PartSize - BytesWritten); RollingOffset += PartSize; } + + return Error::success(); } Error DXContainerWriter::write(raw_ostream &OS) { if (Error Err = computePartOffsets()) return Err; writeHeader(OS); - writeParts(OS); - return Error::success(); + return writeParts(OS); } namespace llvm { diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp index 263f7bdf37bc..32b502ed4e21 100644 --- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp +++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp @@ -376,9 +376,9 @@ void MappingTraits<DXContainerYAML::RootSignatureYamlDesc>::mapping( IO &IO, DXContainerYAML::RootSignatureYamlDesc &S) { IO.mapRequired("Version", S.Version); IO.mapRequired("NumRootParameters", S.NumRootParameters); - IO.mapRequired("RootParametersOffset", S.RootParametersOffset); + IO.mapOptional("RootParametersOffset", S.RootParametersOffset, std::nullopt); IO.mapRequired("NumStaticSamplers", S.NumStaticSamplers); - IO.mapRequired("StaticSamplersOffset", S.StaticSamplersOffset); + IO.mapOptional("StaticSamplersOffset", S.StaticSamplersOffset, std::nullopt); IO.mapRequired("Parameters", S.Parameters.Locations, S); IO.mapOptional("Samplers", S.StaticSamplers); #define ROOT_SIGNATURE_FLAG(Num, Val) IO.mapOptional(#Val, S.Val, false); diff --git a/llvm/lib/ObjectYAML/ELFEmitter.cpp b/llvm/lib/ObjectYAML/ELFEmitter.cpp index bc5c68d08d11..2386a2e3e6c4 100644 --- a/llvm/lib/ObjectYAML/ELFEmitter.cpp +++ b/llvm/lib/ObjectYAML/ELFEmitter.cpp @@ -1487,8 +1487,8 @@ void ELFState<ELFT>::writeSectionContent( if (!E.BBRanges) continue; uint64_t TotalNumBlocks = 0; - bool EmitCallsiteOffsets = - FeatureOrErr->CallsiteOffsets || E.hasAnyCallsiteOffsets(); + bool EmitCallsiteEndOffsets = + FeatureOrErr->CallsiteEndOffsets || E.hasAnyCallsiteEndOffsets(); for (const ELFYAML::BBAddrMapEntry::BBRangeEntry &BBR : *E.BBRanges) { // Write the base address of the range. CBA.write<uintX_t>(BBR.BaseAddress, ELFT::Endianness); @@ -1506,12 +1506,12 @@ void ELFState<ELFT>::writeSectionContent( if (Section.Type == llvm::ELF::SHT_LLVM_BB_ADDR_MAP && E.Version > 1) SHeader.sh_size += CBA.writeULEB128(BBE.ID); SHeader.sh_size += CBA.writeULEB128(BBE.AddressOffset); - if (EmitCallsiteOffsets) { - size_t NumCallsiteOffsets = - BBE.CallsiteOffsets ? BBE.CallsiteOffsets->size() : 0; - SHeader.sh_size += CBA.writeULEB128(NumCallsiteOffsets); - if (BBE.CallsiteOffsets) { - for (uint32_t Offset : *BBE.CallsiteOffsets) + if (EmitCallsiteEndOffsets) { + size_t NumCallsiteEndOffsets = + BBE.CallsiteEndOffsets ? BBE.CallsiteEndOffsets->size() : 0; + SHeader.sh_size += CBA.writeULEB128(NumCallsiteEndOffsets); + if (BBE.CallsiteEndOffsets) { + for (uint32_t Offset : *BBE.CallsiteEndOffsets) SHeader.sh_size += CBA.writeULEB128(Offset); } } diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index c27339de67ef..62d80a24f478 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -1884,7 +1884,7 @@ void MappingTraits<ELFYAML::BBAddrMapEntry::BBEntry>::mapping( IO.mapRequired("AddressOffset", E.AddressOffset); IO.mapRequired("Size", E.Size); IO.mapRequired("Metadata", E.Metadata); - IO.mapOptional("CallsiteOffsets", E.CallsiteOffsets); + IO.mapOptional("CallsiteEndOffsets", E.CallsiteEndOffsets); } void MappingTraits<ELFYAML::PGOAnalysisMapEntry>::mapping( diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index b7edeea08276..8cf277657a54 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -69,6 +69,7 @@ #include "llvm/Analysis/RegionInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" +#include "llvm/Analysis/ScalarEvolutionDivision.h" #include "llvm/Analysis/ScopedNoAliasAA.h" #include "llvm/Analysis/StackLifetime.h" #include "llvm/Analysis/StackSafetyAnalysis.h" @@ -184,6 +185,7 @@ #include "llvm/IR/Verifier.h" #include "llvm/IRPrinter/IRPrintingPasses.h" #include "llvm/Passes/OptimizationLevel.h" +#include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" @@ -1491,6 +1493,27 @@ parseBoundsCheckingOptions(StringRef Params) { return Options; } +Expected<CodeGenOptLevel> parseExpandFpOptions(StringRef Param) { + if (Param.empty()) + return CodeGenOptLevel::None; + + // Parse a CodeGenOptLevel, e.g. "O1", "O2", "O3". + auto [Prefix, Digit] = Param.split('O'); + + uint8_t N; + if (!Prefix.empty() || Digit.getAsInteger(10, N)) + return createStringError("invalid expand-fp pass parameter '%s'", + Param.str().c_str()); + + std::optional<CodeGenOptLevel> Level = CodeGenOpt::getLevel(N); + if (!Level.has_value()) + return createStringError( + "invalid optimization level for expand-fp pass: %s", + Digit.str().c_str()); + + return *Level; +} + Expected<RAGreedyPass::Options> parseRegAllocGreedyFilterFunc(PassBuilder &PB, StringRef Params) { if (Params.empty() || Params == "all") diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 1b111dc20d35..1d015971dfbd 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -426,7 +426,6 @@ FUNCTION_PASS("dot-post-dom-only", PostDomOnlyPrinter()) FUNCTION_PASS("dse", DSEPass()) FUNCTION_PASS("dwarf-eh-prepare", DwarfEHPreparePass(TM)) FUNCTION_PASS("expand-large-div-rem", ExpandLargeDivRemPass(TM)) -FUNCTION_PASS("expand-fp", ExpandFpPass(TM)) FUNCTION_PASS("expand-memcmp", ExpandMemCmpPass(TM)) FUNCTION_PASS("expand-reductions", ExpandReductionsPass()) FUNCTION_PASS("extra-vector-passes", @@ -518,6 +517,7 @@ FUNCTION_PASS("print<phi-values>", PhiValuesPrinterPass(errs())) FUNCTION_PASS("print<postdomtree>", PostDominatorTreePrinterPass(errs())) FUNCTION_PASS("print<regions>", RegionInfoPrinterPass(errs())) FUNCTION_PASS("print<scalar-evolution>", ScalarEvolutionPrinterPass(errs())) +FUNCTION_PASS("print<scev-division>", SCEVDivisionPrinterPass(errs())) FUNCTION_PASS("print<stack-safety-local>", StackSafetyPrinterPass(errs())) FUNCTION_PASS("print<uniformity>", UniformityInfoPrinterPass(errs())) FUNCTION_PASS("prof-inject", ProfileInjectorPass()) @@ -719,6 +719,13 @@ FUNCTION_PASS_WITH_PARAMS( }, parseBoundsCheckingOptions, "trap;rt;rt-abort;min-rt;min-rt-abort;merge;guard=N") +FUNCTION_PASS_WITH_PARAMS( + "expand-fp", "ExpandFpPass", + [TM = TM](CodeGenOptLevel OL) { + return ExpandFpPass(TM, OL); + }, + parseExpandFpOptions, "O0;O1;O2;O3") + #undef FUNCTION_PASS_WITH_PARAMS #ifndef LOOPNEST_PASS diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp index f165e85baf61..de293308ae69 100644 --- a/llvm/lib/Passes/StandardInstrumentations.cpp +++ b/llvm/lib/Passes/StandardInstrumentations.cpp @@ -118,15 +118,15 @@ static cl::opt<bool> PrintPassNumbers( "print-pass-numbers", cl::init(false), cl::Hidden, cl::desc("Print pass names and their ordinals")); -static cl::opt<unsigned> PrintBeforePassNumber( - "print-before-pass-number", cl::init(0), cl::Hidden, - cl::desc("Print IR before the pass with this number as " +static cl::list<unsigned> PrintBeforePassNumber( + "print-before-pass-number", cl::CommaSeparated, cl::Hidden, + cl::desc("Print IR before the passes with specified numbers as " "reported by print-pass-numbers")); -static cl::opt<unsigned> - PrintAfterPassNumber("print-after-pass-number", cl::init(0), cl::Hidden, - cl::desc("Print IR after the pass with this number as " - "reported by print-pass-numbers")); +static cl::list<unsigned> PrintAfterPassNumber( + "print-after-pass-number", cl::CommaSeparated, cl::Hidden, + cl::desc("Print IR after the passes with specified numbers as " + "reported by print-pass-numbers")); static cl::opt<std::string> IRDumpDirectory( "ir-dump-directory", @@ -984,12 +984,12 @@ bool PrintIRInstrumentation::shouldPrintAfterPass(StringRef PassID) { bool PrintIRInstrumentation::shouldPrintBeforeCurrentPassNumber() { return shouldPrintBeforeSomePassNumber() && - (CurrentPassNumber == PrintBeforePassNumber); + (is_contained(PrintBeforePassNumber, CurrentPassNumber)); } bool PrintIRInstrumentation::shouldPrintAfterCurrentPassNumber() { return shouldPrintAfterSomePassNumber() && - (CurrentPassNumber == PrintAfterPassNumber); + (is_contained(PrintAfterPassNumber, CurrentPassNumber)); } bool PrintIRInstrumentation::shouldPrintPassNumbers() { @@ -997,11 +997,11 @@ bool PrintIRInstrumentation::shouldPrintPassNumbers() { } bool PrintIRInstrumentation::shouldPrintBeforeSomePassNumber() { - return PrintBeforePassNumber > 0; + return !PrintBeforePassNumber.empty(); } bool PrintIRInstrumentation::shouldPrintAfterSomePassNumber() { - return PrintAfterPassNumber > 0; + return !PrintAfterPassNumber.empty(); } void PrintIRInstrumentation::registerCallbacks( diff --git a/llvm/lib/Remarks/BitstreamRemarkParser.cpp b/llvm/lib/Remarks/BitstreamRemarkParser.cpp index 312886013598..86a6c6dffb18 100644 --- a/llvm/lib/Remarks/BitstreamRemarkParser.cpp +++ b/llvm/lib/Remarks/BitstreamRemarkParser.cpp @@ -11,7 +11,6 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Remarks/BitstreamRemarkParser.h" #include "BitstreamRemarkParser.h" #include "llvm/Remarks/Remark.h" #include "llvm/Support/MemoryBuffer.h" @@ -600,3 +599,5 @@ BitstreamRemarkParser::processRemark(BitstreamRemarkParserHelper &Helper) { return std::move(Result); } +llvm::remarks::BitstreamRemarkParser::BitstreamRemarkParser(StringRef Buf) + : RemarkParser(Format::Bitstream), ParserHelper(Buf) {} diff --git a/llvm/lib/Remarks/BitstreamRemarkParser.h b/llvm/lib/Remarks/BitstreamRemarkParser.h index f6f79ef199f7..cba805dc24b5 100644 --- a/llvm/lib/Remarks/BitstreamRemarkParser.h +++ b/llvm/lib/Remarks/BitstreamRemarkParser.h @@ -13,10 +13,14 @@ #ifndef LLVM_LIB_REMARKS_BITSTREAM_REMARK_PARSER_H #define LLVM_LIB_REMARKS_BITSTREAM_REMARK_PARSER_H +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Bitstream/BitstreamReader.h" #include "llvm/Remarks/BitstreamRemarkContainer.h" -#include "llvm/Remarks/BitstreamRemarkParser.h" #include "llvm/Remarks/RemarkFormat.h" #include "llvm/Remarks/RemarkParser.h" +#include "llvm/Support/Error.h" +#include <array> #include <cstdint> #include <memory> #include <optional> @@ -26,6 +30,91 @@ namespace remarks { struct Remark; +/// Helper to parse a META_BLOCK for a bitstream remark container. +struct BitstreamMetaParserHelper { + /// The Bitstream reader. + BitstreamCursor &Stream; + /// Reference to the storage for the block info. + BitstreamBlockInfo &BlockInfo; + /// The parsed content: depending on the container type, some fields might be + /// empty. + std::optional<uint64_t> ContainerVersion; + std::optional<uint8_t> ContainerType; + std::optional<StringRef> StrTabBuf; + std::optional<StringRef> ExternalFilePath; + std::optional<uint64_t> RemarkVersion; + + /// Continue parsing with \p Stream. \p Stream is expected to contain a + /// ENTER_SUBBLOCK to the META_BLOCK at the current position. + /// \p Stream is expected to have a BLOCKINFO_BLOCK set. + BitstreamMetaParserHelper(BitstreamCursor &Stream, + BitstreamBlockInfo &BlockInfo); + + /// Parse the META_BLOCK and fill the available entries. + /// This helper does not check for the validity of the fields. + Error parse(); +}; + +/// Helper to parse a REMARK_BLOCK for a bitstream remark container. +struct BitstreamRemarkParserHelper { + /// The Bitstream reader. + BitstreamCursor &Stream; + /// The parsed content: depending on the remark, some fields might be empty. + std::optional<uint8_t> Type; + std::optional<uint64_t> RemarkNameIdx; + std::optional<uint64_t> PassNameIdx; + std::optional<uint64_t> FunctionNameIdx; + std::optional<uint64_t> SourceFileNameIdx; + std::optional<uint32_t> SourceLine; + std::optional<uint32_t> SourceColumn; + std::optional<uint64_t> Hotness; + struct Argument { + std::optional<uint64_t> KeyIdx; + std::optional<uint64_t> ValueIdx; + std::optional<uint64_t> SourceFileNameIdx; + std::optional<uint32_t> SourceLine; + std::optional<uint32_t> SourceColumn; + }; + std::optional<ArrayRef<Argument>> Args; + /// Avoid re-allocating a vector every time. + SmallVector<Argument, 8> TmpArgs; + + /// Continue parsing with \p Stream. \p Stream is expected to contain a + /// ENTER_SUBBLOCK to the REMARK_BLOCK at the current position. + /// \p Stream is expected to have a BLOCKINFO_BLOCK set and to have already + /// parsed the META_BLOCK. + BitstreamRemarkParserHelper(BitstreamCursor &Stream); + + /// Parse the REMARK_BLOCK and fill the available entries. + /// This helper does not check for the validity of the fields. + Error parse(); +}; + +/// Helper to parse any bitstream remark container. +struct BitstreamParserHelper { + /// The Bitstream reader. + BitstreamCursor Stream; + /// The block info block. + BitstreamBlockInfo BlockInfo; + /// Start parsing at \p Buffer. + BitstreamParserHelper(StringRef Buffer); + /// Parse the magic number. + Expected<std::array<char, 4>> parseMagic(); + /// Parse the block info block containing all the abbrevs. + /// This needs to be called before calling any other parsing function. + Error parseBlockInfoBlock(); + /// Return true if the next block is a META_BLOCK. This function does not move + /// the cursor. + Expected<bool> isMetaBlock(); + /// Return true if the next block is a REMARK_BLOCK. This function does not + /// move the cursor. + Expected<bool> isRemarkBlock(); + /// Return true if the parser reached the end of the stream. + bool atEndOfStream() { return Stream.AtEndOfStream(); } + /// Jump to the end of the stream, skipping everything. + void skipToEnd() { return Stream.skipToEnd(); } +}; + /// Parses and holds the state of the latest parsed remark. struct BitstreamRemarkParser : public RemarkParser { /// The buffer to parse. @@ -45,8 +134,7 @@ struct BitstreamRemarkParser : public RemarkParser { /// Create a parser that expects to find a string table embedded in the /// stream. - explicit BitstreamRemarkParser(StringRef Buf) - : RemarkParser(Format::Bitstream), ParserHelper(Buf) {} + explicit BitstreamRemarkParser(StringRef Buf); Expected<std::unique_ptr<Remark>> next() override; diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp index aa5b3c78ea5f..d14abb4bd05b 100644 --- a/llvm/lib/Support/APFloat.cpp +++ b/llvm/lib/Support/APFloat.cpp @@ -2927,51 +2927,6 @@ APFloat::opStatus IEEEFloat::convertFromAPInt(const APInt &Val, bool isSigned, return convertFromUnsignedParts(api.getRawData(), partCount, rounding_mode); } -/* Convert a two's complement integer SRC to a floating point number, - rounding according to ROUNDING_MODE. ISSIGNED is true if the - integer is signed, in which case it must be sign-extended. */ -APFloat::opStatus -IEEEFloat::convertFromSignExtendedInteger(const integerPart *src, - unsigned int srcCount, bool isSigned, - roundingMode rounding_mode) { - opStatus status; - - if (isSigned && - APInt::tcExtractBit(src, srcCount * integerPartWidth - 1)) { - integerPart *copy; - - /* If we're signed and negative negate a copy. */ - sign = true; - copy = new integerPart[srcCount]; - APInt::tcAssign(copy, src, srcCount); - APInt::tcNegate(copy, srcCount); - status = convertFromUnsignedParts(copy, srcCount, rounding_mode); - delete [] copy; - } else { - sign = false; - status = convertFromUnsignedParts(src, srcCount, rounding_mode); - } - - return status; -} - -/* FIXME: should this just take a const APInt reference? */ -APFloat::opStatus -IEEEFloat::convertFromZeroExtendedInteger(const integerPart *parts, - unsigned int width, bool isSigned, - roundingMode rounding_mode) { - unsigned int partCount = partCountForBits(width); - APInt api = APInt(width, ArrayRef(parts, partCount)); - - sign = false; - if (isSigned && APInt::tcExtractBit(parts, width - 1)) { - sign = true; - api = -api; - } - - return convertFromUnsignedParts(api.getRawData(), partCount, rounding_mode); -} - Expected<APFloat::opStatus> IEEEFloat::convertFromHexadecimalString(StringRef s, roundingMode rounding_mode) { @@ -5648,36 +5603,158 @@ DoubleAPFloat::convertToInteger(MutableArrayRef<integerPart> Input, return FS; } -APFloat::opStatus DoubleAPFloat::convertFromAPInt(const APInt &Input, - bool IsSigned, - roundingMode RM) { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); - APFloat Tmp(semPPCDoubleDoubleLegacy); - auto Ret = Tmp.convertFromAPInt(Input, IsSigned, RM); - *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt()); - return Ret; -} +APFloat::opStatus DoubleAPFloat::handleOverflow(roundingMode RM) { + switch (RM) { + case APFloat::rmTowardZero: + makeLargest(/*Neg=*/isNegative()); + break; + case APFloat::rmTowardNegative: + if (isNegative()) + makeInf(/*Neg=*/true); + else + makeLargest(/*Neg=*/false); + break; + case APFloat::rmTowardPositive: + if (isNegative()) + makeLargest(/*Neg=*/true); + else + makeInf(/*Neg=*/false); + break; + case APFloat::rmNearestTiesToAway: + case APFloat::rmNearestTiesToEven: + makeInf(/*Neg=*/isNegative()); + break; + default: + llvm_unreachable("Invalid rounding mode found"); + } + opStatus S = opInexact; + if (!getFirst().isFinite()) + S = static_cast<opStatus>(S | opOverflow); + return S; +} + +APFloat::opStatus DoubleAPFloat::convertFromUnsignedParts( + const integerPart *Src, unsigned int SrcCount, roundingMode RM) { + // Find the most significant bit of the source integer. APInt::tcMSB returns + // UINT_MAX for a zero value. + const unsigned SrcMSB = APInt::tcMSB(Src, SrcCount); + if (SrcMSB == UINT_MAX) { + // The source integer is 0. + makeZero(/*Neg=*/false); + return opOK; + } -APFloat::opStatus -DoubleAPFloat::convertFromSignExtendedInteger(const integerPart *Input, - unsigned int InputSize, - bool IsSigned, roundingMode RM) { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); - APFloat Tmp(semPPCDoubleDoubleLegacy); - auto Ret = Tmp.convertFromSignExtendedInteger(Input, InputSize, IsSigned, RM); - *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt()); - return Ret; + // Create a minimally-sized APInt to represent the source value. + const unsigned SrcBitWidth = SrcMSB + 1; + APSInt SrcInt{APInt{/*numBits=*/SrcBitWidth, + /*numWords=*/SrcCount, Src}, + /*isUnsigned=*/true}; + + // Stage 1: Initial Approximation. + // Convert the source integer SrcInt to the Hi part of the DoubleAPFloat. + // We use round-to-nearest because it minimizes the initial error, which is + // crucial for the subsequent steps. + APFloat Hi{getFirst().getSemantics()}; + Hi.convertFromAPInt(SrcInt, /*IsSigned=*/false, rmNearestTiesToEven); + + // If the first approximation already overflows, the number is too large. + // NOTE: The underlying semantics are *more* conservative when choosing to + // overflow because their notion of ULP is much larger. As such, it is always + // safe to overflow at the DoubleAPFloat level if the APFloat overflows. + if (!Hi.isFinite()) + return handleOverflow(RM); + + // Stage 2: Exact Error Calculation. + // Calculate the exact error of the first approximation: Error = SrcInt - Hi. + // This is done by converting Hi back to an integer and subtracting it from + // the original source. + bool HiAsIntIsExact; + // Create an integer representation of Hi. Its width is determined by the + // exponent of Hi, ensuring it's just large enough. This width can exceed + // SrcBitWidth if the conversion to Hi rounded up to a power of two. + // accurately when converted back to an integer. + APSInt HiAsInt{static_cast<uint32_t>(ilogb(Hi) + 1), /*isUnsigned=*/true}; + Hi.convertToInteger(HiAsInt, rmNearestTiesToEven, &HiAsIntIsExact); + const APInt Error = SrcInt.zext(HiAsInt.getBitWidth()) - HiAsInt; + + // Stage 3: Error Approximation and Rounding. + // Convert the integer error into the Lo part of the DoubleAPFloat. This step + // captures the remainder of the original number. The rounding mode for this + // conversion (LoRM) may need to be adjusted from the user-requested RM to + // ensure the final sum (Hi + Lo) rounds correctly. + roundingMode LoRM = RM; + // Adjustments are only necessary when the initial approximation Hi was an + // overestimate, making the Error negative. + if (Error.isNegative()) { + if (RM == rmNearestTiesToAway) { + // For rmNearestTiesToAway, a tie should round away from zero. Since + // SrcInt is positive, this means rounding toward +infinity. + // A standard conversion of a negative Error would round ties toward + // -infinity, causing the final sum Hi + Lo to be smaller. To + // counteract this, we detect the tie case and override the rounding + // mode for Lo to rmTowardPositive. + const unsigned ErrorActiveBits = Error.getSignificantBits() - 1; + const unsigned LoPrecision = getSecond().getSemantics().precision; + if (ErrorActiveBits > LoPrecision) { + const unsigned RoundingBoundary = ErrorActiveBits - LoPrecision; + // A tie occurs when the bits to be truncated are of the form 100...0. + // This is detected by checking if the number of trailing zeros is + // exactly one less than the number of bits being truncated. + if (Error.countTrailingZeros() == RoundingBoundary - 1) + LoRM = rmTowardPositive; + } + } else if (RM == rmTowardZero) { + // For rmTowardZero, the final positive result must be truncated (rounded + // down). When Hi is an overestimate, Error is negative. A standard + // rmTowardZero conversion of Error would make it *less* negative, + // effectively rounding the final sum Hi + Lo *up*. To ensure the sum + // rounds down correctly, we force Lo to round toward -infinity. + LoRM = rmTowardNegative; + } + } + + APFloat Lo{getSecond().getSemantics()}; + opStatus Status = Lo.convertFromAPInt(Error, /*IsSigned=*/true, LoRM); + + // Renormalize the pair (Hi, Lo) into a canonical DoubleAPFloat form where the + // components do not overlap. fastTwoSum performs this operation. + std::tie(Hi, Lo) = fastTwoSum(Hi, Lo); + Floats[0] = std::move(Hi); + Floats[1] = std::move(Lo); + + // A final check for overflow is needed because fastTwoSum can cause a + // carry-out from Lo that pushes Hi to infinity. + if (!getFirst().isFinite()) + return handleOverflow(RM); + + // The largest DoubleAPFloat must be canonical. Values which are larger are + // not canonical and are equivalent to overflow. + if (getFirst().isFiniteNonZero() && Floats[0].isLargest()) { + DoubleAPFloat Largest{*Semantics}; + Largest.makeLargest(/*Neg=*/false); + if (compare(Largest) == APFloat::cmpGreaterThan) + return handleOverflow(RM); + } + + // The final status of the operation is determined by the conversion of the + // error term. If Lo could represent Error exactly, the entire conversion + // is exact. Otherwise, it's inexact. + return Status; } -APFloat::opStatus -DoubleAPFloat::convertFromZeroExtendedInteger(const integerPart *Input, - unsigned int InputSize, - bool IsSigned, roundingMode RM) { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); - APFloat Tmp(semPPCDoubleDoubleLegacy); - auto Ret = Tmp.convertFromZeroExtendedInteger(Input, InputSize, IsSigned, RM); - *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt()); - return Ret; +APFloat::opStatus DoubleAPFloat::convertFromAPInt(const APInt &Input, + bool IsSigned, + roundingMode RM) { + const bool NegateInput = IsSigned && Input.isNegative(); + APInt API = Input; + if (NegateInput) + API.negate(); + + const APFloat::opStatus Status = + convertFromUnsignedParts(API.getRawData(), API.getNumWords(), RM); + if (NegateInput) + changeSign(); + return Status; } unsigned int DoubleAPFloat::convertToHexString(char *DST, diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt index 6646af6db5d3..2528e8bd1142 100644 --- a/llvm/lib/Support/CMakeLists.txt +++ b/llvm/lib/Support/CMakeLists.txt @@ -265,18 +265,23 @@ add_llvm_component_library(LLVMSupport ToolOutputFile.cpp TrieRawHashMap.cpp Twine.cpp - TypeSize.cpp Unicode.cpp UnicodeCaseFold.cpp UnicodeNameToCodepoint.cpp UnicodeNameToCodepointGenerated.cpp VersionTuple.cpp VirtualFileSystem.cpp + VirtualOutputBackend.cpp + VirtualOutputBackends.cpp + VirtualOutputConfig.cpp + VirtualOutputError.cpp + VirtualOutputFile.cpp WithColor.cpp YAMLParser.cpp YAMLTraits.cpp raw_os_ostream.cpp raw_ostream.cpp + raw_ostream_proxy.cpp raw_socket_stream.cpp regcomp.c regerror.c diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp index 8491633df97e..be232f5bff58 100644 --- a/llvm/lib/Support/CommandLine.cpp +++ b/llvm/lib/Support/CommandLine.cpp @@ -2671,7 +2671,6 @@ static void initCommonOptions() { initSignalsOptions(); initStatisticOptions(); initTimerOptions(); - initTypeSizeOptions(); initWithColorOptions(); initDebugOptions(); initRandomSeedOptions(); diff --git a/llvm/lib/Support/DebugOptions.h b/llvm/lib/Support/DebugOptions.h index db727d5a584c..6c3382e8f858 100644 --- a/llvm/lib/Support/DebugOptions.h +++ b/llvm/lib/Support/DebugOptions.h @@ -24,7 +24,6 @@ void initGraphWriterOptions(); void initSignalsOptions(); void initStatisticOptions(); void initTimerOptions(); -void initTypeSizeOptions(); void initWithColorOptions(); void initDebugOptions(); void initRandomSeedOptions(); diff --git a/llvm/lib/Support/JSON.cpp b/llvm/lib/Support/JSON.cpp index d8662340cb3e..4652c0740dc4 100644 --- a/llvm/lib/Support/JSON.cpp +++ b/llvm/lib/Support/JSON.cpp @@ -84,16 +84,7 @@ json::Array *Object::getArray(StringRef K) { return V->getAsArray(); return nullptr; } -bool operator==(const Object &LHS, const Object &RHS) { - if (LHS.size() != RHS.size()) - return false; - for (const auto &L : LHS) { - auto R = RHS.find(L.first); - if (R == RHS.end() || L.second != R->second) - return false; - } - return true; -} +bool operator==(const Object &LHS, const Object &RHS) { return LHS.M == RHS.M; } Array::Array(std::initializer_list<Value> Elements) { V.reserve(Elements.size()); diff --git a/llvm/lib/Support/KnownBits.cpp b/llvm/lib/Support/KnownBits.cpp index bd08365a3fcd..8d91f0e95d22 100644 --- a/llvm/lib/Support/KnownBits.cpp +++ b/llvm/lib/Support/KnownBits.cpp @@ -372,8 +372,7 @@ KnownBits KnownBits::lshr(const KnownBits &LHS, const KnownBits &RHS, unsigned BitWidth = LHS.getBitWidth(); auto ShiftByConst = [&](const KnownBits &LHS, unsigned ShiftAmt) { KnownBits Known = LHS; - Known.Zero.lshrInPlace(ShiftAmt); - Known.One.lshrInPlace(ShiftAmt); + Known >>= ShiftAmt; // High bits are known zero. Known.Zero.setHighBits(ShiftAmt); return Known; diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp index 2ba02b73dd8f..3ac6fc74fd3e 100644 --- a/llvm/lib/Support/Parallel.cpp +++ b/llvm/lib/Support/Parallel.cpp @@ -60,7 +60,7 @@ public: auto &Thread0 = Threads[0]; Thread0 = std::thread([this, S] { for (unsigned I = 1; I < ThreadCount; ++I) { - Threads.emplace_back([=] { work(S, I); }); + Threads.emplace_back([this, S, I] { work(S, I); }); if (Stop) break; } diff --git a/llvm/lib/Support/PrettyStackTrace.cpp b/llvm/lib/Support/PrettyStackTrace.cpp index 26e22161b605..82b0e6ac513e 100644 --- a/llvm/lib/Support/PrettyStackTrace.cpp +++ b/llvm/lib/Support/PrettyStackTrace.cpp @@ -39,7 +39,7 @@ using namespace llvm; static const char *BugReportMsg = "PLEASE submit a bug report to " BUG_REPORT_URL - " and include the crash backtrace.\n"; + " and include the crash backtrace and instructions to reproduce the bug.\n"; // If backtrace support is not enabled, compile out support for pretty stack // traces. This has the secondary effect of not requiring thread local storage diff --git a/llvm/lib/Support/Twine.cpp b/llvm/lib/Support/Twine.cpp index 495b9cf2dbd6..d6b48166fb0f 100644 --- a/llvm/lib/Support/Twine.cpp +++ b/llvm/lib/Support/Twine.cpp @@ -56,11 +56,12 @@ StringRef Twine::toNullTerminatedStringRef(SmallVectorImpl<char> &Out) const { return StringRef(Out.data(), Out.size()); } -void Twine::printOneChild(raw_ostream &OS, Child Ptr, - NodeKind Kind) const { +void Twine::printOneChild(raw_ostream &OS, Child Ptr, NodeKind Kind) const { switch (Kind) { - case Twine::NullKind: break; - case Twine::EmptyKind: break; + case Twine::NullKind: + break; + case Twine::EmptyKind: + break; case Twine::TwineKind: Ptr.twine->print(OS); break; @@ -104,24 +105,23 @@ void Twine::printOneChild(raw_ostream &OS, Child Ptr, } } -void Twine::printOneChildRepr(raw_ostream &OS, Child Ptr, - NodeKind Kind) const { +void Twine::printOneChildRepr(raw_ostream &OS, Child Ptr, NodeKind Kind) const { switch (Kind) { case Twine::NullKind: - OS << "null"; break; + OS << "null"; + break; case Twine::EmptyKind: - OS << "empty"; break; + OS << "empty"; + break; case Twine::TwineKind: OS << "rope:"; Ptr.twine->printRepr(OS); break; case Twine::CStringKind: - OS << "cstring:\"" - << Ptr.cString << "\""; + OS << "cstring:\"" << Ptr.cString << "\""; break; case Twine::StdStringKind: - OS << "std::string:\"" - << Ptr.stdString << "\""; + OS << "std::string:\"" << Ptr.stdString << "\""; break; case Twine::PtrAndLengthKind: OS << "ptrAndLength:\"" @@ -175,11 +175,7 @@ void Twine::printRepr(raw_ostream &OS) const { } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -LLVM_DUMP_METHOD void Twine::dump() const { - print(dbgs()); -} +LLVM_DUMP_METHOD void Twine::dump() const { print(dbgs()); } -LLVM_DUMP_METHOD void Twine::dumpRepr() const { - printRepr(dbgs()); -} +LLVM_DUMP_METHOD void Twine::dumpRepr() const { printRepr(dbgs()); } #endif diff --git a/llvm/lib/Support/TypeSize.cpp b/llvm/lib/Support/TypeSize.cpp deleted file mode 100644 index 43346b81cd67..000000000000 --- a/llvm/lib/Support/TypeSize.cpp +++ /dev/null @@ -1,58 +0,0 @@ -//===- TypeSize.cpp - Wrapper around type sizes------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "llvm/Support/TypeSize.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/ManagedStatic.h" -#include "llvm/Support/WithColor.h" - -#include "DebugOptions.h" - -using namespace llvm; - -#ifndef STRICT_FIXED_SIZE_VECTORS -namespace { -struct CreateScalableErrorAsWarning { - /// The ScalableErrorAsWarning is a temporary measure to suppress errors from - /// using the wrong interface on a scalable vector. - static void *call() { - return new cl::opt<bool>( - "treat-scalable-fixed-error-as-warning", cl::Hidden, - cl::desc( - "Treat issues where a fixed-width property is requested from a " - "scalable type as a warning, instead of an error")); - } -}; -} // namespace -static ManagedStatic<cl::opt<bool>, CreateScalableErrorAsWarning> - ScalableErrorAsWarning; -void llvm::initTypeSizeOptions() { *ScalableErrorAsWarning; } -#else -void llvm::initTypeSizeOptions() {} -#endif - -void llvm::reportInvalidSizeRequest(const char *Msg) { -#ifndef STRICT_FIXED_SIZE_VECTORS - if (*ScalableErrorAsWarning) { - WithColor::warning() << "Invalid size request on a scalable vector; " << Msg - << "\n"; - return; - } -#endif - report_fatal_error("Invalid size request on a scalable vector."); -} - -TypeSize::operator TypeSize::ScalarTy() const { - if (isScalable()) { - reportInvalidSizeRequest( - "Cannot implicitly convert a scalable size to a fixed-width size in " - "`TypeSize::operator ScalarTy()`"); - return getKnownMinValue(); - } - return getFixedValue(); -} diff --git a/llvm/lib/Support/Unix/Signals.inc b/llvm/lib/Support/Unix/Signals.inc index 6cd38aabc734..573ad82f2dea 100644 --- a/llvm/lib/Support/Unix/Signals.inc +++ b/llvm/lib/Support/Unix/Signals.inc @@ -883,8 +883,7 @@ void llvm::sys::PrintStackTrace(raw_ostream &OS, int Depth) { } else { const char *name = strrchr(dlinfo.dli_fname, '/'); if (!name) - OS << format(" %-*s", width, - static_cast<const char *>(dlinfo.dli_fname)); + OS << format(" %-*s", width, dlinfo.dli_fname); else OS << format(" %-*s", width, name + 1); } diff --git a/llvm/lib/Support/Unix/Threading.inc b/llvm/lib/Support/Unix/Threading.inc index 7854d6d22915..f016ed693752 100644 --- a/llvm/lib/Support/Unix/Threading.inc +++ b/llvm/lib/Support/Unix/Threading.inc @@ -194,9 +194,9 @@ void llvm::set_thread_name(const Twine &Name) { if (get_max_thread_name_length() > 0) NameStr = NameStr.take_back(get_max_thread_name_length() - 1); (void)NameStr; -#if defined(HAVE_PTHREAD_SET_NAME_NP) +#if defined(HAVE_PTHREAD_SET_NAME_NP) && HAVE_PTHREAD_SET_NAME_NP ::pthread_set_name_np(::pthread_self(), NameStr.data()); -#elif defined(HAVE_PTHREAD_SETNAME_NP) +#elif defined(HAVE_PTHREAD_SETNAME_NP) && HAVE_PTHREAD_SETNAME_NP #if defined(__NetBSD__) ::pthread_setname_np(::pthread_self(), "%s", const_cast<char *>(NameStr.data())); diff --git a/llvm/lib/Support/VirtualOutputBackend.cpp b/llvm/lib/Support/VirtualOutputBackend.cpp new file mode 100644 index 000000000000..97dab054dfa0 --- /dev/null +++ b/llvm/lib/Support/VirtualOutputBackend.cpp @@ -0,0 +1,40 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements \c vfs::OutputBackend class methods. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/VirtualOutputBackend.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/Support/VirtualOutputError.h" + +using namespace llvm; +using namespace llvm::vfs; + +void OutputBackend::anchor() {} + +Expected<OutputFile> +OutputBackend::createFile(const Twine &Path, + std::optional<OutputConfig> Config) { + SmallString<128> PathStorage; + Path.toVector(PathStorage); + + if (Config) { + // Check for invalid configs. + if (!Config->getText() && Config->getCRLF()) + return make_error<OutputConfigError>(*Config, PathStorage); + } + + std::unique_ptr<OutputFileImpl> Impl; + if (Error E = createFileImpl(PathStorage, Config).moveInto(Impl)) + return std::move(E); + assert(Impl && "Expected valid Impl or Error"); + return OutputFile(PathStorage, std::move(Impl)); +} diff --git a/llvm/lib/Support/VirtualOutputBackends.cpp b/llvm/lib/Support/VirtualOutputBackends.cpp new file mode 100644 index 000000000000..d6d7b8715bd4 --- /dev/null +++ b/llvm/lib/Support/VirtualOutputBackends.cpp @@ -0,0 +1,598 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements the VirtualOutputBackend types, including: +/// * NullOutputBackend: Outputs to NullOutputBackend are discarded. +/// * FilteringOutputBackend: Filter paths from output. +/// * MirroringOutputBackend: Mirror the output into two different backend. +/// * OnDiskOutputBackend: Write output files to disk. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/VirtualOutputBackends.h" +#include "llvm/ADT/ScopeExit.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/LockFileManager.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/Process.h" +#include "llvm/Support/Signals.h" +#include "llvm/Support/VirtualOutputConfig.h" +#include "llvm/Support/VirtualOutputError.h" + +using namespace llvm; +using namespace llvm::vfs; + +void ProxyOutputBackend::anchor() {} +void OnDiskOutputBackend::anchor() {} + +IntrusiveRefCntPtr<OutputBackend> vfs::makeNullOutputBackend() { + struct NullOutputBackend : public OutputBackend { + IntrusiveRefCntPtr<OutputBackend> cloneImpl() const override { + return const_cast<NullOutputBackend *>(this); + } + Expected<std::unique_ptr<OutputFileImpl>> + createFileImpl(StringRef Path, std::optional<OutputConfig>) override { + return std::make_unique<NullOutputFileImpl>(); + } + }; + + return makeIntrusiveRefCnt<NullOutputBackend>(); +} + +IntrusiveRefCntPtr<OutputBackend> vfs::makeFilteringOutputBackend( + IntrusiveRefCntPtr<OutputBackend> UnderlyingBackend, + std::function<bool(StringRef, std::optional<OutputConfig>)> Filter) { + struct FilteringOutputBackend : public ProxyOutputBackend { + Expected<std::unique_ptr<OutputFileImpl>> + createFileImpl(StringRef Path, + std::optional<OutputConfig> Config) override { + if (Filter(Path, Config)) + return ProxyOutputBackend::createFileImpl(Path, Config); + return std::make_unique<NullOutputFileImpl>(); + } + + IntrusiveRefCntPtr<OutputBackend> cloneImpl() const override { + return makeIntrusiveRefCnt<FilteringOutputBackend>( + getUnderlyingBackend().clone(), Filter); + } + + FilteringOutputBackend( + IntrusiveRefCntPtr<OutputBackend> UnderlyingBackend, + std::function<bool(StringRef, std::optional<OutputConfig>)> Filter) + : ProxyOutputBackend(std::move(UnderlyingBackend)), + Filter(std::move(Filter)) { + assert(this->Filter && "Expected a non-null function"); + } + std::function<bool(StringRef, std::optional<OutputConfig>)> Filter; + }; + + return makeIntrusiveRefCnt<FilteringOutputBackend>( + std::move(UnderlyingBackend), std::move(Filter)); +} + +IntrusiveRefCntPtr<OutputBackend> +vfs::makeMirroringOutputBackend(IntrusiveRefCntPtr<OutputBackend> Backend1, + IntrusiveRefCntPtr<OutputBackend> Backend2) { + struct ProxyOutputBackend1 : public ProxyOutputBackend { + using ProxyOutputBackend::ProxyOutputBackend; + }; + struct ProxyOutputBackend2 : public ProxyOutputBackend { + using ProxyOutputBackend::ProxyOutputBackend; + }; + struct MirroringOutput final : public OutputFileImpl, raw_pwrite_stream { + Error keep() final { + flush(); + return joinErrors(F1->keep(), F2->keep()); + } + Error discard() final { + flush(); + return joinErrors(F1->discard(), F2->discard()); + } + raw_pwrite_stream &getOS() final { return *this; } + + void write_impl(const char *Ptr, size_t Size) override { + F1->getOS().write(Ptr, Size); + F2->getOS().write(Ptr, Size); + } + void pwrite_impl(const char *Ptr, size_t Size, uint64_t Offset) override { + this->flush(); + F1->getOS().pwrite(Ptr, Size, Offset); + F2->getOS().pwrite(Ptr, Size, Offset); + } + uint64_t current_pos() const override { return F1->getOS().tell(); } + size_t preferred_buffer_size() const override { + return PreferredBufferSize; + } + void reserveExtraSpace(uint64_t ExtraSize) override { + F1->getOS().reserveExtraSpace(ExtraSize); + F2->getOS().reserveExtraSpace(ExtraSize); + } + bool is_displayed() const override { + return F1->getOS().is_displayed() && F2->getOS().is_displayed(); + } + bool has_colors() const override { + return F1->getOS().has_colors() && F2->getOS().has_colors(); + } + void enable_colors(bool enable) override { + raw_pwrite_stream::enable_colors(enable); + F1->getOS().enable_colors(enable); + F2->getOS().enable_colors(enable); + } + + MirroringOutput(std::unique_ptr<OutputFileImpl> F1, + std::unique_ptr<OutputFileImpl> F2) + : PreferredBufferSize(std::max(F1->getOS().GetBufferSize(), + F1->getOS().GetBufferSize())), + F1(std::move(F1)), F2(std::move(F2)) { + // Don't double buffer. + this->F1->getOS().SetUnbuffered(); + this->F2->getOS().SetUnbuffered(); + } + size_t PreferredBufferSize; + std::unique_ptr<OutputFileImpl> F1; + std::unique_ptr<OutputFileImpl> F2; + }; + struct MirroringOutputBackend : public ProxyOutputBackend1, + public ProxyOutputBackend2 { + Expected<std::unique_ptr<OutputFileImpl>> + createFileImpl(StringRef Path, + std::optional<OutputConfig> Config) override { + std::unique_ptr<OutputFileImpl> File1; + std::unique_ptr<OutputFileImpl> File2; + if (Error E = + ProxyOutputBackend1::createFileImpl(Path, Config).moveInto(File1)) + return std::move(E); + if (Error E = + ProxyOutputBackend2::createFileImpl(Path, Config).moveInto(File2)) + return joinErrors(std::move(E), File1->discard()); + + // Skip the extra indirection if one of these is a null output. + if (isa<NullOutputFileImpl>(*File1)) { + consumeError(File1->discard()); + return std::move(File2); + } + if (isa<NullOutputFileImpl>(*File2)) { + consumeError(File2->discard()); + return std::move(File1); + } + return std::make_unique<MirroringOutput>(std::move(File1), + std::move(File2)); + } + + IntrusiveRefCntPtr<OutputBackend> cloneImpl() const override { + return IntrusiveRefCntPtr<ProxyOutputBackend1>( + makeIntrusiveRefCnt<MirroringOutputBackend>( + ProxyOutputBackend1::getUnderlyingBackend().clone(), + ProxyOutputBackend2::getUnderlyingBackend().clone())); + } + void Retain() const { ProxyOutputBackend1::Retain(); } + void Release() const { ProxyOutputBackend1::Release(); } + + MirroringOutputBackend(IntrusiveRefCntPtr<OutputBackend> Backend1, + IntrusiveRefCntPtr<OutputBackend> Backend2) + : ProxyOutputBackend1(std::move(Backend1)), + ProxyOutputBackend2(std::move(Backend2)) {} + }; + + assert(Backend1 && "Expected actual backend"); + assert(Backend2 && "Expected actual backend"); + return IntrusiveRefCntPtr<ProxyOutputBackend1>( + makeIntrusiveRefCnt<MirroringOutputBackend>(std::move(Backend1), + std::move(Backend2))); +} + +static OutputConfig +applySettings(std::optional<OutputConfig> &&Config, + const OnDiskOutputBackend::OutputSettings &Settings) { + if (!Config) + Config = Settings.DefaultConfig; + if (!Settings.UseTemporaries) + Config->setNoAtomicWrite(); + if (!Settings.RemoveOnSignal) + Config->setNoDiscardOnSignal(); + return *Config; +} + +namespace { +class OnDiskOutputFile final : public OutputFileImpl { +public: + Error keep() override; + Error discard() override; + raw_pwrite_stream &getOS() override { + assert(FileOS && "Expected valid file"); + if (BufferOS) + return *BufferOS; + return *FileOS; + } + + /// Attempt to open a temporary file for \p OutputPath. + /// + /// This tries to open a uniquely-named temporary file for \p OutputPath, + /// possibly also creating any missing directories if \a + /// OnDiskOutputConfig::UseTemporaryCreateMissingDirectories is set in \a + /// Config. + /// + /// \post FD and \a TempPath are initialized if this is successful. + Error tryToCreateTemporary(std::optional<int> &FD); + + Error initializeFile(std::optional<int> &FD); + Error initializeStream(); + Error reset(); + + OnDiskOutputFile(StringRef OutputPath, std::optional<OutputConfig> Config, + const OnDiskOutputBackend::OutputSettings &Settings) + : Config(applySettings(std::move(Config), Settings)), + OutputPath(OutputPath.str()) {} + + OutputConfig Config; + const std::string OutputPath; + std::optional<std::string> TempPath; + std::optional<raw_fd_ostream> FileOS; + std::optional<buffer_ostream> BufferOS; +}; +} // end namespace + +static Error createDirectoriesOnDemand(StringRef OutputPath, + OutputConfig Config, + llvm::function_ref<Error()> CreateFile) { + return handleErrors(CreateFile(), [&](std::unique_ptr<ECError> EC) { + if (EC->convertToErrorCode() != std::errc::no_such_file_or_directory || + Config.getNoImplyCreateDirectories()) + return Error(std::move(EC)); + + StringRef ParentPath = sys::path::parent_path(OutputPath); + if (std::error_code EC = sys::fs::create_directories(ParentPath)) + return make_error<OutputError>(ParentPath, EC); + return CreateFile(); + }); +} + +Error OnDiskOutputFile::tryToCreateTemporary(std::optional<int> &FD) { + // Create a temporary file. + // Insert -%%%%%%%% before the extension (if any), and because some tools + // (noticeable, clang's own GlobalModuleIndex.cpp) glob for build + // artifacts, also append .tmp. + StringRef OutputExtension = sys::path::extension(OutputPath); + SmallString<128> ModelPath = + StringRef(OutputPath).drop_back(OutputExtension.size()); + ModelPath += "-%%%%%%%%"; + ModelPath += OutputExtension; + ModelPath += ".tmp"; + + return createDirectoriesOnDemand(OutputPath, Config, [&]() -> Error { + int NewFD; + SmallString<128> UniquePath; + if (std::error_code EC = + sys::fs::createUniqueFile(ModelPath, NewFD, UniquePath)) + return make_error<TempFileOutputError>(ModelPath, OutputPath, EC); + + if (Config.getDiscardOnSignal()) + sys::RemoveFileOnSignal(UniquePath); + + TempPath = UniquePath.str().str(); + FD.emplace(NewFD); + return Error::success(); + }); +} + +Error OnDiskOutputFile::initializeFile(std::optional<int> &FD) { + assert(OutputPath != "-" && "Unexpected request for FD of stdout"); + + // Disable temporary file for other non-regular files, and if we get a status + // object, also check if we can write and disable write-through buffers if + // appropriate. + if (Config.getAtomicWrite()) { + sys::fs::file_status Status; + sys::fs::status(OutputPath, Status); + if (sys::fs::exists(Status)) { + if (!sys::fs::is_regular_file(Status)) + Config.setNoAtomicWrite(); + + // Fail now if we can't write to the final destination. + if (!sys::fs::can_write(OutputPath)) + return make_error<OutputError>( + OutputPath, + std::make_error_code(std::errc::operation_not_permitted)); + } + } + + // If (still) using a temporary file, try to create it (and return success if + // that works). + if (Config.getAtomicWrite()) + if (!errorToBool(tryToCreateTemporary(FD))) + return Error::success(); + + // Not using a temporary file. Open the final output file. + return createDirectoriesOnDemand(OutputPath, Config, [&]() -> Error { + int NewFD; + sys::fs::OpenFlags OF = sys::fs::OF_None; + if (Config.getTextWithCRLF()) + OF |= sys::fs::OF_TextWithCRLF; + else if (Config.getText()) + OF |= sys::fs::OF_Text; + if (Config.getAppend()) + OF |= sys::fs::OF_Append; + if (std::error_code EC = sys::fs::openFileForWrite( + OutputPath, NewFD, sys::fs::CD_CreateAlways, OF)) + return convertToOutputError(OutputPath, EC); + FD.emplace(NewFD); + + if (Config.getDiscardOnSignal()) + sys::RemoveFileOnSignal(OutputPath); + return Error::success(); + }); +} + +Error OnDiskOutputFile::initializeStream() { + // Open the file stream. + if (OutputPath == "-") { + std::error_code EC; + FileOS.emplace(OutputPath, EC); + if (EC) + return make_error<OutputError>(OutputPath, EC); + } else { + std::optional<int> FD; + if (Error E = initializeFile(FD)) + return E; + FileOS.emplace(*FD, /*shouldClose=*/true); + } + + // Buffer the stream if necessary. + if (!FileOS->supportsSeeking() && !Config.getText()) + BufferOS.emplace(*FileOS); + + return Error::success(); +} + +namespace { +class OpenFileRAII { + static const int InvalidFd = -1; + +public: + int Fd = InvalidFd; + + ~OpenFileRAII() { + if (Fd != InvalidFd) + llvm::sys::Process::SafelyCloseFileDescriptor(Fd); + } +}; + +enum class FileDifference : uint8_t { + /// The source and destination paths refer to the exact same file. + IdenticalFile, + /// The source and destination paths refer to separate files with identical + /// contents. + SameContents, + /// The source and destination paths refer to separate files with different + /// contents. + DifferentContents +}; +} // end anonymous namespace + +static Expected<FileDifference> +areFilesDifferent(const llvm::Twine &Source, const llvm::Twine &Destination) { + if (sys::fs::equivalent(Source, Destination)) + return FileDifference::IdenticalFile; + + OpenFileRAII SourceFile; + sys::fs::file_status SourceStatus; + // If we can't open the source file, fail. + if (std::error_code EC = sys::fs::openFileForRead(Source, SourceFile.Fd)) + return convertToOutputError(Source, EC); + + // If we can't stat the source file, fail. + if (std::error_code EC = sys::fs::status(SourceFile.Fd, SourceStatus)) + return convertToOutputError(Source, EC); + + OpenFileRAII DestFile; + sys::fs::file_status DestStatus; + // If we can't open the destination file, report different. + if (std::error_code Error = + sys::fs::openFileForRead(Destination, DestFile.Fd)) + return FileDifference::DifferentContents; + + // If we can't open the destination file, report different. + if (std::error_code Error = sys::fs::status(DestFile.Fd, DestStatus)) + return FileDifference::DifferentContents; + + // If the files are different sizes, they must be different. + uint64_t Size = SourceStatus.getSize(); + if (Size != DestStatus.getSize()) + return FileDifference::DifferentContents; + + // If both files are zero size, they must be the same. + if (Size == 0) + return FileDifference::SameContents; + + // The two files match in size, so we have to compare the bytes to determine + // if they're the same. + std::error_code SourceRegionErr; + sys::fs::mapped_file_region SourceRegion( + sys::fs::convertFDToNativeFile(SourceFile.Fd), + sys::fs::mapped_file_region::readonly, Size, 0, SourceRegionErr); + if (SourceRegionErr) + return convertToOutputError(Source, SourceRegionErr); + + std::error_code DestRegionErr; + sys::fs::mapped_file_region DestRegion( + sys::fs::convertFDToNativeFile(DestFile.Fd), + sys::fs::mapped_file_region::readonly, Size, 0, DestRegionErr); + + if (DestRegionErr) + return FileDifference::DifferentContents; + + if (memcmp(SourceRegion.const_data(), DestRegion.const_data(), Size) != 0) + return FileDifference::DifferentContents; + + return FileDifference::SameContents; +} + +Error OnDiskOutputFile::reset() { + // Destroy the streams to flush them. + BufferOS.reset(); + if (!FileOS) + return Error::success(); + + // Remember the error in raw_fd_ostream to be reported later. + std::error_code EC = FileOS->error(); + // Clear the error to avoid fatal error when reset. + FileOS->clear_error(); + FileOS.reset(); + return errorCodeToError(EC); +} + +Error OnDiskOutputFile::keep() { + if (auto E = reset()) + return E; + + // Close the file descriptor and remove crash cleanup before exit. + auto RemoveDiscardOnSignal = make_scope_exit([&]() { + if (Config.getDiscardOnSignal()) + sys::DontRemoveFileOnSignal(TempPath ? *TempPath : OutputPath); + }); + + if (!TempPath) + return Error::success(); + + // See if we should append instead of move. + if (Config.getAppend() && OutputPath != "-") { + // Read TempFile for the content to append. + auto Content = MemoryBuffer::getFile(*TempPath); + if (!Content) + return convertToTempFileOutputError(*TempPath, OutputPath, + Content.getError()); + while (1) { + // Attempt to lock the output file. + // Only one process is allowed to append to this file at a time. + llvm::LockFileManager Lock(OutputPath); + bool Owned; + if (Error Err = Lock.tryLock().moveInto(Owned)) { + // If we error acquiring a lock, we cannot ensure appends + // to the trace file are atomic - cannot ensure output correctness. + Lock.unsafeMaybeUnlock(); + return convertToOutputError( + OutputPath, std::make_error_code(std::errc::no_lock_available)); + } + if (Owned) { + // Lock acquired, perform the write and release the lock. + std::error_code EC; + llvm::raw_fd_ostream Out(OutputPath, EC, llvm::sys::fs::OF_Append); + if (EC) + return convertToOutputError(OutputPath, EC); + Out << (*Content)->getBuffer(); + Out.close(); + Lock.unsafeMaybeUnlock(); + if (Out.has_error()) + return convertToOutputError(OutputPath, Out.error()); + // Remove temp file and done. + (void)sys::fs::remove(*TempPath); + return Error::success(); + } + // Someone else owns the lock on this file, wait. + switch (Lock.waitForUnlockFor(std::chrono::seconds(256))) { + case WaitForUnlockResult::Success: + LLVM_FALLTHROUGH; + case WaitForUnlockResult::OwnerDied: { + continue; // try again to get the lock. + } + case WaitForUnlockResult::Timeout: { + // We could error on timeout to avoid potentially hanging forever, but + // it may be more likely that an interrupted process failed to clear + // the lock, causing other waiting processes to time-out. Let's clear + // the lock and try again right away. If we do start seeing compiler + // hangs in this location, we will need to re-consider. + Lock.unsafeMaybeUnlock(); + continue; + } + } + break; + } + } + + if (Config.getOnlyIfDifferent()) { + auto Result = areFilesDifferent(*TempPath, OutputPath); + if (!Result) + return Result.takeError(); + switch (*Result) { + case FileDifference::IdenticalFile: + // Do nothing for a self-move. + return Error::success(); + + case FileDifference::SameContents: + // Files are identical; remove the source file. + (void)sys::fs::remove(*TempPath); + return Error::success(); + + case FileDifference::DifferentContents: + break; // Rename the file. + } + } + + // Move temporary to the final output path and remove it if that fails. + std::error_code RenameEC = sys::fs::rename(*TempPath, OutputPath); + if (!RenameEC) + return Error::success(); + + // FIXME: TempPath should be in the same directory as OutputPath but try to + // copy the output to see if makes any difference. If this path is used, + // investigate why we need to copy. + RenameEC = sys::fs::copy_file(*TempPath, OutputPath); + (void)sys::fs::remove(*TempPath); + + if (!RenameEC) + return Error::success(); + + return make_error<TempFileOutputError>(*TempPath, OutputPath, RenameEC); +} + +Error OnDiskOutputFile::discard() { + // Destroy the streams to flush them. + if (auto E = reset()) + return E; + + // Nothing on the filesystem to remove for stdout. + if (OutputPath == "-") + return Error::success(); + + auto discardPath = [&](StringRef Path) { + std::error_code EC = sys::fs::remove(Path); + sys::DontRemoveFileOnSignal(Path); + return EC; + }; + + // Clean up the file that's in-progress. + if (!TempPath) + return convertToOutputError(OutputPath, discardPath(OutputPath)); + return convertToTempFileOutputError(*TempPath, OutputPath, + discardPath(*TempPath)); +} + +Error OnDiskOutputBackend::makeAbsolute(SmallVectorImpl<char> &Path) const { + return convertToOutputError(StringRef(Path.data(), Path.size()), + sys::fs::make_absolute(Path)); +} + +Expected<std::unique_ptr<OutputFileImpl>> +OnDiskOutputBackend::createFileImpl(StringRef Path, + std::optional<OutputConfig> Config) { + SmallString<256> AbsPath; + if (Path != "-") { + AbsPath = Path; + if (Error E = makeAbsolute(AbsPath)) + return std::move(E); + Path = AbsPath; + } + + auto File = std::make_unique<OnDiskOutputFile>(Path, Config, Settings); + if (Error E = File->initializeStream()) + return std::move(E); + + return std::move(File); +} diff --git a/llvm/lib/Support/VirtualOutputConfig.cpp b/llvm/lib/Support/VirtualOutputConfig.cpp new file mode 100644 index 000000000000..4672a0dad65d --- /dev/null +++ b/llvm/lib/Support/VirtualOutputConfig.cpp @@ -0,0 +1,55 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements \c OutputConfig class methods. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/VirtualOutputConfig.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; +using namespace llvm::vfs; + +OutputConfig &OutputConfig::setOpenFlags(const sys::fs::OpenFlags &Flags) { + // Ignore CRLF on its own as invalid. + using namespace llvm::sys::fs; + return Flags & OF_Text + ? setText().setCRLF(Flags & OF_CRLF).setAppend(Flags & OF_Append) + : setBinary().setAppend(Flags & OF_Append); +} + +void OutputConfig::print(raw_ostream &OS) const { + OS << "{"; + bool IsFirst = true; + auto printFlag = [&](StringRef FlagName, bool Value) { + if (IsFirst) + IsFirst = false; + else + OS << ","; + if (!Value) + OS << "No"; + OS << FlagName; + }; + +#define HANDLE_OUTPUT_CONFIG_FLAG(NAME, DEFAULT) \ + if (get##NAME() != DEFAULT) \ + printFlag(#NAME, get##NAME()); +#include "llvm/Support/VirtualOutputConfig.def" + OS << "}"; +} + +LLVM_DUMP_METHOD void OutputConfig::dump() const { print(dbgs()); } + +raw_ostream &llvm::operator<<(raw_ostream &OS, OutputConfig Config) { + Config.print(OS); + return OS; +} diff --git a/llvm/lib/Support/VirtualOutputError.cpp b/llvm/lib/Support/VirtualOutputError.cpp new file mode 100644 index 000000000000..c899c621205f --- /dev/null +++ b/llvm/lib/Support/VirtualOutputError.cpp @@ -0,0 +1,73 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements the errors for output virtualization. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/VirtualOutputError.h" + +using namespace llvm; +using namespace llvm::vfs; + +void OutputError::anchor() {} +void OutputConfigError::anchor() {} +void TempFileOutputError::anchor() {} + +char OutputError::ID = 0; +char OutputConfigError::ID = 0; +char TempFileOutputError::ID = 0; + +void OutputError::log(raw_ostream &OS) const { + OS << getOutputPath() << ": "; + ECError::log(OS); +} + +void OutputConfigError::log(raw_ostream &OS) const { + OutputError::log(OS); + OS << ": " << Config; +} + +void TempFileOutputError::log(raw_ostream &OS) const { + OS << getTempPath() << " => "; + OutputError::log(OS); +} + +namespace { +class OutputErrorCategory : public std::error_category { +public: + const char *name() const noexcept override; + std::string message(int EV) const override; +}; +} // end namespace + +const std::error_category &vfs::output_category() { + static OutputErrorCategory ErrorCategory; + return ErrorCategory; +} + +const char *OutputErrorCategory::name() const noexcept { + return "llvm.vfs.output"; +} + +std::string OutputErrorCategory::message(int EV) const { + OutputErrorCode E = static_cast<OutputErrorCode>(EV); + switch (E) { + case OutputErrorCode::invalid_config: + return "invalid config"; + case OutputErrorCode::not_closed: + return "output not closed"; + case OutputErrorCode::already_closed: + return "output already closed"; + case OutputErrorCode::has_open_proxy: + return "output has open proxy"; + } + llvm_unreachable( + "An enumerator of OutputErrorCode does not have a message defined."); +} diff --git a/llvm/lib/Support/VirtualOutputFile.cpp b/llvm/lib/Support/VirtualOutputFile.cpp new file mode 100644 index 000000000000..62f54266d3be --- /dev/null +++ b/llvm/lib/Support/VirtualOutputFile.cpp @@ -0,0 +1,110 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements \c OutputFile class methods. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/VirtualOutputFile.h" +#include "llvm/Support/VirtualOutputError.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/raw_ostream_proxy.h" + +using namespace llvm; +using namespace llvm::vfs; + +char OutputFileImpl::ID = 0; +char NullOutputFileImpl::ID = 0; + +void OutputFileImpl::anchor() {} +void NullOutputFileImpl::anchor() {} + +class OutputFile::TrackedProxy : public raw_pwrite_stream_proxy { +public: + void resetProxy() { + TrackingPointer = nullptr; + resetProxiedOS(); + } + + explicit TrackedProxy(TrackedProxy *&TrackingPointer, raw_pwrite_stream &OS) + : raw_pwrite_stream_proxy(OS), TrackingPointer(TrackingPointer) { + assert(!TrackingPointer && "Expected to add a proxy"); + TrackingPointer = this; + } + + ~TrackedProxy() override { resetProxy(); } + + TrackedProxy *&TrackingPointer; +}; + +Expected<std::unique_ptr<raw_pwrite_stream>> OutputFile::createProxy() { + if (OpenProxy) + return make_error<OutputError>(getPath(), OutputErrorCode::has_open_proxy); + + return std::make_unique<TrackedProxy>(OpenProxy, getOS()); +} + +Error OutputFile::keep() { + // Catch double-closing logic bugs. + if (LLVM_UNLIKELY(!Impl)) + report_fatal_error( + make_error<OutputError>(getPath(), OutputErrorCode::already_closed)); + + // Report a fatal error if there's an open proxy and the file is being kept. + // This is safer than relying on clients to remember to flush(). Also call + // OutputFile::discard() to give the backend a chance to clean up any + // side effects (such as temporaries). + if (LLVM_UNLIKELY(OpenProxy)) + report_fatal_error(joinErrors( + make_error<OutputError>(getPath(), OutputErrorCode::has_open_proxy), + discard())); + + Error E = Impl->keep(); + Impl = nullptr; + DiscardOnDestroyHandler = nullptr; + return E; +} + +Error OutputFile::discard() { + // Catch double-closing logic bugs. + if (LLVM_UNLIKELY(!Impl)) + report_fatal_error( + make_error<OutputError>(getPath(), OutputErrorCode::already_closed)); + + // Be lenient about open proxies since client teardown paths won't + // necessarily clean up in the right order. Reset the proxy to flush any + // current content; if there is another write, there should be quick crash on + // null dereference. + if (OpenProxy) + OpenProxy->resetProxy(); + + Error E = Impl->discard(); + Impl = nullptr; + DiscardOnDestroyHandler = nullptr; + return E; +} + +void OutputFile::destroy() { + if (!Impl) + return; + + // Clean up the file. Move the discard handler into a local since discard + // will reset it. + auto DiscardHandler = std::move(DiscardOnDestroyHandler); + Error E = discard(); + assert(!Impl && "Expected discard to destroy Impl"); + + // If there's no handler, report a fatal error. + if (LLVM_UNLIKELY(!DiscardHandler)) + llvm::report_fatal_error(joinErrors( + make_error<OutputError>(getPath(), OutputErrorCode::not_closed), + std::move(E))); + else if (E) + DiscardHandler(std::move(E)); +} diff --git a/llvm/lib/Support/Windows/Threading.inc b/llvm/lib/Support/Windows/Threading.inc index b11f216adeba..968423b98486 100644 --- a/llvm/lib/Support/Windows/Threading.inc +++ b/llvm/lib/Support/Windows/Threading.inc @@ -31,23 +31,22 @@ llvm_execute_on_thread_impl(unsigned(__stdcall *ThreadFunc)(void *), void *Arg, HANDLE hThread = (HANDLE)::_beginthreadex(NULL, StackSizeInBytes.value_or(0), ThreadFunc, Arg, 0, NULL); - if (!hThread) { + if (!hThread) ReportLastErrorFatal("_beginthreadex failed"); - } return hThread; } void llvm_thread_join_impl(HANDLE hThread) { - if (::WaitForSingleObject(hThread, INFINITE) == WAIT_FAILED) { + if (::WaitForSingleObject(hThread, INFINITE) == WAIT_FAILED) ReportLastErrorFatal("WaitForSingleObject failed"); - } + if (::CloseHandle(hThread) == FALSE) + ReportLastErrorFatal("CloseHandle failed"); } void llvm_thread_detach_impl(HANDLE hThread) { - if (::CloseHandle(hThread) == FALSE) { + if (::CloseHandle(hThread) == FALSE) ReportLastErrorFatal("CloseHandle failed"); - } } DWORD llvm_thread_get_id_impl(HANDLE hThread) { return ::GetThreadId(hThread); } @@ -202,9 +201,9 @@ template <typename F> static bool IterateProcInfo(LOGICAL_PROCESSOR_RELATIONSHIP Relationship, F Fn) { DWORD Len = 0; BOOL R = ::GetLogicalProcessorInformationEx(Relationship, NULL, &Len); - if (R || GetLastError() != ERROR_INSUFFICIENT_BUFFER) { + if (R || GetLastError() != ERROR_INSUFFICIENT_BUFFER) return false; - } + auto *Info = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)calloc(1, Len); R = ::GetLogicalProcessorInformationEx(Relationship, Info, &Len); if (R) { diff --git a/llvm/lib/Support/raw_ostream_proxy.cpp b/llvm/lib/Support/raw_ostream_proxy.cpp new file mode 100644 index 000000000000..2bbaa82f4afa --- /dev/null +++ b/llvm/lib/Support/raw_ostream_proxy.cpp @@ -0,0 +1,15 @@ +//===- raw_ostream_proxy.cpp - Implement the raw_ostream proxies ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/raw_ostream_proxy.h" + +using namespace llvm; + +void raw_ostream_proxy::anchor() {} + +void raw_pwrite_stream_proxy::anchor() {} diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp index 3657a15ab198..051a896cfd1b 100644 --- a/llvm/lib/TableGen/Record.cpp +++ b/llvm/lib/TableGen/Record.cpp @@ -525,6 +525,14 @@ std::optional<int64_t> BitsInit::convertInitializerToInt() const { return Result; } +uint64_t BitsInit::convertKnownBitsToInt() const { + uint64_t Result = 0; + for (auto [Idx, InitV] : enumerate(getBits())) + if (auto *Bit = dyn_cast<BitInit>(InitV)) + Result |= static_cast<int64_t>(Bit->getValue()) << Idx; + return Result; +} + const Init * BitsInit::convertInitializerBitRange(ArrayRef<unsigned> Bits) const { SmallVector<const Init *, 16> NewBits(Bits.size()); diff --git a/llvm/lib/TableGen/TGParser.cpp b/llvm/lib/TableGen/TGParser.cpp index 0c6add59cb28..f928ded16186 100644 --- a/llvm/lib/TableGen/TGParser.cpp +++ b/llvm/lib/TableGen/TGParser.cpp @@ -33,6 +33,14 @@ using namespace llvm; namespace llvm { +RecordsEntry::RecordsEntry(std::unique_ptr<Record> Rec) : Rec(std::move(Rec)) {} +RecordsEntry::RecordsEntry(std::unique_ptr<ForeachLoop> Loop) + : Loop(std::move(Loop)) {} +RecordsEntry::RecordsEntry(std::unique_ptr<Record::AssertionInfo> Assertion) + : Assertion(std::move(Assertion)) {} +RecordsEntry::RecordsEntry(std::unique_ptr<Record::DumpInfo> Dump) + : Dump(std::move(Dump)) {} + struct SubClassReference { SMRange RefRange; const Record *Rec = nullptr; diff --git a/llvm/lib/TableGen/TGParser.h b/llvm/lib/TableGen/TGParser.h index 7edb6c7a9aac..09b7d5380695 100644 --- a/llvm/lib/TableGen/TGParser.h +++ b/llvm/lib/TableGen/TGParser.h @@ -46,12 +46,10 @@ struct RecordsEntry { void dump() const; RecordsEntry() = default; - RecordsEntry(std::unique_ptr<Record> Rec) : Rec(std::move(Rec)) {} - RecordsEntry(std::unique_ptr<ForeachLoop> Loop) : Loop(std::move(Loop)) {} - RecordsEntry(std::unique_ptr<Record::AssertionInfo> Assertion) - : Assertion(std::move(Assertion)) {} - RecordsEntry(std::unique_ptr<Record::DumpInfo> Dump) - : Dump(std::move(Dump)) {} + RecordsEntry(std::unique_ptr<Record> Rec); + RecordsEntry(std::unique_ptr<ForeachLoop> Loop); + RecordsEntry(std::unique_ptr<Record::AssertionInfo> Assertion); + RecordsEntry(std::unique_ptr<Record::DumpInfo> Dump); }; /// ForeachLoop - Record the iteration state associated with a for loop. diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index c52487ab8a79..c31a090bba77 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -307,6 +307,7 @@ private: /// Emit instruction to set float register to zero. void emitFMov0(const MachineInstr &MI); + void emitFMov0AsFMov(const MachineInstr &MI, Register DestReg); using MInstToMCSymbol = std::map<const MachineInstr *, MCSymbol *>; @@ -734,7 +735,7 @@ void AArch64AsmPrinter::emitHwasanMemaccessSymbols(Module &M) { const Triple &TT = TM.getTargetTriple(); assert(TT.isOSBinFormatELF()); std::unique_ptr<MCSubtargetInfo> STI( - TM.getTarget().createMCSubtargetInfo(TT.str(), "", "")); + TM.getTarget().createMCSubtargetInfo(TT, "", "")); assert(STI && "Unable to create subtarget info"); this->STI = static_cast<const AArch64Subtarget *>(&*STI); @@ -1829,45 +1830,77 @@ void AArch64AsmPrinter::emitMOVK(Register Dest, uint64_t Imm, unsigned Shift) { void AArch64AsmPrinter::emitFMov0(const MachineInstr &MI) { Register DestReg = MI.getOperand(0).getReg(); - if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround() && - STI->isNeonAvailable()) { - // Convert H/S register to corresponding D register - if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31) - DestReg = AArch64::D0 + (DestReg - AArch64::H0); - else if (AArch64::S0 <= DestReg && DestReg <= AArch64::S31) - DestReg = AArch64::D0 + (DestReg - AArch64::S0); - else - assert(AArch64::D0 <= DestReg && DestReg <= AArch64::D31); + if (!STI->hasZeroCycleZeroingFPWorkaround() && STI->isNeonAvailable()) { + if (STI->hasZeroCycleZeroingFPR64()) { + // Convert H/S register to corresponding D register + const AArch64RegisterInfo *TRI = STI->getRegisterInfo(); + if (AArch64::FPR16RegClass.contains(DestReg)) + DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::hsub, + &AArch64::FPR64RegClass); + else if (AArch64::FPR32RegClass.contains(DestReg)) + DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::ssub, + &AArch64::FPR64RegClass); + else + assert(AArch64::FPR64RegClass.contains(DestReg)); + + MCInst MOVI; + MOVI.setOpcode(AArch64::MOVID); + MOVI.addOperand(MCOperand::createReg(DestReg)); + MOVI.addOperand(MCOperand::createImm(0)); + EmitToStreamer(*OutStreamer, MOVI); + } else if (STI->hasZeroCycleZeroingFPR128()) { + // Convert H/S/D register to corresponding Q register + const AArch64RegisterInfo *TRI = STI->getRegisterInfo(); + if (AArch64::FPR16RegClass.contains(DestReg)) { + DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::hsub, + &AArch64::FPR128RegClass); + } else if (AArch64::FPR32RegClass.contains(DestReg)) { + DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::ssub, + &AArch64::FPR128RegClass); + } else { + assert(AArch64::FPR64RegClass.contains(DestReg)); + DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::dsub, + &AArch64::FPR128RegClass); + } - MCInst MOVI; - MOVI.setOpcode(AArch64::MOVID); - MOVI.addOperand(MCOperand::createReg(DestReg)); - MOVI.addOperand(MCOperand::createImm(0)); - EmitToStreamer(*OutStreamer, MOVI); - } else { - MCInst FMov; - switch (MI.getOpcode()) { - default: llvm_unreachable("Unexpected opcode"); - case AArch64::FMOVH0: - FMov.setOpcode(STI->hasFullFP16() ? AArch64::FMOVWHr : AArch64::FMOVWSr); - if (!STI->hasFullFP16()) - DestReg = (AArch64::S0 + (DestReg - AArch64::H0)); - FMov.addOperand(MCOperand::createReg(DestReg)); - FMov.addOperand(MCOperand::createReg(AArch64::WZR)); - break; - case AArch64::FMOVS0: - FMov.setOpcode(AArch64::FMOVWSr); - FMov.addOperand(MCOperand::createReg(DestReg)); - FMov.addOperand(MCOperand::createReg(AArch64::WZR)); - break; - case AArch64::FMOVD0: - FMov.setOpcode(AArch64::FMOVXDr); - FMov.addOperand(MCOperand::createReg(DestReg)); - FMov.addOperand(MCOperand::createReg(AArch64::XZR)); - break; + MCInst MOVI; + MOVI.setOpcode(AArch64::MOVIv2d_ns); + MOVI.addOperand(MCOperand::createReg(DestReg)); + MOVI.addOperand(MCOperand::createImm(0)); + EmitToStreamer(*OutStreamer, MOVI); + } else { + emitFMov0AsFMov(MI, DestReg); } - EmitToStreamer(*OutStreamer, FMov); + } else { + emitFMov0AsFMov(MI, DestReg); + } +} + +void AArch64AsmPrinter::emitFMov0AsFMov(const MachineInstr &MI, + Register DestReg) { + MCInst FMov; + switch (MI.getOpcode()) { + default: + llvm_unreachable("Unexpected opcode"); + case AArch64::FMOVH0: + FMov.setOpcode(STI->hasFullFP16() ? AArch64::FMOVWHr : AArch64::FMOVWSr); + if (!STI->hasFullFP16()) + DestReg = (AArch64::S0 + (DestReg - AArch64::H0)); + FMov.addOperand(MCOperand::createReg(DestReg)); + FMov.addOperand(MCOperand::createReg(AArch64::WZR)); + break; + case AArch64::FMOVS0: + FMov.setOpcode(AArch64::FMOVWSr); + FMov.addOperand(MCOperand::createReg(DestReg)); + FMov.addOperand(MCOperand::createReg(AArch64::WZR)); + break; + case AArch64::FMOVD0: + FMov.setOpcode(AArch64::FMOVXDr); + FMov.addOperand(MCOperand::createReg(DestReg)); + FMov.addOperand(MCOperand::createReg(AArch64::XZR)); + break; } + EmitToStreamer(*OutStreamer, FMov); } Register AArch64AsmPrinter::emitPtrauthDiscriminator(uint16_t Disc, @@ -2229,13 +2262,24 @@ void AArch64AsmPrinter::emitPtrauthBranch(const MachineInstr *MI) { if (BrTarget == AddrDisc) report_fatal_error("Branch target is signed with its own value"); - // If we are printing BLRA pseudo instruction, then x16 and x17 are - // implicit-def'ed by the MI and AddrDisc is not used as any other input, so - // try to save one MOV by setting MayUseAddrAsScratch. + // If we are printing BLRA pseudo, try to save one MOV by making use of the + // fact that x16 and x17 are described as clobbered by the MI instruction and + // AddrDisc is not used as any other input. + // + // Back in the day, emitPtrauthDiscriminator was restricted to only returning + // either x16 or x17, meaning the returned register is always among the + // implicit-def'ed registers of BLRA pseudo. Now this property can be violated + // if isX16X17Safer predicate is false, thus manually check if AddrDisc is + // among x16 and x17 to prevent clobbering unexpected registers. + // // Unlike BLRA, BRA pseudo is used to perform computed goto, and thus not // declared as clobbering x16/x17. + // + // FIXME: Make use of `killed` flags and register masks instead. + bool AddrDiscIsImplicitDef = + IsCall && (AddrDisc == AArch64::X16 || AddrDisc == AArch64::X17); Register DiscReg = emitPtrauthDiscriminator(Disc, AddrDisc, AArch64::X17, - /*MayUseAddrAsScratch=*/IsCall); + AddrDiscIsImplicitDef); bool IsZeroDisc = DiscReg == AArch64::XZR; unsigned Opc; @@ -2862,7 +2906,7 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) { MCInst TmpInst; TmpInst.setOpcode(AArch64::MOVIv16b_ns); TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg())); - TmpInst.addOperand(MCOperand::createImm(MI->getOperand(1).getImm())); + TmpInst.addOperand(MCOperand::createImm(0)); EmitToStreamer(*OutStreamer, TmpInst); return; } @@ -2968,8 +3012,15 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) { // See the comments in emitPtrauthBranch. if (Callee == AddrDisc) report_fatal_error("Call target is signed with its own value"); + + // After isX16X17Safer predicate was introduced, emitPtrauthDiscriminator is + // no longer restricted to only reusing AddrDisc when it is X16 or X17 + // (which are implicit-def'ed by AUTH_TCRETURN pseudos), thus impose this + // restriction manually not to clobber an unexpected register. + bool AddrDiscIsImplicitDef = + AddrDisc == AArch64::X16 || AddrDisc == AArch64::X17; Register DiscReg = emitPtrauthDiscriminator(Disc, AddrDisc, ScratchReg, - /*MayUseAddrAsScratch=*/true); + AddrDiscIsImplicitDef); const bool IsZero = DiscReg == AArch64::XZR; const unsigned Opcodes[2][2] = {{AArch64::BRAA, AArch64::BRAAZ}, diff --git a/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp b/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp index 3436dc9ef452..137ff898e86a 100644 --- a/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp +++ b/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp @@ -30,6 +30,14 @@ using namespace llvm; #define AARCH64_BRANCH_TARGETS_NAME "AArch64 Branch Targets" namespace { +// BTI HINT encoding: base (32) plus 'c' (2) and/or 'j' (4). +enum : unsigned { + BTIBase = 32, // Base immediate for BTI HINT + BTIC = 1u << 1, // 2 + BTIJ = 1u << 2, // 4 + BTIMask = BTIC | BTIJ, +}; + class AArch64BranchTargets : public MachineFunctionPass { public: static char ID; @@ -42,6 +50,7 @@ private: void addBTI(MachineBasicBlock &MBB, bool CouldCall, bool CouldJump, bool NeedsWinCFI); }; + } // end anonymous namespace char AArch64BranchTargets::ID = 0; @@ -62,9 +71,8 @@ bool AArch64BranchTargets::runOnMachineFunction(MachineFunction &MF) { if (!MF.getInfo<AArch64FunctionInfo>()->branchTargetEnforcement()) return false; - LLVM_DEBUG( - dbgs() << "********** AArch64 Branch Targets **********\n" - << "********** Function: " << MF.getName() << '\n'); + LLVM_DEBUG(dbgs() << "********** AArch64 Branch Targets **********\n" + << "********** Function: " << MF.getName() << '\n'); const Function &F = MF.getFunction(); // LLVM does not consider basic blocks which are the targets of jump tables @@ -103,6 +111,12 @@ bool AArch64BranchTargets::runOnMachineFunction(MachineFunction &MF) { JumpTableTargets.count(&MBB)) CouldJump = true; + if (MBB.isEHPad()) { + if (HasWinCFI && (MBB.isEHFuncletEntry() || MBB.isCleanupFuncletEntry())) + CouldCall = true; + else + CouldJump = true; + } if (CouldCall || CouldJump) { addBTI(MBB, CouldCall, CouldJump, HasWinCFI); MadeChange = true; @@ -130,7 +144,12 @@ void AArch64BranchTargets::addBTI(MachineBasicBlock &MBB, bool CouldCall, auto MBBI = MBB.begin(); - // Skip the meta instructions, those will be removed anyway. + // If the block starts with EH_LABEL(s), skip them first. + while (MBBI != MBB.end() && MBBI->isEHLabel()) { + ++MBBI; + } + + // Skip meta/CFI/etc. (and EMITBKEY) to reach the first executable insn. for (; MBBI != MBB.end() && (MBBI->isMetaInstruction() || MBBI->getOpcode() == AArch64::EMITBKEY); ++MBBI) @@ -138,16 +157,21 @@ void AArch64BranchTargets::addBTI(MachineBasicBlock &MBB, bool CouldCall, // SCTLR_EL1.BT[01] is set to 0 by default which means // PACI[AB]SP are implicitly BTI C so no BTI C instruction is needed there. - if (MBBI != MBB.end() && HintNum == 34 && + if (MBBI != MBB.end() && ((HintNum & BTIMask) == BTIC) && (MBBI->getOpcode() == AArch64::PACIASP || MBBI->getOpcode() == AArch64::PACIBSP)) return; - if (HasWinCFI && MBBI->getFlag(MachineInstr::FrameSetup)) { - BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), - TII->get(AArch64::SEH_Nop)); + // Insert BTI exactly at the first executable instruction. + const DebugLoc DL = MBB.findDebugLoc(MBBI); + MachineInstr *BTI = BuildMI(MBB, MBBI, DL, TII->get(AArch64::HINT)) + .addImm(HintNum) + .getInstr(); + + // WinEH: put .seh_nop after BTI when the first real insn is FrameSetup. + if (HasWinCFI && MBBI != MBB.end() && + MBBI->getFlag(MachineInstr::FrameSetup)) { + auto AfterBTI = std::next(MachineBasicBlock::iterator(BTI)); + BuildMI(MBB, AfterBTI, DL, TII->get(AArch64::SEH_Nop)); } - BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), - TII->get(AArch64::HINT)) - .addImm(HintNum); } diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index 5f499e5e9700..076a6235eef0 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -369,5 +369,5 @@ def AArch64PostLegalizerCombiner commute_constant_to_rhs, extract_vec_elt_combines, push_freeze_to_prevent_poison_from_propagating, combine_mul_cmlt, combine_use_vector_truncate, - extmultomull, truncsat_combines]> { + extmultomull, truncsat_combines, lshr_of_trunc_of_lshr]> { } diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 57dcd68595ff..79655e1c9529 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -1688,6 +1688,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, } case AArch64::InOutZAUsePseudo: case AArch64::RequiresZASavePseudo: + case AArch64::SMEStateAllocPseudo: case AArch64::COALESCER_BARRIER_FPR16: case AArch64::COALESCER_BARRIER_FPR32: case AArch64::COALESCER_BARRIER_FPR64: diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td index c1c1f0a1024d..46f5f0c1ca9d 100644 --- a/llvm/lib/Target/AArch64/AArch64Features.td +++ b/llvm/lib/Target/AArch64/AArch64Features.td @@ -621,25 +621,30 @@ def FeatureZCRegMoveGPR64 : SubtargetFeature<"zcm-gpr64", "HasZeroCycleRegMoveGP def FeatureZCRegMoveGPR32 : SubtargetFeature<"zcm-gpr32", "HasZeroCycleRegMoveGPR32", "true", "Has zero-cycle register moves for GPR32 registers">; +def FeatureZCRegMoveFPR128 : SubtargetFeature<"zcm-fpr128", "HasZeroCycleRegMoveFPR128", "true", + "Has zero-cycle register moves for FPR128 registers">; + def FeatureZCRegMoveFPR64 : SubtargetFeature<"zcm-fpr64", "HasZeroCycleRegMoveFPR64", "true", "Has zero-cycle register moves for FPR64 registers">; def FeatureZCRegMoveFPR32 : SubtargetFeature<"zcm-fpr32", "HasZeroCycleRegMoveFPR32", "true", "Has zero-cycle register moves for FPR32 registers">; -def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true", - "Has zero-cycle zeroing instructions for generic registers">; +def FeatureZCZeroingGPR64 : SubtargetFeature<"zcz-gpr64", "HasZeroCycleZeroingGPR64", "true", + "Has zero-cycle zeroing instructions for GPR64 registers">; + +def FeatureZCZeroingGPR32 : SubtargetFeature<"zcz-gpr32", "HasZeroCycleZeroingGPR32", "true", + "Has zero-cycle zeroing instructions for GPR32 registers">; + +def FeatureZCZeroingFPR128 : SubtargetFeature<"zcz-fpr128", "HasZeroCycleZeroingFPR128", "true", + "Has zero-cycle zeroing instructions for FPR128 registers">; // It is generally beneficial to rewrite "fmov s0, wzr" to "movi d0, #0". // as movi is more efficient across all cores. Newer cores can eliminate // fmovs early and there is no difference with movi, but this not true for // all implementations. -def FeatureNoZCZeroingFP : SubtargetFeature<"no-zcz-fp", "HasZeroCycleZeroingFP", "false", - "Has no zero-cycle zeroing instructions for FP registers">; - -def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true", - "Has zero-cycle zeroing instructions", - [FeatureZCZeroingGP]>; +def FeatureNoZCZeroingFPR64 : SubtargetFeature<"no-zcz-fpr64", "HasZeroCycleZeroingFPR64", "false", + "Has no zero-cycle zeroing instructions for FPR64 registers">; /// ... but the floating-point version doesn't quite work in rare cases on older /// CPUs. @@ -730,9 +735,13 @@ def FeatureFuseArithmeticLogic : SubtargetFeature< "fuse-arith-logic", "HasFuseArithmeticLogic", "true", "CPU fuses arithmetic and logic operations">; -def FeatureFuseCCSelect : SubtargetFeature< - "fuse-csel", "HasFuseCCSelect", "true", - "CPU fuses conditional select operations">; +def FeatureFuseCmpCSel : SubtargetFeature< + "fuse-csel", "HasFuseCmpCSel", "true", + "CPU can fuse CMP and CSEL operations">; + +def FeatureFuseCmpCSet : SubtargetFeature< + "fuse-cset", "HasFuseCmpCSet", "true", + "CPU can fuse CMP and CSET operations">; def FeatureFuseCryptoEOR : SubtargetFeature< "fuse-crypto-eor", "HasFuseCryptoEOR", "true", diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 7725fa4f1ccb..175b5e04d82f 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -211,6 +211,7 @@ #include "AArch64FrameLowering.h" #include "AArch64InstrInfo.h" #include "AArch64MachineFunctionInfo.h" +#include "AArch64PrologueEpilogue.h" #include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" @@ -218,7 +219,6 @@ #include "Utils/AArch64SMEAttributes.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/CFIInstBuilder.h" #include "llvm/CodeGen/LivePhysRegs.h" @@ -293,8 +293,6 @@ static cl::opt<bool> DisableMultiVectorSpillFill( cl::desc("Disable use of LD/ST pairs for SME2 or SVE2p1"), cl::init(false), cl::Hidden); -STATISTIC(NumRedZoneFunctions, "Number of functions using red zone"); - /// Returns how much of the incoming argument stack area (in bytes) we should /// clean up in an epilogue. For the C calling convention this will be 0, for /// guaranteed tail call conventions it can be positive (a normal return or a @@ -328,23 +326,20 @@ static int64_t getArgumentStackToRestore(MachineFunction &MF, return ArgumentPopSize; } -static bool produceCompactUnwindFrame(MachineFunction &MF); -static bool needsWinCFI(const MachineFunction &MF); -static StackOffset getSVEStackSize(const MachineFunction &MF); -static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB, - bool HasCall = false); -static bool requiresSaveVG(const MachineFunction &MF); +static bool produceCompactUnwindFrame(const AArch64FrameLowering &, + MachineFunction &MF); // Conservatively, returns true if the function is likely to have an SVE vectors // on the stack. This function is safe to be called before callee-saves or // object offsets have been determined. -static bool isLikelyToHaveSVEStack(const MachineFunction &MF) { +static bool isLikelyToHaveSVEStack(const AArch64FrameLowering &AFL, + const MachineFunction &MF) { auto *AFI = MF.getInfo<AArch64FunctionInfo>(); if (AFI->isSVECC()) return true; if (AFI->hasCalculatedStackSizeSVE()) - return bool(getSVEStackSize(MF)); + return bool(AFL.getSVEStackSize(MF)); const MachineFrameInfo &MFI = MF.getFrameInfo(); for (int FI = MFI.getObjectIndexBegin(); FI < MFI.getObjectIndexEnd(); FI++) { @@ -372,7 +367,7 @@ bool AArch64FrameLowering::homogeneousPrologEpilog( return false; // TODO: SVE is not supported yet. - if (isLikelyToHaveSVEStack(MF)) + if (isLikelyToHaveSVEStack(*this, MF)) return false; // Bail on stack adjustment needed on return for simplicity. @@ -409,7 +404,7 @@ bool AArch64FrameLowering::homogeneousPrologEpilog( /// Returns true if CSRs should be paired. bool AArch64FrameLowering::producePairRegisters(MachineFunction &MF) const { - return produceCompactUnwindFrame(MF) || homogeneousPrologEpilog(MF); + return produceCompactUnwindFrame(*this, MF) || homogeneousPrologEpilog(MF); } /// This is the biggest offset to the stack pointer we can encode in aarch64 @@ -451,11 +446,10 @@ AArch64FrameLowering::getStackIDForScalableVectors() const { return TargetStackID::ScalableVector; } -/// Returns the size of the fixed object area (allocated next to sp on entry) -/// On Win64 this may include a var args area and an UnwindHelp object for EH. -static unsigned getFixedObjectSize(const MachineFunction &MF, - const AArch64FunctionInfo *AFI, bool IsWin64, - bool IsFunclet) { +unsigned +AArch64FrameLowering::getFixedObjectSize(const MachineFunction &MF, + const AArch64FunctionInfo *AFI, + bool IsWin64, bool IsFunclet) const { assert(AFI->getTailCallReservedStack() % 16 == 0 && "Tail call reserved stack must be aligned to 16 bytes"); if (!IsWin64 || IsFunclet) { @@ -494,7 +488,8 @@ static unsigned getFixedObjectSize(const MachineFunction &MF, } /// Returns the size of the entire SVE stackframe (calleesaves + spills). -static StackOffset getSVEStackSize(const MachineFunction &MF) { +StackOffset +AArch64FrameLowering::getSVEStackSize(const MachineFunction &MF) const { const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); return StackOffset::getScalable((int64_t)AFI->getStackSizeSVE()); } @@ -683,70 +678,6 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr( return MBB.erase(I); } -void AArch64FrameLowering::emitCalleeSavedGPRLocations( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const { - MachineFunction &MF = *MBB.getParent(); - MachineFrameInfo &MFI = MF.getFrameInfo(); - - const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); - if (CSI.empty()) - return; - - CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup); - for (const auto &Info : CSI) { - unsigned FrameIdx = Info.getFrameIdx(); - if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector) - continue; - - assert(!Info.isSpilledToReg() && "Spilling to registers not implemented"); - int64_t Offset = MFI.getObjectOffset(FrameIdx) - getOffsetOfLocalArea(); - CFIBuilder.buildOffset(Info.getReg(), Offset); - } -} - -void AArch64FrameLowering::emitCalleeSavedSVELocations( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const { - MachineFunction &MF = *MBB.getParent(); - MachineFrameInfo &MFI = MF.getFrameInfo(); - - // Add callee saved registers to move list. - const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); - if (CSI.empty()) - return; - - const TargetSubtargetInfo &STI = MF.getSubtarget(); - const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); - AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>(); - CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup); - - std::optional<int64_t> IncomingVGOffsetFromDefCFA; - if (requiresSaveVG(MF)) { - auto IncomingVG = *find_if( - reverse(CSI), [](auto &Info) { return Info.getReg() == AArch64::VG; }); - IncomingVGOffsetFromDefCFA = - MFI.getObjectOffset(IncomingVG.getFrameIdx()) - getOffsetOfLocalArea(); - } - - for (const auto &Info : CSI) { - if (MFI.getStackID(Info.getFrameIdx()) != TargetStackID::ScalableVector) - continue; - - // Not all unwinders may know about SVE registers, so assume the lowest - // common denominator. - assert(!Info.isSpilledToReg() && "Spilling to registers not implemented"); - MCRegister Reg = Info.getReg(); - if (!static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg)) - continue; - - StackOffset Offset = - StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) - - StackOffset::getFixed(AFI.getCalleeSavedStackSize(MFI)); - - CFIBuilder.insertCFIInst( - createCFAOffset(TRI, Reg, Offset, IncomingVGOffsetFromDefCFA)); - } -} - void AArch64FrameLowering::resetCFIToInitialState( MachineBasicBlock &MBB) const { @@ -1088,8 +1019,8 @@ void AArch64FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero, } } -static bool windowsRequiresStackProbe(const MachineFunction &MF, - uint64_t StackSizeInBytes) { +bool AArch64FrameLowering::windowsRequiresStackProbe( + const MachineFunction &MF, uint64_t StackSizeInBytes) const { const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); const AArch64FunctionInfo &MFI = *MF.getInfo<AArch64FunctionInfo>(); // TODO: When implementing stack protectors, take that into account @@ -1108,19 +1039,9 @@ static void getLiveRegsForEntryMBB(LivePhysRegs &LiveRegs, LiveRegs.addReg(CSRegs[i]); } -// Find a scratch register that we can use at the start of the prologue to -// re-align the stack pointer. We avoid using callee-save registers since they -// may appear to be free when this is called from canUseAsPrologue (during -// shrink wrapping), but then no longer be free when this is called from -// emitPrologue. -// -// FIXME: This is a bit conservative, since in the above case we could use one -// of the callee-save registers as a scratch temp to re-align the stack pointer, -// but we would then have to make sure that we were in fact saving at least one -// callee-save register in the prologue, which is additional complexity that -// doesn't seem worth the benefit. -static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB, - bool HasCall) { +Register +AArch64FrameLowering::findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB, + bool HasCall) const { MachineFunction *MF = MBB->getParent(); // If MBB is an entry block, use X9 as the scratch register @@ -1193,13 +1114,14 @@ bool AArch64FrameLowering::canUseAsPrologue( return true; } -static bool needsWinCFI(const MachineFunction &MF) { +bool AArch64FrameLowering::needsWinCFI(const MachineFunction &MF) const { const Function &F = MF.getFunction(); return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() && F.needsUnwindTableEntry(); } -static bool shouldSignReturnAddressEverywhere(const MachineFunction &MF) { +bool AArch64FrameLowering::shouldSignReturnAddressEverywhere( + const MachineFunction &MF) const { // FIXME: With WinCFI, extra care should be taken to place SEH_PACSignLR // and SEH_EpilogEnd instructions in the correct order. if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) @@ -1475,13 +1397,13 @@ static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI, ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize); } -bool requiresGetVGCall(MachineFunction &MF) { - AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); +bool AArch64FrameLowering::requiresGetVGCall(const MachineFunction &MF) const { + auto *AFI = MF.getInfo<AArch64FunctionInfo>(); return AFI->hasStreamingModeChanges() && !MF.getSubtarget<AArch64Subtarget>().hasSVE(); } -static bool requiresSaveVG(const MachineFunction &MF) { +bool AArch64FrameLowering::requiresSaveVG(const MachineFunction &MF) const { const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); if (!AFI->needsDwarfUnwindInfo(MF) || !AFI->hasStreamingModeChanges()) return false; @@ -1499,8 +1421,8 @@ static bool matchLibcall(const TargetLowering &TLI, const MachineOperand &MO, StringRef(TLI.getLibcallName(LC)) == MO.getSymbolName(); } -bool isVGInstruction(MachineBasicBlock::iterator MBBI, - const TargetLowering &TLI) { +bool AArch64FrameLowering::isVGInstruction(MachineBasicBlock::iterator MBBI, + const TargetLowering &TLI) const { unsigned Opc = MBBI->getOpcode(); if (Opc == AArch64::CNTD_XPiI) return true; @@ -1514,15 +1436,12 @@ bool isVGInstruction(MachineBasicBlock::iterator MBBI, return Opc == TargetOpcode::COPY; } -// Convert callee-save register save/restore instruction to do stack pointer -// decrement/increment to allocate/deallocate the callee-save stack area by -// converting store/load to use pre/post increment version. -static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( +MachineBasicBlock::iterator +AArch64FrameLowering::convertCalleeSaveRestoreToSPPrePostIncDec( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc, bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI, - MachineInstr::MIFlag FrameFlag = MachineInstr::FrameSetup, - int CFAOffset = 0) { + MachineInstr::MIFlag FrameFlag, int CFAOffset) const { unsigned NewOpc; // If the function contains streaming mode changes, we expect instructions @@ -1643,12 +1562,9 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( return std::prev(MBB.erase(MBBI)); } -// Fixup callee-save register save/restore instructions to take into account -// combined SP bump by adding the local stack size to the stack offsets. -static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI, - uint64_t LocalStackSize, - bool NeedsWinCFI, - bool *HasWinCFI) { +void AArch64FrameLowering::fixupCalleeSaveRestoreStackOffset( + MachineInstr &MI, uint64_t LocalStackSize, bool NeedsWinCFI, + bool *HasWinCFI) const { if (AArch64InstrInfo::isSEHInstruction(MI)) return; @@ -1703,7 +1619,8 @@ static unsigned getStackHazardSize(const MachineFunction &MF) { } // Convenience function to determine whether I is an SVE callee save. -static bool IsSVECalleeSave(MachineBasicBlock::iterator I) { +bool AArch64FrameLowering::isSVECalleeSave( + MachineBasicBlock::iterator I) const { switch (I->getOpcode()) { default: return false; @@ -1725,42 +1642,6 @@ static bool IsSVECalleeSave(MachineBasicBlock::iterator I) { } } -static void emitShadowCallStackPrologue(const TargetInstrInfo &TII, - MachineFunction &MF, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, bool NeedsWinCFI, - bool NeedsUnwindInfo) { - // Shadow call stack prolog: str x30, [x18], #8 - BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXpost)) - .addReg(AArch64::X18, RegState::Define) - .addReg(AArch64::LR) - .addReg(AArch64::X18) - .addImm(8) - .setMIFlag(MachineInstr::FrameSetup); - - // This instruction also makes x18 live-in to the entry block. - MBB.addLiveIn(AArch64::X18); - - if (NeedsWinCFI) - BuildMI(MBB, MBBI, DL, TII.get(AArch64::SEH_Nop)) - .setMIFlag(MachineInstr::FrameSetup); - - if (NeedsUnwindInfo) { - // Emit a CFI instruction that causes 8 to be subtracted from the value of - // x18 when unwinding past this frame. - static const char CFIInst[] = { - dwarf::DW_CFA_val_expression, - 18, // register - 2, // length - static_cast<char>(unsigned(dwarf::DW_OP_breg18)), - static_cast<char>(-8) & 0x7f, // addend (sleb128) - }; - CFIInstBuilder(MBB, MBBI, MachineInstr::FrameSetup) - .buildEscape(StringRef(CFIInst, sizeof(CFIInst))); - } -} - static void emitShadowCallStackEpilogue(const TargetInstrInfo &TII, MachineFunction &MF, MachineBasicBlock &MBB, @@ -1783,36 +1664,6 @@ static void emitShadowCallStackEpilogue(const TargetInstrInfo &TII, .buildRestore(AArch64::X18); } -// Define the current CFA rule to use the provided FP. -static void emitDefineCFAWithFP(MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - unsigned FixedObject) { - const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>(); - const AArch64RegisterInfo *TRI = STI.getRegisterInfo(); - AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); - - const int OffsetToFirstCalleeSaveFromFP = - AFI->getCalleeSaveBaseToFrameRecordOffset() - - AFI->getCalleeSavedStackSize(); - Register FramePtr = TRI->getFrameRegister(MF); - CFIInstBuilder(MBB, MBBI, MachineInstr::FrameSetup) - .buildDefCFA(FramePtr, FixedObject - OffsetToFirstCalleeSaveFromFP); -} - -#ifndef NDEBUG -/// Collect live registers from the end of \p MI's parent up to (including) \p -/// MI in \p LiveRegs. -static void getLivePhysRegsUpTo(MachineInstr &MI, const TargetRegisterInfo &TRI, - LivePhysRegs &LiveRegs) { - - MachineBasicBlock &MBB = *MI.getParent(); - LiveRegs.addLiveOuts(MBB); - for (const MachineInstr &MI : - reverse(make_range(MI.getIterator(), MBB.instr_end()))) - LiveRegs.stepBackward(MI); -} -#endif - void AArch64FrameLowering::emitPacRetPlusLeafHardening( MachineFunction &MF) const { const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); @@ -1848,616 +1699,8 @@ void AArch64FrameLowering::emitPacRetPlusLeafHardening( void AArch64FrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { - MachineBasicBlock::iterator MBBI = MBB.begin(); - const MachineFrameInfo &MFI = MF.getFrameInfo(); - const Function &F = MF.getFunction(); - const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); - const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - - AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); - bool EmitCFI = AFI->needsDwarfUnwindInfo(MF); - bool EmitAsyncCFI = AFI->needsAsyncDwarfUnwindInfo(MF); - bool HasFP = hasFP(MF); - bool NeedsWinCFI = needsWinCFI(MF); - bool HasWinCFI = false; - auto Cleanup = make_scope_exit([&]() { MF.setHasWinCFI(HasWinCFI); }); - - MachineBasicBlock::iterator End = MBB.end(); -#ifndef NDEBUG - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - // Collect live register from the end of MBB up to the start of the existing - // frame setup instructions. - MachineBasicBlock::iterator NonFrameStart = MBB.begin(); - while (NonFrameStart != End && - NonFrameStart->getFlag(MachineInstr::FrameSetup)) - ++NonFrameStart; - - LivePhysRegs LiveRegs(*TRI); - if (NonFrameStart != MBB.end()) { - getLivePhysRegsUpTo(*NonFrameStart, *TRI, LiveRegs); - // Ignore registers used for stack management for now. - LiveRegs.removeReg(AArch64::SP); - LiveRegs.removeReg(AArch64::X19); - LiveRegs.removeReg(AArch64::FP); - LiveRegs.removeReg(AArch64::LR); - - // X0 will be clobbered by a call to __arm_get_current_vg in the prologue. - // This is necessary to spill VG if required where SVE is unavailable, but - // X0 is preserved around this call. - if (requiresGetVGCall(MF)) - LiveRegs.removeReg(AArch64::X0); - } - - auto VerifyClobberOnExit = make_scope_exit([&]() { - if (NonFrameStart == MBB.end()) - return; - // Check if any of the newly instructions clobber any of the live registers. - for (MachineInstr &MI : - make_range(MBB.instr_begin(), NonFrameStart->getIterator())) { - for (auto &Op : MI.operands()) - if (Op.isReg() && Op.isDef()) - assert(!LiveRegs.contains(Op.getReg()) && - "live register clobbered by inserted prologue instructions"); - } - }); -#endif - - bool IsFunclet = MBB.isEHFuncletEntry(); - - // At this point, we're going to decide whether or not the function uses a - // redzone. In most cases, the function doesn't have a redzone so let's - // assume that's false and set it to true in the case that there's a redzone. - AFI->setHasRedZone(false); - - // Debug location must be unknown since the first debug location is used - // to determine the end of the prologue. - DebugLoc DL; - - const auto &MFnI = *MF.getInfo<AArch64FunctionInfo>(); - if (MFnI.shouldSignReturnAddress(MF)) { - // If pac-ret+leaf is in effect, PAUTH_PROLOGUE pseudo instructions - // are inserted by emitPacRetPlusLeafHardening(). - if (!shouldSignReturnAddressEverywhere(MF)) { - BuildMI(MBB, MBBI, DL, TII->get(AArch64::PAUTH_PROLOGUE)) - .setMIFlag(MachineInstr::FrameSetup); - } - // AArch64PointerAuth pass will insert SEH_PACSignLR - HasWinCFI |= NeedsWinCFI; - } - - if (MFnI.needsShadowCallStackPrologueEpilogue(MF)) { - emitShadowCallStackPrologue(*TII, MF, MBB, MBBI, DL, NeedsWinCFI, - MFnI.needsDwarfUnwindInfo(MF)); - HasWinCFI |= NeedsWinCFI; - } - - if (EmitCFI && MFnI.isMTETagged()) { - BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITMTETAGGED)) - .setMIFlag(MachineInstr::FrameSetup); - } - - // We signal the presence of a Swift extended frame to external tools by - // storing FP with 0b0001 in bits 63:60. In normal userland operation a simple - // ORR is sufficient, it is assumed a Swift kernel would initialize the TBI - // bits so that is still true. - if (HasFP && AFI->hasSwiftAsyncContext()) { - switch (MF.getTarget().Options.SwiftAsyncFramePointer) { - case SwiftAsyncFramePointerMode::DeploymentBased: - if (Subtarget.swiftAsyncContextIsDynamicallySet()) { - // The special symbol below is absolute and has a *value* that can be - // combined with the frame pointer to signal an extended frame. - BuildMI(MBB, MBBI, DL, TII->get(AArch64::LOADgot), AArch64::X16) - .addExternalSymbol("swift_async_extendedFramePointerFlags", - AArch64II::MO_GOT); - if (NeedsWinCFI) { - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop)) - .setMIFlags(MachineInstr::FrameSetup); - HasWinCFI = true; - } - BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrs), AArch64::FP) - .addUse(AArch64::FP) - .addUse(AArch64::X16) - .addImm(Subtarget.isTargetILP32() ? 32 : 0); - if (NeedsWinCFI) { - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop)) - .setMIFlags(MachineInstr::FrameSetup); - HasWinCFI = true; - } - break; - } - [[fallthrough]]; - - case SwiftAsyncFramePointerMode::Always: - // ORR x29, x29, #0x1000_0000_0000_0000 - BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXri), AArch64::FP) - .addUse(AArch64::FP) - .addImm(0x1100) - .setMIFlag(MachineInstr::FrameSetup); - if (NeedsWinCFI) { - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop)) - .setMIFlags(MachineInstr::FrameSetup); - HasWinCFI = true; - } - break; - - case SwiftAsyncFramePointerMode::Never: - break; - } - } - - // All calls are tail calls in GHC calling conv, and functions have no - // prologue/epilogue. - if (MF.getFunction().getCallingConv() == CallingConv::GHC) - return; - - // Set tagged base pointer to the requested stack slot. - // Ideally it should match SP value after prologue. - std::optional<int> TBPI = AFI->getTaggedBasePointerIndex(); - if (TBPI) - AFI->setTaggedBasePointerOffset(-MFI.getObjectOffset(*TBPI)); - else - AFI->setTaggedBasePointerOffset(MFI.getStackSize()); - - const StackOffset &SVEStackSize = getSVEStackSize(MF); - - // getStackSize() includes all the locals in its size calculation. We don't - // include these locals when computing the stack size of a funclet, as they - // are allocated in the parent's stack frame and accessed via the frame - // pointer from the funclet. We only save the callee saved registers in the - // funclet, which are really the callee saved registers of the parent - // function, including the funclet. - int64_t NumBytes = - IsFunclet ? getWinEHFuncletFrameSize(MF) : MFI.getStackSize(); - if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) { - assert(!HasFP && "unexpected function without stack frame but with FP"); - assert(!SVEStackSize && - "unexpected function without stack frame but with SVE objects"); - // All of the stack allocation is for locals. - AFI->setLocalStackSize(NumBytes); - if (!NumBytes) { - if (NeedsWinCFI && HasWinCFI) { - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd)) - .setMIFlag(MachineInstr::FrameSetup); - } - return; - } - // REDZONE: If the stack size is less than 128 bytes, we don't need - // to actually allocate. - if (canUseRedZone(MF)) { - AFI->setHasRedZone(true); - ++NumRedZoneFunctions; - } else { - emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(-NumBytes), TII, - MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); - if (EmitCFI) { - // Label used to tie together the PROLOG_LABEL and the MachineMoves. - MCSymbol *FrameLabel = MF.getContext().createTempSymbol(); - // Encode the stack size of the leaf function. - CFIInstBuilder(MBB, MBBI, MachineInstr::FrameSetup) - .buildDefCFAOffset(NumBytes, FrameLabel); - } - } - - if (NeedsWinCFI) { - HasWinCFI = true; - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd)) - .setMIFlag(MachineInstr::FrameSetup); - } - - return; - } - - bool IsWin64 = Subtarget.isCallingConvWin64(F.getCallingConv(), F.isVarArg()); - unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet); - - // Windows unwind can't represent the required stack adjustments if we have - // both SVE callee-saves and dynamic stack allocations, and the frame - // pointer is before the SVE spills. The allocation of the frame pointer - // must be the last instruction in the prologue so the unwinder can restore - // the stack pointer correctly. (And there isn't any unwind opcode for - // `addvl sp, x29, -17`.) - // - // Because of this, we do spills in the opposite order on Windows: first SVE, - // then GPRs. The main side-effect of this is that it makes accessing - // parameters passed on the stack more expensive. - // - // We could consider rearranging the spills for simpler cases. - bool FPAfterSVECalleeSaves = - Subtarget.isTargetWindows() && AFI->getSVECalleeSavedStackSize(); - - if (FPAfterSVECalleeSaves && AFI->hasStackHazardSlotIndex()) - reportFatalUsageError("SME hazard padding is not supported on Windows"); - - auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject; - // All of the remaining stack allocations are for locals. - AFI->setLocalStackSize(NumBytes - PrologueSaveSize); - bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); - bool HomPrologEpilog = homogeneousPrologEpilog(MF); - if (FPAfterSVECalleeSaves) { - // If we're doing SVE saves first, we need to immediately allocate space - // for fixed objects, then space for the SVE callee saves. - // - // Windows unwind requires that the scalable size is a multiple of 16; - // that's handled when the callee-saved size is computed. - auto SaveSize = - StackOffset::getScalable(AFI->getSVECalleeSavedStackSize()) + - StackOffset::getFixed(FixedObject); - allocateStackSpace(MBB, MBBI, 0, SaveSize, NeedsWinCFI, &HasWinCFI, - /*EmitCFI=*/false, StackOffset{}, - /*FollowupAllocs=*/true); - NumBytes -= FixedObject; - - // Now allocate space for the GPR callee saves. - while (MBBI != End && IsSVECalleeSave(MBBI)) - ++MBBI; - MBBI = convertCalleeSaveRestoreToSPPrePostIncDec( - MBB, MBBI, DL, TII, -AFI->getCalleeSavedStackSize(), NeedsWinCFI, - &HasWinCFI, EmitAsyncCFI); - NumBytes -= AFI->getCalleeSavedStackSize(); - } else if (CombineSPBump) { - assert(!SVEStackSize && "Cannot combine SP bump with SVE"); - emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(-NumBytes), TII, - MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI, - EmitAsyncCFI); - NumBytes = 0; - } else if (HomPrologEpilog) { - // Stack has been already adjusted. - NumBytes -= PrologueSaveSize; - } else if (PrologueSaveSize != 0) { - MBBI = convertCalleeSaveRestoreToSPPrePostIncDec( - MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI, - EmitAsyncCFI); - NumBytes -= PrologueSaveSize; - } - assert(NumBytes >= 0 && "Negative stack allocation size!?"); - - // Move past the saves of the callee-saved registers, fixing up the offsets - // and pre-inc if we decided to combine the callee-save and local stack - // pointer bump above. - auto &TLI = *MF.getSubtarget().getTargetLowering(); - while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) && - !IsSVECalleeSave(MBBI)) { - if (CombineSPBump && - // Only fix-up frame-setup load/store instructions. - (!requiresSaveVG(MF) || !isVGInstruction(MBBI, TLI))) - fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(), - NeedsWinCFI, &HasWinCFI); - ++MBBI; - } - - // For funclets the FP belongs to the containing function. - if (!IsFunclet && HasFP) { - // Only set up FP if we actually need to. - int64_t FPOffset = AFI->getCalleeSaveBaseToFrameRecordOffset(); - - if (CombineSPBump) - FPOffset += AFI->getLocalStackSize(); - - if (AFI->hasSwiftAsyncContext()) { - // Before we update the live FP we have to ensure there's a valid (or - // null) asynchronous context in its slot just before FP in the frame - // record, so store it now. - const auto &Attrs = MF.getFunction().getAttributes(); - bool HaveInitialContext = Attrs.hasAttrSomewhere(Attribute::SwiftAsync); - if (HaveInitialContext) - MBB.addLiveIn(AArch64::X22); - Register Reg = HaveInitialContext ? AArch64::X22 : AArch64::XZR; - BuildMI(MBB, MBBI, DL, TII->get(AArch64::StoreSwiftAsyncContext)) - .addUse(Reg) - .addUse(AArch64::SP) - .addImm(FPOffset - 8) - .setMIFlags(MachineInstr::FrameSetup); - if (NeedsWinCFI) { - // WinCFI and arm64e, where StoreSwiftAsyncContext is expanded - // to multiple instructions, should be mutually-exclusive. - assert(Subtarget.getTargetTriple().getArchName() != "arm64e"); - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop)) - .setMIFlags(MachineInstr::FrameSetup); - HasWinCFI = true; - } - } - - if (HomPrologEpilog) { - auto Prolog = MBBI; - --Prolog; - assert(Prolog->getOpcode() == AArch64::HOM_Prolog); - Prolog->addOperand(MachineOperand::CreateImm(FPOffset)); - } else { - // Issue sub fp, sp, FPOffset or - // mov fp,sp when FPOffset is zero. - // Note: All stores of callee-saved registers are marked as "FrameSetup". - // This code marks the instruction(s) that set the FP also. - emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, - StackOffset::getFixed(FPOffset), TII, - MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); - if (NeedsWinCFI && HasWinCFI) { - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd)) - .setMIFlag(MachineInstr::FrameSetup); - // After setting up the FP, the rest of the prolog doesn't need to be - // included in the SEH unwind info. - NeedsWinCFI = false; - } - } - if (EmitAsyncCFI) - emitDefineCFAWithFP(MF, MBB, MBBI, FixedObject); - } - - // Now emit the moves for whatever callee saved regs we have (including FP, - // LR if those are saved). Frame instructions for SVE register are emitted - // later, after the instruction which actually save SVE regs. - if (EmitAsyncCFI) - emitCalleeSavedGPRLocations(MBB, MBBI); - - // Alignment is required for the parent frame, not the funclet - const bool NeedsRealignment = - NumBytes && !IsFunclet && RegInfo->hasStackRealignment(MF); - const int64_t RealignmentPadding = - (NeedsRealignment && MFI.getMaxAlign() > Align(16)) - ? MFI.getMaxAlign().value() - 16 - : 0; - - if (windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding)) { - if (AFI->getSVECalleeSavedStackSize()) - report_fatal_error( - "SVE callee saves not yet supported with stack probing"); - - // Find an available register to spill the value of X15 to, if X15 is being - // used already for nest. - unsigned X15Scratch = AArch64::NoRegister; - const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>(); - if (llvm::any_of(MBB.liveins(), - [&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) { - return STI.getRegisterInfo()->isSuperOrSubRegisterEq( - AArch64::X15, LiveIn.PhysReg); - })) { - X15Scratch = findScratchNonCalleeSaveRegister(&MBB, true); - assert(X15Scratch != AArch64::NoRegister && - (X15Scratch < AArch64::X15 || X15Scratch > AArch64::X17)); -#ifndef NDEBUG - LiveRegs.removeReg(AArch64::X15); // ignore X15 since we restore it -#endif - BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), X15Scratch) - .addReg(AArch64::XZR) - .addReg(AArch64::X15, RegState::Undef) - .addReg(AArch64::X15, RegState::Implicit) - .setMIFlag(MachineInstr::FrameSetup); - } - - uint64_t NumWords = (NumBytes + RealignmentPadding) >> 4; - if (NeedsWinCFI) { - HasWinCFI = true; - // alloc_l can hold at most 256MB, so assume that NumBytes doesn't - // exceed this amount. We need to move at most 2^24 - 1 into x15. - // This is at most two instructions, MOVZ followed by MOVK. - // TODO: Fix to use multiple stack alloc unwind codes for stacks - // exceeding 256MB in size. - if (NumBytes >= (1 << 28)) - report_fatal_error("Stack size cannot exceed 256MB for stack " - "unwinding purposes"); - - uint32_t LowNumWords = NumWords & 0xFFFF; - BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVZXi), AArch64::X15) - .addImm(LowNumWords) - .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) - .setMIFlag(MachineInstr::FrameSetup); - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop)) - .setMIFlag(MachineInstr::FrameSetup); - if ((NumWords & 0xFFFF0000) != 0) { - BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVKXi), AArch64::X15) - .addReg(AArch64::X15) - .addImm((NumWords & 0xFFFF0000) >> 16) // High half - .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16)) - .setMIFlag(MachineInstr::FrameSetup); - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop)) - .setMIFlag(MachineInstr::FrameSetup); - } - } else { - BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15) - .addImm(NumWords) - .setMIFlags(MachineInstr::FrameSetup); - } - - const char *ChkStk = Subtarget.getChkStkName(); - switch (MF.getTarget().getCodeModel()) { - case CodeModel::Tiny: - case CodeModel::Small: - case CodeModel::Medium: - case CodeModel::Kernel: - BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL)) - .addExternalSymbol(ChkStk) - .addReg(AArch64::X15, RegState::Implicit) - .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead) - .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead) - .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead) - .setMIFlags(MachineInstr::FrameSetup); - if (NeedsWinCFI) { - HasWinCFI = true; - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop)) - .setMIFlag(MachineInstr::FrameSetup); - } - break; - case CodeModel::Large: - BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT)) - .addReg(AArch64::X16, RegState::Define) - .addExternalSymbol(ChkStk) - .addExternalSymbol(ChkStk) - .setMIFlags(MachineInstr::FrameSetup); - if (NeedsWinCFI) { - HasWinCFI = true; - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop)) - .setMIFlag(MachineInstr::FrameSetup); - } - - BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF))) - .addReg(AArch64::X16, RegState::Kill) - .addReg(AArch64::X15, RegState::Implicit | RegState::Define) - .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead) - .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead) - .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead) - .setMIFlags(MachineInstr::FrameSetup); - if (NeedsWinCFI) { - HasWinCFI = true; - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop)) - .setMIFlag(MachineInstr::FrameSetup); - } - break; - } - - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP) - .addReg(AArch64::SP, RegState::Kill) - .addReg(AArch64::X15, RegState::Kill) - .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4)) - .setMIFlags(MachineInstr::FrameSetup); - if (NeedsWinCFI) { - HasWinCFI = true; - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)) - .addImm(NumBytes) - .setMIFlag(MachineInstr::FrameSetup); - } - NumBytes = 0; - - if (RealignmentPadding > 0) { - if (RealignmentPadding >= 4096) { - BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm)) - .addReg(AArch64::X16, RegState::Define) - .addImm(RealignmentPadding) - .setMIFlags(MachineInstr::FrameSetup); - BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXrx64), AArch64::X15) - .addReg(AArch64::SP) - .addReg(AArch64::X16, RegState::Kill) - .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0)) - .setMIFlag(MachineInstr::FrameSetup); - } else { - BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), AArch64::X15) - .addReg(AArch64::SP) - .addImm(RealignmentPadding) - .addImm(0) - .setMIFlag(MachineInstr::FrameSetup); - } - - uint64_t AndMask = ~(MFI.getMaxAlign().value() - 1); - BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP) - .addReg(AArch64::X15, RegState::Kill) - .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64)); - AFI->setStackRealigned(true); - - // No need for SEH instructions here; if we're realigning the stack, - // we've set a frame pointer and already finished the SEH prologue. - assert(!NeedsWinCFI); - } - if (X15Scratch != AArch64::NoRegister) { - BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), AArch64::X15) - .addReg(AArch64::XZR) - .addReg(X15Scratch, RegState::Undef) - .addReg(X15Scratch, RegState::Implicit) - .setMIFlag(MachineInstr::FrameSetup); - } - } - - StackOffset SVECalleeSavesSize = {}, SVELocalsSize = SVEStackSize; - MachineBasicBlock::iterator CalleeSavesEnd = MBBI; - - StackOffset CFAOffset = - StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes); - - // Process the SVE callee-saves to determine what space needs to be - // allocated. - if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) { - LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize - << "\n"); - SVECalleeSavesSize = StackOffset::getScalable(CalleeSavedSize); - SVELocalsSize = SVEStackSize - SVECalleeSavesSize; - // Find callee save instructions in frame. - // Note: With FPAfterSVECalleeSaves the callee saves have already been - // allocated. - if (!FPAfterSVECalleeSaves) { - MachineBasicBlock::iterator CalleeSavesBegin = MBBI; - assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction"); - while (IsSVECalleeSave(MBBI) && MBBI != MBB.getFirstTerminator()) - ++MBBI; - CalleeSavesEnd = MBBI; - - StackOffset LocalsSize = SVELocalsSize + StackOffset::getFixed(NumBytes); - // Allocate space for the callee saves (if any). - allocateStackSpace(MBB, CalleeSavesBegin, 0, SVECalleeSavesSize, false, - nullptr, EmitAsyncCFI && !HasFP, CFAOffset, - MFI.hasVarSizedObjects() || LocalsSize); - } - } - CFAOffset += SVECalleeSavesSize; - - if (EmitAsyncCFI) - emitCalleeSavedSVELocations(MBB, CalleeSavesEnd); - - // Allocate space for the rest of the frame including SVE locals. Align the - // stack as necessary. - assert(!(canUseRedZone(MF) && NeedsRealignment) && - "Cannot use redzone with stack realignment"); - if (!canUseRedZone(MF)) { - // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have - // the correct value here, as NumBytes also includes padding bytes, - // which shouldn't be counted here. - allocateStackSpace(MBB, CalleeSavesEnd, RealignmentPadding, - SVELocalsSize + StackOffset::getFixed(NumBytes), - NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP, - CFAOffset, MFI.hasVarSizedObjects()); - } - - // If we need a base pointer, set it up here. It's whatever the value of the - // stack pointer is at this point. Any variable size objects will be allocated - // after this, so we can still use the base pointer to reference locals. - // - // FIXME: Clarify FrameSetup flags here. - // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is - // needed. - // For funclets the BP belongs to the containing function. - if (!IsFunclet && RegInfo->hasBasePointer(MF)) { - TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP, - false); - if (NeedsWinCFI) { - HasWinCFI = true; - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop)) - .setMIFlag(MachineInstr::FrameSetup); - } - } - - // The very last FrameSetup instruction indicates the end of prologue. Emit a - // SEH opcode indicating the prologue end. - if (NeedsWinCFI && HasWinCFI) { - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd)) - .setMIFlag(MachineInstr::FrameSetup); - } - - // SEH funclets are passed the frame pointer in X1. If the parent - // function uses the base register, then the base register is used - // directly, and is not retrieved from X1. - if (IsFunclet && F.hasPersonalityFn()) { - EHPersonality Per = classifyEHPersonality(F.getPersonalityFn()); - if (isAsynchronousEHPersonality(Per)) { - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP) - .addReg(AArch64::X1) - .setMIFlag(MachineInstr::FrameSetup); - MBB.addLiveIn(AArch64::X1); - } - } - - if (EmitCFI && !EmitAsyncCFI) { - if (HasFP) { - emitDefineCFAWithFP(MF, MBB, MBBI, FixedObject); - } else { - StackOffset TotalSize = - SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize()); - CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup); - CFIBuilder.insertCFIInst( - createDefCFA(*RegInfo, /*FrameReg=*/AArch64::SP, /*Reg=*/AArch64::SP, - TotalSize, /*LastAdjustmentWasScalable=*/false)); - } - emitCalleeSavedGPRLocations(MBB, MBBI); - emitCalleeSavedSVELocations(MBB, MBBI); - } + AArch64PrologueEmitter PrologueEmitter(MF, MBB, *this); + PrologueEmitter.emitPrologue(); } static bool isFuncletReturnInstr(const MachineInstr &MI) { @@ -2548,15 +1791,15 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, AFI->setLocalStackSize(NumBytes - PrologueSaveSize); if (homogeneousPrologEpilog(MF, &MBB)) { assert(!NeedsWinCFI); - auto LastPopI = MBB.getFirstTerminator(); - if (LastPopI != MBB.begin()) { - auto HomogeneousEpilog = std::prev(LastPopI); + auto FirstHomogenousEpilogI = MBB.getFirstTerminator(); + if (FirstHomogenousEpilogI != MBB.begin()) { + auto HomogeneousEpilog = std::prev(FirstHomogenousEpilogI); if (HomogeneousEpilog->getOpcode() == AArch64::HOM_Epilog) - LastPopI = HomogeneousEpilog; + FirstHomogenousEpilogI = HomogeneousEpilog; } // Adjust local stack - emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, + emitFrameOffset(MBB, FirstHomogenousEpilogI, DL, AArch64::SP, AArch64::SP, StackOffset::getFixed(AFI->getLocalStackSize()), TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); @@ -2602,17 +1845,17 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // Move past the restores of the callee-saved registers. // If we plan on combining the sp bump of the local stack size and the callee // save stack size, we might need to adjust the CSR save and restore offsets. - MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator(); + MachineBasicBlock::iterator FirstGPRRestoreI = MBB.getFirstTerminator(); MachineBasicBlock::iterator Begin = MBB.begin(); - while (LastPopI != Begin) { - --LastPopI; - if (!LastPopI->getFlag(MachineInstr::FrameDestroy) || - (!FPAfterSVECalleeSaves && IsSVECalleeSave(LastPopI))) { - ++LastPopI; + while (FirstGPRRestoreI != Begin) { + --FirstGPRRestoreI; + if (!FirstGPRRestoreI->getFlag(MachineInstr::FrameDestroy) || + (!FPAfterSVECalleeSaves && isSVECalleeSave(FirstGPRRestoreI))) { + ++FirstGPRRestoreI; break; } else if (CombineSPBump) - fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize(), - NeedsWinCFI, &HasWinCFI); + fixupCalleeSaveRestoreStackOffset( + *FirstGPRRestoreI, AFI->getLocalStackSize(), NeedsWinCFI, &HasWinCFI); } if (NeedsWinCFI) { @@ -2622,9 +1865,9 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // arguments. Insert the SEH_EpilogStart and remove it later if it // we didn't emit any SEH opcodes to avoid generating WinCFI for // functions that don't need it. - BuildMI(MBB, LastPopI, DL, TII->get(AArch64::SEH_EpilogStart)) + BuildMI(MBB, FirstGPRRestoreI, DL, TII->get(AArch64::SEH_EpilogStart)) .setMIFlag(MachineInstr::FrameDestroy); - EpilogStartI = LastPopI; + EpilogStartI = FirstGPRRestoreI; --EpilogStartI; } @@ -2665,7 +1908,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // When we are about to restore the CSRs, the CFA register is SP again. if (EmitCFI && hasFP(MF)) - CFIInstBuilder(MBB, LastPopI, MachineInstr::FrameDestroy) + CFIInstBuilder(MBB, FirstGPRRestoreI, MachineInstr::FrameDestroy) .buildDefCFA(AArch64::SP, NumBytes); emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP, @@ -2681,18 +1924,19 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // Process the SVE callee-saves to determine what space needs to be // deallocated. StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize; - MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI; + MachineBasicBlock::iterator RestoreBegin = FirstGPRRestoreI, + RestoreEnd = FirstGPRRestoreI; if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) { if (FPAfterSVECalleeSaves) RestoreEnd = MBB.getFirstTerminator(); RestoreBegin = std::prev(RestoreEnd); while (RestoreBegin != MBB.begin() && - IsSVECalleeSave(std::prev(RestoreBegin))) + isSVECalleeSave(std::prev(RestoreBegin))) --RestoreBegin; - assert(IsSVECalleeSave(RestoreBegin) && - IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction"); + assert(isSVECalleeSave(RestoreBegin) && + isSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction"); StackOffset CalleeSavedSizeAsOffset = StackOffset::getScalable(CalleeSavedSize); @@ -2706,7 +1950,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // deallocates non-callee-save SVE allocations. Otherwise, deallocate // them explicitly. if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) { - emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, + emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP, DeallocateBefore, TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); } @@ -2796,7 +2040,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, StackRestoreBytes += AfterCSRPopSize; emitFrameOffset( - MBB, LastPopI, DL, AArch64::SP, AArch64::SP, + MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP, StackOffset::getFixed(StackRestoreBytes), TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI, EmitCFI, StackOffset::getFixed((RedZone ? 0 : NumBytes) + PrologueSaveSize)); @@ -2816,17 +2060,17 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // be able to save any instructions. if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) { emitFrameOffset( - MBB, LastPopI, DL, AArch64::SP, AArch64::FP, + MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::FP, StackOffset::getFixed(-AFI->getCalleeSaveBaseToFrameRecordOffset()), TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); } else if (NumBytes) - emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, + emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP, StackOffset::getFixed(NumBytes), TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); // When we are about to restore the CSRs, the CFA register is SP again. if (EmitCFI && hasFP(MF)) - CFIInstBuilder(MBB, LastPopI, MachineInstr::FrameDestroy) + CFIInstBuilder(MBB, FirstGPRRestoreI, MachineInstr::FrameDestroy) .buildDefCFA(AArch64::SP, PrologueSaveSize); // This must be placed after the callee-save restore code because that code @@ -2926,8 +2170,8 @@ AArch64FrameLowering::getNonLocalFrameIndexReference(const MachineFunction &MF, return StackOffset::getFixed(getSEHFrameIndexOffset(MF, FI)); } -static StackOffset getFPOffset(const MachineFunction &MF, - int64_t ObjectOffset) { +StackOffset AArch64FrameLowering::getFPOffset(const MachineFunction &MF, + int64_t ObjectOffset) const { const auto *AFI = MF.getInfo<AArch64FunctionInfo>(); const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>(); const Function &F = MF.getFunction(); @@ -2940,8 +2184,8 @@ static StackOffset getFPOffset(const MachineFunction &MF, return StackOffset::getFixed(ObjectOffset + FixedObject + FPAdjust); } -static StackOffset getStackOffset(const MachineFunction &MF, - int64_t ObjectOffset) { +StackOffset AArch64FrameLowering::getStackOffset(const MachineFunction &MF, + int64_t ObjectOffset) const { const auto &MFI = MF.getFrameInfo(); return StackOffset::getFixed(ObjectOffset + (int64_t)MFI.getStackSize()); } @@ -3139,7 +2383,8 @@ static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) { return getKillRegState(!IsLiveIn); } -static bool produceCompactUnwindFrame(MachineFunction &MF) { +static bool produceCompactUnwindFrame(const AArch64FrameLowering &AFL, + MachineFunction &MF) { const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); AttributeList Attrs = MF.getFunction().getAttributes(); AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); @@ -3147,7 +2392,7 @@ static bool produceCompactUnwindFrame(MachineFunction &MF) { !(Subtarget.getTargetLowering()->supportSwiftError() && Attrs.hasAttrSomewhere(Attribute::SwiftError)) && MF.getFunction().getCallingConv() != CallingConv::SwiftTail && - !requiresSaveVG(MF) && !AFI->isSVECC(); + !AFL.requiresSaveVG(MF) && !AFI->isSVECC(); } static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2, @@ -3244,16 +2489,18 @@ bool enableMultiVectorSpillFill(const AArch64Subtarget &Subtarget, (!IsLocallyStreaming && Subtarget.isStreaming())); } -static void computeCalleeSaveRegisterPairs( - MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI, - const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs, - bool NeedsFrameRecord) { +void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL, + MachineFunction &MF, + ArrayRef<CalleeSavedInfo> CSI, + const TargetRegisterInfo *TRI, + SmallVectorImpl<RegPairInfo> &RegPairs, + bool NeedsFrameRecord) { if (CSI.empty()) return; bool IsWindows = isTargetWindows(MF); - bool NeedsWinCFI = needsWinCFI(MF); + bool NeedsWinCFI = AFL.needsWinCFI(MF); AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); unsigned StackHazardSize = getStackHazardSize(MF); MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -3262,9 +2509,10 @@ static void computeCalleeSaveRegisterPairs( (void)CC; // MachO's compact unwind format relies on all registers being stored in // pairs. - assert((!produceCompactUnwindFrame(MF) || CC == CallingConv::PreserveMost || - CC == CallingConv::PreserveAll || CC == CallingConv::CXX_FAST_TLS || - CC == CallingConv::Win64 || (Count & 1) == 0) && + assert((!produceCompactUnwindFrame(AFL, MF) || + CC == CallingConv::PreserveMost || CC == CallingConv::PreserveAll || + CC == CallingConv::CXX_FAST_TLS || CC == CallingConv::Win64 || + (Count & 1) == 0) && "Odd number of callee-saved regs to spill!"); int ByteOffset = AFI->getCalleeSavedStackSize(); int StackFillDir = -1; @@ -3380,9 +2628,9 @@ static void computeCalleeSaveRegisterPairs( // MachO's compact unwind format relies on all registers being stored in // adjacent register pairs. - assert((!produceCompactUnwindFrame(MF) || CC == CallingConv::PreserveMost || - CC == CallingConv::PreserveAll || CC == CallingConv::CXX_FAST_TLS || - CC == CallingConv::Win64 || + assert((!produceCompactUnwindFrame(AFL, MF) || + CC == CallingConv::PreserveMost || CC == CallingConv::PreserveAll || + CC == CallingConv::CXX_FAST_TLS || CC == CallingConv::Win64 || (RPI.isPaired() && ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) || RPI.Reg1 + 1 == RPI.Reg2))) && @@ -3495,7 +2743,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( DebugLoc DL; SmallVector<RegPairInfo, 8> RegPairs; - computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF)); + computeCalleeSaveRegisterPairs(*this, MF, CSI, TRI, RegPairs, hasFP(MF)); MachineRegisterInfo &MRI = MF.getRegInfo(); // Refresh the reserved regs in case there are any potential changes since the @@ -3707,7 +2955,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( if (MBBI != MBB.end()) DL = MBBI->getDebugLoc(); - computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF)); + computeCalleeSaveRegisterPairs(*this, MF, CSI, TRI, RegPairs, hasFP(MF)); if (homogeneousPrologEpilog(MF, &MBB)) { auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog)) .setMIFlag(MachineInstr::FrameDestroy); @@ -4141,7 +3389,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, if (producePairRegisters(MF)) { if (UnspilledCSGPRPaired == AArch64::NoRegister) { // Failed to make a pair for compact unwind format, revert spilling. - if (produceCompactUnwindFrame(MF)) { + if (produceCompactUnwindFrame(*this, MF)) { SavedRegs.reset(UnspilledCSGPR); ExtraCSSpill = AArch64::NoRegister; } diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h index 555a93359c27..a9d65441a4e3 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -19,6 +19,10 @@ namespace llvm { +class TargetLowering; +class AArch64FunctionInfo; +class AArch64PrologueEmitter; + class AArch64FrameLowering : public TargetFrameLowering { public: explicit AArch64FrameLowering() @@ -130,12 +134,19 @@ public: return StackId != TargetStackID::ScalableVector; } + friend class AArch64PrologueEmitter; void orderFrameObjects(const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const override; bool isFPReserved(const MachineFunction &MF) const; + bool needsWinCFI(const MachineFunction &MF) const; + + bool requiresSaveVG(const MachineFunction &MF) const; + + StackOffset getSVEStackSize(const MachineFunction &MF) const; + protected: bool hasFPImpl(const MachineFunction &MF) const override; @@ -159,10 +170,6 @@ private: int &MaxCSFrameIndex) const; bool shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock &MBB, uint64_t StackBumpBytes) const; - void emitCalleeSavedGPRLocations(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI) const; - void emitCalleeSavedSVELocations(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI) const; void emitCalleeSavedGPRRestores(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const; void emitCalleeSavedSVERestores(MachineBasicBlock &MBB, @@ -196,6 +203,61 @@ private: void emitRemarks(const MachineFunction &MF, MachineOptimizationRemarkEmitter *ORE) const override; + + bool windowsRequiresStackProbe(const MachineFunction &MF, + uint64_t StackSizeInBytes) const; + + bool shouldSignReturnAddressEverywhere(const MachineFunction &MF) const; + + StackOffset getFPOffset(const MachineFunction &MF, + int64_t ObjectOffset) const; + + StackOffset getStackOffset(const MachineFunction &MF, + int64_t ObjectOffset) const; + + // Find a scratch register that we can use at the start of the prologue to + // re-align the stack pointer. We avoid using callee-save registers since + // they may appear to be free when this is called from canUseAsPrologue + // (during shrink wrapping), but then no longer be free when this is called + // from emitPrologue. + // + // FIXME: This is a bit conservative, since in the above case we could use one + // of the callee-save registers as a scratch temp to re-align the stack + // pointer, but we would then have to make sure that we were in fact saving at + // least one callee-save register in the prologue, which is additional + // complexity that doesn't seem worth the benefit. + Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB, + bool HasCall = false) const; + + // Convert callee-save register save/restore instruction to do stack pointer + // decrement/increment to allocate/deallocate the callee-save stack area by + // converting store/load to use pre/post increment version. + MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc, + bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI, + MachineInstr::MIFlag FrameFlag = MachineInstr::FrameSetup, + int CFAOffset = 0) const; + + // Fixup callee-save register save/restore instructions to take into account + // combined SP bump by adding the local stack size to the stack offsets. + void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI, + uint64_t LocalStackSize, + bool NeedsWinCFI, + bool *HasWinCFI) const; + + bool isSVECalleeSave(MachineBasicBlock::iterator I) const; + + /// Returns the size of the fixed object area (allocated next to sp on entry) + /// On Win64 this may include a var args area and an UnwindHelp object for EH. + unsigned getFixedObjectSize(const MachineFunction &MF, + const AArch64FunctionInfo *AFI, bool IsWin64, + bool IsFunclet) const; + + bool isVGInstruction(MachineBasicBlock::iterator MBBI, + const TargetLowering &TLI) const; + + bool requiresGetVGCall(const MachineFunction &MF) const; }; } // End llvm namespace diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index bc786f415b55..6fdc981fc21a 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -246,9 +246,9 @@ public: return false; } - template<MVT::SimpleValueType VT> + template <MVT::SimpleValueType VT, bool Negate> bool SelectSVEAddSubImm(SDValue N, SDValue &Imm, SDValue &Shift) { - return SelectSVEAddSubImm(N, VT, Imm, Shift); + return SelectSVEAddSubImm(N, VT, Imm, Shift, Negate); } template <MVT::SimpleValueType VT, bool Negate> @@ -489,7 +489,8 @@ private: bool SelectCMP_SWAP(SDNode *N); - bool SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift); + bool SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift, + bool Negate); bool SelectSVEAddSubSSatImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift, bool Negate); bool SelectSVECpyDupImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift); @@ -4227,35 +4228,36 @@ bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) { } bool AArch64DAGToDAGISel::SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, - SDValue &Shift) { + SDValue &Shift, bool Negate) { if (!isa<ConstantSDNode>(N)) return false; SDLoc DL(N); - uint64_t Val = cast<ConstantSDNode>(N) - ->getAPIntValue() - .trunc(VT.getFixedSizeInBits()) - .getZExtValue(); + APInt Val = + cast<ConstantSDNode>(N)->getAPIntValue().trunc(VT.getFixedSizeInBits()); + + if (Negate) + Val = -Val; switch (VT.SimpleTy) { case MVT::i8: // All immediates are supported. Shift = CurDAG->getTargetConstant(0, DL, MVT::i32); - Imm = CurDAG->getTargetConstant(Val, DL, MVT::i32); + Imm = CurDAG->getTargetConstant(Val.getZExtValue(), DL, MVT::i32); return true; case MVT::i16: case MVT::i32: case MVT::i64: // Support 8bit unsigned immediates. - if (Val <= 255) { + if ((Val & ~0xff) == 0) { Shift = CurDAG->getTargetConstant(0, DL, MVT::i32); - Imm = CurDAG->getTargetConstant(Val, DL, MVT::i32); + Imm = CurDAG->getTargetConstant(Val.getZExtValue(), DL, MVT::i32); return true; } // Support 16bit unsigned immediates that are a multiple of 256. - if (Val <= 65280 && Val % 256 == 0) { + if ((Val & ~0xff00) == 0) { Shift = CurDAG->getTargetConstant(8, DL, MVT::i32); - Imm = CurDAG->getTargetConstant(Val >> 8, DL, MVT::i32); + Imm = CurDAG->getTargetConstant(Val.lshr(8).getZExtValue(), DL, MVT::i32); return true; } break; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d70a46b0e893..5ffaf2c49b4c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1179,6 +1179,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); setTargetDAGCombine(ISD::SHL); + setTargetDAGCombine(ISD::VECTOR_DEINTERLEAVE); // In case of strict alignment, avoid an excessive number of byte wide stores. MaxStoresPerMemsetOptSize = 8; @@ -1918,6 +1919,20 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, } } + // Handle non-aliasing elements mask + if (Subtarget->hasSVE2() || + (Subtarget->hasSME() && Subtarget->isStreaming())) { + // FIXME: Support wider fixed-length types when msve-vector-bits is used. + for (auto VT : {MVT::v2i32, MVT::v4i16, MVT::v8i8, MVT::v16i8}) { + setOperationAction(ISD::LOOP_DEPENDENCE_RAW_MASK, VT, Custom); + setOperationAction(ISD::LOOP_DEPENDENCE_WAR_MASK, VT, Custom); + } + for (auto VT : {MVT::nxv2i1, MVT::nxv4i1, MVT::nxv8i1, MVT::nxv16i1}) { + setOperationAction(ISD::LOOP_DEPENDENCE_RAW_MASK, VT, Custom); + setOperationAction(ISD::LOOP_DEPENDENCE_WAR_MASK, VT, Custom); + } + } + // Handle operations that are only available in non-streaming SVE mode. if (Subtarget->isSVEAvailable()) { for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64, @@ -2585,6 +2600,30 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode( Known = Known.intersectWith(Known2); break; } + case AArch64ISD::CSNEG: + case AArch64ISD::CSINC: + case AArch64ISD::CSINV: { + KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); + KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); + + // The result is either: + // CSINC: KnownOp0 or KnownOp1 + 1 + // CSINV: KnownOp0 or ~KnownOp1 + // CSNEG: KnownOp0 or KnownOp1 * -1 + if (Op.getOpcode() == AArch64ISD::CSINC) + KnownOp1 = KnownBits::add( + KnownOp1, + KnownBits::makeConstant(APInt(Op.getScalarValueSizeInBits(), 1))); + else if (Op.getOpcode() == AArch64ISD::CSINV) + std::swap(KnownOp1.Zero, KnownOp1.One); + else if (Op.getOpcode() == AArch64ISD::CSNEG) + KnownOp1 = + KnownBits::mul(KnownOp1, KnownBits::makeConstant(APInt::getAllOnes( + Op.getScalarValueSizeInBits()))); + + Known = KnownOp0.intersectWith(KnownOp1); + break; + } case AArch64ISD::BICi: { // Compute the bit cleared value. APInt Mask = @@ -2626,6 +2665,32 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode( << Op->getConstantOperandVal(1))); break; } + case AArch64ISD::MOVImsl: { + unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1)); + Known = KnownBits::makeConstant(APInt( + Known.getBitWidth(), ~(~Op->getConstantOperandVal(0) << ShiftAmt))); + break; + } + case AArch64ISD::MOVIedit: { + Known = KnownBits::makeConstant(APInt( + Known.getBitWidth(), + AArch64_AM::decodeAdvSIMDModImmType10(Op->getConstantOperandVal(0)))); + break; + } + case AArch64ISD::MVNIshift: { + Known = KnownBits::makeConstant( + APInt(Known.getBitWidth(), + ~(Op->getConstantOperandVal(0) << Op->getConstantOperandVal(1)), + /*isSigned*/ false, /*implicitTrunc*/ true)); + break; + } + case AArch64ISD::MVNImsl: { + unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1)); + Known = KnownBits::makeConstant( + APInt(Known.getBitWidth(), (~Op->getConstantOperandVal(0) << ShiftAmt), + /*isSigned*/ false, /*implicitTrunc*/ true)); + break; + } case AArch64ISD::LOADgot: case AArch64ISD::ADDlow: { if (!Subtarget->isTargetILP32()) @@ -2984,21 +3049,20 @@ AArch64TargetLowering::EmitInitTPIDR2Object(MachineInstr &MI, AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>(); TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); if (TPIDR2.Uses > 0) { + // Note: This case just needs to do `SVL << 48`. It is not implemented as we + // generally don't support big-endian SVE/SME. + if (!Subtarget->isLittleEndian()) + reportFatalInternalError( + "TPIDR2 block initialization is not supported on big-endian targets"); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); - // Store the buffer pointer to the TPIDR2 stack object. - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui)) + // Store buffer pointer and num_za_save_slices. + // Bytes 10-15 are implicitly zeroed. + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STPXi)) .addReg(MI.getOperand(0).getReg()) + .addReg(MI.getOperand(1).getReg()) .addFrameIndex(TPIDR2.FrameIndex) .addImm(0); - // Set the reserved bytes (10-15) to zero - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui)) - .addReg(AArch64::WZR) - .addFrameIndex(TPIDR2.FrameIndex) - .addImm(5); - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui)) - .addReg(AArch64::WZR) - .addFrameIndex(TPIDR2.FrameIndex) - .addImm(3); } else MFI.RemoveStackObject(TPIDR2.FrameIndex); @@ -3111,21 +3175,24 @@ MachineBasicBlock * AArch64TargetLowering::EmitEntryPStateSM(MachineInstr &MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); - AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + const DebugLoc &DL = MI.getDebugLoc(); Register ResultReg = MI.getOperand(0).getReg(); - if (FuncInfo->isPStateSMRegUsed()) { + if (MF->getRegInfo().use_empty(ResultReg)) { + // Nothing to do. Pseudo erased below. + } else if (Subtarget->hasSME()) { + BuildMI(*BB, MI, DL, TII->get(AArch64::MRS), ResultReg) + .addImm(AArch64SysReg::SVCR) + .addReg(AArch64::VG, RegState::Implicit); + } else { RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE; const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL)) + BuildMI(*BB, MI, DL, TII->get(AArch64::BL)) .addExternalSymbol(getLibcallName(LC)) .addReg(AArch64::X0, RegState::ImplicitDefine) .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC))); - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), ResultReg) + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), ResultReg) .addReg(AArch64::X0); - } else { - assert(MI.getMF()->getRegInfo().use_empty(ResultReg) && - "Expected no users of the entry pstate.sm!"); } MI.eraseFromParent(); return BB; @@ -4912,6 +4979,18 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, if (DstWidth < SatWidth) return SDValue(); + if (SrcVT == MVT::f16 && SatVT == MVT::i16 && DstVT == MVT::i32) { + if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) { + SDValue CVTf32 = + DAG.getNode(AArch64ISD::FCVTZS_HALF, DL, MVT::f32, SrcVal); + SDValue Bitcast = DAG.getBitcast(DstVT, CVTf32); + return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, Bitcast, + DAG.getValueType(SatVT)); + } + SDValue CVTf32 = DAG.getNode(AArch64ISD::FCVTZU_HALF, DL, MVT::f32, SrcVal); + return DAG.getBitcast(DstVT, CVTf32); + } + SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT)); SDValue Sat; @@ -5242,6 +5321,56 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, static MVT getSVEContainerType(EVT ContentTy); +SDValue +AArch64TargetLowering::LowerLOOP_DEPENDENCE_MASK(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + uint64_t EltSize = Op.getConstantOperandVal(2); + EVT VT = Op.getValueType(); + switch (EltSize) { + case 1: + if (VT != MVT::v16i8 && VT != MVT::nxv16i1) + return SDValue(); + break; + case 2: + if (VT != MVT::v8i8 && VT != MVT::nxv8i1) + return SDValue(); + break; + case 4: + if (VT != MVT::v4i16 && VT != MVT::nxv4i1) + return SDValue(); + break; + case 8: + if (VT != MVT::v2i32 && VT != MVT::nxv2i1) + return SDValue(); + break; + default: + // Other element sizes are incompatible with whilewr/rw, so expand instead + return SDValue(); + } + + SDValue PtrA = Op.getOperand(0); + SDValue PtrB = Op.getOperand(1); + + if (VT.isScalableVT()) + return DAG.getNode(Op.getOpcode(), DL, VT, PtrA, PtrB, Op.getOperand(2)); + + // We can use the SVE whilewr/whilerw instruction to lower this + // intrinsic by creating the appropriate sequence of scalable vector + // operations and then extracting a fixed-width subvector from the scalable + // vector. Scalable vector variants are already legal. + EVT ContainerVT = + EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), + VT.getVectorNumElements(), true); + EVT WhileVT = ContainerVT.changeElementType(MVT::i1); + + SDValue Mask = + DAG.getNode(Op.getOpcode(), DL, WhileVT, PtrA, PtrB, Op.getOperand(2)); + SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt, + DAG.getVectorIdxConstant(0, DL)); +} + SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { EVT OpVT = Op.getValueType(); @@ -6000,6 +6129,38 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, EVT PtrVT = getPointerTy(DAG.getDataLayout()); return DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT); } + case Intrinsic::aarch64_sve_whilewr_b: + return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(1, DL, MVT::i64)); + case Intrinsic::aarch64_sve_whilewr_h: + return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(2, DL, MVT::i64)); + case Intrinsic::aarch64_sve_whilewr_s: + return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(4, DL, MVT::i64)); + case Intrinsic::aarch64_sve_whilewr_d: + return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(8, DL, MVT::i64)); + case Intrinsic::aarch64_sve_whilerw_b: + return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(1, DL, MVT::i64)); + case Intrinsic::aarch64_sve_whilerw_h: + return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(2, DL, MVT::i64)); + case Intrinsic::aarch64_sve_whilerw_s: + return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(4, DL, MVT::i64)); + case Intrinsic::aarch64_sve_whilerw_d: + return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(8, DL, MVT::i64)); case Intrinsic::aarch64_neon_abs: { EVT Ty = Op.getValueType(); if (Ty == MVT::i64) { @@ -7359,6 +7520,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, default: llvm_unreachable("unimplemented operand"); return SDValue(); + case ISD::LOOP_DEPENDENCE_RAW_MASK: + case ISD::LOOP_DEPENDENCE_WAR_MASK: + return LowerLOOP_DEPENDENCE_MASK(Op, DAG); case ISD::BITCAST: return LowerBITCAST(Op, DAG); case ISD::GlobalAddress: @@ -7873,6 +8037,39 @@ static bool isPassedInFPR(EVT VT) { (VT.isFloatingPoint() && !VT.isScalableVector()); } +SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL, + SelectionDAG &DAG) const { + assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value"); + SDValue Glue = Chain.getValue(1); + + MachineFunction &MF = DAG.getMachineFunction(); + SMEAttrs SMEFnAttrs = MF.getInfo<AArch64FunctionInfo>()->getSMEFnAttrs(); + + // The following conditions are true on entry to an exception handler: + // - PSTATE.SM is 0. + // - PSTATE.ZA is 0. + // - TPIDR2_EL0 is null. + // See: + // https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#exceptions + // + // Therefore, if the function that contains this exception handler is a + // streaming[-compatible] function, we must re-enable streaming mode. + // + // These mode changes are usually optimized away in catch blocks as they + // occur before the __cxa_begin_catch (which is a non-streaming function), + // but are necessary in some cases (such as for cleanups). + + if (SMEFnAttrs.hasStreamingInterfaceOrBody()) + return changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, + /*Glue*/ Glue, AArch64SME::Always); + + if (SMEFnAttrs.hasStreamingCompatibleInterface()) + return changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue, + AArch64SME::IfCallerIsStreaming); + + return Chain; +} + SDValue AArch64TargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, @@ -8292,7 +8489,39 @@ SDValue AArch64TargetLowering::LowerFormalArguments( if (Subtarget->hasCustomCallingConv()) Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF); - if (!getTM().useNewSMEABILowering() || Attrs.hasAgnosticZAInterface()) { + if (getTM().useNewSMEABILowering()) { + if (Subtarget->isTargetWindows() || hasInlineStackProbe(MF)) { + SDValue Size; + if (Attrs.hasZAState()) { + SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, + DAG.getConstant(1, DL, MVT::i32)); + Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL); + } else if (Attrs.hasAgnosticZAInterface()) { + RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE; + SDValue Callee = DAG.getExternalSymbol( + getLibcallName(LC), getPointerTy(DAG.getDataLayout())); + auto *RetTy = EVT(MVT::i64).getTypeForEVT(*DAG.getContext()); + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(DL).setChain(Chain).setLibCallee( + getLibcallCallingConv(LC), RetTy, Callee, {}); + std::tie(Size, Chain) = LowerCallTo(CLI); + } + if (Size) { + SDValue Buffer = DAG.getNode( + ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other), + {Chain, Size, DAG.getConstant(1, DL, MVT::i64)}); + Chain = Buffer.getValue(1); + + Register BufferPtr = + MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); + Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer); + Chain = DAG.getNode(AArch64ISD::SME_STATE_ALLOC, DL, + DAG.getVTList(MVT::Other), Chain); + FuncInfo->setEarlyAllocSMESaveBuffer(BufferPtr); + MFI.CreateVariableSizedObject(Align(16), nullptr); + } + } + } else { // Old SME ABI lowering (deprecated): // Create a 16 Byte TPIDR2 object. The dynamic buffer // will be expanded and stored in the static object later using a @@ -8313,9 +8542,12 @@ SDValue AArch64TargetLowering::LowerFormalArguments( {Chain, Size, DAG.getConstant(1, DL, MVT::i64)}); MFI.CreateVariableSizedObject(Align(16), nullptr); } + SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, + DAG.getConstant(1, DL, MVT::i32)); Chain = DAG.getNode( AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other), - {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0)}); + {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0), + /*Num save slices*/ NumZaSaveSlices}); } else if (Attrs.hasAgnosticZAInterface()) { // Call __arm_sme_state_size(). SDValue BufferSize = @@ -8338,7 +8570,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( Register BufferPtr = MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); FuncInfo->setSMESaveBufferAddr(BufferPtr); - Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer); + Chain = DAG.getCopyToReg(Buffer.getValue(1), DL, BufferPtr, Buffer); } } @@ -8905,7 +9137,6 @@ SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL, SmallVector<SDValue> Ops = {Chain, MSROp}; unsigned Opcode; if (Condition != AArch64SME::Always) { - FuncInfo->setPStateSMRegUsed(true); Register PStateReg = FuncInfo->getPStateSMReg(); assert(PStateReg.isValid() && "PStateSM Register is invalid"); SDValue PStateSM = @@ -9078,17 +9309,17 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Determine whether we need any streaming mode changes. SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), *this, CLI); + + std::optional<unsigned> ZAMarkerNode; bool UseNewSMEABILowering = getTM().useNewSMEABILowering(); - bool IsAgnosticZAFunction = CallAttrs.caller().hasAgnosticZAInterface(); - auto ZAMarkerNode = [&]() -> std::optional<unsigned> { - // TODO: Handle agnostic ZA functions. - if (!UseNewSMEABILowering || IsAgnosticZAFunction) - return std::nullopt; - if (!CallAttrs.caller().hasZAState() && !CallAttrs.caller().hasZT0State()) - return std::nullopt; - return CallAttrs.requiresLazySave() ? AArch64ISD::REQUIRES_ZA_SAVE - : AArch64ISD::INOUT_ZA_USE; - }(); + if (UseNewSMEABILowering) { + if (CallAttrs.requiresLazySave() || + CallAttrs.requiresPreservingAllZAState()) + ZAMarkerNode = AArch64ISD::REQUIRES_ZA_SAVE; + else if (CallAttrs.caller().hasZAState() || + CallAttrs.caller().hasZT0State()) + ZAMarkerNode = AArch64ISD::INOUT_ZA_USE; + } if (IsTailCall) { // Check if it's really possible to do a tail call. @@ -9163,21 +9394,13 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, }; bool RequiresLazySave = !UseNewSMEABILowering && CallAttrs.requiresLazySave(); - bool RequiresSaveAllZA = CallAttrs.requiresPreservingAllZAState(); + bool RequiresSaveAllZA = + !UseNewSMEABILowering && CallAttrs.requiresPreservingAllZAState(); if (RequiresLazySave) { - const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); - MachinePointerInfo MPI = - MachinePointerInfo::getStack(MF, TPIDR2.FrameIndex); + TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); SDValue TPIDR2ObjAddr = DAG.getFrameIndex( TPIDR2.FrameIndex, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); - SDValue NumZaSaveSlicesAddr = - DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr, - DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType())); - SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, - DAG.getConstant(1, DL, MVT::i32)); - Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr, - MPI, MVT::i16); Chain = DAG.getNode( ISD::INTRINSIC_VOID, DL, MVT::Other, Chain, DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), @@ -17599,14 +17822,16 @@ bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) { bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store, Value *LaneMask, ShuffleVectorInst *SVI, - unsigned Factor) const { + unsigned Factor, + const APInt &GapMask) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); auto *SI = dyn_cast<StoreInst>(Store); if (!SI) return false; - assert(!LaneMask && "Unexpected mask on store"); + assert(!LaneMask && GapMask.popcount() == Factor && + "Unexpected mask on store"); auto *VecTy = cast<FixedVectorType>(SVI->getType()); assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); @@ -20868,13 +21093,6 @@ static bool isNegatedInteger(SDValue Op) { return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)); } -static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG) { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - SDValue Zero = DAG.getConstant(0, DL, VT); - return DAG.getNode(ISD::SUB, DL, VT, Zero, Op); -} - // Try to fold // // (neg (csel X, Y)) -> (csel (neg X), (neg Y)) @@ -20893,16 +21111,17 @@ static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG) { SDValue N0 = CSel.getOperand(0); SDValue N1 = CSel.getOperand(1); - // If both of them is not negations, it's not worth the folding as it + // If neither of them are negations, it's not worth the folding as it // introduces two additional negations while reducing one negation. if (!isNegatedInteger(N0) && !isNegatedInteger(N1)) return SDValue(); - SDValue N0N = getNegatedInteger(N0, DAG); - SDValue N1N = getNegatedInteger(N1, DAG); - SDLoc DL(N); EVT VT = CSel.getValueType(); + + SDValue N0N = DAG.getNegative(N0, DL, VT); + SDValue N1N = DAG.getNegative(N1, DL, VT); + return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2), CSel.getOperand(3)); } @@ -22087,10 +22306,14 @@ static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op); } + unsigned PTest = AArch64ISD::PTEST; + if (Cond == AArch64CC::ANY_ACTIVE) + PTest = AArch64ISD::PTEST_ANY; + else if (Cond == AArch64CC::FIRST_ACTIVE) + PTest = AArch64ISD::PTEST_FIRST; + // Set condition code (CC) flags. - SDValue Test = DAG.getNode( - Cond == AArch64CC::ANY_ACTIVE ? AArch64ISD::PTEST_ANY : AArch64ISD::PTEST, - DL, MVT::i32, Pg, Op); + SDValue Test = DAG.getNode(PTest, DL, MVT::i32, Pg, Op); // Convert CC to integer based on requested condition. // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare. @@ -22158,6 +22381,17 @@ static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, Zero); } +static SDValue tryCombineNeonFcvtFP16ToI16(SDNode *N, unsigned Opcode, + SelectionDAG &DAG) { + if (N->getValueType(0) != MVT::i16) + return SDValue(); + + SDLoc DL(N); + SDValue CVT = DAG.getNode(Opcode, DL, MVT::f32, N->getOperand(1)); + SDValue Bitcast = DAG.getBitcast(MVT::i32, CVT); + return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Bitcast); +} + // If a merged operation has no inactive lanes we can relax it to a predicated // or unpredicated operation, which potentially allows better isel (perhaps // using immediate forms) or relaxing register reuse requirements. @@ -22411,6 +22645,26 @@ static SDValue performIntrinsicCombine(SDNode *N, case Intrinsic::aarch64_neon_uabd: return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); + case Intrinsic::aarch64_neon_fcvtzs: + return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZS_HALF, DAG); + case Intrinsic::aarch64_neon_fcvtzu: + return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZU_HALF, DAG); + case Intrinsic::aarch64_neon_fcvtas: + return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAS_HALF, DAG); + case Intrinsic::aarch64_neon_fcvtau: + return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAU_HALF, DAG); + case Intrinsic::aarch64_neon_fcvtms: + return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMS_HALF, DAG); + case Intrinsic::aarch64_neon_fcvtmu: + return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMU_HALF, DAG); + case Intrinsic::aarch64_neon_fcvtns: + return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNS_HALF, DAG); + case Intrinsic::aarch64_neon_fcvtnu: + return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNU_HALF, DAG); + case Intrinsic::aarch64_neon_fcvtps: + return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPS_HALF, DAG); + case Intrinsic::aarch64_neon_fcvtpu: + return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPU_HALF, DAG); case Intrinsic::aarch64_crc32b: case Intrinsic::aarch64_crc32cb: return tryCombineCRC32(0xff, N, DAG); @@ -22419,7 +22673,7 @@ static SDValue performIntrinsicCombine(SDNode *N, return tryCombineCRC32(0xffff, N, DAG); case Intrinsic::aarch64_sve_saddv: // There is no i64 version of SADDV because the sign is irrelevant. - if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64) + if (N->getOperand(2).getValueType().getVectorElementType() == MVT::i64) return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG); else return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG); @@ -24106,6 +24360,7 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) { // Ensure that all elements' bits are either 0s or 1s. ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT); + bool IsLE = DAG.getDataLayout().isLittleEndian(); SmallVector<SDValue, 16> MaskConstants; if (DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable() && VecVT == MVT::v16i8) { @@ -24113,7 +24368,10 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) { // per entry. We split it into two halves, apply the mask, zip the halves to // create 8x 16-bit values, and the perform the vector reduce. for (unsigned Half = 0; Half < 2; ++Half) { - for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) { + for (unsigned I = 0; I < 8; ++I) { + // On big-endian targets, the lane order in sub-byte vector elements + // gets reversed, so we need to flip the bit index. + unsigned MaskBit = IsLE ? (1u << I) : (1u << (7 - I)); MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32)); } } @@ -24131,8 +24389,9 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) { } // All other vector sizes. - unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1); - for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) { + unsigned NumEl = VecVT.getVectorNumElements(); + for (unsigned I = 0; I < NumEl; ++I) { + unsigned MaskBit = IsLE ? (1u << I) : (1u << (NumEl - 1 - I)); MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64)); } @@ -24444,6 +24703,105 @@ static SDValue performSTORECombine(SDNode *N, return SDValue(); } +static bool +isSequentialConcatOfVectorInterleave(SDNode *N, SmallVectorImpl<SDValue> &Ops) { + if (N->getOpcode() != ISD::CONCAT_VECTORS) + return false; + + unsigned NumParts = N->getNumOperands(); + + // We should be concatenating each sequential result from a + // VECTOR_INTERLEAVE. + SDNode *InterleaveOp = N->getOperand(0).getNode(); + if (InterleaveOp->getOpcode() != ISD::VECTOR_INTERLEAVE || + InterleaveOp->getNumOperands() != NumParts) + return false; + + for (unsigned I = 0; I < NumParts; I++) + if (N->getOperand(I) != SDValue(InterleaveOp, I)) + return false; + + Ops.append(InterleaveOp->op_begin(), InterleaveOp->op_end()); + return true; +} + +static SDValue getNarrowMaskForInterleavedOps(SelectionDAG &DAG, SDLoc &DL, + SDValue WideMask, + unsigned RequiredNumParts) { + if (WideMask->getOpcode() == ISD::CONCAT_VECTORS) { + SmallVector<SDValue, 4> MaskInterleaveOps; + if (!isSequentialConcatOfVectorInterleave(WideMask.getNode(), + MaskInterleaveOps)) + return SDValue(); + + if (MaskInterleaveOps.size() != RequiredNumParts) + return SDValue(); + + // Make sure the inputs to the vector interleave are identical. + if (!llvm::all_equal(MaskInterleaveOps)) + return SDValue(); + + return MaskInterleaveOps[0]; + } + + if (WideMask->getOpcode() != ISD::SPLAT_VECTOR) + return SDValue(); + + ElementCount EC = WideMask.getValueType().getVectorElementCount(); + assert(EC.isKnownMultipleOf(RequiredNumParts) && + "Expected element count divisible by number of parts"); + EC = EC.divideCoefficientBy(RequiredNumParts); + return DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::getVectorVT(MVT::i1, EC), + WideMask->getOperand(0)); +} + +static SDValue performInterleavedMaskedStoreCombine( + SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { + if (!DCI.isBeforeLegalize()) + return SDValue(); + + MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N); + SDValue WideValue = MST->getValue(); + + // Bail out if the stored value has an unexpected number of uses, since we'll + // have to perform manual interleaving and may as well just use normal masked + // stores. Also, discard masked stores that are truncating or indexed. + if (!WideValue.hasOneUse() || !ISD::isNormalMaskedStore(MST) || + !MST->isSimple() || !MST->getOffset().isUndef()) + return SDValue(); + + SmallVector<SDValue, 4> ValueInterleaveOps; + if (!isSequentialConcatOfVectorInterleave(WideValue.getNode(), + ValueInterleaveOps)) + return SDValue(); + + unsigned NumParts = ValueInterleaveOps.size(); + if (NumParts != 2 && NumParts != 4) + return SDValue(); + + // At the moment we're unlikely to see a fixed-width vector interleave as + // we usually generate shuffles instead. + EVT SubVecTy = ValueInterleaveOps[0].getValueType(); + if (!SubVecTy.isScalableVT() || + SubVecTy.getSizeInBits().getKnownMinValue() != 128 || + !DAG.getTargetLoweringInfo().isTypeLegal(SubVecTy)) + return SDValue(); + + SDLoc DL(N); + SDValue NarrowMask = + getNarrowMaskForInterleavedOps(DAG, DL, MST->getMask(), NumParts); + if (!NarrowMask) + return SDValue(); + + const Intrinsic::ID IID = + NumParts == 2 ? Intrinsic::aarch64_sve_st2 : Intrinsic::aarch64_sve_st4; + SmallVector<SDValue, 8> NewStOps; + NewStOps.append({MST->getChain(), DAG.getConstant(IID, DL, MVT::i32)}); + NewStOps.append(ValueInterleaveOps); + NewStOps.append({NarrowMask, MST->getBasePtr()}); + return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, NewStOps); +} + static SDValue performMSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, @@ -24453,6 +24811,9 @@ static SDValue performMSTORECombine(SDNode *N, SDValue Mask = MST->getMask(); SDLoc DL(N); + if (SDValue Res = performInterleavedMaskedStoreCombine(N, DCI, DAG)) + return Res; + // If this is a UZP1 followed by a masked store, fold this into a masked // truncating store. We can do this even if this is already a masked // truncstore. @@ -26523,6 +26884,26 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, return DAG.getNode(SOpc, DL, N->getValueType(0), Ext); } + // Sign extend of CSET -> CSETM. + if (Opc == AArch64ISD::CSEL && + cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i1) { + EVT VT = N->getValueType(0); + SDValue TVal = Src.getOperand(0); + SDValue FVal = Src.getOperand(1); + + // SIGN_EXTEND_INREG (CSEL 0, 1, cc, NZCV), i1 --> CSEL 0, -1, cc, NZCV + if (isNullConstant(TVal) && isOneConstant(FVal)) + return DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, + DAG.getAllOnesConstant(DL, VT), Src.getOperand(2), + Src.getOperand(3)); + + // SIGN_EXTEND_INREG (CSEL 1, 0, cc, NZCV), i1 --> CSEL -1, 0, cc, NZCV + if (isOneConstant(TVal) && isNullConstant(FVal)) + return DAG.getNode(AArch64ISD::CSEL, DL, VT, + DAG.getAllOnesConstant(DL, VT), FVal, + Src.getOperand(2), Src.getOperand(3)); + } + if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -27020,6 +27401,83 @@ performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, return NVCAST; } +static SDValue performVectorDeinterleaveCombine( + SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { + if (!DCI.isBeforeLegalize()) + return SDValue(); + + unsigned NumParts = N->getNumOperands(); + if (NumParts != 2 && NumParts != 4) + return SDValue(); + + EVT SubVecTy = N->getValueType(0); + + // At the moment we're unlikely to see a fixed-width vector deinterleave as + // we usually generate shuffles instead. + unsigned MinNumElements = SubVecTy.getVectorMinNumElements(); + if (!SubVecTy.isScalableVector() || + SubVecTy.getSizeInBits().getKnownMinValue() != 128 || + !DAG.getTargetLoweringInfo().isTypeLegal(SubVecTy)) + return SDValue(); + + // Make sure each input operand is the correct extract_subvector of the same + // wider vector. + SDValue Op0 = N->getOperand(0); + for (unsigned I = 0; I < NumParts; I++) { + SDValue OpI = N->getOperand(I); + if (OpI->getOpcode() != ISD::EXTRACT_SUBVECTOR || + OpI->getOperand(0) != Op0->getOperand(0)) + return SDValue(); + if (OpI->getConstantOperandVal(1) != (I * MinNumElements)) + return SDValue(); + } + + // Normal loads are currently already handled by the InterleavedAccessPass so + // we don't expect to see them here. Bail out if the masked load has an + // unexpected number of uses, since we want to avoid a situation where we have + // both deinterleaving loads and normal loads in the same block. Also, discard + // masked loads that are extending, indexed, have an unexpected offset or have + // an unsupported passthru value until we find a valid use case. + auto MaskedLoad = dyn_cast<MaskedLoadSDNode>(Op0->getOperand(0)); + if (!MaskedLoad || !MaskedLoad->hasNUsesOfValue(NumParts, 0) || + !MaskedLoad->isSimple() || !ISD::isNormalMaskedLoad(MaskedLoad) || + !MaskedLoad->getOffset().isUndef() || + (!MaskedLoad->getPassThru()->isUndef() && + !isZerosVector(MaskedLoad->getPassThru().getNode()))) + return SDValue(); + + // Now prove that the mask is an interleave of identical masks. + SDLoc DL(N); + SDValue NarrowMask = + getNarrowMaskForInterleavedOps(DAG, DL, MaskedLoad->getMask(), NumParts); + if (!NarrowMask) + return SDValue(); + + const Intrinsic::ID IID = NumParts == 2 ? Intrinsic::aarch64_sve_ld2_sret + : Intrinsic::aarch64_sve_ld4_sret; + SDValue NewLdOps[] = {MaskedLoad->getChain(), + DAG.getConstant(IID, DL, MVT::i32), NarrowMask, + MaskedLoad->getBasePtr()}; + SDValue Res; + if (NumParts == 2) + Res = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, + {SubVecTy, SubVecTy, MVT::Other}, NewLdOps); + else + Res = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, + {SubVecTy, SubVecTy, SubVecTy, SubVecTy, MVT::Other}, + NewLdOps); + + // We can now generate a structured load! + SmallVector<SDValue, 4> ResOps(NumParts); + for (unsigned Idx = 0; Idx < NumParts; Idx++) + ResOps[Idx] = SDValue(Res.getNode(), Idx); + + // Replace uses of the original chain result with the new chain result. + DAG.ReplaceAllUsesOfValueWith(SDValue(MaskedLoad, 1), + SDValue(Res.getNode(), NumParts)); + return DCI.CombineTo(N, ResOps, false); +} + /// If the operand is a bitwise AND with a constant RHS, and the shift has a /// constant RHS and is the only use, we can pull it out of the shift, i.e. /// @@ -27088,6 +27546,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, default: LLVM_DEBUG(dbgs() << "Custom combining: skipping\n"); break; + case ISD::VECTOR_DEINTERLEAVE: + return performVectorDeinterleaveCombine(N, DCI, DAG); case ISD::VECREDUCE_AND: case ISD::VECREDUCE_OR: case ISD::VECREDUCE_XOR: @@ -30640,10 +31100,41 @@ bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode( Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth); } +bool AArch64TargetLowering::canCreateUndefOrPoisonForTargetNode( + SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, + bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const { + + // TODO: Add more target nodes. + switch (Op.getOpcode()) { + case AArch64ISD::MOVI: + case AArch64ISD::MOVIedit: + case AArch64ISD::MOVImsl: + case AArch64ISD::MOVIshift: + case AArch64ISD::MVNImsl: + case AArch64ISD::MVNIshift: + case AArch64ISD::VASHR: + case AArch64ISD::VLSHR: + case AArch64ISD::VSHL: + return false; + } + return TargetLowering::canCreateUndefOrPoisonForTargetNode( + Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth); +} + bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const { return Op.getOpcode() == AArch64ISD::DUP || Op.getOpcode() == AArch64ISD::MOVI || Op.getOpcode() == AArch64ISD::MOVIshift || + Op.getOpcode() == AArch64ISD::MOVImsl || + Op.getOpcode() == AArch64ISD::MOVIedit || + Op.getOpcode() == AArch64ISD::MVNIshift || + Op.getOpcode() == AArch64ISD::MVNImsl || + // Ignoring fneg(movi(0)), because if it is folded to FPConstant(-0.0), + // ISel will select fmov(mov i64 0x8000000000000000), resulting in a + // fmov from fpr to gpr, which is more expensive than fneg(movi(0)) + (Op.getOpcode() == ISD::FNEG && + Op.getOperand(0).getOpcode() == AArch64ISD::MOVIedit && + Op.getOperand(0).getConstantOperandVal(0) == 0) || (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && Op.getOperand(0).getOpcode() == AArch64ISD::DUP) || TargetLowering::isTargetCanonicalConstantNode(Op); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 6c6ae782f779..f5d14905cac6 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -233,8 +233,8 @@ public: ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const override; bool lowerInterleavedStore(Instruction *Store, Value *Mask, - ShuffleVectorInst *SVI, - unsigned Factor) const override; + ShuffleVectorInst *SVI, unsigned Factor, + const APInt &GapMask) const override; bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, IntrinsicInst *DI) const override; @@ -575,6 +575,9 @@ private: bool shouldExpandBuildVectorWithShuffles(EVT, unsigned) const override; + SDValue lowerEHPadEntry(SDValue Chain, SDLoc const &DL, + SelectionDAG &DAG) const override; + SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, @@ -735,6 +738,7 @@ private: SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerLOOP_DEPENDENCE_MASK(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; @@ -868,6 +872,12 @@ private: TargetLoweringOpt &TLO, unsigned Depth) const override; + bool canCreateUndefOrPoisonForTargetNode(SDValue Op, + const APInt &DemandedElts, + const SelectionDAG &DAG, + bool PoisonOnly, bool ConsiderFlags, + unsigned Depth) const override; + bool isTargetCanonicalConstantNode(SDValue Op) const override; // With the exception of data-predicate transitions, no instructions are diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 178dab689739..8958ad129269 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -1327,6 +1327,8 @@ def move_vec_shift : Operand<i32> { let PrintMethod = "printShifter"; let EncoderMethod = "getMoveVecShifterOpValue"; let ParserMatchClass = MoveVecShifterOperand; + let OperandType = "OPERAND_SHIFT_MSL"; + let OperandNamespace = "AArch64"; } let DiagnosticType = "AddSubSecondSource" in { @@ -3032,8 +3034,12 @@ class BaseAddSubEReg64<bit isSub, bit setFlags, RegisterClass dstRegtype, // Aliases for register+register add/subtract. class AddSubRegAlias<string asm, Instruction inst, RegisterClass dstRegtype, - RegisterClass src1Regtype, RegisterClass src2Regtype, - int shiftExt> + RegisterClass src1Regtype, dag src2> + : InstAlias<asm#"\t$dst, $src1, $src2", + (inst dstRegtype:$dst, src1Regtype:$src1, src2)>; +class AddSubRegAlias64<string asm, Instruction inst, RegisterClass dstRegtype, + RegisterClass src1Regtype, RegisterClass src2Regtype, + int shiftExt> : InstAlias<asm#"\t$dst, $src1, $src2", (inst dstRegtype:$dst, src1Regtype:$src1, src2Regtype:$src2, shiftExt)>; @@ -3101,22 +3107,22 @@ multiclass AddSub<bit isSub, string mnemonic, string alias, // Register/register aliases with no shift when SP is not used. def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"), - GPR32, GPR32, GPR32, 0>; + GPR32, GPR32, (arith_shifted_reg32 GPR32:$src2, 0)>; def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"), - GPR64, GPR64, GPR64, 0>; + GPR64, GPR64, (arith_shifted_reg64 GPR64:$src2, 0)>; // Register/register aliases with no shift when either the destination or // first source register is SP. def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"), - GPR32sponly, GPR32sp, GPR32, 16>; // UXTW #0 + GPR32sponly, GPR32sp, + (arith_extended_reg32_i32 GPR32:$src2, 16)>; // UXTW #0 def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"), - GPR32sp, GPR32sponly, GPR32, 16>; // UXTW #0 - def : AddSubRegAlias<mnemonic, - !cast<Instruction>(NAME#"Xrx64"), - GPR64sponly, GPR64sp, GPR64, 24>; // UXTX #0 - def : AddSubRegAlias<mnemonic, - !cast<Instruction>(NAME#"Xrx64"), - GPR64sp, GPR64sponly, GPR64, 24>; // UXTX #0 + GPR32sp, GPR32sponly, + (arith_extended_reg32_i32 GPR32:$src2, 16)>; // UXTW #0 + def : AddSubRegAlias64<mnemonic, !cast<Instruction>(NAME#"Xrx64"), + GPR64sponly, GPR64sp, GPR64, 24>; // UXTX #0 + def : AddSubRegAlias64<mnemonic, !cast<Instruction>(NAME#"Xrx64"), + GPR64sp, GPR64sponly, GPR64, 24>; // UXTX #0 } multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp, @@ -3180,15 +3186,19 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp, def : InstAlias<cmp#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri") XZR, GPR64sp:$src, addsub_shifted_imm64:$imm), 5>; def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Wrx") - WZR, GPR32sp:$src1, GPR32:$src2, arith_extend:$sh), 4>; + WZR, GPR32sp:$src1, + (arith_extended_reg32_i32 GPR32:$src2, arith_extend:$sh)), 4>; def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx") - XZR, GPR64sp:$src1, GPR32:$src2, arith_extend:$sh), 4>; + XZR, GPR64sp:$src1, + (arith_extended_reg32_i64 GPR32:$src2, arith_extend:$sh)), 4>; def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx64") XZR, GPR64sp:$src1, GPR64:$src2, arith_extendlsl64:$sh), 4>; def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Wrs") - WZR, GPR32:$src1, GPR32:$src2, arith_shift32:$sh), 4>; + WZR, GPR32:$src1, + (arith_shifted_reg32 GPR32:$src2, arith_shift32:$sh)), 4>; def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrs") - XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>; + XZR, GPR64:$src1, + (arith_shifted_reg64 GPR64:$src2, arith_shift64:$sh)), 4>; // Support negative immediates, e.g. cmp Rn, -imm -> cmn Rn, imm def : InstSubst<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri") @@ -3198,27 +3208,28 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp, // Compare shorthands def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Wrs") - WZR, GPR32:$src1, GPR32:$src2, 0), 5>; + WZR, GPR32:$src1, (arith_shifted_reg32 GPR32:$src2, 0)), 5>; def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Xrs") - XZR, GPR64:$src1, GPR64:$src2, 0), 5>; + XZR, GPR64:$src1, (arith_shifted_reg64 GPR64:$src2, 0)), 5>; def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Wrx") - WZR, GPR32sponly:$src1, GPR32:$src2, 16), 5>; + WZR, GPR32sponly:$src1, + (arith_extended_reg32_i32 GPR32:$src2, 16)), 5>; def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Xrx64") XZR, GPR64sponly:$src1, GPR64:$src2, 24), 5>; // Register/register aliases with no shift when SP is not used. def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"), - GPR32, GPR32, GPR32, 0>; + GPR32, GPR32, (arith_shifted_reg32 GPR32:$src2, 0)>; def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"), - GPR64, GPR64, GPR64, 0>; + GPR64, GPR64, (arith_shifted_reg64 GPR64:$src2, 0)>; // Register/register aliases with no shift when the first source register // is SP. def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"), - GPR32, GPR32sponly, GPR32, 16>; // UXTW #0 - def : AddSubRegAlias<mnemonic, - !cast<Instruction>(NAME#"Xrx64"), - GPR64, GPR64sponly, GPR64, 24>; // UXTX #0 + GPR32, GPR32sponly, + (arith_extended_reg32_i32 GPR32:$src2, 16)>; // UXTW #0 + def : AddSubRegAlias64<mnemonic, !cast<Instruction>(NAME#"Xrx64"), + GPR64, GPR64sponly, GPR64, 24>; // UXTX #0 } class AddSubG<bit isSub, string asm_inst, SDPatternOperator OpNode> @@ -3403,9 +3414,10 @@ class BaseLogicalSReg<bits<2> opc, bit N, RegisterClass regtype, } // Aliases for register+register logical instructions. -class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype> +class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype, + dag op2> : InstAlias<asm#"\t$dst, $src1, $src2", - (inst regtype:$dst, regtype:$src1, regtype:$src2, 0)>; + (inst regtype:$dst, regtype:$src1, op2)>; multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode, string Alias> { @@ -3477,10 +3489,10 @@ multiclass LogicalReg<bits<2> opc, bit N, string mnemonic, let Inst{31} = 1; } - def : LogicalRegAlias<mnemonic, - !cast<Instruction>(NAME#"Wrs"), GPR32>; - def : LogicalRegAlias<mnemonic, - !cast<Instruction>(NAME#"Xrs"), GPR64>; + def : LogicalRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"), + GPR32, (logical_shifted_reg32 GPR32:$src2, 0)>; + def : LogicalRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"), + GPR64, (logical_shifted_reg64 GPR64:$src2, 0)>; } // Split from LogicalReg to allow setting NZCV Defs @@ -3500,10 +3512,10 @@ multiclass LogicalRegS<bits<2> opc, bit N, string mnemonic, } } // Defs = [NZCV] - def : LogicalRegAlias<mnemonic, - !cast<Instruction>(NAME#"Wrs"), GPR32>; - def : LogicalRegAlias<mnemonic, - !cast<Instruction>(NAME#"Xrs"), GPR64>; + def : LogicalRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"), + GPR32, (logical_shifted_reg32 GPR32:$src2, 0)>; + def : LogicalRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"), + GPR64, (logical_shifted_reg64 GPR64:$src2, 0)>; } //--- @@ -3991,9 +4003,10 @@ class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, string asm, dag ins, let Inst{4-0} = Rt; } -class ROInstAlias<string asm, DAGOperand regtype, Instruction INST> +class ROInstAlias<string asm, DAGOperand regtype, Instruction INST, + ro_extend ext> : InstAlias<asm # "\t$Rt, [$Rn, $Rm]", - (INST regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, 0, 0)>; + (INST regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, (ext 0, 0))>; multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype, string asm, ValueType Ty, SDPatternOperator loadop> { @@ -4019,7 +4032,7 @@ multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype, let Inst{13} = 0b1; } - def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>; + def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX"), ro_Xextend8>; } multiclass Store8RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype, @@ -4044,7 +4057,7 @@ multiclass Store8RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype, let Inst{13} = 0b1; } - def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>; + def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX"), ro_Xextend8>; } class LoadStore16RO<bits<2> sz, bit V, bits<2> opc, string asm, dag ins, @@ -4091,7 +4104,7 @@ multiclass Load16RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype, let Inst{13} = 0b1; } - def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>; + def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX"), ro_Xextend16>; } multiclass Store16RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype, @@ -4116,7 +4129,7 @@ multiclass Store16RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype, let Inst{13} = 0b1; } - def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>; + def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX"), ro_Xextend16>; } class LoadStore32RO<bits<2> sz, bit V, bits<2> opc, string asm, dag ins, @@ -4163,7 +4176,7 @@ multiclass Load32RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype, let Inst{13} = 0b1; } - def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>; + def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX"), ro_Xextend32>; } multiclass Store32RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype, @@ -4188,7 +4201,7 @@ multiclass Store32RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype, let Inst{13} = 0b1; } - def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>; + def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX"), ro_Xextend32>; } class LoadStore64RO<bits<2> sz, bit V, bits<2> opc, string asm, dag ins, @@ -4235,7 +4248,7 @@ multiclass Load64RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype, let Inst{13} = 0b1; } - def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>; + def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX"), ro_Xextend64>; } multiclass Store64RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype, @@ -4260,7 +4273,7 @@ multiclass Store64RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype, let Inst{13} = 0b1; } - def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>; + def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX"), ro_Xextend64>; } class LoadStore128RO<bits<2> sz, bit V, bits<2> opc, string asm, dag ins, @@ -4307,7 +4320,7 @@ multiclass Load128RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype, let Inst{13} = 0b1; } - def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>; + def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX"), ro_Xextend128>; } multiclass Store128RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype, @@ -4328,7 +4341,7 @@ multiclass Store128RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype, let Inst{13} = 0b1; } - def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>; + def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX"), ro_Xextend128>; } let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in @@ -4377,9 +4390,7 @@ multiclass PrefetchRO<bits<2> sz, bit V, bits<2> opc, string asm> { let Inst{13} = 0b1; } - def : InstAlias<"prfm $Rt, [$Rn, $Rm]", - (!cast<Instruction>(NAME # "roX") prfop:$Rt, - GPR64sp:$Rn, GPR64:$Rm, 0, 0)>; + def : ROInstAlias<"prfm", prfop, !cast<Instruction>(NAME # "roX"), ro_Xextend64>; } //--- diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index db028b4b7677..e56fe90259d5 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -91,8 +91,8 @@ static cl::opt<unsigned> GatherOptSearchLimit( "machine-combiner gather pattern optimization")); AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) - : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP, - AArch64::CATCHRET), + : AArch64GenInstrInfo(STI, AArch64::ADJCALLSTACKDOWN, + AArch64::ADJCALLSTACKUP, AArch64::CATCHRET), RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {} /// GetInstSize - Return the number of bytes of code the specified @@ -1299,6 +1299,7 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, break; case AArch64::PTEST_PP: case AArch64::PTEST_PP_ANY: + case AArch64::PTEST_PP_FIRST: SrcReg = MI.getOperand(0).getReg(); SrcReg2 = MI.getOperand(1).getReg(); if (MI.getOperand(2).getSubReg()) @@ -1691,7 +1692,8 @@ bool AArch64InstrInfo::optimizeCompareInstr( } if (CmpInstr.getOpcode() == AArch64::PTEST_PP || - CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY) + CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY || + CmpInstr.getOpcode() == AArch64::PTEST_PP_FIRST) return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI); if (SrcReg2 != 0) @@ -5075,7 +5077,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, .addImm(0) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); } - } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) { + } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGPR32()) { BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg) .addImm(0) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); @@ -5202,7 +5204,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, .addReg(SrcReg, getKillRegState(KillSrc)) .addImm(0) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); - } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) { + } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGPR64()) { BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg) .addImm(0) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); @@ -5318,15 +5320,49 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (AArch64::FPR64RegClass.contains(DestReg) && AArch64::FPR64RegClass.contains(SrcReg)) { - BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); + if (Subtarget.hasZeroCycleRegMoveFPR128() && + !Subtarget.hasZeroCycleRegMoveFPR64() && + !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable()) { + const TargetRegisterInfo *TRI = &getRegisterInfo(); + MCRegister DestRegQ = TRI->getMatchingSuperReg(DestReg, AArch64::dsub, + &AArch64::FPR128RegClass); + MCRegister SrcRegQ = TRI->getMatchingSuperReg(SrcReg, AArch64::dsub, + &AArch64::FPR128RegClass); + // This instruction is reading and writing Q registers. This may upset + // the register scavenger and machine verifier, so we need to indicate + // that we are reading an undefined value from SrcRegQ, but a proper + // value from SrcReg. + BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ) + .addReg(SrcRegQ, RegState::Undef) + .addReg(SrcRegQ, RegState::Undef) + .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); + } else { + BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + } return; } if (AArch64::FPR32RegClass.contains(DestReg) && AArch64::FPR32RegClass.contains(SrcReg)) { - if (Subtarget.hasZeroCycleRegMoveFPR64() && - !Subtarget.hasZeroCycleRegMoveFPR32()) { + if (Subtarget.hasZeroCycleRegMoveFPR128() && + !Subtarget.hasZeroCycleRegMoveFPR64() && + !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable()) { + const TargetRegisterInfo *TRI = &getRegisterInfo(); + MCRegister DestRegQ = TRI->getMatchingSuperReg(DestReg, AArch64::ssub, + &AArch64::FPR128RegClass); + MCRegister SrcRegQ = TRI->getMatchingSuperReg(SrcReg, AArch64::ssub, + &AArch64::FPR128RegClass); + // This instruction is reading and writing Q registers. This may upset + // the register scavenger and machine verifier, so we need to indicate + // that we are reading an undefined value from SrcRegQ, but a proper + // value from SrcReg. + BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ) + .addReg(SrcRegQ, RegState::Undef) + .addReg(SrcRegQ, RegState::Undef) + .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); + } else if (Subtarget.hasZeroCycleRegMoveFPR64() && + !Subtarget.hasZeroCycleRegMoveFPR32()) { const TargetRegisterInfo *TRI = &getRegisterInfo(); MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::ssub, &AArch64::FPR64RegClass); @@ -5348,8 +5384,24 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (AArch64::FPR16RegClass.contains(DestReg) && AArch64::FPR16RegClass.contains(SrcReg)) { - if (Subtarget.hasZeroCycleRegMoveFPR64() && - !Subtarget.hasZeroCycleRegMoveFPR32()) { + if (Subtarget.hasZeroCycleRegMoveFPR128() && + !Subtarget.hasZeroCycleRegMoveFPR64() && + !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable()) { + const TargetRegisterInfo *TRI = &getRegisterInfo(); + MCRegister DestRegQ = TRI->getMatchingSuperReg(DestReg, AArch64::hsub, + &AArch64::FPR128RegClass); + MCRegister SrcRegQ = TRI->getMatchingSuperReg(SrcReg, AArch64::hsub, + &AArch64::FPR128RegClass); + // This instruction is reading and writing Q registers. This may upset + // the register scavenger and machine verifier, so we need to indicate + // that we are reading an undefined value from SrcRegQ, but a proper + // value from SrcReg. + BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ) + .addReg(SrcRegQ, RegState::Undef) + .addReg(SrcRegQ, RegState::Undef) + .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); + } else if (Subtarget.hasZeroCycleRegMoveFPR64() && + !Subtarget.hasZeroCycleRegMoveFPR32()) { const TargetRegisterInfo *TRI = &getRegisterInfo(); MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR64RegClass); @@ -5375,8 +5427,24 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (AArch64::FPR8RegClass.contains(DestReg) && AArch64::FPR8RegClass.contains(SrcReg)) { - if (Subtarget.hasZeroCycleRegMoveFPR64() && - !Subtarget.hasZeroCycleRegMoveFPR32()) { + if (Subtarget.hasZeroCycleRegMoveFPR128() && + !Subtarget.hasZeroCycleRegMoveFPR64() && + !Subtarget.hasZeroCycleRegMoveFPR64() && Subtarget.isNeonAvailable()) { + const TargetRegisterInfo *TRI = &getRegisterInfo(); + MCRegister DestRegQ = TRI->getMatchingSuperReg(DestReg, AArch64::bsub, + &AArch64::FPR128RegClass); + MCRegister SrcRegQ = TRI->getMatchingSuperReg(SrcReg, AArch64::bsub, + &AArch64::FPR128RegClass); + // This instruction is reading and writing Q registers. This may upset + // the register scavenger and machine verifier, so we need to indicate + // that we are reading an undefined value from SrcRegQ, but a proper + // value from SrcReg. + BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ) + .addReg(SrcRegQ, RegState::Undef) + .addReg(SrcRegQ, RegState::Undef) + .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); + } else if (Subtarget.hasZeroCycleRegMoveFPR64() && + !Subtarget.hasZeroCycleRegMoveFPR32()) { const TargetRegisterInfo *TRI = &getRegisterInfo(); MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR64RegClass); @@ -5403,8 +5471,12 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Copies between GPR64 and FPR64. if (AArch64::FPR64RegClass.contains(DestReg) && AArch64::GPR64RegClass.contains(SrcReg)) { - BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); + if (AArch64::XZR == SrcReg) { + BuildMI(MBB, I, DL, get(AArch64::FMOVD0), DestReg); + } else { + BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + } return; } if (AArch64::GPR64RegClass.contains(DestReg) && @@ -5416,8 +5488,12 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Copies between GPR32 and FPR32. if (AArch64::FPR32RegClass.contains(DestReg) && AArch64::GPR32RegClass.contains(SrcReg)) { - BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); + if (AArch64::WZR == SrcReg) { + BuildMI(MBB, I, DL, get(AArch64::FMOVS0), DestReg); + } else { + BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + } return; } if (AArch64::GPR32RegClass.contains(DestReg) && @@ -6652,7 +6728,7 @@ static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, if (MO.isReg() && MO.getReg().isVirtual()) MI = MRI.getUniqueVRegDef(MO.getReg()); // And it needs to be in the trace (otherwise, it won't have a depth). - if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc) + if (!MI || MI->getParent() != &MBB || MI->getOpcode() != CombineOpc) return false; // Must only used by the user we combine with. if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 89f88776d832..f0020a9a3c91 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -989,6 +989,17 @@ def AArch64fcvtxnv: PatFrags<(ops node:$Rn), [(int_aarch64_neon_fcvtxn node:$Rn), (AArch64fcvtxn_n node:$Rn)]>; +def AArch64fcvtzs_half : SDNode<"AArch64ISD::FCVTZS_HALF", SDTFPExtendOp>; +def AArch64fcvtzu_half : SDNode<"AArch64ISD::FCVTZU_HALF", SDTFPExtendOp>; +def AArch64fcvtas_half : SDNode<"AArch64ISD::FCVTAS_HALF", SDTFPExtendOp>; +def AArch64fcvtau_half : SDNode<"AArch64ISD::FCVTAU_HALF", SDTFPExtendOp>; +def AArch64fcvtms_half : SDNode<"AArch64ISD::FCVTMS_HALF", SDTFPExtendOp>; +def AArch64fcvtmu_half : SDNode<"AArch64ISD::FCVTMU_HALF", SDTFPExtendOp>; +def AArch64fcvtns_half : SDNode<"AArch64ISD::FCVTNS_HALF", SDTFPExtendOp>; +def AArch64fcvtnu_half : SDNode<"AArch64ISD::FCVTNU_HALF", SDTFPExtendOp>; +def AArch64fcvtps_half : SDNode<"AArch64ISD::FCVTPS_HALF", SDTFPExtendOp>; +def AArch64fcvtpu_half : SDNode<"AArch64ISD::FCVTPU_HALF", SDTFPExtendOp>; + //def Aarch64softf32tobf16v8: SDNode<"AArch64ISD::", SDTFPRoundOp>; // Vector immediate ops @@ -2155,7 +2166,7 @@ let Predicates = [HasPAuth] in { i64imm:$Disc, GPR64:$AddrDisc), [], "$AuthVal = $Val">, Sched<[WriteI, ReadI]> { let isCodeGenOnly = 1; - let hasSideEffects = 0; + let hasSideEffects = 1; let mayStore = 0; let mayLoad = 0; let Size = 32; @@ -2660,13 +2671,17 @@ defm ADD : AddSub<0, "add", "sub", add>; defm SUB : AddSub<1, "sub", "add">; def : InstAlias<"mov $dst, $src", - (ADDWri GPR32sponly:$dst, GPR32sp:$src, 0, 0)>; + (ADDWri GPR32sponly:$dst, GPR32sp:$src, + (addsub_shifted_imm32 0, 0))>; def : InstAlias<"mov $dst, $src", - (ADDWri GPR32sp:$dst, GPR32sponly:$src, 0, 0)>; + (ADDWri GPR32sp:$dst, GPR32sponly:$src, + (addsub_shifted_imm32 0, 0))>; def : InstAlias<"mov $dst, $src", - (ADDXri GPR64sponly:$dst, GPR64sp:$src, 0, 0)>; + (ADDXri GPR64sponly:$dst, GPR64sp:$src, + (addsub_shifted_imm64 0, 0))>; def : InstAlias<"mov $dst, $src", - (ADDXri GPR64sp:$dst, GPR64sponly:$src, 0, 0)>; + (ADDXri GPR64sp:$dst, GPR64sponly:$src, + (addsub_shifted_imm64 0, 0))>; defm ADDS : AddSubS<0, "adds", AArch64add_flag, "cmn", "subs", "cmp">; defm SUBS : AddSubS<1, "subs", AArch64sub_flag, "cmp", "adds", "cmn">; @@ -2726,19 +2741,31 @@ def : Pat<(AArch64sub_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm), (ADDSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>; } -def : InstAlias<"neg $dst, $src", (SUBWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>; -def : InstAlias<"neg $dst, $src", (SUBXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>; +def : InstAlias<"neg $dst, $src", + (SUBWrs GPR32:$dst, WZR, + (arith_shifted_reg32 GPR32:$src, 0)), 3>; +def : InstAlias<"neg $dst, $src", + (SUBXrs GPR64:$dst, XZR, + (arith_shifted_reg64 GPR64:$src, 0)), 3>; def : InstAlias<"neg $dst, $src$shift", - (SUBWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>; + (SUBWrs GPR32:$dst, WZR, + (arith_shifted_reg32 GPR32:$src, arith_shift32:$shift)), 2>; def : InstAlias<"neg $dst, $src$shift", - (SUBXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>; - -def : InstAlias<"negs $dst, $src", (SUBSWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>; -def : InstAlias<"negs $dst, $src", (SUBSXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>; + (SUBXrs GPR64:$dst, XZR, + (arith_shifted_reg64 GPR64:$src, arith_shift64:$shift)), 2>; + +def : InstAlias<"negs $dst, $src", + (SUBSWrs GPR32:$dst, WZR, + (arith_shifted_reg32 GPR32:$src, 0)), 3>; +def : InstAlias<"negs $dst, $src", + (SUBSXrs GPR64:$dst, XZR, + (arith_shifted_reg64 GPR64:$src, 0)), 3>; def : InstAlias<"negs $dst, $src$shift", - (SUBSWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>; + (SUBSWrs GPR32:$dst, WZR, + (arith_shifted_reg32 GPR32:$src, arith_shift32:$shift)), 2>; def : InstAlias<"negs $dst, $src$shift", - (SUBSXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>; + (SUBSXrs GPR64:$dst, XZR, + (arith_shifted_reg64 GPR64:$src, arith_shift64:$shift)), 2>; // Unsigned/Signed divide @@ -3165,16 +3192,26 @@ defm ORN : LogicalReg<0b01, 1, "orn", BinOpFrag<(or node:$LHS, (not node:$RHS))>>; defm ORR : LogicalReg<0b01, 0, "orr", or>; -def : InstAlias<"mov $dst, $src", (ORRWrs GPR32:$dst, WZR, GPR32:$src, 0), 2>; -def : InstAlias<"mov $dst, $src", (ORRXrs GPR64:$dst, XZR, GPR64:$src, 0), 2>; - -def : InstAlias<"mvn $Wd, $Wm", (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, 0), 3>; -def : InstAlias<"mvn $Xd, $Xm", (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, 0), 3>; +def : InstAlias<"mov $dst, $src", + (ORRWrs GPR32:$dst, WZR, + (logical_shifted_reg32 GPR32:$src, 0)), 2>; +def : InstAlias<"mov $dst, $src", + (ORRXrs GPR64:$dst, XZR, + (logical_shifted_reg64 GPR64:$src, 0)), 2>; + +def : InstAlias<"mvn $Wd, $Wm", + (ORNWrs GPR32:$Wd, WZR, + (logical_shifted_reg32 GPR32:$Wm, 0)), 3>; +def : InstAlias<"mvn $Xd, $Xm", + (ORNXrs GPR64:$Xd, XZR, + (logical_shifted_reg64 GPR64:$Xm, 0)), 3>; def : InstAlias<"mvn $Wd, $Wm$sh", - (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, logical_shift32:$sh), 2>; + (ORNWrs GPR32:$Wd, WZR, + (logical_shifted_reg32 GPR32:$Wm, logical_shift32:$sh)), 2>; def : InstAlias<"mvn $Xd, $Xm$sh", - (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, logical_shift64:$sh), 2>; + (ORNXrs GPR64:$Xd, XZR, + (logical_shifted_reg64 GPR64:$Xm, logical_shift64:$sh)), 2>; def : InstAlias<"tst $src1, $src2", (ANDSWri WZR, GPR32:$src1, logical_imm32:$src2), 2>; @@ -3182,14 +3219,18 @@ def : InstAlias<"tst $src1, $src2", (ANDSXri XZR, GPR64:$src1, logical_imm64:$src2), 2>; def : InstAlias<"tst $src1, $src2", - (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, 0), 3>; + (ANDSWrs WZR, GPR32:$src1, + (logical_shifted_reg32 GPR32:$src2, 0)), 3>; def : InstAlias<"tst $src1, $src2", - (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, 0), 3>; + (ANDSXrs XZR, GPR64:$src1, + (logical_shifted_reg64 GPR64:$src2, 0)), 3>; def : InstAlias<"tst $src1, $src2$sh", - (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, logical_shift32:$sh), 2>; + (ANDSWrs WZR, GPR32:$src1, + (logical_shifted_reg32 GPR32:$src2, logical_shift32:$sh)), 2>; def : InstAlias<"tst $src1, $src2$sh", - (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, logical_shift64:$sh), 2>; + (ANDSXrs XZR, GPR64:$src1, + (logical_shifted_reg64 GPR64:$src2, logical_shift64:$sh)), 2>; def : Pat<(not GPR32:$Wm), (ORNWrr WZR, GPR32:$Wm)>; @@ -4710,6 +4751,26 @@ let Predicates = [IsLE] in { (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; } +// truncstorei32 of f64 bitcasted to i64 +def : Pat<(truncstorei32 (i64 (bitconvert (f64 FPR64:$Rt))), (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)), + (STRSui (EXTRACT_SUBREG FPR64:$Rt, ssub), GPR64sp:$Rn, uimm12s4:$offset)>; + +// truncstorei16 of f64 bitcasted to i64 +def : Pat<(truncstorei16 (i64 (bitconvert (f64 FPR64:$Rt))), (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)), + (STRHui (f16 (EXTRACT_SUBREG FPR64:$Rt, hsub)), GPR64sp:$Rn, uimm12s2:$offset)>; + + // truncstorei16 of f32 bitcasted to i32 +def : Pat<(truncstorei16 (i32 (bitconvert (f32 FPR32:$Rt))), (am_indexed16 GPR64sp:$Rn, uimm12s2:$off)), + (STRHui (f16 (EXTRACT_SUBREG FPR32:$Rt, hsub)), GPR64sp:$Rn, uimm12s2:$off)>; + + // truncstorei8 of f64 bitcasted to i64 +def : Pat<(truncstorei8 (i64 (bitconvert (f64 FPR64:$Rt))), (am_indexed8 GPR64sp:$Rn, uimm12s1:$off)), + (STRBui (aarch64mfp8 (EXTRACT_SUBREG FPR64:$Rt, bsub)), GPR64sp:$Rn, uimm12s1:$off)>; + + // truncstorei8 of f32 bitcasted to i32 +def : Pat<(truncstorei8 (i32 (bitconvert (f32 FPR32:$Rt))), (am_indexed8 GPR64sp:$Rn, uimm12s1:$off)), + (STRBui (aarch64mfp8 (EXTRACT_SUBREG FPR32:$Rt, bsub)), GPR64sp:$Rn, uimm12s1:$off)>; + // truncstore i64 def : Pat<(truncstorei32 GPR64:$Rt, (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)), @@ -6536,9 +6597,33 @@ defm UQXTN : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_aarch64_neon_scalar defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd", int_aarch64_neon_usqadd>; +// f16 -> s16 conversions +let Predicates = [HasFullFP16] in { + def : Pat<(i16(fp_to_sint_sat_gi f16:$Rn)), (FCVTZSv1f16 f16:$Rn)>; + def : Pat<(i16(fp_to_uint_sat_gi f16:$Rn)), (FCVTZUv1f16 f16:$Rn)>; +} + def : Pat<(v1i64 (AArch64vashr (v1i64 V64:$Rn), (i32 63))), (CMLTv1i64rz V64:$Rn)>; +// f16 -> i16 conversions leave the bit pattern in a f32 +class F16ToI16ScalarPat<SDNode cvt_isd, BaseSIMDTwoScalar instr> + : Pat<(f32 (cvt_isd (f16 FPR16:$Rn))), + (f32 (SUBREG_TO_REG (i64 0), (instr FPR16:$Rn), hsub))>; + +let Predicates = [HasFullFP16] in { +def : F16ToI16ScalarPat<AArch64fcvtzs_half, FCVTZSv1f16>; +def : F16ToI16ScalarPat<AArch64fcvtzu_half, FCVTZUv1f16>; +def : F16ToI16ScalarPat<AArch64fcvtas_half, FCVTASv1f16>; +def : F16ToI16ScalarPat<AArch64fcvtau_half, FCVTAUv1f16>; +def : F16ToI16ScalarPat<AArch64fcvtms_half, FCVTMSv1f16>; +def : F16ToI16ScalarPat<AArch64fcvtmu_half, FCVTMUv1f16>; +def : F16ToI16ScalarPat<AArch64fcvtns_half, FCVTNSv1f16>; +def : F16ToI16ScalarPat<AArch64fcvtnu_half, FCVTNUv1f16>; +def : F16ToI16ScalarPat<AArch64fcvtps_half, FCVTPSv1f16>; +def : F16ToI16ScalarPat<AArch64fcvtpu_half, FCVTPUv1f16>; +} + // Round FP64 to BF16. let Predicates = [HasNEONandIsStreamingSafe, HasBF16] in def : Pat<(bf16 (any_fpround (f64 FPR64:$Rn))), @@ -6641,20 +6726,24 @@ def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))), // Some float -> int -> float conversion patterns for which we want to keep the // int values in FP registers using the corresponding NEON instructions to // avoid more costly int <-> fp register transfers. +let HasOneUse = 1 in { +def any_fp_to_sint_oneuse: PatFrag<(ops node:$src0), (any_fp_to_sint $src0)>; +def any_fp_to_uint_oneuse: PatFrag<(ops node:$src0), (any_fp_to_uint $src0)>; +} let Predicates = [HasNEONandIsSME2p2StreamingSafe] in { -def : Pat<(f64 (any_sint_to_fp (i64 (any_fp_to_sint f64:$Rn)))), +def : Pat<(f64 (any_sint_to_fp (i64 (any_fp_to_sint_oneuse f64:$Rn)))), (SCVTFv1i64 (i64 (FCVTZSv1i64 f64:$Rn)))>; -def : Pat<(f32 (any_sint_to_fp (i32 (any_fp_to_sint f32:$Rn)))), +def : Pat<(f32 (any_sint_to_fp (i32 (any_fp_to_sint_oneuse f32:$Rn)))), (SCVTFv1i32 (i32 (FCVTZSv1i32 f32:$Rn)))>; -def : Pat<(f64 (any_uint_to_fp (i64 (any_fp_to_uint f64:$Rn)))), +def : Pat<(f64 (any_uint_to_fp (i64 (any_fp_to_uint_oneuse f64:$Rn)))), (UCVTFv1i64 (i64 (FCVTZUv1i64 f64:$Rn)))>; -def : Pat<(f32 (any_uint_to_fp (i32 (any_fp_to_uint f32:$Rn)))), +def : Pat<(f32 (any_uint_to_fp (i32 (any_fp_to_uint_oneuse f32:$Rn)))), (UCVTFv1i32 (i32 (FCVTZUv1i32 f32:$Rn)))>; let Predicates = [HasNEONandIsSME2p2StreamingSafe, HasFullFP16] in { -def : Pat<(f16 (any_sint_to_fp (i32 (any_fp_to_sint f16:$Rn)))), +def : Pat<(f16 (any_sint_to_fp (i32 (any_fp_to_sint_oneuse f16:$Rn)))), (SCVTFv1i16 (f16 (FCVTZSv1f16 f16:$Rn)))>; -def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))), +def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint_oneuse f16:$Rn)))), (UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>; } @@ -8234,6 +8323,29 @@ def MVNIv4s_msl : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s", (AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>; } +// SABA patterns for add(x, abs(y)) -> saba(x, y, 0) +def : Pat<(v8i8 (add V64:$Vn, (abs V64:$Vm))), + (SABAv8i8 V64:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>; +def : Pat<(v4i16 (add V64:$Vn, (abs V64:$Vm))), + (SABAv4i16 V64:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>; +def : Pat<(v2i32 (add V64:$Vn, (abs V64:$Vm))), + (SABAv2i32 V64:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>; +def : Pat<(v16i8 (add V128:$Vn, (abs V128:$Vm))), + (SABAv16i8 V128:$Vn, V128:$Vm, (MOVIv2d_ns (i32 0)))>; +def : Pat<(v8i16 (add V128:$Vn, (abs V128:$Vm))), + (SABAv8i16 V128:$Vn, V128:$Vm, (MOVIv2d_ns (i32 0)))>; +def : Pat<(v4i32 (add V128:$Vn, (abs V128:$Vm))), + (SABAv4i32 V128:$Vn, V128:$Vm, (MOVIv2d_ns (i32 0)))>; + +// SABAL patterns for add(x, zext(abs(y))) -> sabal(x, y, 0) +def : Pat<(v8i16 (add V128:$Vn, (zext (abs (v8i8 V64:$Vm))))), + (SABALv8i8_v8i16 V128:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>; +def : Pat<(v4i32 (add V128:$Vn, (zext (abs (v4i16 V64:$Vm))))), + (SABALv4i16_v4i32 V128:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>; +def : Pat<(v2i64 (add V128:$Vn, (zext (abs (v2i32 V64:$Vm))))), + (SABALv2i32_v2i64 V128:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>; + + //---------------------------------------------------------------------------- // AdvSIMD indexed element //---------------------------------------------------------------------------- diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 1fde87e65a34..993cff112ba8 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -228,9 +228,6 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { // on function entry to record the initial pstate of a function. Register PStateSMReg = MCRegister::NoRegister; - // true if PStateSMReg is used. - bool PStateSMRegUsed = false; - // Has the PNReg used to build PTRUE instruction. // The PTRUE is used for the LD/ST of ZReg pairs in save and restore. unsigned PredicateRegForFillSpill = 0; @@ -238,6 +235,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { // Holds the SME function attributes (streaming mode, ZA/ZT0 state). SMEAttrs SMEFnAttrs; + // Holds the TPIDR2 block if allocated early (for Windows/stack probes + // support). + Register EarlyAllocSMESaveBuffer = AArch64::NoRegister; + // Note: The following properties are only used for the old SME ABI lowering: /// The frame-index for the TPIDR2 object used for lazy saves. TPIDR2Object TPIDR2; @@ -256,6 +257,14 @@ public: const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB) const override; + void setEarlyAllocSMESaveBuffer(Register Ptr) { + EarlyAllocSMESaveBuffer = Ptr; + } + + Register getEarlyAllocSMESaveBuffer() const { + return EarlyAllocSMESaveBuffer; + } + // Old SME ABI lowering state getters/setters: Register getSMESaveBufferAddr() const { return SMESaveBufferAddr; }; void setSMESaveBufferAddr(Register Reg) { SMESaveBufferAddr = Reg; }; @@ -273,9 +282,6 @@ public: Register getPStateSMReg() const { return PStateSMReg; }; void setPStateSMReg(Register Reg) { PStateSMReg = Reg; }; - unsigned isPStateSMRegUsed() const { return PStateSMRegUsed; }; - void setPStateSMRegUsed(bool Used = true) { PStateSMRegUsed = Used; }; - bool isSVECC() const { return IsSVECC; }; void setIsSVECC(bool s) { IsSVECC = s; }; diff --git a/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp b/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp index ff7a0d1faedf..f4a7f774d477 100644 --- a/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp +++ b/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp @@ -237,8 +237,8 @@ static bool isAddressLdStPair(const MachineInstr *FirstMI, } /// Compare and conditional select. -static bool isCCSelectPair(const MachineInstr *FirstMI, - const MachineInstr &SecondMI) { +static bool isCmpCSelPair(const MachineInstr *FirstMI, + const MachineInstr &SecondMI) { // 32 bits if (SecondMI.getOpcode() == AArch64::CSELWr) { // Assume the 1st instr to be a wildcard if it is unspecified. @@ -279,6 +279,40 @@ static bool isCCSelectPair(const MachineInstr *FirstMI, return false; } +/// Compare and cset. +static bool isCmpCSetPair(const MachineInstr *FirstMI, + const MachineInstr &SecondMI) { + if ((SecondMI.getOpcode() == AArch64::CSINCWr && + SecondMI.getOperand(1).getReg() == AArch64::WZR && + SecondMI.getOperand(2).getReg() == AArch64::WZR) || + (SecondMI.getOpcode() == AArch64::CSINCXr && + SecondMI.getOperand(1).getReg() == AArch64::XZR && + SecondMI.getOperand(2).getReg() == AArch64::XZR)) { + // Assume the 1st instr to be a wildcard if it is unspecified. + if (FirstMI == nullptr) + return true; + + if (FirstMI->definesRegister(AArch64::WZR, /*TRI=*/nullptr) || + FirstMI->definesRegister(AArch64::XZR, /*TRI=*/nullptr)) + switch (FirstMI->getOpcode()) { + case AArch64::SUBSWrs: + case AArch64::SUBSXrs: + return !AArch64InstrInfo::hasShiftedReg(*FirstMI); + case AArch64::SUBSWrx: + case AArch64::SUBSXrx: + case AArch64::SUBSXrx64: + return !AArch64InstrInfo::hasExtendedReg(*FirstMI); + case AArch64::SUBSWri: + case AArch64::SUBSWrr: + case AArch64::SUBSXri: + case AArch64::SUBSXrr: + return true; + } + } + + return false; +} + // Arithmetic and logic. static bool isArithmeticLogicPair(const MachineInstr *FirstMI, const MachineInstr &SecondMI) { @@ -465,7 +499,9 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, return true; if (ST.hasFuseAddress() && isAddressLdStPair(FirstMI, SecondMI)) return true; - if (ST.hasFuseCCSelect() && isCCSelectPair(FirstMI, SecondMI)) + if (ST.hasFuseCmpCSel() && isCmpCSelPair(FirstMI, SecondMI)) + return true; + if (ST.hasFuseCmpCSet() && isCmpCSetPair(FirstMI, SecondMI)) return true; if (ST.hasFuseArithmeticLogic() && isArithmeticLogicPair(FirstMI, SecondMI)) return true; diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index 42eaeca906e6..81f5d075729d 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -134,6 +134,8 @@ def TuneA78 : SubtargetFeature<"a78", "ARMProcFamily", "CortexA78", FeatureCmpBccFusion, FeatureFuseAES, FeatureFuseAdrpAdd, + FeatureFuseCmpCSel, + FeatureFuseCmpCSet, FeatureAddrLSLSlow14, FeatureALULSLFast, FeaturePostRAScheduler, @@ -146,6 +148,8 @@ def TuneA78AE : SubtargetFeature<"a78ae", "ARMProcFamily", FeatureCmpBccFusion, FeatureFuseAES, FeatureFuseAdrpAdd, + FeatureFuseCmpCSel, + FeatureFuseCmpCSet, FeatureAddrLSLSlow14, FeatureALULSLFast, FeaturePostRAScheduler, @@ -158,6 +162,8 @@ def TuneA78C : SubtargetFeature<"a78c", "ARMProcFamily", FeatureCmpBccFusion, FeatureFuseAES, FeatureFuseAdrpAdd, + FeatureFuseCmpCSel, + FeatureFuseCmpCSet, FeatureAddrLSLSlow14, FeatureALULSLFast, FeaturePostRAScheduler, @@ -169,6 +175,8 @@ def TuneA710 : SubtargetFeature<"a710", "ARMProcFamily", "CortexA710", FeatureCmpBccFusion, FeatureFuseAES, FeatureFuseAdrpAdd, + FeatureFuseCmpCSel, + FeatureFuseCmpCSet, FeatureALULSLFast, FeaturePostRAScheduler, FeatureEnableSelectOptimize, @@ -181,6 +189,8 @@ def TuneA715 : SubtargetFeature<"a715", "ARMProcFamily", "CortexA715", FeatureCmpBccFusion, FeatureALULSLFast, FeatureFuseAdrpAdd, + FeatureFuseCmpCSel, + FeatureFuseCmpCSet, FeatureEnableSelectOptimize, FeaturePredictableSelectIsExpensive]>; @@ -191,6 +201,8 @@ def TuneA720 : SubtargetFeature<"a720", "ARMProcFamily", "CortexA720", FeatureCmpBccFusion, FeatureALULSLFast, FeatureFuseAdrpAdd, + FeatureFuseCmpCSel, + FeatureFuseCmpCSet, FeatureEnableSelectOptimize, FeaturePredictableSelectIsExpensive]>; @@ -201,6 +213,8 @@ def TuneA720AE : SubtargetFeature<"a720ae", "ARMProcFamily", "CortexA720", FeatureCmpBccFusion, FeatureALULSLFast, FeatureFuseAdrpAdd, + FeatureFuseCmpCSel, + FeatureFuseCmpCSet, FeatureEnableSelectOptimize, FeaturePredictableSelectIsExpensive]>; @@ -212,6 +226,8 @@ def TuneA725 : SubtargetFeature<"cortex-a725", "ARMProcFamily", FeatureCmpBccFusion, FeatureALULSLFast, FeatureFuseAdrpAdd, + FeatureFuseCmpCSel, + FeatureFuseCmpCSet, FeatureEnableSelectOptimize, FeaturePredictableSelectIsExpensive]>; @@ -262,6 +278,8 @@ def TuneX4 : SubtargetFeature<"cortex-x4", "ARMProcFamily", "CortexX4", "Cortex-X4 ARM processors", [ FeatureALULSLFast, FeatureFuseAdrpAdd, + FeatureFuseCmpCSel, + FeatureFuseCmpCSet, FeatureFuseAES, FeaturePostRAScheduler, FeatureEnableSelectOptimize, @@ -273,6 +291,8 @@ def TuneX925 : SubtargetFeature<"cortex-x925", "ARMProcFamily", "CortexX925", "Cortex-X925 ARM processors",[ FeatureALULSLFast, FeatureFuseAdrpAdd, + FeatureFuseCmpCSel, + FeatureFuseCmpCSet, FeatureFuseAES, FeaturePostRAScheduler, FeatureEnableSelectOptimize, @@ -321,7 +341,11 @@ def TuneAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7", FeatureFuseAES, FeatureFuseCryptoEOR, FeatureStorePairSuppress, FeatureZCRegMoveGPR64, - FeatureZCZeroing, + FeatureZCRegMoveFPR128, + FeatureZCZeroingGPR32, + FeatureZCZeroingGPR64, + FeatureNoZCZeroingFPR64, + FeatureZCZeroingFPR128, FeatureZCZeroingFPWorkaround]>; def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10", @@ -334,7 +358,11 @@ def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10", FeatureFuseCryptoEOR, FeatureStorePairSuppress, FeatureZCRegMoveGPR64, - FeatureZCZeroing]>; + FeatureZCRegMoveFPR128, + FeatureZCZeroingGPR32, + FeatureZCZeroingGPR64, + FeatureNoZCZeroingFPR64, + FeatureZCZeroingFPR128]>; def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11", "Apple A11", [ @@ -346,7 +374,11 @@ def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11", FeatureFuseCryptoEOR, FeatureStorePairSuppress, FeatureZCRegMoveGPR64, - FeatureZCZeroing]>; + FeatureZCRegMoveFPR128, + FeatureZCZeroingGPR32, + FeatureZCZeroingGPR64, + FeatureNoZCZeroingFPR64, + FeatureZCZeroingFPR128]>; def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12", "Apple A12", [ @@ -358,7 +390,11 @@ def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12", FeatureFuseCryptoEOR, FeatureStorePairSuppress, FeatureZCRegMoveGPR64, - FeatureZCZeroing]>; + FeatureZCRegMoveFPR128, + FeatureZCZeroingGPR32, + FeatureZCZeroingGPR64, + FeatureNoZCZeroingFPR64, + FeatureZCZeroingFPR128]>; def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13", "Apple A13", [ @@ -370,7 +406,11 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13", FeatureFuseCryptoEOR, FeatureStorePairSuppress, FeatureZCRegMoveGPR64, - FeatureZCZeroing]>; + FeatureZCRegMoveFPR128, + FeatureZCZeroingGPR32, + FeatureZCZeroingGPR64, + FeatureNoZCZeroingFPR64, + FeatureZCZeroingFPR128]>; def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14", "Apple A14", [ @@ -382,12 +422,16 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14", FeatureFuseAddress, FeatureFuseAES, FeatureFuseArithmeticLogic, - FeatureFuseCCSelect, + FeatureFuseCmpCSel, FeatureFuseCryptoEOR, FeatureFuseLiterals, FeatureStorePairSuppress, FeatureZCRegMoveGPR64, - FeatureZCZeroing]>; + FeatureZCRegMoveFPR128, + FeatureZCZeroingGPR32, + FeatureZCZeroingGPR64, + FeatureNoZCZeroingFPR64, + FeatureZCZeroingFPR128]>; def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15", "Apple A15", [ @@ -399,12 +443,16 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15", FeatureFuseAdrpAdd, FeatureFuseAES, FeatureFuseArithmeticLogic, - FeatureFuseCCSelect, + FeatureFuseCmpCSel, FeatureFuseCryptoEOR, FeatureFuseLiterals, FeatureStorePairSuppress, FeatureZCRegMoveGPR64, - FeatureZCZeroing]>; + FeatureZCRegMoveFPR128, + FeatureZCZeroingGPR32, + FeatureZCZeroingGPR64, + FeatureNoZCZeroingFPR64, + FeatureZCZeroingFPR128]>; def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16", "Apple A16", [ @@ -416,12 +464,16 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16", FeatureFuseAdrpAdd, FeatureFuseAES, FeatureFuseArithmeticLogic, - FeatureFuseCCSelect, + FeatureFuseCmpCSel, FeatureFuseCryptoEOR, FeatureFuseLiterals, FeatureStorePairSuppress, FeatureZCRegMoveGPR64, - FeatureZCZeroing]>; + FeatureZCRegMoveFPR128, + FeatureZCZeroingGPR32, + FeatureZCZeroingGPR64, + FeatureNoZCZeroingFPR64, + FeatureZCZeroingFPR128]>; def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17", "Apple A17", [ @@ -433,12 +485,16 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17", FeatureFuseAdrpAdd, FeatureFuseAES, FeatureFuseArithmeticLogic, - FeatureFuseCCSelect, + FeatureFuseCmpCSel, FeatureFuseCryptoEOR, FeatureFuseLiterals, FeatureStorePairSuppress, FeatureZCRegMoveGPR64, - FeatureZCZeroing]>; + FeatureZCRegMoveFPR128, + FeatureZCZeroingGPR32, + FeatureZCZeroingGPR64, + FeatureNoZCZeroingFPR64, + FeatureZCZeroingFPR128]>; def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4", "Apple M4", [ @@ -450,12 +506,15 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4", FeatureFuseAdrpAdd, FeatureFuseAES, FeatureFuseArithmeticLogic, - FeatureFuseCCSelect, + FeatureFuseCmpCSel, FeatureFuseCryptoEOR, FeatureFuseLiterals, FeatureZCRegMoveGPR64, - FeatureZCZeroing - ]>; + FeatureZCRegMoveFPR128, + FeatureZCZeroingGPR32, + FeatureZCZeroingGPR64, + FeatureNoZCZeroingFPR64, + FeatureZCZeroingFPR128]>; def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3", "Samsung Exynos-M3 processors", @@ -463,7 +522,7 @@ def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3", FeatureForce32BitJumpTables, FeatureFuseAddress, FeatureFuseAES, - FeatureFuseCCSelect, + FeatureFuseCmpCSel, FeatureFuseAdrpAdd, FeatureFuseLiterals, FeatureStorePairSuppress, @@ -481,19 +540,21 @@ def TuneExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3", FeatureFuseAddress, FeatureFuseAES, FeatureFuseArithmeticLogic, - FeatureFuseCCSelect, + FeatureFuseCmpCSel, FeatureFuseAdrpAdd, FeatureFuseLiterals, FeatureStorePairSuppress, FeatureALULSLFast, FeaturePostRAScheduler, - FeatureZCZeroing]>; + FeatureZCZeroingGPR32, + FeatureZCZeroingGPR64]>; def TuneKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo", "Qualcomm Kryo processors", [ FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, - FeatureZCZeroing, + FeatureZCZeroingGPR32, + FeatureZCZeroingGPR64, FeatureALULSLFast, FeatureStorePairSuppress]>; @@ -501,7 +562,8 @@ def TuneFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor", "Qualcomm Falkor processors", [ FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, - FeatureZCZeroing, + FeatureZCZeroingGPR32, + FeatureZCZeroingGPR64, FeatureStorePairSuppress, FeatureALULSLFast, FeatureSlowSTRQro]>; @@ -526,6 +588,8 @@ def TuneNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily", "NeoverseN2 "Neoverse N2 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, + FeatureFuseCmpCSel, + FeatureFuseCmpCSet, FeatureALULSLFast, FeaturePostRAScheduler, FeatureEnableSelectOptimize, @@ -537,6 +601,8 @@ def TuneNeoverseN3 : SubtargetFeature<"neoversen3", "ARMProcFamily", "NeoverseN3 FeaturePostRAScheduler, FeatureALULSLFast, FeatureFuseAdrpAdd, + FeatureFuseCmpCSel, + FeatureFuseCmpCSet, FeatureEnableSelectOptimize, FeaturePredictableSelectIsExpensive]>; @@ -553,6 +619,8 @@ def TuneNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", "NeoverseV1 "Neoverse V1 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, + FeatureFuseCmpCSel, + FeatureFuseCmpCSet, FeatureAddrLSLSlow14, FeatureALULSLFast, FeaturePostRAScheduler, @@ -565,6 +633,8 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2 FeatureFuseAES, FeatureCmpBccFusion, FeatureFuseAdrpAdd, + FeatureFuseCmpCSel, + FeatureFuseCmpCSet, FeatureALULSLFast, FeaturePostRAScheduler, FeatureEnableSelectOptimize, @@ -578,6 +648,8 @@ def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3 FeatureFuseAES, FeatureALULSLFast, FeatureFuseAdrpAdd, + FeatureFuseCmpCSel, + FeatureFuseCmpCSet, FeaturePostRAScheduler, FeatureEnableSelectOptimize, FeatureAvoidLDAPUR, @@ -588,6 +660,8 @@ def TuneNeoverseV3AE : SubtargetFeature<"neoversev3AE", "ARMProcFamily", "Neover FeatureFuseAES, FeatureALULSLFast, FeatureFuseAdrpAdd, + FeatureFuseCmpCSel, + FeatureFuseCmpCSet, FeaturePostRAScheduler, FeatureEnableSelectOptimize, FeatureAvoidLDAPUR, @@ -597,7 +671,8 @@ def TuneSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira", "Qualcomm Saphira processors", [ FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, - FeatureZCZeroing, + FeatureZCZeroingGPR32, + FeatureZCZeroingGPR64, FeatureStorePairSuppress, FeatureALULSLFast]>; diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp new file mode 100644 index 000000000000..af424987b8dd --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp @@ -0,0 +1,794 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "AArch64PrologueEpilogue.h" +#include "AArch64FrameLowering.h" +#include "AArch64MachineFunctionInfo.h" +#include "AArch64Subtarget.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/BinaryFormat/Dwarf.h" +#include "llvm/CodeGen/CFIInstBuilder.h" +#include "llvm/MC/MCContext.h" + +#define DEBUG_TYPE "frame-info" + +STATISTIC(NumRedZoneFunctions, "Number of functions using red zone"); + +namespace llvm { + +AArch64PrologueEmitter::AArch64PrologueEmitter(MachineFunction &MF, + MachineBasicBlock &MBB, + const AArch64FrameLowering &AFL) + : MF(MF), MBB(MBB), F(MF.getFunction()), MFI(MF.getFrameInfo()), + Subtarget(MF.getSubtarget<AArch64Subtarget>()), AFL(AFL), + RegInfo(*Subtarget.getRegisterInfo()) { + TII = Subtarget.getInstrInfo(); + AFI = MF.getInfo<AArch64FunctionInfo>(); + + EmitCFI = AFI->needsDwarfUnwindInfo(MF); + EmitAsyncCFI = AFI->needsAsyncDwarfUnwindInfo(MF); + HasFP = AFL.hasFP(MF); + NeedsWinCFI = AFL.needsWinCFI(MF); + IsFunclet = MBB.isEHFuncletEntry(); + HomPrologEpilog = AFL.homogeneousPrologEpilog(MF); + +#ifndef NDEBUG + collectBlockLiveins(); +#endif +} + +#ifndef NDEBUG +/// Collect live registers from the end of \p MI's parent up to (including) \p +/// MI in \p LiveRegs. +static void getLivePhysRegsUpTo(MachineInstr &MI, const TargetRegisterInfo &TRI, + LivePhysRegs &LiveRegs) { + + MachineBasicBlock &MBB = *MI.getParent(); + LiveRegs.addLiveOuts(MBB); + for (const MachineInstr &MI : + reverse(make_range(MI.getIterator(), MBB.instr_end()))) + LiveRegs.stepBackward(MI); +} + +void AArch64PrologueEmitter::collectBlockLiveins() { + // Collect live register from the end of MBB up to the start of the existing + // frame setup instructions. + PrologueEndI = MBB.begin(); + while (PrologueEndI != MBB.end() && + PrologueEndI->getFlag(MachineInstr::FrameSetup)) + ++PrologueEndI; + + if (PrologueEndI != MBB.end()) { + getLivePhysRegsUpTo(*PrologueEndI, RegInfo, LiveRegs); + // Ignore registers used for stack management for now. + LiveRegs.removeReg(AArch64::SP); + LiveRegs.removeReg(AArch64::X19); + LiveRegs.removeReg(AArch64::FP); + LiveRegs.removeReg(AArch64::LR); + + // X0 will be clobbered by a call to __arm_get_current_vg in the prologue. + // This is necessary to spill VG if required where SVE is unavailable, but + // X0 is preserved around this call. + if (AFL.requiresGetVGCall(MF)) + LiveRegs.removeReg(AArch64::X0); + } +} + +void AArch64PrologueEmitter::verifyPrologueClobbers() const { + if (PrologueEndI == MBB.end()) + return; + // Check if any of the newly instructions clobber any of the live registers. + for (MachineInstr &MI : + make_range(MBB.instr_begin(), PrologueEndI->getIterator())) { + for (auto &Op : MI.operands()) + if (Op.isReg() && Op.isDef()) + assert(!LiveRegs.contains(Op.getReg()) && + "live register clobbered by inserted prologue instructions"); + } +} +#endif + +void AArch64PrologueEmitter::determineLocalsStackSize( + uint64_t StackSize, uint64_t PrologueSaveSize) { + AFI->setLocalStackSize(StackSize - PrologueSaveSize); + CombineSPBump = AFL.shouldCombineCSRLocalStackBump(MF, StackSize); +} + +void AArch64PrologueEmitter::emitPrologue() { + const MachineBasicBlock::iterator PrologueBeginI = MBB.begin(); + const MachineBasicBlock::iterator EndI = MBB.end(); + + // At this point, we're going to decide whether or not the function uses a + // redzone. In most cases, the function doesn't have a redzone so let's + // assume that's false and set it to true in the case that there's a redzone. + AFI->setHasRedZone(false); + + // Debug location must be unknown since the first debug location is used + // to determine the end of the prologue. + DebugLoc DL; + + if (AFI->shouldSignReturnAddress(MF)) { + // If pac-ret+leaf is in effect, PAUTH_PROLOGUE pseudo instructions + // are inserted by emitPacRetPlusLeafHardening(). + if (!AFL.shouldSignReturnAddressEverywhere(MF)) { + BuildMI(MBB, PrologueBeginI, DL, TII->get(AArch64::PAUTH_PROLOGUE)) + .setMIFlag(MachineInstr::FrameSetup); + } + // AArch64PointerAuth pass will insert SEH_PACSignLR + HasWinCFI |= NeedsWinCFI; + } + + if (AFI->needsShadowCallStackPrologueEpilogue(MF)) { + emitShadowCallStackPrologue(PrologueBeginI, DL); + HasWinCFI |= NeedsWinCFI; + } + + if (EmitCFI && AFI->isMTETagged()) + BuildMI(MBB, PrologueBeginI, DL, TII->get(AArch64::EMITMTETAGGED)) + .setMIFlag(MachineInstr::FrameSetup); + + // We signal the presence of a Swift extended frame to external tools by + // storing FP with 0b0001 in bits 63:60. In normal userland operation a simple + // ORR is sufficient, it is assumed a Swift kernel would initialize the TBI + // bits so that is still true. + if (HasFP && AFI->hasSwiftAsyncContext()) + emitSwiftAsyncContextFramePointer(PrologueBeginI, DL); + + // All calls are tail calls in GHC calling conv, and functions have no + // prologue/epilogue. + if (MF.getFunction().getCallingConv() == CallingConv::GHC) + return; + + // Set tagged base pointer to the requested stack slot. Ideally it should + // match SP value after prologue. + if (std::optional<int> TBPI = AFI->getTaggedBasePointerIndex()) + AFI->setTaggedBasePointerOffset(-MFI.getObjectOffset(*TBPI)); + else + AFI->setTaggedBasePointerOffset(MFI.getStackSize()); + + // getStackSize() includes all the locals in its size calculation. We don't + // include these locals when computing the stack size of a funclet, as they + // are allocated in the parent's stack frame and accessed via the frame + // pointer from the funclet. We only save the callee saved registers in the + // funclet, which are really the callee saved registers of the parent + // function, including the funclet. + int64_t NumBytes = + IsFunclet ? AFL.getWinEHFuncletFrameSize(MF) : MFI.getStackSize(); + if (!AFI->hasStackFrame() && !AFL.windowsRequiresStackProbe(MF, NumBytes)) + return emitEmptyStackFramePrologue(NumBytes, PrologueBeginI, DL); + + bool IsWin64 = Subtarget.isCallingConvWin64(F.getCallingConv(), F.isVarArg()); + unsigned FixedObject = AFL.getFixedObjectSize(MF, AFI, IsWin64, IsFunclet); + + // Windows unwind can't represent the required stack adjustments if we have + // both SVE callee-saves and dynamic stack allocations, and the frame + // pointer is before the SVE spills. The allocation of the frame pointer + // must be the last instruction in the prologue so the unwinder can restore + // the stack pointer correctly. (And there isn't any unwind opcode for + // `addvl sp, x29, -17`.) + // + // Because of this, we do spills in the opposite order on Windows: first SVE, + // then GPRs. The main side-effect of this is that it makes accessing + // parameters passed on the stack more expensive. + // + // We could consider rearranging the spills for simpler cases. + bool FPAfterSVECalleeSaves = + Subtarget.isTargetWindows() && AFI->getSVECalleeSavedStackSize(); + + if (FPAfterSVECalleeSaves && AFI->hasStackHazardSlotIndex()) + reportFatalUsageError("SME hazard padding is not supported on Windows"); + + auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject; + // All of the remaining stack allocations are for locals. + determineLocalsStackSize(NumBytes, PrologueSaveSize); + + MachineBasicBlock::iterator FirstGPRSaveI = PrologueBeginI; + if (FPAfterSVECalleeSaves) { + // If we're doing SVE saves first, we need to immediately allocate space + // for fixed objects, then space for the SVE callee saves. + // + // Windows unwind requires that the scalable size is a multiple of 16; + // that's handled when the callee-saved size is computed. + auto SaveSize = + StackOffset::getScalable(AFI->getSVECalleeSavedStackSize()) + + StackOffset::getFixed(FixedObject); + AFL.allocateStackSpace(MBB, PrologueBeginI, 0, SaveSize, NeedsWinCFI, + &HasWinCFI, + /*EmitCFI=*/false, StackOffset{}, + /*FollowupAllocs=*/true); + NumBytes -= FixedObject; + + // Now allocate space for the GPR callee saves. + MachineBasicBlock::iterator MBBI = PrologueBeginI; + while (MBBI != EndI && AFL.isSVECalleeSave(MBBI)) + ++MBBI; + FirstGPRSaveI = AFL.convertCalleeSaveRestoreToSPPrePostIncDec( + MBB, MBBI, DL, TII, -AFI->getCalleeSavedStackSize(), NeedsWinCFI, + &HasWinCFI, EmitAsyncCFI); + NumBytes -= AFI->getCalleeSavedStackSize(); + } else if (CombineSPBump) { + assert(!AFL.getSVEStackSize(MF) && "Cannot combine SP bump with SVE"); + emitFrameOffset(MBB, PrologueBeginI, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(-NumBytes), TII, + MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI, + EmitAsyncCFI); + NumBytes = 0; + } else if (HomPrologEpilog) { + // Stack has been already adjusted. + NumBytes -= PrologueSaveSize; + } else if (PrologueSaveSize != 0) { + FirstGPRSaveI = AFL.convertCalleeSaveRestoreToSPPrePostIncDec( + MBB, PrologueBeginI, DL, TII, -PrologueSaveSize, NeedsWinCFI, + &HasWinCFI, EmitAsyncCFI); + NumBytes -= PrologueSaveSize; + } + assert(NumBytes >= 0 && "Negative stack allocation size!?"); + + // Move past the saves of the callee-saved registers, fixing up the offsets + // and pre-inc if we decided to combine the callee-save and local stack + // pointer bump above. + auto &TLI = *MF.getSubtarget().getTargetLowering(); + + MachineBasicBlock::iterator AfterGPRSavesI = FirstGPRSaveI; + while (AfterGPRSavesI != EndI && + AfterGPRSavesI->getFlag(MachineInstr::FrameSetup) && + !AFL.isSVECalleeSave(AfterGPRSavesI)) { + if (CombineSPBump && + // Only fix-up frame-setup load/store instructions. + (!AFL.requiresSaveVG(MF) || !AFL.isVGInstruction(AfterGPRSavesI, TLI))) + AFL.fixupCalleeSaveRestoreStackOffset( + *AfterGPRSavesI, AFI->getLocalStackSize(), NeedsWinCFI, &HasWinCFI); + ++AfterGPRSavesI; + } + + // For funclets the FP belongs to the containing function. Only set up FP if + // we actually need to. + if (!IsFunclet && HasFP) + emitFramePointerSetup(AfterGPRSavesI, DL, FixedObject); + + // Now emit the moves for whatever callee saved regs we have (including FP, + // LR if those are saved). Frame instructions for SVE register are emitted + // later, after the instruction which actually save SVE regs. + if (EmitAsyncCFI) + emitCalleeSavedGPRLocations(AfterGPRSavesI); + + // Alignment is required for the parent frame, not the funclet + const bool NeedsRealignment = + NumBytes && !IsFunclet && RegInfo.hasStackRealignment(MF); + const int64_t RealignmentPadding = + (NeedsRealignment && MFI.getMaxAlign() > Align(16)) + ? MFI.getMaxAlign().value() - 16 + : 0; + + if (AFL.windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding)) + emitWindowsStackProbe(AfterGPRSavesI, DL, NumBytes, RealignmentPadding); + + StackOffset SVEStackSize = AFL.getSVEStackSize(MF); + StackOffset SVECalleeSavesSize = {}, SVELocalsSize = SVEStackSize; + MachineBasicBlock::iterator CalleeSavesEnd = AfterGPRSavesI; + + StackOffset CFAOffset = + StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes); + + // Process the SVE callee-saves to determine what space needs to be + // allocated. + MachineBasicBlock::iterator AfterSVESavesI = AfterGPRSavesI; + if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) { + LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize + << "\n"); + SVECalleeSavesSize = StackOffset::getScalable(CalleeSavedSize); + SVELocalsSize = SVEStackSize - SVECalleeSavesSize; + // Find callee save instructions in frame. + // Note: With FPAfterSVECalleeSaves the callee saves have already been + // allocated. + if (!FPAfterSVECalleeSaves) { + MachineBasicBlock::iterator CalleeSavesBegin = AfterGPRSavesI; + assert(AFL.isSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction"); + while (AFL.isSVECalleeSave(AfterSVESavesI) && + AfterSVESavesI != MBB.getFirstTerminator()) + ++AfterSVESavesI; + CalleeSavesEnd = AfterSVESavesI; + + StackOffset LocalsSize = SVELocalsSize + StackOffset::getFixed(NumBytes); + // Allocate space for the callee saves (if any). + AFL.allocateStackSpace(MBB, CalleeSavesBegin, 0, SVECalleeSavesSize, + false, nullptr, EmitAsyncCFI && !HasFP, CFAOffset, + MFI.hasVarSizedObjects() || LocalsSize); + } + } + CFAOffset += SVECalleeSavesSize; + + if (EmitAsyncCFI) + emitCalleeSavedSVELocations(CalleeSavesEnd); + + // Allocate space for the rest of the frame including SVE locals. Align the + // stack as necessary. + assert(!(AFL.canUseRedZone(MF) && NeedsRealignment) && + "Cannot use redzone with stack realignment"); + if (!AFL.canUseRedZone(MF)) { + // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have + // the correct value here, as NumBytes also includes padding bytes, + // which shouldn't be counted here. + AFL.allocateStackSpace(MBB, CalleeSavesEnd, RealignmentPadding, + SVELocalsSize + StackOffset::getFixed(NumBytes), + NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP, + CFAOffset, MFI.hasVarSizedObjects()); + } + + // If we need a base pointer, set it up here. It's whatever the value of the + // stack pointer is at this point. Any variable size objects will be allocated + // after this, so we can still use the base pointer to reference locals. + // + // FIXME: Clarify FrameSetup flags here. + // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is + // needed. + // For funclets the BP belongs to the containing function. + if (!IsFunclet && RegInfo.hasBasePointer(MF)) { + TII->copyPhysReg(MBB, AfterSVESavesI, DL, RegInfo.getBaseRegister(), + AArch64::SP, false); + if (NeedsWinCFI) { + HasWinCFI = true; + BuildMI(MBB, AfterSVESavesI, DL, TII->get(AArch64::SEH_Nop)) + .setMIFlag(MachineInstr::FrameSetup); + } + } + + // The very last FrameSetup instruction indicates the end of prologue. Emit a + // SEH opcode indicating the prologue end. + if (NeedsWinCFI && HasWinCFI) { + BuildMI(MBB, AfterSVESavesI, DL, TII->get(AArch64::SEH_PrologEnd)) + .setMIFlag(MachineInstr::FrameSetup); + } + + // SEH funclets are passed the frame pointer in X1. If the parent + // function uses the base register, then the base register is used + // directly, and is not retrieved from X1. + if (IsFunclet && F.hasPersonalityFn()) { + EHPersonality Per = classifyEHPersonality(F.getPersonalityFn()); + if (isAsynchronousEHPersonality(Per)) { + BuildMI(MBB, AfterSVESavesI, DL, TII->get(TargetOpcode::COPY), + AArch64::FP) + .addReg(AArch64::X1) + .setMIFlag(MachineInstr::FrameSetup); + MBB.addLiveIn(AArch64::X1); + } + } + + if (EmitCFI && !EmitAsyncCFI) { + if (HasFP) { + emitDefineCFAWithFP(AfterSVESavesI, FixedObject); + } else { + StackOffset TotalSize = + SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize()); + CFIInstBuilder CFIBuilder(MBB, AfterSVESavesI, MachineInstr::FrameSetup); + CFIBuilder.insertCFIInst( + createDefCFA(RegInfo, /*FrameReg=*/AArch64::SP, /*Reg=*/AArch64::SP, + TotalSize, /*LastAdjustmentWasScalable=*/false)); + } + emitCalleeSavedGPRLocations(AfterSVESavesI); + emitCalleeSavedSVELocations(AfterSVESavesI); + } +} + +void AArch64PrologueEmitter::emitShadowCallStackPrologue( + MachineBasicBlock::iterator MBBI, const DebugLoc &DL) const { + // Shadow call stack prolog: str x30, [x18], #8 + BuildMI(MBB, MBBI, DL, TII->get(AArch64::STRXpost)) + .addReg(AArch64::X18, RegState::Define) + .addReg(AArch64::LR) + .addReg(AArch64::X18) + .addImm(8) + .setMIFlag(MachineInstr::FrameSetup); + + // This instruction also makes x18 live-in to the entry block. + MBB.addLiveIn(AArch64::X18); + + if (NeedsWinCFI) + BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop)) + .setMIFlag(MachineInstr::FrameSetup); + + if (EmitCFI) { + // Emit a CFI instruction that causes 8 to be subtracted from the value of + // x18 when unwinding past this frame. + static const char CFIInst[] = { + dwarf::DW_CFA_val_expression, + 18, // register + 2, // length + static_cast<char>(unsigned(dwarf::DW_OP_breg18)), + static_cast<char>(-8) & 0x7f, // addend (sleb128) + }; + CFIInstBuilder(MBB, MBBI, MachineInstr::FrameSetup) + .buildEscape(StringRef(CFIInst, sizeof(CFIInst))); + } +} + +void AArch64PrologueEmitter::emitSwiftAsyncContextFramePointer( + MachineBasicBlock::iterator MBBI, const DebugLoc &DL) const { + switch (MF.getTarget().Options.SwiftAsyncFramePointer) { + case SwiftAsyncFramePointerMode::DeploymentBased: + if (Subtarget.swiftAsyncContextIsDynamicallySet()) { + // The special symbol below is absolute and has a *value* that can be + // combined with the frame pointer to signal an extended frame. + BuildMI(MBB, MBBI, DL, TII->get(AArch64::LOADgot), AArch64::X16) + .addExternalSymbol("swift_async_extendedFramePointerFlags", + AArch64II::MO_GOT); + if (NeedsWinCFI) { + BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop)) + .setMIFlags(MachineInstr::FrameSetup); + HasWinCFI = true; + } + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrs), AArch64::FP) + .addUse(AArch64::FP) + .addUse(AArch64::X16) + .addImm(Subtarget.isTargetILP32() ? 32 : 0); + if (NeedsWinCFI) { + BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop)) + .setMIFlags(MachineInstr::FrameSetup); + HasWinCFI = true; + } + break; + } + [[fallthrough]]; + + case SwiftAsyncFramePointerMode::Always: + // ORR x29, x29, #0x1000_0000_0000_0000 + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXri), AArch64::FP) + .addUse(AArch64::FP) + .addImm(0x1100) + .setMIFlag(MachineInstr::FrameSetup); + if (NeedsWinCFI) { + BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop)) + .setMIFlags(MachineInstr::FrameSetup); + HasWinCFI = true; + } + break; + + case SwiftAsyncFramePointerMode::Never: + break; + } +} + +void AArch64PrologueEmitter::emitEmptyStackFramePrologue( + int64_t NumBytes, MachineBasicBlock::iterator MBBI, + const DebugLoc &DL) const { + assert(!HasFP && "unexpected function without stack frame but with FP"); + assert(!AFL.getSVEStackSize(MF) && + "unexpected function without stack frame but with SVE objects"); + // All of the stack allocation is for locals. + AFI->setLocalStackSize(NumBytes); + if (!NumBytes) { + if (NeedsWinCFI && HasWinCFI) { + BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd)) + .setMIFlag(MachineInstr::FrameSetup); + } + return; + } + // REDZONE: If the stack size is less than 128 bytes, we don't need + // to actually allocate. + if (AFL.canUseRedZone(MF)) { + AFI->setHasRedZone(true); + ++NumRedZoneFunctions; + } else { + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(-NumBytes), TII, + MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); + if (EmitCFI) { + // Label used to tie together the PROLOG_LABEL and the MachineMoves. + MCSymbol *FrameLabel = MF.getContext().createTempSymbol(); + // Encode the stack size of the leaf function. + CFIInstBuilder(MBB, MBBI, MachineInstr::FrameSetup) + .buildDefCFAOffset(NumBytes, FrameLabel); + } + } + + if (NeedsWinCFI) { + HasWinCFI = true; + BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd)) + .setMIFlag(MachineInstr::FrameSetup); + } +} + +void AArch64PrologueEmitter::emitFramePointerSetup( + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, + unsigned FixedObject) { + int64_t FPOffset = AFI->getCalleeSaveBaseToFrameRecordOffset(); + if (CombineSPBump) + FPOffset += AFI->getLocalStackSize(); + + if (AFI->hasSwiftAsyncContext()) { + // Before we update the live FP we have to ensure there's a valid (or + // null) asynchronous context in its slot just before FP in the frame + // record, so store it now. + const auto &Attrs = MF.getFunction().getAttributes(); + bool HaveInitialContext = Attrs.hasAttrSomewhere(Attribute::SwiftAsync); + if (HaveInitialContext) + MBB.addLiveIn(AArch64::X22); + Register Reg = HaveInitialContext ? AArch64::X22 : AArch64::XZR; + BuildMI(MBB, MBBI, DL, TII->get(AArch64::StoreSwiftAsyncContext)) + .addUse(Reg) + .addUse(AArch64::SP) + .addImm(FPOffset - 8) + .setMIFlags(MachineInstr::FrameSetup); + if (NeedsWinCFI) { + // WinCFI and arm64e, where StoreSwiftAsyncContext is expanded + // to multiple instructions, should be mutually-exclusive. + assert(Subtarget.getTargetTriple().getArchName() != "arm64e"); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop)) + .setMIFlags(MachineInstr::FrameSetup); + HasWinCFI = true; + } + } + + if (HomPrologEpilog) { + auto Prolog = MBBI; + --Prolog; + assert(Prolog->getOpcode() == AArch64::HOM_Prolog); + Prolog->addOperand(MachineOperand::CreateImm(FPOffset)); + } else { + // Issue sub fp, sp, FPOffset or + // mov fp,sp when FPOffset is zero. + // Note: All stores of callee-saved registers are marked as "FrameSetup". + // This code marks the instruction(s) that set the FP also. + emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, + StackOffset::getFixed(FPOffset), TII, + MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); + if (NeedsWinCFI && HasWinCFI) { + BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd)) + .setMIFlag(MachineInstr::FrameSetup); + // After setting up the FP, the rest of the prolog doesn't need to be + // included in the SEH unwind info. + NeedsWinCFI = false; + } + } + if (EmitAsyncCFI) + emitDefineCFAWithFP(MBBI, FixedObject); +} + +// Define the current CFA rule to use the provided FP. +void AArch64PrologueEmitter::emitDefineCFAWithFP( + MachineBasicBlock::iterator MBBI, unsigned FixedObject) const { + const AArch64RegisterInfo *TRI = Subtarget.getRegisterInfo(); + const int OffsetToFirstCalleeSaveFromFP = + AFI->getCalleeSaveBaseToFrameRecordOffset() - + AFI->getCalleeSavedStackSize(); + Register FramePtr = TRI->getFrameRegister(MF); + CFIInstBuilder(MBB, MBBI, MachineInstr::FrameSetup) + .buildDefCFA(FramePtr, FixedObject - OffsetToFirstCalleeSaveFromFP); +} + +void AArch64PrologueEmitter::emitWindowsStackProbe( + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, int64_t &NumBytes, + int64_t RealignmentPadding) const { + if (AFI->getSVECalleeSavedStackSize()) + report_fatal_error("SVE callee saves not yet supported with stack probing"); + + // Find an available register to spill the value of X15 to, if X15 is being + // used already for nest. + unsigned X15Scratch = AArch64::NoRegister; + const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>(); + if (llvm::any_of(MBB.liveins(), + [&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) { + return STI.getRegisterInfo()->isSuperOrSubRegisterEq( + AArch64::X15, LiveIn.PhysReg); + })) { + X15Scratch = AFL.findScratchNonCalleeSaveRegister(&MBB, /*HasCall=*/true); + assert(X15Scratch != AArch64::NoRegister && + (X15Scratch < AArch64::X15 || X15Scratch > AArch64::X17)); +#ifndef NDEBUG + LiveRegs.removeReg(AArch64::X15); // ignore X15 since we restore it +#endif + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), X15Scratch) + .addReg(AArch64::XZR) + .addReg(AArch64::X15, RegState::Undef) + .addReg(AArch64::X15, RegState::Implicit) + .setMIFlag(MachineInstr::FrameSetup); + } + + uint64_t NumWords = (NumBytes + RealignmentPadding) >> 4; + if (NeedsWinCFI) { + HasWinCFI = true; + // alloc_l can hold at most 256MB, so assume that NumBytes doesn't + // exceed this amount. We need to move at most 2^24 - 1 into x15. + // This is at most two instructions, MOVZ followed by MOVK. + // TODO: Fix to use multiple stack alloc unwind codes for stacks + // exceeding 256MB in size. + if (NumBytes >= (1 << 28)) + report_fatal_error("Stack size cannot exceed 256MB for stack " + "unwinding purposes"); + + uint32_t LowNumWords = NumWords & 0xFFFF; + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVZXi), AArch64::X15) + .addImm(LowNumWords) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) + .setMIFlag(MachineInstr::FrameSetup); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop)) + .setMIFlag(MachineInstr::FrameSetup); + if ((NumWords & 0xFFFF0000) != 0) { + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVKXi), AArch64::X15) + .addReg(AArch64::X15) + .addImm((NumWords & 0xFFFF0000) >> 16) // High half + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16)) + .setMIFlag(MachineInstr::FrameSetup); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop)) + .setMIFlag(MachineInstr::FrameSetup); + } + } else { + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15) + .addImm(NumWords) + .setMIFlags(MachineInstr::FrameSetup); + } + + const char *ChkStk = Subtarget.getChkStkName(); + switch (MF.getTarget().getCodeModel()) { + case CodeModel::Tiny: + case CodeModel::Small: + case CodeModel::Medium: + case CodeModel::Kernel: + BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL)) + .addExternalSymbol(ChkStk) + .addReg(AArch64::X15, RegState::Implicit) + .addReg(AArch64::X16, + RegState::Implicit | RegState::Define | RegState::Dead) + .addReg(AArch64::X17, + RegState::Implicit | RegState::Define | RegState::Dead) + .addReg(AArch64::NZCV, + RegState::Implicit | RegState::Define | RegState::Dead) + .setMIFlags(MachineInstr::FrameSetup); + if (NeedsWinCFI) { + HasWinCFI = true; + BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop)) + .setMIFlag(MachineInstr::FrameSetup); + } + break; + case CodeModel::Large: + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT)) + .addReg(AArch64::X16, RegState::Define) + .addExternalSymbol(ChkStk) + .addExternalSymbol(ChkStk) + .setMIFlags(MachineInstr::FrameSetup); + if (NeedsWinCFI) { + HasWinCFI = true; + BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop)) + .setMIFlag(MachineInstr::FrameSetup); + } + + BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF))) + .addReg(AArch64::X16, RegState::Kill) + .addReg(AArch64::X15, RegState::Implicit | RegState::Define) + .addReg(AArch64::X16, + RegState::Implicit | RegState::Define | RegState::Dead) + .addReg(AArch64::X17, + RegState::Implicit | RegState::Define | RegState::Dead) + .addReg(AArch64::NZCV, + RegState::Implicit | RegState::Define | RegState::Dead) + .setMIFlags(MachineInstr::FrameSetup); + if (NeedsWinCFI) { + HasWinCFI = true; + BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop)) + .setMIFlag(MachineInstr::FrameSetup); + } + break; + } + + BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP) + .addReg(AArch64::SP, RegState::Kill) + .addReg(AArch64::X15, RegState::Kill) + .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4)) + .setMIFlags(MachineInstr::FrameSetup); + if (NeedsWinCFI) { + HasWinCFI = true; + BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } + NumBytes = 0; + + if (RealignmentPadding > 0) { + if (RealignmentPadding >= 4096) { + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm)) + .addReg(AArch64::X16, RegState::Define) + .addImm(RealignmentPadding) + .setMIFlags(MachineInstr::FrameSetup); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXrx64), AArch64::X15) + .addReg(AArch64::SP) + .addReg(AArch64::X16, RegState::Kill) + .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0)) + .setMIFlag(MachineInstr::FrameSetup); + } else { + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), AArch64::X15) + .addReg(AArch64::SP) + .addImm(RealignmentPadding) + .addImm(0) + .setMIFlag(MachineInstr::FrameSetup); + } + + uint64_t AndMask = ~(MFI.getMaxAlign().value() - 1); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP) + .addReg(AArch64::X15, RegState::Kill) + .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64)); + AFI->setStackRealigned(true); + + // No need for SEH instructions here; if we're realigning the stack, + // we've set a frame pointer and already finished the SEH prologue. + assert(!NeedsWinCFI); + } + if (X15Scratch != AArch64::NoRegister) { + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), AArch64::X15) + .addReg(AArch64::XZR) + .addReg(X15Scratch, RegState::Undef) + .addReg(X15Scratch, RegState::Implicit) + .setMIFlag(MachineInstr::FrameSetup); + } +} + +void AArch64PrologueEmitter::emitCalleeSavedGPRLocations( + MachineBasicBlock::iterator MBBI) const { + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + + const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); + if (CSI.empty()) + return; + + CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup); + for (const auto &Info : CSI) { + unsigned FrameIdx = Info.getFrameIdx(); + if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector) + continue; + + assert(!Info.isSpilledToReg() && "Spilling to registers not implemented"); + int64_t Offset = MFI.getObjectOffset(FrameIdx) - AFL.getOffsetOfLocalArea(); + CFIBuilder.buildOffset(Info.getReg(), Offset); + } +} + +void AArch64PrologueEmitter::emitCalleeSavedSVELocations( + MachineBasicBlock::iterator MBBI) const { + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + + // Add callee saved registers to move list. + const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); + if (CSI.empty()) + return; + + const TargetSubtargetInfo &STI = MF.getSubtarget(); + const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); + AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>(); + CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup); + + std::optional<int64_t> IncomingVGOffsetFromDefCFA; + if (AFL.requiresSaveVG(MF)) { + auto IncomingVG = *find_if( + reverse(CSI), [](auto &Info) { return Info.getReg() == AArch64::VG; }); + IncomingVGOffsetFromDefCFA = MFI.getObjectOffset(IncomingVG.getFrameIdx()) - + AFL.getOffsetOfLocalArea(); + } + + for (const auto &Info : CSI) { + if (MFI.getStackID(Info.getFrameIdx()) != TargetStackID::ScalableVector) + continue; + + // Not all unwinders may know about SVE registers, so assume the lowest + // common denominator. + assert(!Info.isSpilledToReg() && "Spilling to registers not implemented"); + MCRegister Reg = Info.getReg(); + if (!static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg)) + continue; + + StackOffset Offset = + StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) - + StackOffset::getFixed(AFI.getCalleeSavedStackSize(MFI)); + + CFIBuilder.insertCFIInst( + createCFAOffset(TRI, Reg, Offset, IncomingVGOffsetFromDefCFA)); + } +} + +} // namespace llvm diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h new file mode 100644 index 000000000000..94029ede60c7 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h @@ -0,0 +1,111 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the declaration of the AArch64PrologueEmitter class, +/// which is is used to emit the prologue on AArch64. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64PROLOGUEEPILOGUE_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64PROLOGUEEPILOGUE_H + +#include "AArch64RegisterInfo.h" +#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/MachineFunction.h" + +namespace llvm { + +class AArch64Subtarget; +class AArch64FunctionInfo; +class AArch64FrameLowering; + +/// A helper class for emitting the prologue. Substantial new functionality +/// should be factored into a new method. Where possible "emit*" methods should +/// be const, and any flags that change how the prologue is emitted should be +/// set in the constructor. +class AArch64PrologueEmitter { +public: + AArch64PrologueEmitter(MachineFunction &MF, MachineBasicBlock &MBB, + const AArch64FrameLowering &AFL); + + /// Emit the prologue. + void emitPrologue(); + + ~AArch64PrologueEmitter() { + MF.setHasWinCFI(HasWinCFI); +#ifndef NDEBUG + verifyPrologueClobbers(); +#endif + } + +private: + void emitShadowCallStackPrologue(MachineBasicBlock::iterator MBBI, + const DebugLoc &DL) const; + + void emitSwiftAsyncContextFramePointer(MachineBasicBlock::iterator MBBI, + const DebugLoc &DL) const; + + void emitEmptyStackFramePrologue(int64_t NumBytes, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL) const; + + void emitFramePointerSetup(MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, unsigned FixedObject); + + void emitDefineCFAWithFP(MachineBasicBlock::iterator MBBI, + unsigned FixedObject) const; + + void emitWindowsStackProbe(MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, int64_t &NumBytes, + int64_t RealignmentPadding) const; + + void emitCalleeSavedGPRLocations(MachineBasicBlock::iterator MBBI) const; + void emitCalleeSavedSVELocations(MachineBasicBlock::iterator MBBI) const; + + void determineLocalsStackSize(uint64_t StackSize, uint64_t PrologueSaveSize); + + MachineFunction &MF; + MachineBasicBlock &MBB; + + const Function &F; + const MachineFrameInfo &MFI; + const AArch64Subtarget &Subtarget; + const AArch64FrameLowering &AFL; + const AArch64RegisterInfo &RegInfo; + +#ifndef NDEBUG + mutable LivePhysRegs LiveRegs{RegInfo}; + MachineBasicBlock::iterator PrologueEndI; + + void collectBlockLiveins(); + void verifyPrologueClobbers() const; +#endif + + // Prologue flags. These generally should not change outside of the + // constructor. Two exceptions are "CombineSPBump" which is set in + // determineLocalsStackSize, and "NeedsWinCFI" which is set in + // emitFramePointerSetup. + bool EmitCFI = false; + bool EmitAsyncCFI = false; + bool HasFP = false; + bool IsFunclet = false; + bool CombineSPBump = false; + bool HomPrologEpilog = false; + bool NeedsWinCFI = false; + + // Note: "HasWinCFI" is mutable as it can change in any "emit" function. + mutable bool HasWinCFI = false; + + const TargetInstrInfo *TII = nullptr; + AArch64FunctionInfo *AFI = nullptr; +}; + +} // namespace llvm + +#endif diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td index 1a7609bfee8a..431ed6ec34e7 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -983,7 +983,7 @@ class ZPRRegOp <string Suffix, AsmOperandClass C, ElementSizeEnum Size, // Note: This hardware mode is enabled in AArch64Subtarget::getHwModeSet() // (without the use of the table-gen'd predicates). -def SMEWithZPRPredicateSpills : HwMode<"", [Predicate<"false">]>; +def SMEWithZPRPredicateSpills : HwMode<[Predicate<"false">]>; def PPRSpillFillRI : RegInfoByHwMode< [DefaultMode, SMEWithZPRPredicateSpills], diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 125225df1546..601dc34d74b9 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -61,10 +61,10 @@ let usesCustomInserter = 1, Defs = [SP], Uses = [SP] in { def : Pat<(i64 (AArch64AllocateZABuffer GPR64:$size)), (AllocateZABuffer $size)>; -def AArch64InitTPIDR2Obj : SDNode<"AArch64ISD::INIT_TPIDR2OBJ", SDTypeProfile<0, 1, - [SDTCisInt<0>]>, [SDNPHasChain, SDNPMayStore]>; +def AArch64InitTPIDR2Obj : SDNode<"AArch64ISD::INIT_TPIDR2OBJ", SDTypeProfile<0, 2, + [SDTCisInt<0>, SDTCisInt<1>]>, [SDNPHasChain, SDNPMayStore]>; let usesCustomInserter = 1 in { - def InitTPIDR2Obj : Pseudo<(outs), (ins GPR64:$buffer), [(AArch64InitTPIDR2Obj GPR64:$buffer)]>, Sched<[WriteI]> {} + def InitTPIDR2Obj : Pseudo<(outs), (ins GPR64:$buffer, GPR64:$save_slices), [(AArch64InitTPIDR2Obj GPR64:$buffer, GPR64:$save_slices)]>, Sched<[WriteI]> {} } // Nodes to allocate a save buffer for SME. @@ -93,6 +93,8 @@ let hasSideEffects = 1, isMeta = 1 in { def RequiresZASavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>; } +def SMEStateAllocPseudo : Pseudo<(outs), (ins), []>, Sched<[]>; + def CommitZASavePseudo : Pseudo<(outs), (ins GPR64:$tpidr2_el0, i1imm:$zero_za, i64imm:$commit_routine, variable_ops), []>, @@ -108,6 +110,11 @@ def AArch64_requires_za_save [SDNPHasChain, SDNPInGlue]>; def : Pat<(AArch64_requires_za_save), (RequiresZASavePseudo)>; +def AArch64_sme_state_alloc + : SDNode<"AArch64ISD::SME_STATE_ALLOC", SDTypeProfile<0, 0,[]>, + [SDNPHasChain]>; +def : Pat<(AArch64_sme_state_alloc), (SMEStateAllocPseudo)>; + //===----------------------------------------------------------------------===// // Instruction naming conventions. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index eeb47b4d9975..7604ffdc9f64 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -412,6 +412,7 @@ def SDT_AArch64PTest : SDTypeProfile<1, 2, [ ]>; def AArch64ptest : SDNode<"AArch64ISD::PTEST", SDT_AArch64PTest>; def AArch64ptest_any : SDNode<"AArch64ISD::PTEST_ANY", SDT_AArch64PTest>; +def AArch64ptest_first : SDNode<"AArch64ISD::PTEST_FIRST", SDT_AArch64PTest>; def SDT_AArch64DUP_PRED : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 3>, SDTCisVec<1>, SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0, 1>]>; @@ -650,7 +651,7 @@ let Predicates = [HasSVE_or_SME, UseExperimentalZeroingPseudos] in { let Predicates = [HasSVE_or_SME] in { defm ADD_ZI : sve_int_arith_imm0<0b000, "add", add>; - defm SUB_ZI : sve_int_arith_imm0<0b001, "sub", sub>; + defm SUB_ZI : sve_int_arith_imm0<0b001, "sub", sub, add>; defm SUBR_ZI : sve_int_arith_imm0<0b011, "subr", AArch64subr>; defm SQADD_ZI : sve_int_arith_imm0_ssat<0b100, "sqadd", saddsat, ssubsat>; defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd", uaddsat>; @@ -1071,7 +1072,7 @@ let Predicates = [HasSVE_or_SME] in { defm BRKB_PPmP : sve_int_break_m<0b101, "brkb", int_aarch64_sve_brkb>; defm BRKBS_PPzP : sve_int_break_z<0b110, "brkbs", null_frag>; - defm PTEST_PP : sve_int_ptest<0b010000, "ptest", AArch64ptest, AArch64ptest_any>; + defm PTEST_PP : sve_int_ptest<0b010000, "ptest", AArch64ptest, AArch64ptest_any, AArch64ptest_first>; defm PFALSE : sve_int_pfalse<0b000000, "pfalse">; defm PFIRST : sve_int_pfirst<0b00000, "pfirst", int_aarch64_sve_pfirst>; defm PNEXT : sve_int_pnext<0b00110, "pnext", int_aarch64_sve_pnext>; @@ -4141,8 +4142,8 @@ let Predicates = [HasSVE2_or_SME] in { defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi", int_aarch64_sve_whilehi, get_active_lane_mask>; // SVE2 pointer conflict compare - defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr", "int_aarch64_sve_whilewr">; - defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw", "int_aarch64_sve_whilerw">; + defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr", loop_dependence_war_mask>; + defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw", loop_dependence_raw_mask>; } // End HasSVE2_or_SME let Predicates = [HasSVEAES, HasNonStreamingSVE_or_SSVE_AES] in { diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 0f4f0129e9cd..98e0a1180510 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -91,6 +91,10 @@ static cl::opt<bool> EnableZPRPredicateSpills( cl::desc( "Enables spilling/reloading SVE predicates as data vectors (ZPRs)")); +static cl::opt<unsigned> + VScaleForTuningOpt("sve-vscale-for-tuning", cl::Hidden, + cl::desc("Force a vscale for tuning factor for SVE")); + // Subreg liveness tracking is disabled by default for now until all issues // are ironed out. This option allows the feature to be used in tests. static cl::opt<bool> @@ -364,6 +368,8 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) { if (AArch64MinimumJumpTableEntries.getNumOccurrences() > 0 || !HasMinSize) MinimumJumpTableEntries = AArch64MinimumJumpTableEntries; + if (VScaleForTuningOpt.getNumOccurrences() > 0) + VScaleForTuning = VScaleForTuningOpt; } AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU, diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index 01c0bcc3a6a7..671df35cd379 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -246,8 +246,8 @@ public: /// Return true if the CPU supports any kind of instruction fusion. bool hasFusion() const { return hasArithmeticBccFusion() || hasArithmeticCbzFusion() || - hasFuseAES() || hasFuseArithmeticLogic() || hasFuseCCSelect() || - hasFuseAdrpAdd() || hasFuseLiterals(); + hasFuseAES() || hasFuseArithmeticLogic() || hasFuseCmpCSel() || + hasFuseCmpCSet() || hasFuseAdrpAdd() || hasFuseLiterals(); } unsigned getEpilogueVectorizationMinVF() const { diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td index 1b0e90b0e0dc..65b752ed40c9 100644 --- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td +++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td @@ -362,7 +362,7 @@ def lookupTSBByName : SearchIndex { let Key = ["Name"]; } -def : TSB<"csync", 0>; +def : TSB<"csync", 2>; //===----------------------------------------------------------------------===// // PRFM (prefetch) instruction options. diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index e67bd5869ccd..4650b2d0c815 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -589,7 +589,8 @@ void AArch64TargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PB.registerLateLoopOptimizationsEPCallback( [=](LoopPassManager &LPM, OptimizationLevel Level) { - LPM.addPass(LoopIdiomVectorizePass()); + if (Level != OptimizationLevel::O0) + LPM.addPass(LoopIdiomVectorizePass()); }); if (getTargetTriple().isOSWindows()) PB.registerPipelineEarlySimplificationEPCallback( diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 490f6391c15a..92321a76dbd8 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -25,6 +25,7 @@ #include "llvm/Support/Debug.h" #include "llvm/TargetParser/AArch64TargetParser.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" +#include "llvm/Transforms/Utils/UnrollLoop.h" #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include <algorithm> #include <optional> @@ -4409,6 +4410,32 @@ AArch64TTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, return 1; } +/// Check whether Opcode1 has less throughput according to the scheduling +/// model than Opcode2. +bool AArch64TTIImpl::hasKnownLowerThroughputFromSchedulingModel( + unsigned Opcode1, unsigned Opcode2) const { + const MCSchedModel &Sched = ST->getSchedModel(); + const TargetInstrInfo *TII = ST->getInstrInfo(); + if (!Sched.hasInstrSchedModel()) + return false; + + const MCSchedClassDesc *SCD1 = + Sched.getSchedClassDesc(TII->get(Opcode1).getSchedClass()); + const MCSchedClassDesc *SCD2 = + Sched.getSchedClassDesc(TII->get(Opcode2).getSchedClass()); + // We cannot handle variant scheduling classes without an MI. If we need to + // support them for any of the instructions we query the information of we + // might need to add a way to resolve them without a MI or not use the + // scheduling info. + assert(!SCD1->isVariant() && !SCD2->isVariant() && + "Cannot handle variant scheduling classes without an MI"); + if (!SCD1->isValid() || !SCD2->isValid()) + return false; + + return MCSchedModel::getReciprocalThroughput(*ST, *SCD1) > + MCSchedModel::getReciprocalThroughput(*ST, *SCD2); +} + InstructionCost AArch64TTIImpl::getCmpSelInstrCost( unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, @@ -4506,6 +4533,12 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost( (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ)) Factor = 3; // fcmxx+fcmyy+or + if (isa<ScalableVectorType>(ValTy) && + CostKind == TTI::TCK_RecipThroughput && + hasKnownLowerThroughputFromSchedulingModel(AArch64::FCMEQ_PPzZZ_S, + AArch64::FCMEQv4f32)) + Factor *= 2; + return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first); } @@ -4937,6 +4970,23 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, if (!L->getExitBlock()) return; + // Check if the loop contains any reductions that could be parallelized when + // unrolling. If so, enable partial unrolling, if the trip count is know to be + // a multiple of 2. + bool HasParellelizableReductions = + L->getNumBlocks() == 1 && + any_of(L->getHeader()->phis(), + [&SE, L](PHINode &Phi) { + return canParallelizeReductionWhenUnrolling(Phi, L, &SE); + }) && + isLoopSizeWithinBudget(L, TTI, 12, nullptr); + if (HasParellelizableReductions && + SE.getSmallConstantTripMultiple(L, L->getExitingBlock()) % 2 == 0) { + UP.Partial = true; + UP.MaxCount = 4; + UP.AddAdditionalAccumulators = true; + } + const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L); if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) || (SE.getSmallConstantMaxTripCount(L) > 0 && @@ -4952,6 +5002,12 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, // Limit to loops with trip counts that are cheap to expand. UP.SCEVExpansionBudget = 1; + if (HasParellelizableReductions) { + UP.Runtime = true; + UP.DefaultUnrollRuntimeCount = 4; + UP.AddAdditionalAccumulators = true; + } + // Try to unroll small loops, of few-blocks with low budget, if they have // load/store dependencies, to expose more parallel memory access streams, // or if they do little work inside a block (i.e. load -> X -> store pattern). @@ -5486,13 +5542,14 @@ InstructionCost AArch64TTIImpl::getExtendedReductionCost( } InstructionCost -AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy, - VectorType *VecTy, +AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, + Type *ResTy, VectorType *VecTy, TTI::TargetCostKind CostKind) const { EVT VecVT = TLI->getValueType(DL, VecTy); EVT ResVT = TLI->getValueType(DL, ResTy); - if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple()) { + if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple() && + RedOpcode == Instruction::Add) { std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy); // The legal cases with dotprod are @@ -5503,7 +5560,8 @@ AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy, return LT.first + 2; } - return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, VecTy, CostKind); + return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, VecTy, + CostKind); } InstructionCost @@ -5750,11 +5808,14 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp); bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector; - // A subvector extract can be implemented with an ext (or trivial extract, if - // from lane 0). This currently only handles low or high extracts to prevent - // SLP vectorizer regressions. + // A subvector extract can be implemented with a NEON/SVE ext (or trivial + // extract, if from lane 0) for 128-bit NEON vectors or legal SVE vectors. + // This currently only handles low or high extracts to prevent SLP vectorizer + // regressions. + // Note that SVE's ext instruction is destructive, but it can be fused with + // a movprfx to act like a constructive instruction. if (IsExtractSubvector && LT.second.isFixedLengthVector()) { - if (LT.second.is128BitVector() && + if (LT.second.getFixedSizeInBits() >= 128 && cast<FixedVectorType>(SubTp)->getNumElements() == LT.second.getVectorNumElements() / 2) { if (Index == 0) @@ -6017,9 +6078,15 @@ static bool containsDecreasingPointers(Loop *TheLoop, return false; } -bool AArch64TTIImpl::preferFixedOverScalableIfEqualCost() const { +bool AArch64TTIImpl::preferFixedOverScalableIfEqualCost(bool IsEpilogue) const { if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences()) return SVEPreferFixedOverScalableIfEqualCost; + // For cases like post-LTO vectorization, when we eventually know the trip + // count, epilogue with fixed-width vectorization can be deleted if the trip + // count is less than the epilogue iterations. That's why we prefer + // fixed-width vectorization in epilogue in case of equal costs. + if (IsEpilogue) + return true; return ST->useFixedOverScalableIfEqualCost(); } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 42ae962b3b42..fe2e849258e3 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -174,6 +174,11 @@ public: bool prefersVectorizedAddressing() const override; + /// Check whether Opcode1 has less throughput according to the scheduling + /// model than Opcode2. + bool hasKnownLowerThroughputFromSchedulingModel(unsigned Opcode1, + unsigned Opcode2) const; + InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, @@ -424,7 +429,7 @@ public: return TailFoldingStyle::DataWithoutLaneMask; } - bool preferFixedOverScalableIfEqualCost() const override; + bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const override; unsigned getEpilogueVectorizationMinVF() const override; @@ -460,7 +465,7 @@ public: TTI::TargetCostKind CostKind) const override; InstructionCost getMulAccReductionCost( - bool IsUnsigned, Type *ResTy, VectorType *Ty, + bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const override; InstructionCost diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt index 803943fd57c4..a8185358d6df 100644 --- a/llvm/lib/Target/AArch64/CMakeLists.txt +++ b/llvm/lib/Target/AArch64/CMakeLists.txt @@ -7,7 +7,8 @@ tablegen(LLVM AArch64GenAsmWriter.inc -gen-asm-writer) tablegen(LLVM AArch64GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1) tablegen(LLVM AArch64GenCallingConv.inc -gen-callingconv) tablegen(LLVM AArch64GenDAGISel.inc -gen-dag-isel) -tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler) +tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler + -ignore-non-decodable-operands) tablegen(LLVM AArch64GenFastISel.inc -gen-fast-isel) tablegen(LLVM AArch64GenGlobalISel.inc -gen-global-isel) tablegen(LLVM AArch64GenO0PreLegalizeGICombiner.inc -gen-global-isel-combiner @@ -91,6 +92,7 @@ add_llvm_target(AArch64CodeGen SVEIntrinsicOpts.cpp MachineSMEABIPass.cpp AArch64SIMDInstrOpt.cpp + AArch64PrologueEpilogue.cpp DEPENDS intrinsics_gen @@ -107,6 +109,7 @@ add_llvm_target(AArch64CodeGen Core GlobalISel MC + Passes Scalar SelectionDAG Support diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index 323db2a0728e..aa1c1c882e22 100644 --- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -35,308 +35,14 @@ using namespace llvm::MCD; // Pull DecodeStatus and its enum values into the global namespace. using DecodeStatus = MCDisassembler::DecodeStatus; -// Forward declare these because the autogenerated code will reference them. -// Definitions are further down. -template <unsigned RegClassID, unsigned FirstReg, unsigned NumRegsInClass> -static DecodeStatus DecodeSimpleRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus -DecodeGPR64x8ClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const MCDisassembler *Decoder); -template <unsigned Min, unsigned Max> -static DecodeStatus DecodeZPRMul2_MinMax(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeZK(MCInst &Inst, unsigned RegNo, uint64_t Address, - const MCDisassembler *Decoder); -template <unsigned Min, unsigned Max> -static DecodeStatus DecodeZPR2Mul2RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeZPR4Mul4RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const void *Decoder); -template <unsigned NumBitsForTile> -static DecodeStatus DecodeMatrixTile(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus -DecodeMatrixTileListRegisterClass(MCInst &Inst, unsigned RegMask, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodePPR2Mul2RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const void *Decoder); - -static DecodeStatus DecodeFixedPointScaleImm32(MCInst &Inst, unsigned Imm, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeFixedPointScaleImm64(MCInst &Inst, unsigned Imm, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodePCRelLabel16(MCInst &Inst, unsigned Imm, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodePCRelLabel19(MCInst &Inst, unsigned Imm, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodePCRelLabel9(MCInst &Inst, unsigned Imm, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeMemExtend(MCInst &Inst, unsigned Imm, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeMRSSystemRegister(MCInst &Inst, unsigned Imm, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeMSRSystemRegister(MCInst &Inst, unsigned Imm, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus -DecodeThreeAddrSRegInstruction(MCInst &Inst, uint32_t insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeMoveImmInstruction(MCInst &Inst, uint32_t insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus -DecodeUnsignedLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeSignedLdStInstruction(MCInst &Inst, uint32_t insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus -DecodeExclusiveLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeAuthLoadInstruction(MCInst &Inst, uint32_t insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeAddSubERegInstruction(MCInst &Inst, uint32_t insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeLogicalImmInstruction(MCInst &Inst, uint32_t insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeModImmInstruction(MCInst &Inst, uint32_t insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeModImmTiedInstruction(MCInst &Inst, uint32_t insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeAdrInstruction(MCInst &Inst, uint32_t insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeAddSubImmShift(MCInst &Inst, uint32_t insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeUnconditionalBranch(MCInst &Inst, uint32_t insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus -DecodeSystemPStateImm0_15Instruction(MCInst &Inst, uint32_t insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus -DecodeSystemPStateImm0_1Instruction(MCInst &Inst, uint32_t insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeTestAndBranch(MCInst &Inst, uint32_t insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeFMOVLaneInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVecShiftR64Imm(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVecShiftR64ImmNarrow(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVecShiftR32Imm(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVecShiftR32ImmNarrow(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVecShiftR16Imm(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVecShiftR16ImmNarrow(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVecShiftR8Imm(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVecShiftL64Imm(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVecShiftL32Imm(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVecShiftL16Imm(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVecShiftL8Imm(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const MCDisassembler *Decoder); -static DecodeStatus -DecodeWSeqPairsClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, - const MCDisassembler *Decoder); -static DecodeStatus -DecodeXSeqPairsClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, - const MCDisassembler *Decoder); -static DecodeStatus DecodeSyspXzrInstruction(MCInst &Inst, uint32_t insn, - uint64_t Addr, - const MCDisassembler *Decoder); -static DecodeStatus -DecodeSVELogicalImmInstruction(MCInst &Inst, uint32_t insn, uint64_t Address, - const MCDisassembler *Decoder); template <int Bits> static DecodeStatus DecodeSImm(MCInst &Inst, uint64_t Imm, uint64_t Address, const MCDisassembler *Decoder); -template <int ElementWidth> -static DecodeStatus DecodeImm8OptLsl(MCInst &Inst, unsigned Imm, uint64_t Addr, - const MCDisassembler *Decoder); -static DecodeStatus DecodeSVEIncDecImm(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const MCDisassembler *Decoder); -static DecodeStatus DecodeSVCROp(MCInst &Inst, unsigned Imm, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeCPYMemOpInstruction(MCInst &Inst, uint32_t insn, - uint64_t Addr, - const MCDisassembler *Decoder); -static DecodeStatus DecodeSETMemOpInstruction(MCInst &Inst, uint32_t insn, - uint64_t Addr, - const MCDisassembler *Decoder); -static DecodeStatus DecodePRFMRegInstruction(MCInst &Inst, uint32_t insn, - uint64_t Address, - const MCDisassembler *Decoder); - -#include "AArch64GenDisassemblerTables.inc" -#include "AArch64GenInstrInfo.inc" #define Success MCDisassembler::Success #define Fail MCDisassembler::Fail #define SoftFail MCDisassembler::SoftFail -static MCDisassembler *createAArch64Disassembler(const Target &T, - const MCSubtargetInfo &STI, - MCContext &Ctx) { - - return new AArch64Disassembler(STI, Ctx, T.createMCInstrInfo()); -} - -DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size, - ArrayRef<uint8_t> Bytes, - uint64_t Address, - raw_ostream &CS) const { - CommentStream = &CS; - - Size = 0; - // We want to read exactly 4 bytes of data. - if (Bytes.size() < 4) - return Fail; - Size = 4; - - // Encoded as a small-endian 32-bit word in the stream. - uint32_t Insn = - (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | (Bytes[0] << 0); - - const uint8_t *Tables[] = {DecoderTable32, DecoderTableFallback32}; - - for (const auto *Table : Tables) { - DecodeStatus Result = - decodeInstruction(Table, MI, Insn, Address, this, STI); - - const MCInstrDesc &Desc = MCII->get(MI.getOpcode()); - - // For Scalable Matrix Extension (SME) instructions that have an implicit - // operand for the accumulator (ZA) or implicit immediate zero which isn't - // encoded, manually insert operand. - for (unsigned i = 0; i < Desc.getNumOperands(); i++) { - if (Desc.operands()[i].OperandType == MCOI::OPERAND_REGISTER) { - switch (Desc.operands()[i].RegClass) { - default: - break; - case AArch64::MPRRegClassID: - MI.insert(MI.begin() + i, MCOperand::createReg(AArch64::ZA)); - break; - case AArch64::MPR8RegClassID: - MI.insert(MI.begin() + i, MCOperand::createReg(AArch64::ZAB0)); - break; - case AArch64::ZTRRegClassID: - MI.insert(MI.begin() + i, MCOperand::createReg(AArch64::ZT0)); - break; - } - } else if (Desc.operands()[i].OperandType == - AArch64::OPERAND_IMPLICIT_IMM_0) { - MI.insert(MI.begin() + i, MCOperand::createImm(0)); - } - } - - if (MI.getOpcode() == AArch64::LDR_ZA || - MI.getOpcode() == AArch64::STR_ZA) { - // Spill and fill instructions have a single immediate used for both - // the vector select offset and optional memory offset. Replicate - // the decoded immediate. - const MCOperand &Imm4Op = MI.getOperand(2); - assert(Imm4Op.isImm() && "Unexpected operand type!"); - MI.addOperand(Imm4Op); - } - - if (Result != MCDisassembler::Fail) - return Result; - } - - return MCDisassembler::Fail; -} - -uint64_t AArch64Disassembler::suggestBytesToSkip(ArrayRef<uint8_t> Bytes, - uint64_t Address) const { - // AArch64 instructions are always 4 bytes wide, so there's no point - // in skipping any smaller number of bytes if an instruction can't - // be decoded. - return 4; -} - -static MCSymbolizer * -createAArch64ExternalSymbolizer(const Triple &TT, LLVMOpInfoCallback GetOpInfo, - LLVMSymbolLookupCallback SymbolLookUp, - void *DisInfo, MCContext *Ctx, - std::unique_ptr<MCRelocationInfo> &&RelInfo) { - return new AArch64ExternalSymbolizer(*Ctx, std::move(RelInfo), GetOpInfo, - SymbolLookUp, DisInfo); -} - -extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void -LLVMInitializeAArch64Disassembler() { - TargetRegistry::RegisterMCDisassembler(getTheAArch64leTarget(), - createAArch64Disassembler); - TargetRegistry::RegisterMCDisassembler(getTheAArch64beTarget(), - createAArch64Disassembler); - TargetRegistry::RegisterMCSymbolizer(getTheAArch64leTarget(), - createAArch64ExternalSymbolizer); - TargetRegistry::RegisterMCSymbolizer(getTheAArch64beTarget(), - createAArch64ExternalSymbolizer); - TargetRegistry::RegisterMCDisassembler(getTheAArch64_32Target(), - createAArch64Disassembler); - TargetRegistry::RegisterMCSymbolizer(getTheAArch64_32Target(), - createAArch64ExternalSymbolizer); - - TargetRegistry::RegisterMCDisassembler(getTheARM64Target(), - createAArch64Disassembler); - TargetRegistry::RegisterMCSymbolizer(getTheARM64Target(), - createAArch64ExternalSymbolizer); - TargetRegistry::RegisterMCDisassembler(getTheARM64_32Target(), - createAArch64Disassembler); - TargetRegistry::RegisterMCSymbolizer(getTheARM64_32Target(), - createAArch64ExternalSymbolizer); -} - template <unsigned RegClassID, unsigned FirstReg, unsigned NumRegsInClass> static DecodeStatus DecodeSimpleRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, @@ -492,11 +198,7 @@ static DecodeStatus DecodePCRelLabel16(MCInst &Inst, unsigned Imm, static DecodeStatus DecodePCRelLabel19(MCInst &Inst, unsigned Imm, uint64_t Addr, const MCDisassembler *Decoder) { - int64_t ImmVal = Imm; - - // Sign-extend 19-bit immediate. - if (ImmVal & (1 << (19 - 1))) - ImmVal |= ~((1LL << 19) - 1); + int64_t ImmVal = SignExtend64<19>(Imm); if (!Decoder->tryAddingSymbolicOperand( Inst, ImmVal * 4, Addr, Inst.getOpcode() != AArch64::LDRXl, 0, 0, 4)) @@ -506,11 +208,7 @@ static DecodeStatus DecodePCRelLabel19(MCInst &Inst, unsigned Imm, static DecodeStatus DecodePCRelLabel9(MCInst &Inst, unsigned Imm, uint64_t Addr, const MCDisassembler *Decoder) { - int64_t ImmVal = Imm; - - // Sign-extend 9-bit immediate. - if (ImmVal & (1 << (9 - 1))) - ImmVal |= ~((1LL << 9) - 1); + int64_t ImmVal = SignExtend64<9>(Imm); if (!Decoder->tryAddingSymbolicOperand(Inst, (ImmVal * 4), Addr, /*IsBranch=*/true, 0, 0, 4)) @@ -827,12 +525,7 @@ static DecodeStatus DecodeSignedLdStInstruction(MCInst &Inst, uint32_t insn, const MCDisassembler *Decoder) { unsigned Rt = fieldFromInstruction(insn, 0, 5); unsigned Rn = fieldFromInstruction(insn, 5, 5); - int64_t offset = fieldFromInstruction(insn, 12, 9); - - // offset is a 9-bit signed immediate, so sign extend it to - // fill the unsigned. - if (offset & (1 << (9 - 1))) - offset |= ~((1LL << 9) - 1); + int64_t offset = SignExtend64<9>(fieldFromInstruction(insn, 12, 9)); // First operand is always the writeback to the address register, if needed. switch (Inst.getOpcode()) { @@ -1129,14 +822,9 @@ static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn, unsigned Rt = fieldFromInstruction(insn, 0, 5); unsigned Rn = fieldFromInstruction(insn, 5, 5); unsigned Rt2 = fieldFromInstruction(insn, 10, 5); - int64_t offset = fieldFromInstruction(insn, 15, 7); + int64_t offset = SignExtend64<7>(fieldFromInstruction(insn, 15, 7)); bool IsLoad = fieldFromInstruction(insn, 22, 1); - // offset is a 7-bit signed immediate, so sign extend it to - // fill the unsigned. - if (offset & (1 << (7 - 1))) - offset |= ~((1LL << 7) - 1); - unsigned Opcode = Inst.getOpcode(); bool NeedsDisjointWritebackTransfer = false; @@ -1505,12 +1193,8 @@ static DecodeStatus DecodeAdrInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, const MCDisassembler *Decoder) { unsigned Rd = fieldFromInstruction(insn, 0, 5); - int64_t imm = fieldFromInstruction(insn, 5, 19) << 2; - imm |= fieldFromInstruction(insn, 29, 2); - - // Sign-extend the 21-bit immediate. - if (imm & (1 << (21 - 1))) - imm |= ~((1LL << 21) - 1); + int64_t imm = SignExtend64<21>((fieldFromInstruction(insn, 5, 19) << 2) | + fieldFromInstruction(insn, 29, 2)); DecodeSimpleRegisterClass<AArch64::GPR64RegClassID, 0, 32>(Inst, Rd, Addr, Decoder); @@ -1564,11 +1248,7 @@ static DecodeStatus DecodeAddSubImmShift(MCInst &Inst, uint32_t insn, static DecodeStatus DecodeUnconditionalBranch(MCInst &Inst, uint32_t insn, uint64_t Addr, const MCDisassembler *Decoder) { - int64_t imm = fieldFromInstruction(insn, 0, 26); - - // Sign-extend the 26-bit immediate. - if (imm & (1 << (26 - 1))) - imm |= ~((1LL << 26) - 1); + int64_t imm = SignExtend64<26>(fieldFromInstruction(insn, 0, 26)); if (!Decoder->tryAddingSymbolicOperand(Inst, imm * 4, Addr, true, 0, 0, 4)) Inst.addOperand(MCOperand::createImm(imm)); @@ -1631,11 +1311,7 @@ static DecodeStatus DecodeTestAndBranch(MCInst &Inst, uint32_t insn, uint64_t Rt = fieldFromInstruction(insn, 0, 5); uint64_t bit = fieldFromInstruction(insn, 31, 1) << 5; bit |= fieldFromInstruction(insn, 19, 5); - int64_t dst = fieldFromInstruction(insn, 5, 14); - - // Sign-extend 14-bit immediate. - if (dst & (1 << (14 - 1))) - dst |= ~((1LL << 14) - 1); + int64_t dst = SignExtend64<14>(fieldFromInstruction(insn, 5, 14)); if (fieldFromInstruction(insn, 31, 1) == 0) DecodeSimpleRegisterClass<AArch64::GPR32RegClassID, 0, 32>(Inst, Rt, Addr, @@ -1856,3 +1532,129 @@ static DecodeStatus DecodePRFMRegInstruction(MCInst &Inst, uint32_t insn, return Success; } + +static DecodeStatus +DecodeSMESpillFillInstruction(MCInst &Inst, uint32_t Bits, uint64_t Addr, + const MCDisassembler *Decoder) { + unsigned RvBits = fieldFromInstruction(Bits, 13, 2); + unsigned RnBits = fieldFromInstruction(Bits, 5, 5); + unsigned Imm4Bits = fieldFromInstruction(Bits, 0, 4); + + DecodeSimpleRegisterClass<AArch64::MatrixIndexGPR32_12_15RegClassID, 0, 4>( + Inst, RvBits, Addr, Decoder); + Inst.addOperand(MCOperand::createImm(Imm4Bits)); + DecodeSimpleRegisterClass<AArch64::GPR64spRegClassID, 0, 32>(Inst, RnBits, + Addr, Decoder); + // Spill and fill instructions have a single immediate used for both + // the vector select offset and optional memory offset. Replicate + // the decoded immediate. + Inst.addOperand(MCOperand::createImm(Imm4Bits)); + return Success; +} + +#include "AArch64GenDisassemblerTables.inc" +#include "AArch64GenInstrInfo.inc" + +static MCDisassembler *createAArch64Disassembler(const Target &T, + const MCSubtargetInfo &STI, + MCContext &Ctx) { + + return new AArch64Disassembler(STI, Ctx, T.createMCInstrInfo()); +} + +DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size, + ArrayRef<uint8_t> Bytes, + uint64_t Address, + raw_ostream &CS) const { + CommentStream = &CS; + + Size = 0; + // We want to read exactly 4 bytes of data. + if (Bytes.size() < 4) + return Fail; + Size = 4; + + // Encoded as a small-endian 32-bit word in the stream. + uint32_t Insn = + (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | (Bytes[0] << 0); + + const uint8_t *Tables[] = {DecoderTable32, DecoderTableFallback32}; + + for (const auto *Table : Tables) { + DecodeStatus Result = + decodeInstruction(Table, MI, Insn, Address, this, STI); + + const MCInstrDesc &Desc = MCII->get(MI.getOpcode()); + + // For Scalable Matrix Extension (SME) instructions that have an implicit + // operand for the accumulator (ZA) or implicit immediate zero which isn't + // encoded, manually insert operand. + for (unsigned i = 0; i < Desc.getNumOperands(); i++) { + if (Desc.operands()[i].OperandType == MCOI::OPERAND_REGISTER) { + switch (Desc.operands()[i].RegClass) { + default: + break; + case AArch64::MPRRegClassID: + MI.insert(MI.begin() + i, MCOperand::createReg(AArch64::ZA)); + break; + case AArch64::MPR8RegClassID: + MI.insert(MI.begin() + i, MCOperand::createReg(AArch64::ZAB0)); + break; + case AArch64::ZTRRegClassID: + MI.insert(MI.begin() + i, MCOperand::createReg(AArch64::ZT0)); + break; + } + } else if (Desc.operands()[i].OperandType == + AArch64::OPERAND_IMPLICIT_IMM_0) { + MI.insert(MI.begin() + i, MCOperand::createImm(0)); + } + } + + if (Result != MCDisassembler::Fail) + return Result; + } + + return MCDisassembler::Fail; +} + +uint64_t AArch64Disassembler::suggestBytesToSkip(ArrayRef<uint8_t> Bytes, + uint64_t Address) const { + // AArch64 instructions are always 4 bytes wide, so there's no point + // in skipping any smaller number of bytes if an instruction can't + // be decoded. + return 4; +} + +static MCSymbolizer * +createAArch64ExternalSymbolizer(const Triple &TT, LLVMOpInfoCallback GetOpInfo, + LLVMSymbolLookupCallback SymbolLookUp, + void *DisInfo, MCContext *Ctx, + std::unique_ptr<MCRelocationInfo> &&RelInfo) { + return new AArch64ExternalSymbolizer(*Ctx, std::move(RelInfo), GetOpInfo, + SymbolLookUp, DisInfo); +} + +extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void +LLVMInitializeAArch64Disassembler() { + TargetRegistry::RegisterMCDisassembler(getTheAArch64leTarget(), + createAArch64Disassembler); + TargetRegistry::RegisterMCDisassembler(getTheAArch64beTarget(), + createAArch64Disassembler); + TargetRegistry::RegisterMCSymbolizer(getTheAArch64leTarget(), + createAArch64ExternalSymbolizer); + TargetRegistry::RegisterMCSymbolizer(getTheAArch64beTarget(), + createAArch64ExternalSymbolizer); + TargetRegistry::RegisterMCDisassembler(getTheAArch64_32Target(), + createAArch64Disassembler); + TargetRegistry::RegisterMCSymbolizer(getTheAArch64_32Target(), + createAArch64ExternalSymbolizer); + + TargetRegistry::RegisterMCDisassembler(getTheARM64Target(), + createAArch64Disassembler); + TargetRegistry::RegisterMCSymbolizer(getTheARM64Target(), + createAArch64ExternalSymbolizer); + TargetRegistry::RegisterMCDisassembler(getTheARM64_32Target(), + createAArch64Disassembler); + TargetRegistry::RegisterMCSymbolizer(getTheARM64_32Target(), + createAArch64ExternalSymbolizer); +} diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index 0bceb322726d..5748556d0728 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -6608,45 +6608,6 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I, switch (IntrinID) { default: break; - case Intrinsic::aarch64_crypto_sha1h: { - Register DstReg = I.getOperand(0).getReg(); - Register SrcReg = I.getOperand(2).getReg(); - - // FIXME: Should this be an assert? - if (MRI.getType(DstReg).getSizeInBits() != 32 || - MRI.getType(SrcReg).getSizeInBits() != 32) - return false; - - // The operation has to happen on FPRs. Set up some new FPR registers for - // the source and destination if they are on GPRs. - if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) { - SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); - MIB.buildCopy({SrcReg}, {I.getOperand(2)}); - - // Make sure the copy ends up getting constrained properly. - RBI.constrainGenericRegister(I.getOperand(2).getReg(), - AArch64::GPR32RegClass, MRI); - } - - if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) - DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); - - // Actually insert the instruction. - auto SHA1Inst = MIB.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg}); - constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI); - - // Did we create a new register for the destination? - if (DstReg != I.getOperand(0).getReg()) { - // Yep. Copy the result of the instruction back into the original - // destination. - MIB.buildCopy({I.getOperand(0)}, {DstReg}); - RBI.constrainGenericRegister(I.getOperand(0).getReg(), - AArch64::GPR32RegClass, MRI); - } - - I.eraseFromParent(); - return true; - } case Intrinsic::ptrauth_resign: { Register DstReg = I.getOperand(0).getReg(); Register ValReg = I.getOperand(2).getReg(); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 210643f6f2f4..ff09b375c310 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -222,7 +222,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .clampNumElements(0, v2s64, v2s64) .moreElementsToNextPow2(0) .minScalarSameAs(1, 0) - .scalarizeIf(scalarOrEltWiderThan(0, 64), 0); + .scalarizeIf(scalarOrEltWiderThan(0, 64), 0) + .minScalarEltSameAsIf(isVector(0), 1, 0) + .maxScalarEltSameAsIf(isVector(0), 1, 0); getActionDefinitionsBuilder(G_PTR_ADD) .legalFor({{p0, s64}, {v2p0, v2s64}}) @@ -879,8 +881,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {v2s32, v2s32}, {v4s32, v4s32}, {v2s64, v2s64}}) - .legalFor(HasFP16, - {{s32, s16}, {s64, s16}, {v4s16, v4s16}, {v8s16, v8s16}}) + .legalFor( + HasFP16, + {{s16, s16}, {s32, s16}, {s64, s16}, {v4s16, v4s16}, {v8s16, v8s16}}) // Handle types larger than i64 by scalarizing/lowering. .scalarizeIf(scalarOrEltWiderThan(0, 64), 0) .scalarizeIf(scalarOrEltWiderThan(1, 64), 1) @@ -1150,7 +1153,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .clampMaxNumElements(1, s32, 4) .clampMaxNumElements(1, s16, 8) .clampMaxNumElements(1, s8, 16) - .clampMaxNumElements(1, p0, 2); + .clampMaxNumElements(1, p0, 2) + .scalarizeIf(scalarOrEltWiderThan(1, 64), 1); getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT) .legalIf( @@ -1165,7 +1169,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .clampNumElements(0, v4s16, v8s16) .clampNumElements(0, v2s32, v4s32) .clampMaxNumElements(0, s64, 2) - .clampMaxNumElements(0, p0, 2); + .clampMaxNumElements(0, p0, 2) + .scalarizeIf(scalarOrEltWiderThan(0, 64), 0); getActionDefinitionsBuilder(G_BUILD_VECTOR) .legalFor({{v8s8, s8}, diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp index 1b919abd222e..62de86bf87f5 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp @@ -482,6 +482,10 @@ static bool isFPIntrinsic(const MachineRegisterInfo &MRI, case Intrinsic::aarch64_neon_sqrdmulh: case Intrinsic::aarch64_neon_sqadd: case Intrinsic::aarch64_neon_sqsub: + case Intrinsic::aarch64_crypto_sha1h: + case Intrinsic::aarch64_crypto_sha1c: + case Intrinsic::aarch64_crypto_sha1p: + case Intrinsic::aarch64_crypto_sha1m: return true; case Intrinsic::aarch64_neon_saddlv: { const LLT SrcTy = MRI.getType(MI.getOperand(2).getReg()); @@ -848,10 +852,20 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR}; break; } + case TargetOpcode::G_FPTOSI_SAT: + case TargetOpcode::G_FPTOUI_SAT: { + LLT DstType = MRI.getType(MI.getOperand(0).getReg()); + if (DstType.isVector()) + break; + if (DstType == LLT::scalar(16)) { + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR}; + break; + } + OpRegBankIdx = {PMI_FirstGPR, PMI_FirstFPR}; + break; + } case TargetOpcode::G_FPTOSI: case TargetOpcode::G_FPTOUI: - case TargetOpcode::G_FPTOSI_SAT: - case TargetOpcode::G_FPTOUI_SAT: case TargetOpcode::G_INTRINSIC_LRINT: case TargetOpcode::G_INTRINSIC_LLRINT: if (MRI.getType(MI.getOperand(0).getReg()).isVector()) diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp index 54b58e948daf..2552ee300933 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp @@ -365,13 +365,6 @@ void AArch64InstPrinter::printInst(const MCInst *MI, uint64_t Address, return; } - // Instruction TSB is specified as a one operand instruction, but 'csync' is - // not encoded, so for printing it is treated as a special case here: - if (Opcode == AArch64::TSB) { - O << "\ttsb\tcsync"; - return; - } - if (!PrintAliases || !printAliasInstr(MI, Address, STI, O)) printInstruction(MI, Address, STI, O); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp index 828c5c546240..2b5cf3484ffc 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp @@ -53,9 +53,9 @@ const MCAsmInfo::AtSpecifier MachOAtSpecifiers[] = { {AArch64::S_MACHO_TLVPPAGEOFF, "TLVPPAGEOFF"}, }; -StringRef AArch64::getSpecifierName(const MCSpecifierExpr &Expr) { +StringRef AArch64::getSpecifierName(AArch64::Specifier S) { // clang-format off - switch (static_cast<uint32_t>(Expr.getSpecifier())) { + switch (static_cast<uint32_t>(S)) { case AArch64::S_CALL: return ""; case AArch64::S_LO12: return ":lo12:"; case AArch64::S_ABS_G3: return ":abs_g3:"; @@ -124,7 +124,7 @@ static bool evaluate(const MCSpecifierExpr &Expr, MCValue &Res, if (!Expr.getSubExpr()->evaluateAsRelocatable(Res, Asm)) return false; Res.setSpecifier(Expr.getSpecifier()); - return true; + return !Res.getSubSym(); } AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin(bool IsILP32) { @@ -183,7 +183,7 @@ void AArch64MCAsmInfoDarwin::printSpecifierExpr( raw_ostream &OS, const MCSpecifierExpr &Expr) const { if (auto *AE = dyn_cast<AArch64AuthMCExpr>(&Expr)) return AE->print(OS, this); - OS << AArch64::getSpecifierName(Expr); + OS << AArch64::getSpecifierName(Expr.getSpecifier()); printExpr(OS, *Expr.getSubExpr()); } @@ -232,7 +232,7 @@ void AArch64MCAsmInfoELF::printSpecifierExpr( raw_ostream &OS, const MCSpecifierExpr &Expr) const { if (auto *AE = dyn_cast<AArch64AuthMCExpr>(&Expr)) return AE->print(OS, this); - OS << AArch64::getSpecifierName(Expr); + OS << AArch64::getSpecifierName(Expr.getSpecifier()); printExpr(OS, *Expr.getSubExpr()); } @@ -262,7 +262,7 @@ AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() { void AArch64MCAsmInfoMicrosoftCOFF::printSpecifierExpr( raw_ostream &OS, const MCSpecifierExpr &Expr) const { - OS << AArch64::getSpecifierName(Expr); + OS << AArch64::getSpecifierName(Expr.getSpecifier()); printExpr(OS, *Expr.getSubExpr()); } @@ -292,7 +292,7 @@ AArch64MCAsmInfoGNUCOFF::AArch64MCAsmInfoGNUCOFF() { void AArch64MCAsmInfoGNUCOFF::printSpecifierExpr( raw_ostream &OS, const MCSpecifierExpr &Expr) const { - OS << AArch64::getSpecifierName(Expr); + OS << AArch64::getSpecifierName(Expr.getSpecifier()); printExpr(OS, *Expr.getSubExpr()); } diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h index c28e925d77e2..0dfa61b1dc60 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h @@ -181,7 +181,7 @@ enum { /// Return the string representation of the ELF relocation specifier /// (e.g. ":got:", ":lo12:"). -StringRef getSpecifierName(const MCSpecifierExpr &Expr); +StringRef getSpecifierName(Specifier S); inline Specifier getSymbolLoc(Specifier S) { return static_cast<Specifier>(S & AArch64::S_SymLocBits); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h index 91bdc880998b..7774d07a214b 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h @@ -65,15 +65,16 @@ void initLLVMToCVRegMapping(MCRegisterInfo *MRI); bool isHForm(const MCInst &MI, const MCInstrInfo *MCII); bool isQForm(const MCInst &MI, const MCInstrInfo *MCII); bool isFpOrNEON(const MCInst &MI, const MCInstrInfo *MCII); -} +} // namespace AArch64_MC namespace AArch64 { enum OperandType { OPERAND_IMPLICIT_IMM_0 = MCOI::OPERAND_FIRST_TARGET, + OPERAND_SHIFT_MSL, }; } // namespace AArch64 -} // End llvm namespace +} // namespace llvm // Defines symbolic names for AArch64 registers. This defines a mapping from // register name to register number. diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp index a53b676142a0..5fe999389ce7 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp @@ -73,9 +73,10 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType( // Supported break; default: - Ctx.reportError(Fixup.getLoc(), "relocation specifier " + - AArch64::getSpecifierName(*A64E) + - " unsupported on COFF targets"); + Ctx.reportError(Fixup.getLoc(), + "relocation specifier " + + AArch64::getSpecifierName(A64E->getSpecifier()) + + " unsupported on COFF targets"); return COFF::IMAGE_REL_ARM64_ABSOLUTE; // Dummy return value } } @@ -83,9 +84,10 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType( switch (FixupKind) { default: { if (auto *A64E = dyn_cast<MCSpecifierExpr>(Expr)) { - Ctx.reportError(Fixup.getLoc(), "relocation specifier " + - AArch64::getSpecifierName(*A64E) + - " unsupported on COFF targets"); + Ctx.reportError(Fixup.getLoc(), + "relocation specifier " + + AArch64::getSpecifierName(A64E->getSpecifier()) + + " unsupported on COFF targets"); } else { MCFixupKindInfo Info = MAB.getFixupKindInfo(Fixup.getKind()); Ctx.reportError(Fixup.getLoc(), Twine("relocation type ") + Info.Name + diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp index b58dfdf32e4a..c39a5cc2fcb1 100644 --- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // // This pass implements the SME ABI requirements for ZA state. This includes -// implementing the lazy ZA state save schemes around calls. +// implementing the lazy (and agnostic) ZA state save schemes around calls. // //===----------------------------------------------------------------------===// // @@ -139,8 +139,8 @@ StringRef getZAStateString(ZAState State) { #undef MAKE_CASE } -static bool isZAorZT0RegOp(const TargetRegisterInfo &TRI, - const MachineOperand &MO) { +static bool isZAorZTRegOp(const TargetRegisterInfo &TRI, + const MachineOperand &MO) { if (!MO.isReg() || !MO.getReg().isPhysical()) return false; return any_of(TRI.subregs_inclusive(MO.getReg()), [](const MCPhysReg &SR) { @@ -166,7 +166,7 @@ getZAStateBeforeInst(const TargetRegisterInfo &TRI, MachineInstr &MI, return {ZAOffAtReturn ? ZAState::OFF : ZAState::ACTIVE, InsertPt}; for (auto &MO : MI.operands()) { - if (isZAorZT0RegOp(TRI, MO)) + if (isZAorZTRegOp(TRI, MO)) return {ZAState::ACTIVE, InsertPt}; } @@ -215,9 +215,44 @@ struct MachineSMEABI : public MachineFunctionPass { void emitZAOff(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, bool ClearTPIDR2); + // Emission routines for agnostic ZA functions. + void emitSetupFullZASave(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + LiveRegs PhysLiveRegs); + // Emit a "full" ZA save or restore. It is "full" in the sense that this + // function will emit a call to __arm_sme_save or __arm_sme_restore, which + // handles saving and restoring both ZA and ZT0. + void emitFullZASaveRestore(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + LiveRegs PhysLiveRegs, bool IsSave); + void emitAllocateFullZASaveBuffer(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + LiveRegs PhysLiveRegs); + void emitStateChange(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, ZAState From, ZAState To, LiveRegs PhysLiveRegs); + // Helpers for switching between lazy/full ZA save/restore routines. + void emitZASave(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + LiveRegs PhysLiveRegs) { + if (AFI->getSMEFnAttrs().hasAgnosticZAInterface()) + return emitFullZASaveRestore(MBB, MBBI, PhysLiveRegs, /*IsSave=*/true); + return emitSetupLazySave(MBB, MBBI); + } + void emitZARestore(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + LiveRegs PhysLiveRegs) { + if (AFI->getSMEFnAttrs().hasAgnosticZAInterface()) + return emitFullZASaveRestore(MBB, MBBI, PhysLiveRegs, /*IsSave=*/false); + return emitRestoreLazySave(MBB, MBBI, PhysLiveRegs); + } + void emitAllocateZASaveBuffer(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + LiveRegs PhysLiveRegs) { + if (AFI->getSMEFnAttrs().hasAgnosticZAInterface()) + return emitAllocateFullZASaveBuffer(MBB, MBBI, PhysLiveRegs); + return emitAllocateLazySaveBuffer(MBB, MBBI); + } + /// Save live physical registers to virtual registers. PhysRegSave createPhysRegSave(LiveRegs PhysLiveRegs, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL); @@ -228,6 +263,8 @@ struct MachineSMEABI : public MachineFunctionPass { /// Get or create a TPIDR2 block in this function. TPIDR2State getTPIDR2Block(); + Register getAgnosticZABufferPtr(); + private: /// Contains the needed ZA state (and live registers) at an instruction. struct InstInfo { @@ -241,6 +278,7 @@ private: struct BlockInfo { ZAState FixedEntryState{ZAState::ANY}; SmallVector<InstInfo> Insts; + LiveRegs PhysLiveRegsAtEntry = LiveRegs::None; LiveRegs PhysLiveRegsAtExit = LiveRegs::None; }; @@ -249,24 +287,29 @@ private: SmallVector<BlockInfo> Blocks; SmallVector<ZAState> BundleStates; std::optional<TPIDR2State> TPIDR2Block; + std::optional<MachineBasicBlock::iterator> AfterSMEProloguePt; + Register AgnosticZABufferPtr = AArch64::NoRegister; + LiveRegs PhysLiveRegsAfterSMEPrologue = LiveRegs::None; } State; MachineFunction *MF = nullptr; EdgeBundles *Bundles = nullptr; const AArch64Subtarget *Subtarget = nullptr; const AArch64RegisterInfo *TRI = nullptr; + const AArch64FunctionInfo *AFI = nullptr; const TargetInstrInfo *TII = nullptr; MachineRegisterInfo *MRI = nullptr; }; void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) { - assert((SMEFnAttrs.hasZT0State() || SMEFnAttrs.hasZAState()) && + assert((SMEFnAttrs.hasAgnosticZAInterface() || SMEFnAttrs.hasZT0State() || + SMEFnAttrs.hasZAState()) && "Expected function to have ZA/ZT0 state!"); State.Blocks.resize(MF->getNumBlockIDs()); for (MachineBasicBlock &MBB : *MF) { BlockInfo &Block = State.Blocks[MBB.getNumber()]; - if (&MBB == &MF->front()) { + if (MBB.isEntryBlock()) { // Entry block: Block.FixedEntryState = SMEFnAttrs.hasPrivateZAInterface() ? ZAState::CALLER_DORMANT @@ -294,10 +337,20 @@ void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) { Block.PhysLiveRegsAtExit = GetPhysLiveRegs(); auto FirstTerminatorInsertPt = MBB.getFirstTerminator(); + auto FirstNonPhiInsertPt = MBB.getFirstNonPHI(); for (MachineInstr &MI : reverse(MBB)) { MachineBasicBlock::iterator MBBI(MI); LiveUnits.stepBackward(MI); LiveRegs PhysLiveRegs = GetPhysLiveRegs(); + // The SMEStateAllocPseudo marker is added to a function if the save + // buffer was allocated in SelectionDAG. It marks the end of the + // allocation -- which is a safe point for this pass to insert any TPIDR2 + // block setup. + if (MI.getOpcode() == AArch64::SMEStateAllocPseudo) { + State.AfterSMEProloguePt = MBBI; + State.PhysLiveRegsAfterSMEPrologue = PhysLiveRegs; + } + // Note: We treat Agnostic ZA as inout_za with an alternate save/restore. auto [NeededState, InsertPt] = getZAStateBeforeInst( *TRI, MI, /*ZAOffAtReturn=*/SMEFnAttrs.hasPrivateZAInterface()); assert((InsertPt == MBBI || @@ -306,6 +359,8 @@ void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) { // TODO: Do something to avoid state changes where NZCV is live. if (MBBI == FirstTerminatorInsertPt) Block.PhysLiveRegsAtExit = PhysLiveRegs; + if (MBBI == FirstNonPhiInsertPt) + Block.PhysLiveRegsAtEntry = PhysLiveRegs; if (NeededState != ZAState::ANY) Block.Insts.push_back({NeededState, InsertPt, PhysLiveRegs}); } @@ -529,23 +584,25 @@ void MachineSMEABI::emitZAOff(MachineBasicBlock &MBB, void MachineSMEABI::emitAllocateLazySaveBuffer( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) { MachineFrameInfo &MFI = MF->getFrameInfo(); - DebugLoc DL = getDebugLoc(MBB, MBBI); Register SP = MRI->createVirtualRegister(&AArch64::GPR64RegClass); Register SVL = MRI->createVirtualRegister(&AArch64::GPR64RegClass); - Register Buffer = MRI->createVirtualRegister(&AArch64::GPR64RegClass); + Register Buffer = AFI->getEarlyAllocSMESaveBuffer(); // Calculate SVL. BuildMI(MBB, MBBI, DL, TII->get(AArch64::RDSVLI_XI), SVL).addImm(1); // 1. Allocate the lazy save buffer. - { - // TODO This function grows the stack with a subtraction, which doesn't work - // on Windows. Some refactoring to share the functionality in - // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI - // supports SME + if (Buffer == AArch64::NoRegister) { + // TODO: On Windows, we allocate the lazy save buffer in SelectionDAG (so + // Buffer != AArch64::NoRegister). This is done to reuse the existing + // expansions (which can insert stack checks). This works, but it means we + // will always allocate the lazy save buffer (even if the function contains + // no lazy saves). If we want to handle Windows here, we'll need to + // implement something similar to LowerWindowsDYNAMIC_STACKALLOC. assert(!Subtarget->isTargetWindows() && "Lazy ZA save is not yet supported on Windows"); + Buffer = MRI->createVirtualRegister(&AArch64::GPR64RegClass); // Get original stack pointer. BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), SP) .addReg(AArch64::SP); @@ -590,8 +647,7 @@ void MachineSMEABI::emitNewZAPrologue(MachineBasicBlock &MBB, .addImm(AArch64SysReg::TPIDR2_EL0); // If TPIDR2_EL0 is non-zero, commit the lazy save. // NOTE: Functions that only use ZT0 don't need to zero ZA. - bool ZeroZA = - MF->getInfo<AArch64FunctionInfo>()->getSMEFnAttrs().hasZAState(); + bool ZeroZA = AFI->getSMEFnAttrs().hasZAState(); auto CommitZASave = BuildMI(MBB, MBBI, DL, TII->get(AArch64::CommitZASavePseudo)) .addReg(TPIDR2EL0) @@ -606,6 +662,86 @@ void MachineSMEABI::emitNewZAPrologue(MachineBasicBlock &MBB, .addImm(1); } +Register MachineSMEABI::getAgnosticZABufferPtr() { + if (State.AgnosticZABufferPtr != AArch64::NoRegister) + return State.AgnosticZABufferPtr; + Register BufferPtr = AFI->getEarlyAllocSMESaveBuffer(); + State.AgnosticZABufferPtr = + BufferPtr != AArch64::NoRegister + ? BufferPtr + : MF->getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); + return State.AgnosticZABufferPtr; +} + +void MachineSMEABI::emitFullZASaveRestore(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + LiveRegs PhysLiveRegs, bool IsSave) { + auto *TLI = Subtarget->getTargetLowering(); + DebugLoc DL = getDebugLoc(MBB, MBBI); + Register BufferPtr = AArch64::X0; + + PhysRegSave RegSave = createPhysRegSave(PhysLiveRegs, MBB, MBBI, DL); + + // Copy the buffer pointer into X0. + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferPtr) + .addReg(getAgnosticZABufferPtr()); + + // Call __arm_sme_save/__arm_sme_restore. + BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL)) + .addReg(BufferPtr, RegState::Implicit) + .addExternalSymbol(TLI->getLibcallName( + IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE)) + .addRegMask(TRI->getCallPreservedMask( + *MF, + CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1)); + + restorePhyRegSave(RegSave, MBB, MBBI, DL); +} + +void MachineSMEABI::emitAllocateFullZASaveBuffer( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + LiveRegs PhysLiveRegs) { + // Buffer already allocated in SelectionDAG. + if (AFI->getEarlyAllocSMESaveBuffer()) + return; + + DebugLoc DL = getDebugLoc(MBB, MBBI); + Register BufferPtr = getAgnosticZABufferPtr(); + Register BufferSize = MRI->createVirtualRegister(&AArch64::GPR64RegClass); + + PhysRegSave RegSave = createPhysRegSave(PhysLiveRegs, MBB, MBBI, DL); + + // Calculate the SME state size. + { + auto *TLI = Subtarget->getTargetLowering(); + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL)) + .addExternalSymbol(TLI->getLibcallName(RTLIB::SMEABI_SME_STATE_SIZE)) + .addReg(AArch64::X0, RegState::ImplicitDefine) + .addRegMask(TRI->getCallPreservedMask( + *MF, CallingConv:: + AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferSize) + .addReg(AArch64::X0); + } + + // Allocate a buffer object of the size given __arm_sme_state_size. + { + MachineFrameInfo &MFI = MF->getFrameInfo(); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP) + .addReg(AArch64::SP) + .addReg(BufferSize) + .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferPtr) + .addReg(AArch64::SP); + + // We have just allocated a variable sized object, tell this to PEI. + MFI.CreateVariableSizedObject(Align(16), nullptr); + } + + restorePhyRegSave(RegSave, MBB, MBBI, DL); +} + void MachineSMEABI::emitStateChange(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, ZAState From, ZAState To, @@ -623,10 +759,7 @@ void MachineSMEABI::emitStateChange(MachineBasicBlock &MBB, // TODO: Avoid setting up the save buffer if there's no transition to // LOCAL_SAVED. if (From == ZAState::CALLER_DORMANT) { - assert(MBB.getParent() - ->getInfo<AArch64FunctionInfo>() - ->getSMEFnAttrs() - .hasPrivateZAInterface() && + assert(AFI->getSMEFnAttrs().hasPrivateZAInterface() && "CALLER_DORMANT state requires private ZA interface"); assert(&MBB == &MBB.getParent()->front() && "CALLER_DORMANT state only valid in entry block"); @@ -641,12 +774,14 @@ void MachineSMEABI::emitStateChange(MachineBasicBlock &MBB, } if (From == ZAState::ACTIVE && To == ZAState::LOCAL_SAVED) - emitSetupLazySave(MBB, InsertPt); + emitZASave(MBB, InsertPt, PhysLiveRegs); else if (From == ZAState::LOCAL_SAVED && To == ZAState::ACTIVE) - emitRestoreLazySave(MBB, InsertPt, PhysLiveRegs); + emitZARestore(MBB, InsertPt, PhysLiveRegs); else if (To == ZAState::OFF) { assert(From != ZAState::CALLER_DORMANT && "CALLER_DORMANT to OFF should have already been handled"); + assert(!AFI->getSMEFnAttrs().hasAgnosticZAInterface() && + "Should not turn ZA off in agnostic ZA function"); emitZAOff(MBB, InsertPt, /*ClearTPIDR2=*/From == ZAState::LOCAL_SAVED); } else { dbgs() << "Error: Transition from " << getZAStateString(From) << " to " @@ -664,9 +799,10 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) { if (!MF.getSubtarget<AArch64Subtarget>().hasSME()) return false; - auto *AFI = MF.getInfo<AArch64FunctionInfo>(); + AFI = MF.getInfo<AArch64FunctionInfo>(); SMEAttrs SMEFnAttrs = AFI->getSMEFnAttrs(); - if (!SMEFnAttrs.hasZAState() && !SMEFnAttrs.hasZT0State()) + if (!SMEFnAttrs.hasZAState() && !SMEFnAttrs.hasZT0State() && + !SMEFnAttrs.hasAgnosticZAInterface()) return false; assert(MF.getRegInfo().isSSA() && "Expected to be run on SSA form!"); @@ -685,9 +821,19 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) { insertStateChanges(); // Allocate save buffer (if needed). - if (State.TPIDR2Block) { - MachineBasicBlock &EntryBlock = MF.front(); - emitAllocateLazySaveBuffer(EntryBlock, EntryBlock.getFirstNonPHI()); + if (State.AgnosticZABufferPtr != AArch64::NoRegister || State.TPIDR2Block) { + if (State.AfterSMEProloguePt) { + // Note: With inline stack probes the AfterSMEProloguePt may not be in the + // entry block (due to the probing loop). + emitAllocateZASaveBuffer(*(*State.AfterSMEProloguePt)->getParent(), + *State.AfterSMEProloguePt, + State.PhysLiveRegsAfterSMEPrologue); + } else { + MachineBasicBlock &EntryBlock = MF.front(); + emitAllocateZASaveBuffer( + EntryBlock, EntryBlock.getFirstNonPHI(), + State.Blocks[EntryBlock.getNumber()].PhysLiveRegsAtEntry); + } } return true; diff --git a/llvm/lib/Target/AArch64/SMEABIPass.cpp b/llvm/lib/Target/AArch64/SMEABIPass.cpp index 2008516885c3..79ceb2ababc7 100644 --- a/llvm/lib/Target/AArch64/SMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/SMEABIPass.cpp @@ -50,8 +50,7 @@ private: char SMEABI::ID = 0; static const char *name = "SME ABI Pass"; -INITIALIZE_PASS_BEGIN(SMEABI, DEBUG_TYPE, name, false, false) -INITIALIZE_PASS_END(SMEABI, DEBUG_TYPE, name, false, false) +INITIALIZE_PASS(SMEABI, DEBUG_TYPE, name, false, false) FunctionPass *llvm::createSMEABIPass() { return new SMEABI(); } diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index b3005d512022..40ec371fe79d 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -1108,6 +1108,10 @@ class sme_spill_fill_base<bit isStore, dag outs, dag ins, string opcodestr> : I<outs, ins, opcodestr, "\t$ZAt[$Rv, $imm4], [$Rn, $offset, mul vl]", "", []>, Sched<[]> { + // 'offset' operand is encoded in the same bits as 'imm4'. There is currently + // no way to tell TableGen about this. + let DecoderMethod = "DecodeSMESpillFillInstruction"; + bits<0> ZAt; bits<2> Rv; bits<5> Rn; bits<4> imm4; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index a3a7d0f74e1b..f8c1fe81c678 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -315,10 +315,16 @@ def addsub_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "uint16_t", SVEAddSubImmOperand16 def addsub_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "uint32_t", SVEAddSubImmOperand32>; def addsub_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "uint64_t", SVEAddSubImmOperand64>; -def SVEAddSubImm8Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i8>", []>; -def SVEAddSubImm16Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i16>", []>; -def SVEAddSubImm32Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i32>", []>; -def SVEAddSubImm64Pat : ComplexPattern<i64, 2, "SelectSVEAddSubImm<MVT::i64>", []>; +let Complexity = 1 in { +def SVEAddSubImm8Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i8, false>", []>; +def SVEAddSubImm16Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i16, false>", []>; +def SVEAddSubImm32Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i32, false>", []>; +def SVEAddSubImm64Pat : ComplexPattern<i64, 2, "SelectSVEAddSubImm<MVT::i64, false>", []>; + +def SVEAddSubNegImm8Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i8, true>", []>; +def SVEAddSubNegImm16Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i16, true>", []>; +def SVEAddSubNegImm32Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i32, true>", []>; +def SVEAddSubNegImm64Pat : ComplexPattern<i64, 2, "SelectSVEAddSubImm<MVT::i64, true>", []>; def SVEAddSubSSatNegImm8Pat : ComplexPattern<i32, 2, "SelectSVEAddSubSSatImm<MVT::i8, true>", []>; def SVEAddSubSSatNegImm16Pat : ComplexPattern<i32, 2, "SelectSVEAddSubSSatImm<MVT::i16, true>", []>; @@ -329,6 +335,7 @@ def SVEAddSubSSatPosImm8Pat : ComplexPattern<i32, 2, "SelectSVEAddSubSSatImm<MV def SVEAddSubSSatPosImm16Pat : ComplexPattern<i32, 2, "SelectSVEAddSubSSatImm<MVT::i16, false>", []>; def SVEAddSubSSatPosImm32Pat : ComplexPattern<i32, 2, "SelectSVEAddSubSSatImm<MVT::i32, false>", []>; def SVEAddSubSSatPosImm64Pat : ComplexPattern<i64, 2, "SelectSVEAddSubSSatImm<MVT::i64, false>", []>; +} // Complexity = 1 def SVECpyDupImm8Pat : ComplexPattern<i32, 2, "SelectSVECpyDupImm<MVT::i8>", []>; def SVECpyDupImm16Pat : ComplexPattern<i32, 2, "SelectSVECpyDupImm<MVT::i16>", []>; @@ -886,13 +893,17 @@ class sve_int_ptest<bits<6> opc, string asm, SDPatternOperator op> } multiclass sve_int_ptest<bits<6> opc, string asm, SDPatternOperator op, - SDPatternOperator op_any> { + SDPatternOperator op_any, SDPatternOperator op_first> { def NAME : sve_int_ptest<opc, asm, op>; let hasNoSchedulingInfo = 1, isCompare = 1, Defs = [NZCV] in { def _ANY : Pseudo<(outs), (ins PPRAny:$Pg, PPR8:$Pn), [(set NZCV, (op_any (nxv16i1 PPRAny:$Pg), (nxv16i1 PPR8:$Pn)))]>, PseudoInstExpansion<(!cast<Instruction>(NAME) PPRAny:$Pg, PPR8:$Pn)>; + + def _FIRST : Pseudo<(outs), (ins PPRAny:$Pg, PPR8:$Pn), + [(set NZCV, (op_first (nxv16i1 PPRAny:$Pg), (nxv16i1 PPR8:$Pn)))]>, + PseudoInstExpansion<(!cast<Instruction>(NAME) PPRAny:$Pg, PPR8:$Pn)>; } } @@ -5154,11 +5165,14 @@ multiclass sve_int_dup_imm<string asm> { (!cast<Instruction>(NAME # _D) ZPR64:$Zd, cpy_imm8_opt_lsl_i64:$imm), 1>; def : InstAlias<"fmov $Zd, #0.0", - (!cast<Instruction>(NAME # _H) ZPR16:$Zd, 0, 0), 1>; + (!cast<Instruction>(NAME # _H) ZPR16:$Zd, + (cpy_imm8_opt_lsl_i16 0, 0)), 1>; def : InstAlias<"fmov $Zd, #0.0", - (!cast<Instruction>(NAME # _S) ZPR32:$Zd, 0, 0), 1>; + (!cast<Instruction>(NAME # _S) ZPR32:$Zd, + (cpy_imm8_opt_lsl_i32 0, 0)), 1>; def : InstAlias<"fmov $Zd, #0.0", - (!cast<Instruction>(NAME # _D) ZPR64:$Zd, 0, 0), 1>; + (!cast<Instruction>(NAME # _D) ZPR64:$Zd, + (cpy_imm8_opt_lsl_i64 0, 0)), 1>; } class sve_int_dup_fpimm<bits<2> sz8_64, Operand fpimmtype, @@ -5218,7 +5232,8 @@ class sve_int_arith_imm0<bits<2> sz8_64, bits<3> opc, string asm, let hasSideEffects = 0; } -multiclass sve_int_arith_imm0<bits<3> opc, string asm, SDPatternOperator op> { +multiclass sve_int_arith_imm0<bits<3> opc, string asm, SDPatternOperator op, + SDPatternOperator inv_op = null_frag> { def _B : sve_int_arith_imm0<0b00, opc, asm, ZPR8, addsub_imm8_opt_lsl_i8>; def _H : sve_int_arith_imm0<0b01, opc, asm, ZPR16, addsub_imm8_opt_lsl_i16>; def _S : sve_int_arith_imm0<0b10, opc, asm, ZPR32, addsub_imm8_opt_lsl_i32>; @@ -5228,6 +5243,12 @@ multiclass sve_int_arith_imm0<bits<3> opc, string asm, SDPatternOperator op> { def : SVE_1_Op_Imm_OptLsl_Pat<nxv8i16, op, ZPR16, i32, SVEAddSubImm16Pat, !cast<Instruction>(NAME # _H)>; def : SVE_1_Op_Imm_OptLsl_Pat<nxv4i32, op, ZPR32, i32, SVEAddSubImm32Pat, !cast<Instruction>(NAME # _S)>; def : SVE_1_Op_Imm_OptLsl_Pat<nxv2i64, op, ZPR64, i64, SVEAddSubImm64Pat, !cast<Instruction>(NAME # _D)>; + + // Extra patterns for add(x, splat(-ve)) -> sub(x, +ve). There is no i8 + // pattern as all i8 constants can be handled by an add. + def : SVE_1_Op_Imm_OptLsl_Pat<nxv8i16, inv_op, ZPR16, i32, SVEAddSubNegImm16Pat, !cast<Instruction>(NAME # _H)>; + def : SVE_1_Op_Imm_OptLsl_Pat<nxv4i32, inv_op, ZPR32, i32, SVEAddSubNegImm32Pat, !cast<Instruction>(NAME # _S)>; + def : SVE_1_Op_Imm_OptLsl_Pat<nxv2i64, inv_op, ZPR64, i64, SVEAddSubNegImm64Pat, !cast<Instruction>(NAME # _D)>; } multiclass sve_int_arith_imm0_ssat<bits<3> opc, string asm, SDPatternOperator op, @@ -5549,11 +5570,14 @@ multiclass sve_int_dup_imm_pred_merge<string asm, SDPatternOperator op> { nxv2i64, nxv2i1, i64, SVECpyDupImm64Pat>; def : InstAlias<"fmov $Zd, $Pg/m, #0.0", - (!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, 0, 0), 0>; + (!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, + (cpy_imm8_opt_lsl_i16 0, 0)), 0>; def : InstAlias<"fmov $Zd, $Pg/m, #0.0", - (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, 0, 0), 0>; + (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, + (cpy_imm8_opt_lsl_i32 0, 0)), 0>; def : InstAlias<"fmov $Zd, $Pg/m, #0.0", - (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, 0, 0), 0>; + (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, + (cpy_imm8_opt_lsl_i64 0, 0)), 0>; def : Pat<(vselect PPRAny:$Pg, (SVEDup0), (nxv8f16 ZPR:$Zd)), (!cast<Instruction>(NAME # _H) $Zd, $Pg, 0, 0)>; @@ -5946,16 +5970,20 @@ class sve2_int_while_rr<bits<2> sz8_64, bits<1> rw, string asm, let isWhile = 1; } -multiclass sve2_int_while_rr<bits<1> rw, string asm, string op> { +multiclass sve2_int_while_rr<bits<1> rw, string asm, SDPatternOperator op> { def _B : sve2_int_while_rr<0b00, rw, asm, PPR8>; def _H : sve2_int_while_rr<0b01, rw, asm, PPR16>; def _S : sve2_int_while_rr<0b10, rw, asm, PPR32>; def _D : sve2_int_while_rr<0b11, rw, asm, PPR64>; - def : SVE_2_Op_Pat<nxv16i1, !cast<SDPatternOperator>(op # _b), i64, i64, !cast<Instruction>(NAME # _B)>; - def : SVE_2_Op_Pat<nxv8i1, !cast<SDPatternOperator>(op # _h), i64, i64, !cast<Instruction>(NAME # _H)>; - def : SVE_2_Op_Pat<nxv4i1, !cast<SDPatternOperator>(op # _s), i64, i64, !cast<Instruction>(NAME # _S)>; - def : SVE_2_Op_Pat<nxv2i1, !cast<SDPatternOperator>(op # _d), i64, i64, !cast<Instruction>(NAME # _D)>; + def : Pat<(nxv16i1 (op i64:$Op1, i64:$Op2, (i64 1))), + (!cast<Instruction>(NAME # _B) $Op1, $Op2)>; + def : Pat<(nxv8i1 (op i64:$Op1, i64:$Op2, (i64 2))), + (!cast<Instruction>(NAME # _H) $Op1, $Op2)>; + def : Pat<(nxv4i1 (op i64:$Op1, i64:$Op2, (i64 4))), + (!cast<Instruction>(NAME # _S) $Op1, $Op2)>; + def : Pat<(nxv2i1 (op i64:$Op1, i64:$Op2, (i64 8))), + (!cast<Instruction>(NAME # _D) $Op1, $Op2)>; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 0059a862ba9b..0f2c33585884 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -62,6 +62,7 @@ FunctionPass *createAMDGPURewriteOutArgumentsPass(); ModulePass * createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr); ModulePass *createAMDGPULowerBufferFatPointersPass(); +ModulePass *createAMDGPULowerIntrinsicsLegacyPass(); FunctionPass *createSIModeRegisterPass(); FunctionPass *createGCNPreRAOptimizationsLegacyPass(); FunctionPass *createAMDGPUPreloadKernArgPrologLegacyPass(); @@ -153,6 +154,16 @@ private: const TargetMachine &TM; }; +void initializeAMDGPULowerIntrinsicsLegacyPass(PassRegistry &); + +struct AMDGPULowerIntrinsicsPass : PassInfoMixin<AMDGPULowerIntrinsicsPass> { + AMDGPULowerIntrinsicsPass(const AMDGPUTargetMachine &TM) : TM(TM) {} + PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM); + +private: + const AMDGPUTargetMachine &TM; +}; + void initializeAMDGPUPrepareAGPRAllocLegacyPass(PassRegistry &); extern char &AMDGPUPrepareAGPRAllocLegacyID; @@ -490,6 +501,9 @@ extern char &SIModeRegisterID; void initializeAMDGPUInsertDelayAluLegacyPass(PassRegistry &); extern char &AMDGPUInsertDelayAluID; +void initializeAMDGPULowerVGPREncodingLegacyPass(PassRegistry &); +extern char &AMDGPULowerVGPREncodingLegacyID; + void initializeSIInsertHardClausesLegacyPass(PassRegistry &); extern char &SIInsertHardClausesID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 8e4b6365dc06..ffbda14dcd84 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -68,13 +68,15 @@ def FeatureFlatInstOffsets : SubtargetFeature<"flat-inst-offsets", def FeatureFlatGlobalInsts : SubtargetFeature<"flat-global-insts", "FlatGlobalInsts", "true", - "Have global_* flat memory instructions" + "Have global_* flat memory instructions", + [FeatureFlatAddressSpace] >; def FeatureFlatScratchInsts : SubtargetFeature<"flat-scratch-insts", "FlatScratchInsts", "true", - "Have scratch_* flat memory instructions" + "Have scratch_* flat memory instructions", + [FeatureFlatAddressSpace] >; def FeatureScalarFlatScratchInsts : SubtargetFeature<"scalar-flat-scratch-insts", @@ -92,7 +94,8 @@ def FeatureEnableFlatScratch : SubtargetFeature<"enable-flat-scratch", def FeatureFlatGVSMode : SubtargetFeature<"flat-gvs-mode", "FlatGVSMode", "true", - "Have GVS addressing mode with flat_* instructions" + "Have GVS addressing mode with flat_* instructions", + [FeatureFlatAddressSpace] >; def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts", @@ -286,12 +289,6 @@ def FeatureSafeCUPrefetch : SubtargetFeature<"safe-cu-prefetch", "VMEM CU scope prefetches do not fail on illegal address" >; -def FeatureCUStores : SubtargetFeature<"cu-stores", - "HasCUStores", - "true", - "Whether SCOPE_CU stores can be used on GFX12.5" ->; - def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard", "HasVcmpxExecWARHazard", "true", @@ -419,6 +416,12 @@ def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts", "Additional instructions for GFX9+" >; +def FeatureRequiresAlignedVGPRs : SubtargetFeature<"vgpr-align2", + "RequiresAlignVGPR", + "true", + "VGPR and AGPR tuple operands require even alignment" +>; + def FeatureGFX90AInsts : SubtargetFeature<"gfx90a-insts", "GFX90AInsts", "true", @@ -928,13 +931,15 @@ def FeatureAtomicFMinFMaxF64GlobalInsts : SubtargetFeature<"atomic-fmin-fmax-glo def FeatureAtomicFMinFMaxF32FlatInsts : SubtargetFeature<"atomic-fmin-fmax-flat-f32", "HasAtomicFMinFMaxF32FlatInsts", "true", - "Has flat memory instructions for atomicrmw fmin/fmax for float" + "Has flat memory instructions for atomicrmw fmin/fmax for float", + [FeatureFlatAddressSpace] >; def FeatureAtomicFMinFMaxF64FlatInsts : SubtargetFeature<"atomic-fmin-fmax-flat-f64", "HasAtomicFMinFMaxF64FlatInsts", "true", - "Has flat memory instructions for atomicrmw fmin/fmax for double" + "Has flat memory instructions for atomicrmw fmin/fmax for double", + [FeatureFlatAddressSpace] >; def FeatureAtomicFaddNoRtnInsts : SubtargetFeature<"atomic-fadd-no-rtn-insts", @@ -986,7 +991,8 @@ def FeatureFlatAtomicFaddF32Inst : SubtargetFeature<"flat-atomic-fadd-f32-inst", "HasFlatAtomicFaddF32Inst", "true", - "Has flat_atomic_add_f32 instruction" + "Has flat_atomic_add_f32 instruction", + [FeatureFlatAddressSpace] >; def FeatureFlatBufferGlobalAtomicFaddF64Inst @@ -1204,6 +1210,12 @@ def Feature64BitLiterals : SubtargetFeature<"64-bit-literals", "Can use 64-bit literals with single DWORD instructions" >; +def Feature1024AddressableVGPRs : SubtargetFeature<"1024-addressable-vgprs", + "Has1024AddressableVGPRs", + "true", + "Has 1024 addressable VGPRs" +>; + def FeatureWaitXcnt : SubtargetFeature<"wait-xcnt", "HasWaitXcnt", "true", @@ -1721,6 +1733,7 @@ def FeatureISAVersion9_0_9 : FeatureSet< def FeatureISAVersion9_0_A : FeatureSet< !listconcat(FeatureISAVersion9_0_MI_Common.Features, [FeatureGFX90AInsts, + FeatureRequiresAlignedVGPRs, FeatureFmacF64Inst, FeatureDPALU_DPP, FeaturePackedFP32Ops, @@ -1743,6 +1756,7 @@ def FeatureISAVersion9_4_Common : FeatureSet< [FeatureGFX9, FeatureGFX90AInsts, FeatureGFX940Insts, + FeatureRequiresAlignedVGPRs, FeatureFmaMixInsts, FeatureLDSBankCount32, FeatureDLInsts, @@ -1894,6 +1908,7 @@ def FeatureISAVersion10_3_Generic: FeatureSet< def FeatureISAVersion11_Common : FeatureSet< [FeatureGFX11, + FeatureBackOffBarrier, FeatureLDSBankCount32, FeatureDLInsts, FeatureDot5Insts, @@ -1977,6 +1992,7 @@ def FeatureISAVersion11_5_3 : FeatureSet< def FeatureISAVersion12 : FeatureSet< [FeatureGFX12, + FeatureBackOffBarrier, FeatureAddressableLocalMemorySize65536, FeatureLDSBankCount32, FeatureDLInsts, @@ -2019,9 +2035,10 @@ def FeatureISAVersion12 : FeatureSet< def FeatureISAVersion12_50 : FeatureSet< [FeatureGFX12, FeatureGFX1250Insts, - FeatureCUStores, + FeatureRequiresAlignedVGPRs, FeatureAddressableLocalMemorySize327680, FeatureCuMode, + Feature1024AddressableVGPRs, Feature64BitLiterals, FeatureLDSBankCount32, FeatureDLInsts, @@ -2830,6 +2847,9 @@ def HasBVHDualAndBVH8Insts : Predicate<"Subtarget->hasBVHDualAndBVH8Insts()">, def Has64BitLiterals : Predicate<"Subtarget->has64BitLiterals()">, AssemblerPredicate<(all_of Feature64BitLiterals)>; +def Has1024AddressableVGPRs : Predicate<"Subtarget->has1024AddressableVGPRs()">, + AssemblerPredicate<(all_of Feature1024AddressableVGPRs)>; + def HasWaitXcnt : Predicate<"Subtarget->hasWaitXcnt()">, AssemblerPredicate<(all_of FeatureWaitXcnt)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 36c0d1cbcea2..29f8f9bc8b54 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -557,7 +557,6 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( MCContext &Ctx = MF.getContext(); uint16_t KernelCodeProperties = 0; const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo(); - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); if (UserSGPRInfo.hasPrivateSegmentBuffer()) { KernelCodeProperties |= @@ -587,13 +586,10 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE; } - if (ST.isWave32()) { + if (MF.getSubtarget<GCNSubtarget>().isWave32()) { KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32; } - if (isGFX1250(ST) && ST.hasCUStores()) { - KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES; - } // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be // un-evaluatable at this point so it cannot be conditionally checked here. @@ -638,7 +634,7 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF, (void)PGRM_Rsrc3; (void)EvaluatableRsrc3; assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 || - STM.hasGFX90AInsts() || !EvaluatableRsrc3 || + STM.hasGFX90AInsts() || AMDGPU::isGFX1250(STM) || !EvaluatableRsrc3 || static_cast<uint64_t>(PGRM_Rsrc3) == 0); KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3; @@ -845,7 +841,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { [[maybe_unused]] int64_t PGMRSrc3; assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 || - STM.hasGFX90AInsts() || + STM.hasGFX90AInsts() || AMDGPU::isGFX1250(STM) || (CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) && static_cast<uint64_t>(PGMRSrc3) == 0)); if (STM.hasGFX90AInsts()) { @@ -1143,9 +1139,13 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx); return SubGPR; }; - - ProgInfo.SGPRBlocks = GetNumGPRBlocks(ProgInfo.NumSGPRsForWavesPerEU, - IsaInfo::getSGPREncodingGranule(&STM)); + // GFX10+ will always allocate 128 SGPRs and this field must be 0 + if (STM.getGeneration() >= AMDGPUSubtarget::GFX10) { + ProgInfo.SGPRBlocks = CreateExpr(0ul); + } else { + ProgInfo.SGPRBlocks = GetNumGPRBlocks( + ProgInfo.NumSGPRsForWavesPerEU, IsaInfo::getSGPREncodingGranule(&STM)); + } ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU, IsaInfo::getVGPREncodingGranule(&STM)); @@ -1440,9 +1440,10 @@ static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD, MD->setComputeRegisters(".dynamic_vgpr_en", true); } - MD->setHwStage(CC, ".lds_size", - (unsigned)(CurrentProgramInfo.LdsSize * - getLdsDwGranularity(ST) * sizeof(uint32_t))); + MD->updateHwStageMaximum( + CC, ".lds_size", + (unsigned)(CurrentProgramInfo.LdsSize * getLdsDwGranularity(ST) * + sizeof(uint32_t))); } // This is the equivalent of EmitProgramInfoSI above, but for when the OS type diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 59cc1df292f4..f646457f9d76 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -1296,74 +1296,6 @@ struct AAAMDGPUNoAGPR const char AAAMDGPUNoAGPR::ID = 0; -/// Performs the final check and updates the 'amdgpu-waves-per-eu' attribute -/// based on the finalized 'amdgpu-flat-work-group-size' attribute. -/// Both attributes start with narrow ranges that expand during iteration. -/// However, a narrower flat-workgroup-size leads to a wider waves-per-eu range, -/// preventing optimal updates later. Therefore, waves-per-eu can't be updated -/// with intermediate values during the attributor run. We defer the -/// finalization of waves-per-eu until after the flat-workgroup-size is -/// finalized. -/// TODO: Remove this and move similar logic back into the attributor run once -/// we have a better representation for waves-per-eu. -static bool updateWavesPerEU(Module &M, TargetMachine &TM) { - bool Changed = false; - - LLVMContext &Ctx = M.getContext(); - - for (Function &F : M) { - if (F.isDeclaration()) - continue; - - const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); - - std::optional<std::pair<unsigned, std::optional<unsigned>>> - FlatWgrpSizeAttr = - AMDGPU::getIntegerPairAttribute(F, "amdgpu-flat-work-group-size"); - - unsigned MinWavesPerEU = ST.getMinWavesPerEU(); - unsigned MaxWavesPerEU = ST.getMaxWavesPerEU(); - - unsigned MinFlatWgrpSize = ST.getMinFlatWorkGroupSize(); - unsigned MaxFlatWgrpSize = ST.getMaxFlatWorkGroupSize(); - if (FlatWgrpSizeAttr.has_value()) { - MinFlatWgrpSize = FlatWgrpSizeAttr->first; - MaxFlatWgrpSize = *(FlatWgrpSizeAttr->second); - } - - // Start with the "best" range. - unsigned Min = MinWavesPerEU; - unsigned Max = MinWavesPerEU; - - // Compute the range from flat workgroup size. `getWavesPerEU` will also - // account for the 'amdgpu-waves-er-eu' attribute. - auto [MinFromFlatWgrpSize, MaxFromFlatWgrpSize] = - ST.getWavesPerEU(F, {MinFlatWgrpSize, MaxFlatWgrpSize}); - - // For the lower bound, we have to "tighten" it. - Min = std::max(Min, MinFromFlatWgrpSize); - // For the upper bound, we have to "extend" it. - Max = std::max(Max, MaxFromFlatWgrpSize); - - // Clamp the range to the max range. - Min = std::max(Min, MinWavesPerEU); - Max = std::min(Max, MaxWavesPerEU); - - // Update the attribute if it is not the max. - if (Min != MinWavesPerEU || Max != MaxWavesPerEU) { - SmallString<10> Buffer; - raw_svector_ostream OS(Buffer); - OS << Min << ',' << Max; - Attribute OldAttr = F.getFnAttribute("amdgpu-waves-per-eu"); - Attribute NewAttr = Attribute::get(Ctx, "amdgpu-waves-per-eu", OS.str()); - F.addFnAttr(NewAttr); - Changed |= OldAttr == NewAttr; - } - } - - return Changed; -} - static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, AMDGPUAttributorOptions Options, ThinOrFullLTOPhase LTOPhase) { @@ -1438,11 +1370,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, } } - bool Changed = A.run() == ChangeStatus::CHANGED; - - Changed |= updateWavesPerEU(M, TM); - - return Changed; + return A.run() == ChangeStatus::CHANGED; } } // namespace diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index d1a5b4e85da4..21255f691e4a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -1004,8 +1004,14 @@ static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect, return IsWave32 ? AMDGPU::SI_CS_CHAIN_TC_W32 : AMDGPU::SI_CS_CHAIN_TC_W64; } - return CC == CallingConv::AMDGPU_Gfx ? AMDGPU::SI_TCRETURN_GFX : - AMDGPU::SI_TCRETURN; + if (CallerF.getFunction().getCallingConv() == + CallingConv::AMDGPU_Gfx_WholeWave) + return AMDGPU::SI_TCRETURN_GFX_WholeWave; + + if (CC == CallingConv::AMDGPU_Gfx || CC == CallingConv::AMDGPU_Gfx_WholeWave) + return AMDGPU::SI_TCRETURN_GFX; + + return AMDGPU::SI_TCRETURN; } // Add operands to call instruction to track the callee. @@ -1284,6 +1290,13 @@ bool AMDGPUCallLowering::lowerTailCall( unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), /*IsTailCall*/ true, ST.isWave32(), CalleeCC, IsDynamicVGPRChainCall); auto MIB = MIRBuilder.buildInstrNoInsert(Opc); + + if (FuncInfo->isWholeWaveFunction()) + addOriginalExecToReturn(MF, MIB); + + // Keep track of the index of the next operand to be added to the call + unsigned CalleeIdx = MIB->getNumOperands(); + if (!addCallTargetOperands(MIB, MIRBuilder, Info, IsDynamicVGPRChainCall)) return false; @@ -1401,7 +1414,7 @@ bool AMDGPUCallLowering::lowerTailCall( // If we have -tailcallopt, we need to adjust the stack. We'll do the call // sequence start and end here. if (!IsSibCall) { - MIB->getOperand(1).setImm(FPDiff); + MIB->getOperand(CalleeIdx + 1).setImm(FPDiff); CallSeqStart.addImm(NumBytes).addImm(0); // End the call sequence *before* emitting the call. Normally, we would // tidy the frame up after the call. However, here, we've laid out the @@ -1413,16 +1426,24 @@ bool AMDGPUCallLowering::lowerTailCall( // Now we can add the actual call instruction to the correct basic block. MIRBuilder.insertInstr(MIB); + // If this is a whole wave tail call, we need to constrain the register for + // the original EXEC. + if (MIB->getOpcode() == AMDGPU::SI_TCRETURN_GFX_WholeWave) { + MIB->getOperand(0).setReg( + constrainOperandRegClass(MF, *TRI, MRI, *TII, *ST.getRegBankInfo(), + *MIB, MIB->getDesc(), MIB->getOperand(0), 0)); + } + // If Callee is a reg, since it is used by a target specific // instruction, it must have a register class matching the // constraint of that instruction. // FIXME: We should define regbankselectable call instructions to handle // divergent call targets. - if (MIB->getOperand(0).isReg()) { - MIB->getOperand(0).setReg( - constrainOperandRegClass(MF, *TRI, MRI, *TII, *ST.getRegBankInfo(), - *MIB, MIB->getDesc(), MIB->getOperand(0), 0)); + if (MIB->getOperand(CalleeIdx).isReg()) { + MIB->getOperand(CalleeIdx).setReg(constrainOperandRegClass( + MF, *TRI, MRI, *TII, *ST.getRegBankInfo(), *MIB, MIB->getDesc(), + MIB->getOperand(CalleeIdx), CalleeIdx)); } MF.getFrameInfo().setHasTailCall(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 394a143dd308..0c112d1787c1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -128,12 +128,18 @@ def gi_global_saddr : def gi_global_saddr_cpol : GIComplexOperandMatcher<s64, "selectGlobalSAddrCPol">, GIComplexPatternEquiv<GlobalSAddrCPol>; +def gi_global_saddr_cpol_m0 : + GIComplexOperandMatcher<s64, "selectGlobalSAddrCPolM0">, + GIComplexPatternEquiv<GlobalSAddrCPolM0>; def gi_global_saddr_glc : GIComplexOperandMatcher<s64, "selectGlobalSAddrGLC">, GIComplexPatternEquiv<GlobalSAddrGLC>; def gi_global_saddr_no_ioffset : GIComplexOperandMatcher<s64, "selectGlobalSAddrNoIOffset">, GIComplexPatternEquiv<GlobalSAddrNoIOffset>; +def gi_global_saddr_no_ioffset_m0 : + GIComplexOperandMatcher<s64, "selectGlobalSAddrNoIOffsetM0">, + GIComplexPatternEquiv<GlobalSAddrNoIOffsetM0>; def gi_mubuf_scratch_offset : GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index b80e43b27129..3785d0f7f268 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2089,6 +2089,23 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr, return true; } +bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPolM0(SDNode *N, SDValue Addr, + SDValue &SAddr, + SDValue &VOffset, + SDValue &Offset, + SDValue &CPol) const { + bool ScaleOffset; + if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset)) + return false; + + // We are assuming CPol is second from last operand of the intrinsic. + auto PassedCPol = + N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL; + CPol = CurDAG->getTargetConstant( + (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32); + return true; +} + bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, @@ -2120,6 +2137,24 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr, return true; } +bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffsetM0(SDNode *N, SDValue Addr, + SDValue &SAddr, + SDValue &VOffset, + SDValue &CPol) const { + bool ScaleOffset; + SDValue DummyOffset; + if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset, + false)) + return false; + + // We are assuming CPol is second from last operand of the intrinsic. + auto PassedCPol = + N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL; + CPol = CurDAG->getTargetConstant( + (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32); + return true; +} + static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) { if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) { SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 16388e750026..4fa0d3f72e1c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -171,11 +171,16 @@ private: bool SelectGlobalSAddrCPol(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, SDValue &CPol) const; + bool SelectGlobalSAddrCPolM0(SDNode *N, SDValue Addr, SDValue &SAddr, + SDValue &VOffset, SDValue &Offset, + SDValue &CPol) const; bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, SDValue &CPol) const; bool SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &CPol) const; + bool SelectGlobalSAddrNoIOffsetM0(SDNode *N, SDValue Addr, SDValue &SAddr, + SDValue &VOffset, SDValue &CPol) const; bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &Offset) const; bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index c048371b11d7..5c9b616e9bc2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -367,6 +367,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand); setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand); + setTruncStoreAction(MVT::v5i32, MVT::v5i1, Expand); + setTruncStoreAction(MVT::v5i32, MVT::v5i8, Expand); + setTruncStoreAction(MVT::v5i32, MVT::v5i16, Expand); + + setTruncStoreAction(MVT::v6i32, MVT::v6i1, Expand); + setTruncStoreAction(MVT::v6i32, MVT::v6i8, Expand); + setTruncStoreAction(MVT::v6i32, MVT::v6i16, Expand); + + setTruncStoreAction(MVT::v7i32, MVT::v7i1, Expand); + setTruncStoreAction(MVT::v7i32, MVT::v7i8, Expand); + setTruncStoreAction(MVT::v7i32, MVT::v7i16, Expand); + setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand); setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand); setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand); @@ -411,7 +423,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64}, Expand); - setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom); + setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Expand); if (Subtarget->has16BitInsts()) { setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal); @@ -1427,8 +1439,8 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); - case ISD::SDIVREM: return LowerSDIVREM(Op, DAG); - case ISD::FREM: return LowerFREM(Op, DAG); + case ISD::SDIVREM: + return LowerSDIVREM(Op, DAG); case ISD::FCEIL: return LowerFCEIL(Op, DAG); case ISD::FTRUNC: return LowerFTRUNC(Op, DAG); case ISD::FRINT: return LowerFRINT(Op, DAG); @@ -2423,21 +2435,6 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, return DAG.getMergeValues(Res, DL); } -// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x) -SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const { - SDLoc SL(Op); - EVT VT = Op.getValueType(); - auto Flags = Op->getFlags(); - SDValue X = Op.getOperand(0); - SDValue Y = Op.getOperand(1); - - SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags); - SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags); - SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags); - // TODO: For f32 use FMAD instead if !hasFastFMA32? - return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags); -} - SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Src = Op.getOperand(0); @@ -2650,10 +2647,7 @@ static bool valueIsKnownNeverF32Denorm(SDValue Src) { bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags) { - if (Flags.hasApproximateFuncs()) - return true; - auto &Options = DAG.getTarget().Options; - return Options.ApproxFuncFPMath; + return Flags.hasApproximateFuncs(); } bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG, @@ -2775,8 +2769,7 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, assert(IsLog10 || Op.getOpcode() == ISD::FLOG); const auto &Options = getTargetMachine().Options; - if (VT == MVT::f16 || Flags.hasApproximateFuncs() || - Options.ApproxFuncFPMath) { + if (VT == MVT::f16 || Flags.hasApproximateFuncs()) { if (VT == MVT::f16 && !Subtarget->has16BitInsts()) { // Log and multiply in f32 is good enough for f16. @@ -5674,6 +5667,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(CALL) NODE_NAME_CASE(TC_RETURN) NODE_NAME_CASE(TC_RETURN_GFX) + NODE_NAME_CASE(TC_RETURN_GFX_WholeWave) NODE_NAME_CASE(TC_RETURN_CHAIN) NODE_NAME_CASE(TC_RETURN_CHAIN_DVGPR) NODE_NAME_CASE(TRAP) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 78394ac9cd2d..bdaf48652d10 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -418,6 +418,7 @@ enum NodeType : unsigned { CALL, TC_RETURN, TC_RETURN_GFX, + TC_RETURN_GFX_WholeWave, TC_RETURN_CHAIN, TC_RETURN_CHAIN_DVGPR, TRAP, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index e305f08925cc..b8fa6f3fc686 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -94,6 +94,10 @@ def AMDGPUtc_return_gfx: SDNode<"AMDGPUISD::TC_RETURN_GFX", AMDGPUTCReturnTP, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic] >; +def AMDGPUtc_return_gfx_ww: SDNode<"AMDGPUISD::TC_RETURN_GFX_WholeWave", AMDGPUTCReturnTP, +[SDNPHasChain, SDNPOptInGlue, SDNPVariadic] +>; + def AMDGPUtc_return_chain: SDNode<"AMDGPUISD::TC_RETURN_CHAIN", SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic] diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 5d31eed8fe7d..12915c734442 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1989,39 +1989,6 @@ bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const { return selectImpl(MI, *CoverageInfo); } -bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const { - Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID(); - if (TM.getOptLevel() > CodeGenOptLevel::None) { - unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second; - if (WGSize <= STI.getWavefrontSize()) { - // If the workgroup fits in a wave, remove s_barrier_signal and lower - // s_barrier/s_barrier_wait to wave_barrier. - if (IntrinsicID == Intrinsic::amdgcn_s_barrier || - IntrinsicID == Intrinsic::amdgcn_s_barrier_wait) { - MachineBasicBlock *MBB = MI.getParent(); - const DebugLoc &DL = MI.getDebugLoc(); - BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER)); - } - MI.eraseFromParent(); - return true; - } - } - - if (STI.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) { - // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait - MachineBasicBlock *MBB = MI.getParent(); - const DebugLoc &DL = MI.getDebugLoc(); - BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM)) - .addImm(AMDGPU::Barrier::WORKGROUP); - BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT)) - .addImm(AMDGPU::Barrier::WORKGROUP); - MI.eraseFromParent(); - return true; - } - - return selectImpl(MI, *CoverageInfo); -} - static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail) { if (TexFailCtrl) @@ -2338,10 +2305,6 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( return selectDSAppendConsume(I, false); case Intrinsic::amdgcn_init_whole_wave: return selectInitWholeWave(I); - case Intrinsic::amdgcn_s_barrier: - case Intrinsic::amdgcn_s_barrier_signal: - case Intrinsic::amdgcn_s_barrier_wait: - return selectSBarrier(I); case Intrinsic::amdgcn_raw_buffer_load_lds: case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: case Intrinsic::amdgcn_struct_buffer_load_lds: @@ -5746,6 +5709,16 @@ AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const { } InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectGlobalSAddrCPolM0(MachineOperand &Root) const { + const MachineInstr &I = *Root.getParent(); + + // We are assuming CPol is second from last operand of the intrinsic. + auto PassedCPol = + I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL; + return selectGlobalSAddr(Root, PassedCPol); +} + +InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const { return selectGlobalSAddr(Root, AMDGPU::CPol::GLC); } @@ -5762,6 +5735,17 @@ AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset( } InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectGlobalSAddrNoIOffsetM0( + MachineOperand &Root) const { + const MachineInstr &I = *Root.getParent(); + + // We are assuming CPol is second from last operand of the intrinsic. + auto PassedCPol = + I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL; + return selectGlobalSAddr(Root, PassedCPol, false); +} + +InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { Register Addr = Root.getReg(); Register PtrBase; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 092439693f39..c760fe7ef99d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -124,7 +124,6 @@ private: bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const; bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const; bool selectInitWholeWave(MachineInstr &MI) const; - bool selectSBarrier(MachineInstr &MI) const; bool selectDSBvhStackIntrinsic(MachineInstr &MI) const; bool selectImageIntrinsic(MachineInstr &MI, @@ -257,9 +256,13 @@ private: InstructionSelector::ComplexRendererFns selectGlobalSAddrCPol(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns + selectGlobalSAddrCPolM0(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectGlobalSAddrGLC(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectGlobalSAddrNoIOffset(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectGlobalSAddrNoIOffsetM0(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectScratchSAddr(MachineOperand &Root) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index efcd87e46620..bd443b5b6f1e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -509,6 +509,10 @@ def atomic_load_nonext_64_#as : PatFrag<(ops node:$ptr), (atomic_load_nonext_64 let IsAtomic = 1; } +def atomic_load_nonext_128_#as : PatFrag<(ops node:$ptr), (atomic_load_nonext_128 node:$ptr)> { + let IsAtomic = 1; +} + def atomic_load_zext_8_#as : PatFrag<(ops node:$ptr), (atomic_load_zext_8 node:$ptr)> { let IsAtomic = 1; } @@ -573,6 +577,8 @@ def atomic_store_32_#as : PatFrag<(ops node:$val, node:$ptr), (atomic_store_32 node:$val, node:$ptr)>; def atomic_store_64_#as : PatFrag<(ops node:$val, node:$ptr), (atomic_store_64 node:$val, node:$ptr)>; +def atomic_store_128_#as : PatFrag<(ops node:$val, node:$ptr), + (atomic_store_128 node:$val, node:$ptr)>; } // End let IsAtomic = 1, AddressSpaces = ... } // End foreach as diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 600a13096f55..f18536cd4ab9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2082,13 +2082,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0) .lower(); - // TODO: Only Try to form v2s16 with legal packed instructions. - getActionDefinitionsBuilder(G_FSHR) - .legalFor({{S32, S32}}) - .lowerFor({{V2S16, V2S16}}) - .clampMaxNumElementsStrict(0, S16, 2) - .scalarize(0) - .lower(); + auto &FSHRActionDefs = getActionDefinitionsBuilder(G_FSHR); + FSHRActionDefs.legalFor({{S32, S32}}) + .clampMaxNumElementsStrict(0, S16, 2); + if (ST.hasVOP3PInsts()) + FSHRActionDefs.lowerFor({{V2S16, V2S16}}); + FSHRActionDefs.scalarize(0).lower(); if (ST.hasVOP3PInsts()) { getActionDefinitionsBuilder(G_FSHL) @@ -3414,10 +3413,7 @@ static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI, } static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) { - if (Flags & MachineInstr::FmAfn) - return true; - const auto &Options = MF.getTarget().Options; - return Options.ApproxFuncFPMath; + return Flags & MachineInstr::FmAfn; } static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, @@ -3522,8 +3518,7 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, const AMDGPUTargetMachine &TM = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); - if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) || - TM.Options.ApproxFuncFPMath) { + if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn)) { if (Ty == F16 && !ST.has16BitInsts()) { Register LogVal = MRI.createGenericVirtualRegister(F32); auto PromoteSrc = B.buildFPExt(F32, X); @@ -7823,6 +7818,20 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MI.eraseFromParent(); return true; } + case Intrinsic::amdgcn_cooperative_atomic_load_32x4B: + case Intrinsic::amdgcn_cooperative_atomic_load_16x8B: + case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: + assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!"); + B.buildLoad(MI.getOperand(0), MI.getOperand(2), **MI.memoperands_begin()); + MI.eraseFromParent(); + return true; + case Intrinsic::amdgcn_cooperative_atomic_store_32x4B: + case Intrinsic::amdgcn_cooperative_atomic_store_16x8B: + case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: + assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!"); + B.buildStore(MI.getOperand(2), MI.getOperand(1), **MI.memoperands_begin()); + MI.eraseFromParent(); + return true; default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrID)) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp new file mode 100644 index 000000000000..a30d9cb0412a --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp @@ -0,0 +1,161 @@ +//===-- AMDGPULowerIntrinsics.cpp -------------------------------------------=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Lower intrinsics that would otherwise require separate handling in both +// SelectionDAG and GlobalISel. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUTargetMachine.h" +#include "GCNSubtarget.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/InitializePasses.h" + +#define DEBUG_TYPE "amdgpu-lower-intrinsics" + +using namespace llvm; + +namespace { + +class AMDGPULowerIntrinsicsImpl { +public: + Module &M; + const AMDGPUTargetMachine &TM; + + AMDGPULowerIntrinsicsImpl(Module &M, const AMDGPUTargetMachine &TM) + : M(M), TM(TM) {} + + bool run(); + +private: + bool visitBarrier(IntrinsicInst &I); +}; + +class AMDGPULowerIntrinsicsLegacy : public ModulePass { +public: + static char ID; + + AMDGPULowerIntrinsicsLegacy() : ModulePass(ID) {} + + bool runOnModule(Module &M) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetPassConfig>(); + AU.setPreservesCFG(); + } +}; + +template <class T> static void forEachCall(Function &Intrin, T Callback) { + for (User *U : make_early_inc_range(Intrin.users())) { + if (auto *CI = dyn_cast<IntrinsicInst>(U)) + Callback(CI); + } +} + +} // anonymous namespace + +bool AMDGPULowerIntrinsicsImpl::run() { + bool Changed = false; + + for (Function &F : M) { + switch (F.getIntrinsicID()) { + default: + continue; + case Intrinsic::amdgcn_s_barrier: + case Intrinsic::amdgcn_s_barrier_signal: + case Intrinsic::amdgcn_s_barrier_signal_isfirst: + case Intrinsic::amdgcn_s_barrier_wait: + forEachCall(F, [&](IntrinsicInst *II) { Changed |= visitBarrier(*II); }); + break; + } + } + + return Changed; +} + +// Optimize barriers and lower s_barrier to a sequence of split barrier +// intrinsics. +bool AMDGPULowerIntrinsicsImpl::visitBarrier(IntrinsicInst &I) { + assert(I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier || + I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal || + I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal_isfirst || + I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait); + + const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(*I.getFunction()); + bool IsSingleWaveWG = false; + + if (TM.getOptLevel() > CodeGenOptLevel::None) { + unsigned WGMaxSize = ST.getFlatWorkGroupSizes(*I.getFunction()).second; + IsSingleWaveWG = WGMaxSize <= ST.getWavefrontSize(); + } + + IRBuilder<> B(&I); + + if (IsSingleWaveWG) { + // Down-grade waits, remove split signals. + if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier || + I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait) { + B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_wave_barrier, {}); + } else if (I.getIntrinsicID() == + Intrinsic::amdgcn_s_barrier_signal_isfirst) { + // If we're the only wave of the workgroup, we're always first. + I.replaceAllUsesWith(B.getInt1(true)); + } + I.eraseFromParent(); + return true; + } + + if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier && + ST.hasSplitBarriers()) { + // Lower to split barriers. + Value *BarrierID_32 = B.getInt32(AMDGPU::Barrier::WORKGROUP); + Value *BarrierID_16 = B.getInt16(AMDGPU::Barrier::WORKGROUP); + B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_signal, + {BarrierID_32}); + B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_wait, + {BarrierID_16}); + I.eraseFromParent(); + return true; + } + + return false; +} + +PreservedAnalyses AMDGPULowerIntrinsicsPass::run(Module &M, + ModuleAnalysisManager &MAM) { + AMDGPULowerIntrinsicsImpl Impl(M, TM); + if (!Impl.run()) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); + return PA; +} + +bool AMDGPULowerIntrinsicsLegacy::runOnModule(Module &M) { + auto &TPC = getAnalysis<TargetPassConfig>(); + const AMDGPUTargetMachine &TM = TPC.getTM<AMDGPUTargetMachine>(); + + AMDGPULowerIntrinsicsImpl Impl(M, TM); + return Impl.run(); +} + +#define PASS_DESC "AMDGPU lower intrinsics" +INITIALIZE_PASS_BEGIN(AMDGPULowerIntrinsicsLegacy, DEBUG_TYPE, PASS_DESC, false, + false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(AMDGPULowerIntrinsicsLegacy, DEBUG_TYPE, PASS_DESC, false, + false) + +char AMDGPULowerIntrinsicsLegacy::ID = 0; + +ModulePass *llvm::createAMDGPULowerIntrinsicsLegacyPass() { + return new AMDGPULowerIntrinsicsLegacy; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp new file mode 100644 index 000000000000..1e6589eb42c1 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp @@ -0,0 +1,373 @@ +//===- AMDGPULowerVGPREncoding.cpp - lower VGPRs above v255 ---------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Lower VGPRs above first 256 on gfx1250. +/// +/// The pass scans used VGPRs and inserts S_SET_VGPR_MSB instructions to switch +/// VGPR addressing mode. The mode change is effective until the next change. +/// This instruction provides high bits of a VGPR address for four of the +/// operands: vdst, src0, src1, and src2, or other 4 operands depending on the +/// instruction encoding. If bits are set they are added as MSB to the +/// corresponding operand VGPR number. +/// +/// There is no need to replace actual register operands because encoding of the +/// high and low VGPRs is the same. I.e. v0 has the encoding 0x100, so does +/// v256. v1 has the encoding 0x101 and v257 has the same encoding. So high +/// VGPRs will survive until actual encoding and will result in a same actual +/// bit encoding. +/// +/// As a result the pass only inserts S_SET_VGPR_MSB to provide an actual offset +/// to a VGPR address of the subseqent instructions. The InstPrinter will take +/// care of the printing a low VGPR instead of a high one. In prinicple this +/// shall be viable to print actual high VGPR numbers, but that would disagree +/// with a disasm printing and create a situation where asm text is not +/// deterministic. +/// +/// This pass creates a convention where non-fall through basic blocks shall +/// start with all 4 MSBs zero. Otherwise a disassembly would not be readable. +/// An optimization here is possible but deemed not desirable because of the +/// readbility concerns. +/// +/// Consequentially the ABI is set to expect all 4 MSBs to be zero on entry. +/// The pass must run very late in the pipeline to make sure no changes to VGPR +/// operands will be made after it. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPULowerVGPREncoding.h" +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/PackedVector.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-lower-vgpr-encoding" + +namespace { + +class AMDGPULowerVGPREncoding { + static constexpr unsigned OpNum = 4; + static constexpr unsigned BitsPerField = 2; + static constexpr unsigned NumFields = 4; + static constexpr unsigned FieldMask = (1 << BitsPerField) - 1; + using ModeType = PackedVector<unsigned, BitsPerField, + std::bitset<BitsPerField * NumFields>>; + + class ModeTy : public ModeType { + public: + // bitset constructor will set all bits to zero + ModeTy() : ModeType(0) {} + + operator int64_t() const { return raw_bits().to_ulong(); } + + static ModeTy fullMask() { + ModeTy M; + M.raw_bits().flip(); + return M; + } + }; + +public: + bool run(MachineFunction &MF); + +private: + const SIInstrInfo *TII; + const SIRegisterInfo *TRI; + + /// Most recent s_set_* instruction. + MachineInstr *MostRecentModeSet; + + /// Whether the current mode is known. + bool CurrentModeKnown; + + /// Current mode bits. + ModeTy CurrentMode; + + /// Current mask of mode bits that instructions since MostRecentModeSet care + /// about. + ModeTy CurrentMask; + + /// Number of current hard clause instructions. + unsigned ClauseLen; + + /// Number of hard clause instructions remaining. + unsigned ClauseRemaining; + + /// Clause group breaks. + unsigned ClauseBreaks; + + /// Last hard clause instruction. + MachineInstr *Clause; + + /// Insert mode change before \p I. \returns true if mode was changed. + bool setMode(ModeTy NewMode, ModeTy Mask, MachineInstr *I); + + /// Reset mode to default. + void resetMode(MachineInstr *I) { setMode(ModeTy(), ModeTy::fullMask(), I); } + + /// If \p MO references VGPRs, return the MSBs. Otherwise, return nullopt. + std::optional<unsigned> getMSBs(const MachineOperand &MO) const; + + /// Handle single \p MI. \return true if changed. + bool runOnMachineInstr(MachineInstr &MI); + + /// Compute the mode and mode mask for a single \p MI given \p Ops operands + /// bit mapping. Optionally takes second array \p Ops2 for VOPD. + /// If provided and an operand from \p Ops is not a VGPR, then \p Ops2 + /// is checked. + void computeMode(ModeTy &NewMode, ModeTy &Mask, MachineInstr &MI, + const AMDGPU::OpName Ops[OpNum], + const AMDGPU::OpName *Ops2 = nullptr); + + /// Check if an instruction \p I is within a clause and returns a suitable + /// iterator to insert mode change. It may also modify the S_CLAUSE + /// instruction to extend it or drop the clause if it cannot be adjusted. + MachineInstr *handleClause(MachineInstr *I); +}; + +bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode, ModeTy Mask, + MachineInstr *I) { + assert((NewMode.raw_bits() & ~Mask.raw_bits()).none()); + + if (CurrentModeKnown) { + auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits(); + + if ((Delta & Mask.raw_bits()).none()) { + CurrentMask |= Mask; + return false; + } + + if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) { + CurrentMode |= NewMode; + CurrentMask |= Mask; + + MostRecentModeSet->getOperand(0).setImm(CurrentMode); + return true; + } + } + + I = handleClause(I); + MostRecentModeSet = + BuildMI(*I->getParent(), I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB)) + .addImm(NewMode); + + CurrentMode = NewMode; + CurrentMask = Mask; + CurrentModeKnown = true; + return true; +} + +std::optional<unsigned> +AMDGPULowerVGPREncoding::getMSBs(const MachineOperand &MO) const { + if (!MO.isReg()) + return std::nullopt; + + MCRegister Reg = MO.getReg(); + const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg); + if (!RC || !TRI->isVGPRClass(RC)) + return std::nullopt; + + unsigned Idx = TRI->getHWRegIndex(Reg); + return Idx >> 8; +} + +void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode, ModeTy &Mask, + MachineInstr &MI, + const AMDGPU::OpName Ops[OpNum], + const AMDGPU::OpName *Ops2) { + NewMode = {}; + Mask = {}; + + for (unsigned I = 0; I < OpNum; ++I) { + MachineOperand *Op = TII->getNamedOperand(MI, Ops[I]); + + std::optional<unsigned> MSBits; + if (Op) + MSBits = getMSBs(*Op); + +#if !defined(NDEBUG) + if (MSBits.has_value() && Ops2) { + auto Op2 = TII->getNamedOperand(MI, Ops2[I]); + if (Op2) { + std::optional<unsigned> MSBits2; + MSBits2 = getMSBs(*Op2); + if (MSBits2.has_value() && MSBits != MSBits2) + llvm_unreachable("Invalid VOPD pair was created"); + } + } +#endif + + if (!MSBits.has_value() && Ops2) { + Op = TII->getNamedOperand(MI, Ops2[I]); + if (Op) + MSBits = getMSBs(*Op); + } + + if (!MSBits.has_value()) + continue; + + // Skip tied uses of src2 of VOP2, these will be handled along with defs and + // only vdst bit affects these operands. We cannot skip tied uses of VOP3, + // these uses are real even if must match the vdst. + if (Ops[I] == AMDGPU::OpName::src2 && !Op->isDef() && Op->isTied() && + (SIInstrInfo::isVOP2(MI) || + (SIInstrInfo::isVOP3(MI) && + TII->hasVALU32BitEncoding(MI.getOpcode())))) + continue; + + NewMode[I] = MSBits.value(); + Mask[I] = FieldMask; + } +} + +bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) { + auto Ops = AMDGPU::getVGPRLoweringOperandTables(MI.getDesc()); + if (Ops.first) { + ModeTy NewMode, Mask; + computeMode(NewMode, Mask, MI, Ops.first, Ops.second); + return setMode(NewMode, Mask, &MI); + } + assert(!TII->hasVGPRUses(MI) || MI.isMetaInstruction() || MI.isPseudo()); + + return false; +} + +MachineInstr *AMDGPULowerVGPREncoding::handleClause(MachineInstr *I) { + if (!ClauseRemaining) + return I; + + // A clause cannot start with a special instruction, place it right before + // the clause. + if (ClauseRemaining == ClauseLen) { + I = Clause->getPrevNode(); + assert(I->isBundle()); + return I; + } + + // If a clause defines breaks each group cannot start with a mode change. + // just drop the clause. + if (ClauseBreaks) { + Clause->eraseFromBundle(); + ClauseRemaining = 0; + return I; + } + + // Otherwise adjust a number of instructions in the clause if it fits. + // If it does not clause will just become shorter. Since the length + // recorded in the clause is one less, increment the length after the + // update. Note that SIMM16[5:0] must be 1-62, not 0 or 63. + if (ClauseLen < 63) + Clause->getOperand(0).setImm(ClauseLen | (ClauseBreaks << 8)); + + ++ClauseLen; + + return I; +} + +bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + if (!ST.has1024AddressableVGPRs()) + return false; + + TII = ST.getInstrInfo(); + TRI = ST.getRegisterInfo(); + + bool Changed = false; + ClauseLen = ClauseRemaining = 0; + CurrentMode.reset(); + CurrentMask.reset(); + CurrentModeKnown = true; + for (auto &MBB : MF) { + MostRecentModeSet = nullptr; + + for (auto &MI : llvm::make_early_inc_range(MBB.instrs())) { + if (MI.isMetaInstruction()) + continue; + + if (MI.isTerminator() || MI.isCall()) { + if (MI.getOpcode() == AMDGPU::S_ENDPGM || + MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) { + CurrentMode.reset(); + CurrentModeKnown = true; + } else + resetMode(&MI); + continue; + } + + if (MI.isInlineAsm()) { + if (TII->hasVGPRUses(MI)) + resetMode(&MI); + continue; + } + + if (MI.getOpcode() == AMDGPU::S_CLAUSE) { + assert(!ClauseRemaining && "Nested clauses are not supported"); + ClauseLen = MI.getOperand(0).getImm(); + ClauseBreaks = (ClauseLen >> 8) & 15; + ClauseLen = ClauseRemaining = (ClauseLen & 63) + 1; + Clause = &MI; + continue; + } + + Changed |= runOnMachineInstr(MI); + + if (ClauseRemaining) + --ClauseRemaining; + } + + // If we're falling through to a block that has at least one other + // predecessor, we no longer know the mode. + MachineBasicBlock *Next = MBB.getNextNode(); + if (Next && Next->pred_size() >= 2 && + llvm::is_contained(Next->predecessors(), &MBB)) { + if (CurrentMode.raw_bits().any()) + CurrentModeKnown = false; + } + } + + return Changed; +} + +class AMDGPULowerVGPREncodingLegacy : public MachineFunctionPass { +public: + static char ID; + + AMDGPULowerVGPREncodingLegacy() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override { + return AMDGPULowerVGPREncoding().run(MF); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // namespace + +char AMDGPULowerVGPREncodingLegacy::ID = 0; + +char &llvm::AMDGPULowerVGPREncodingLegacyID = AMDGPULowerVGPREncodingLegacy::ID; + +INITIALIZE_PASS(AMDGPULowerVGPREncodingLegacy, DEBUG_TYPE, + "AMDGPU Lower VGPR Encoding", false, false) + +PreservedAnalyses +AMDGPULowerVGPREncodingPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + if (!AMDGPULowerVGPREncoding().run(MF)) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); + return PA; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.h b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.h new file mode 100644 index 000000000000..c8c2051c9fdd --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.h @@ -0,0 +1,25 @@ +//===--- AMDGPULowerVGPREncoding.h ------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPULOWERVGPRENCODING_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPULOWERVGPRENCODING_H + +#include "llvm/CodeGen/MachinePassManager.h" + +namespace llvm { + +class AMDGPULowerVGPREncodingPass + : public PassInfoMixin<AMDGPULowerVGPREncodingPass> { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); +}; + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPULOWERVGPRENCODING_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index c84a0f6e3138..6acbf52b97de 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -373,6 +373,13 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { MF->getInfo<SIMachineFunctionInfo>(), *OutStreamer); + if (isVerbose() && MI->getOpcode() == AMDGPU::S_SET_VGPR_MSB) { + unsigned V = MI->getOperand(0).getImm(); + OutStreamer->AddComment( + " msbs: dst=" + Twine(V >> 6) + " src0=" + Twine(V & 3) + + " src1=" + Twine((V >> 2) & 3) + " src2=" + Twine((V >> 4) & 3)); + } + MCInst TmpInst; MCInstLowering.lower(MI, TmpInst); EmitToStreamer(*OutStreamer, TmpInst); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp index eda479064d7b..d09b7cffe9f2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp @@ -23,6 +23,7 @@ AMDGPUMachineModuleInfo::AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI) AgentSSID = CTX.getOrInsertSyncScopeID("agent"); WorkgroupSSID = CTX.getOrInsertSyncScopeID("workgroup"); WavefrontSSID = CTX.getOrInsertSyncScopeID("wavefront"); + ClusterSSID = CTX.getOrInsertSyncScopeID("cluster"); SystemOneAddressSpaceSSID = CTX.getOrInsertSyncScopeID("one-as"); AgentOneAddressSpaceSSID = @@ -33,4 +34,5 @@ AMDGPUMachineModuleInfo::AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI) CTX.getOrInsertSyncScopeID("wavefront-one-as"); SingleThreadOneAddressSpaceSSID = CTX.getOrInsertSyncScopeID("singlethread-one-as"); + ClusterOneAddressSpaceSSID = CTX.getOrInsertSyncScopeID("cluster-one-as"); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h index 5c2ecaa65714..bf852bb38376 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h @@ -32,6 +32,8 @@ private: SyncScope::ID WorkgroupSSID; /// Wavefront synchronization scope ID (cross address space). SyncScope::ID WavefrontSSID; + /// Cluster synchronization scope ID (cross address space). + SyncScope::ID ClusterSSID; /// System synchronization scope ID (single address space). SyncScope::ID SystemOneAddressSpaceSSID; /// Agent synchronization scope ID (single address space). @@ -42,6 +44,8 @@ private: SyncScope::ID WavefrontOneAddressSpaceSSID; /// Single thread synchronization scope ID (single address space). SyncScope::ID SingleThreadOneAddressSpaceSSID; + /// Cluster synchronization scope ID (single address space). + SyncScope::ID ClusterOneAddressSpaceSSID; /// In AMDGPU target synchronization scopes are inclusive, meaning a /// larger synchronization scope is inclusive of a smaller synchronization @@ -60,12 +64,15 @@ private: else if (SSID == getWorkgroupSSID() || SSID == getWorkgroupOneAddressSpaceSSID()) return 2; + else if (SSID == getClusterSSID() || + SSID == getClusterOneAddressSpaceSSID()) + return 3; else if (SSID == getAgentSSID() || SSID == getAgentOneAddressSpaceSSID()) - return 3; + return 4; else if (SSID == SyncScope::System || SSID == getSystemOneAddressSpaceSSID()) - return 4; + return 5; return std::nullopt; } @@ -73,11 +80,12 @@ private: /// \returns True if \p SSID is restricted to single address space, false /// otherwise bool isOneAddressSpace(SyncScope::ID SSID) const { - return SSID == getSingleThreadOneAddressSpaceSSID() || - SSID == getWavefrontOneAddressSpaceSSID() || - SSID == getWorkgroupOneAddressSpaceSSID() || - SSID == getAgentOneAddressSpaceSSID() || - SSID == getSystemOneAddressSpaceSSID(); + return SSID == getClusterOneAddressSpaceSSID() || + SSID == getSingleThreadOneAddressSpaceSSID() || + SSID == getWavefrontOneAddressSpaceSSID() || + SSID == getWorkgroupOneAddressSpaceSSID() || + SSID == getAgentOneAddressSpaceSSID() || + SSID == getSystemOneAddressSpaceSSID(); } public: @@ -95,6 +103,8 @@ public: SyncScope::ID getWavefrontSSID() const { return WavefrontSSID; } + /// \returns Cluster synchronization scope ID (cross address space). + SyncScope::ID getClusterSSID() const { return ClusterSSID; } /// \returns System synchronization scope ID (single address space). SyncScope::ID getSystemOneAddressSpaceSSID() const { return SystemOneAddressSpaceSSID; @@ -115,6 +125,10 @@ public: SyncScope::ID getSingleThreadOneAddressSpaceSSID() const { return SingleThreadOneAddressSpaceSSID; } + /// \returns Single thread synchronization scope ID (single address space). + SyncScope::ID getClusterOneAddressSpaceSSID() const { + return ClusterOneAddressSpaceSSID; + } /// In AMDGPU target synchronization scopes are inclusive, meaning a /// larger synchronization scope is inclusive of a smaller synchronization diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 6ddfa386e8ac..9449e7093091 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -20,6 +20,7 @@ MODULE_PASS("amdgpu-always-inline", AMDGPUAlwaysInlinePass()) MODULE_PASS("amdgpu-export-kernel-runtime-handles", AMDGPUExportKernelRuntimeHandlesPass()) MODULE_PASS("amdgpu-lower-buffer-fat-pointers", AMDGPULowerBufferFatPointersPass(*this)) +MODULE_PASS("amdgpu-lower-intrinsics", AMDGPULowerIntrinsicsPass(*this)) MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass()) MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this)) MODULE_PASS("amdgpu-perf-hint", @@ -105,6 +106,7 @@ MACHINE_FUNCTION_ANALYSIS("amdgpu-resource-usage", AMDGPUResourceUsageAnalysis(* #endif MACHINE_FUNCTION_PASS("amdgpu-insert-delay-alu", AMDGPUInsertDelayAluPass()) MACHINE_FUNCTION_PASS("amdgpu-isel", AMDGPUISelDAGToDAGPass(*this)) +MACHINE_FUNCTION_PASS("amdgpu-lower-vgpr-encoding", AMDGPULowerVGPREncodingPass()) MACHINE_FUNCTION_PASS("amdgpu-mark-last-scratch-load", AMDGPUMarkLastScratchLoadPass()) MACHINE_FUNCTION_PASS("amdgpu-pre-ra-long-branch-reg", GCNPreRALongBranchRegPass()) MACHINE_FUNCTION_PASS("amdgpu-reserve-wwm-regs", AMDGPUReserveWWMRegsPass()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index f226c7f381aa..7dbe1235a98b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -70,7 +70,7 @@ static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs( "amdgpu-promote-alloca-to-vector-max-regs", cl::desc( "Maximum vector size (in 32b registers) to use when promoting alloca"), - cl::init(16)); + cl::init(32)); // Use up to 1/4 of available register budget for vectorization. // FIXME: Increase the limit for whole function budgets? Perhaps x2? @@ -287,8 +287,12 @@ void AMDGPUPromoteAllocaImpl::sortAllocasToPromote( void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) { // Load per function limits, overriding with global options where appropriate. + // R600 register tuples/aliasing are fragile with large vector promotions so + // apply architecture specific limit here. + const int R600MaxVectorRegs = 16; MaxVectorRegs = F.getFnAttributeAsParsedInteger( - "amdgpu-promote-alloca-to-vector-max-regs", PromoteAllocaToVectorMaxRegs); + "amdgpu-promote-alloca-to-vector-max-regs", + IsAMDGCN ? PromoteAllocaToVectorMaxRegs : R600MaxVectorRegs); if (PromoteAllocaToVectorMaxRegs.getNumOccurrences()) MaxVectorRegs = PromoteAllocaToVectorMaxRegs; VGPRBudgetRatio = F.getFnAttributeAsParsedInteger( @@ -439,9 +443,10 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, return nullptr; APInt IndexQuot; - uint64_t Rem; - APInt::udivrem(ConstOffset, VecElemSize, IndexQuot, Rem); - if (Rem != 0) + APInt Rem; + APInt::sdivrem(ConstOffset, APInt(ConstOffset.getBitWidth(), VecElemSize), + IndexQuot, Rem); + if (!Rem.isZero()) return nullptr; if (VarOffsets.size() == 0) return ConstantInt::get(GEP->getContext(), IndexQuot); @@ -450,8 +455,10 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, const auto &VarOffset = VarOffsets.front(); APInt OffsetQuot; - APInt::udivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem); - if (Rem != 0 || OffsetQuot.isZero()) + APInt::sdivrem(VarOffset.second, + APInt(VarOffset.second.getBitWidth(), VecElemSize), OffsetQuot, + Rem); + if (!Rem.isZero() || OffsetQuot.isZero()) return nullptr; Value *Offset = VarOffset.first; @@ -461,7 +468,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, if (!OffsetQuot.isOne()) { ConstantInt *ConstMul = - ConstantInt::get(OffsetType, OffsetQuot.getZExtValue()); + ConstantInt::get(OffsetType, OffsetQuot.getSExtValue()); Offset = Builder.CreateMul(Offset, ConstMul); if (Instruction *NewInst = dyn_cast<Instruction>(Offset)) NewInsts.push_back(NewInst); @@ -470,8 +477,8 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, return Offset; ConstantInt *ConstIndex = - ConstantInt::get(OffsetType, IndexQuot.getZExtValue()); - Value *IndexAdd = Builder.CreateAdd(ConstIndex, Offset); + ConstantInt::get(OffsetType, IndexQuot.getSExtValue()); + Value *IndexAdd = Builder.CreateAdd(Offset, ConstIndex); if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd)) NewInsts.push_back(NewInst); return IndexAdd; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 237929699dd9..36b27bef350e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3322,6 +3322,14 @@ void AMDGPURegisterBankInfo::applyMappingImpl( constrainOpWithReadfirstlane(B, MI, 6); // soffset return; } + case Intrinsic::amdgcn_cluster_load_async_to_lds_b8: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b32: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b64: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: { + applyDefaultMapping(OpdMapper); + constrainOpWithReadfirstlane(B, MI, 5); + return; + } case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: { applyDefaultMapping(OpdMapper); @@ -3338,6 +3346,13 @@ void AMDGPURegisterBankInfo::applyMappingImpl( applyDefaultMapping(OpdMapper); constrainOpWithReadfirstlane(B, MI, 8); // M0 return; + case Intrinsic::amdgcn_cluster_load_b32: + case Intrinsic::amdgcn_cluster_load_b64: + case Intrinsic::amdgcn_cluster_load_b128: { + applyDefaultMapping(OpdMapper); + constrainOpWithReadfirstlane(B, MI, 4); // M0 + return; + } case Intrinsic::amdgcn_s_sleep_var: assert(OpdMapper.getVRegs(1).empty()); constrainOpWithReadfirstlane(B, MI, 1); @@ -5466,6 +5481,27 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); break; } + case Intrinsic::amdgcn_cluster_load_b32: + case Intrinsic::amdgcn_cluster_load_b64: + case Intrinsic::amdgcn_cluster_load_b128: { + OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + unsigned M0Bank = + getRegBankID(MI.getOperand(4).getReg(), MRI, AMDGPU::SGPRRegBankID); + OpdsMapping[4] = AMDGPU::getValueMapping(M0Bank, 32); + break; + } + case Intrinsic::amdgcn_cluster_load_async_to_lds_b8: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b32: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b64: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: { + OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + unsigned M0Bank = + getRegBankID(MI.getOperand(5).getReg(), MRI, AMDGPU::SGPRRegBankID); + OpdsMapping[5] = AMDGPU::getValueMapping(M0Bank, 32); + break; + } case Intrinsic::amdgcn_global_store_async_from_lds_b8: case Intrinsic::amdgcn_global_store_async_from_lds_b32: case Intrinsic::amdgcn_global_store_async_from_lds_b64: diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp index 8b1d4ba68a44..21cf9cc6878f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp @@ -14,6 +14,10 @@ /// MFMA opcode. /// /// TODO: +/// - Handle rewrites of phis. This must be more careful than normal about the +/// reassignment. We do not want to introduce an AGPR-to-AGPR copy inside of a +/// loop, so it depends on the exact assignment of the copy. +/// /// - Update LiveIntervals incrementally instead of recomputing from scratch /// //===----------------------------------------------------------------------===// @@ -22,6 +26,7 @@ #include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveRegMatrix.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -34,6 +39,9 @@ using namespace llvm; namespace { +STATISTIC(NumMFMAsRewrittenToAGPR, + "Number of MFMA instructions rewritten to use AGPR form"); + class AMDGPURewriteAGPRCopyMFMAImpl { MachineFunction &MF; const GCNSubtarget &ST; @@ -60,6 +68,25 @@ public: return TII.isMAI(MI) && AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode()) != -1; } + /// Find AV_* registers assigned to AGPRs (or virtual registers which were + /// already required to be AGPR). + /// + /// \return the assigned physical register that \p VReg is assigned to if it + /// is an AGPR, otherwise MCRegister(). + MCRegister getAssignedAGPR(Register VReg) const { + MCRegister PhysReg = VRM.getPhys(VReg); + if (!PhysReg) + return MCRegister(); + + // If this is an AV register, we have to check if the actual assignment is + // to an AGPR + const TargetRegisterClass *AssignedRC = TRI.getPhysRegBaseClass(PhysReg); + return TRI.isAGPRClass(AssignedRC) ? PhysReg : MCRegister(); + } + + bool tryReassigningMFMAChain(MachineInstr &MFMA, Register MFMAHintReg, + MCPhysReg PhysRegHint) const; + /// Compute the register class constraints based on the uses of \p Reg, /// excluding MFMA uses from which can be rewritten to change the register /// class constraint. This should be nearly identical to @@ -74,6 +101,8 @@ public: Register Reg, SmallVectorImpl<MachineInstr *> &RewriteCandidates, SmallSetVector<Register, 4> &RewriteRegs) const; + bool tryFoldCopiesToAGPR(Register VReg, MCRegister AssignedAGPR) const; + bool tryFoldCopiesFromAGPR(Register VReg, MCRegister AssignedAGPR) const; bool run(MachineFunction &MF) const; }; @@ -154,6 +183,88 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable( return true; } +bool AMDGPURewriteAGPRCopyMFMAImpl::tryReassigningMFMAChain( + MachineInstr &MFMA, Register MFMAHintReg, MCPhysReg PhysRegHint) const { + // src2 and dst have the same physical class constraint; try to preserve + // the original src2 subclass if one were to exist. + SmallVector<MachineInstr *, 4> RewriteCandidates = {&MFMA}; + SmallSetVector<Register, 4> RewriteRegs; + + // Make sure we reassign the MFMA we found the copy from first. We want + // to ensure dst ends up in the physreg we were originally copying to. + RewriteRegs.insert(MFMAHintReg); + + // We've found av = COPY (MFMA) (or MFMA (v = COPY av)) and need to verify + // that we can trivially rewrite src2 to use the new AGPR. If we can't + // trivially replace it, we're going to induce as many copies as we would have + // emitted in the first place, as well as need to assign another register, and + // need to figure out where to put them. The live range splitting is smarter + // than anything we're doing here, so trust it did something reasonable. + // + // Note recomputeRegClassExceptRewritable will consider the constraints of + // this MFMA's src2 as well as the src2/dst of any transitive MFMA users. + if (!recomputeRegClassExceptRewritable(MFMAHintReg, RewriteCandidates, + RewriteRegs)) { + LLVM_DEBUG(dbgs() << "Could not recompute the regclass of dst reg " + << printReg(MFMAHintReg, &TRI) << '\n'); + return false; + } + + // If src2 and dst are different registers, we need to also reassign the + // input to an available AGPR if it is compatible with all other uses. + // + // If we can't reassign it, we'd need to introduce a different copy + // which is likely worse than the copy we'd be saving. + // + // It's likely that the MFMA is used in sequence with other MFMAs; if we + // cannot migrate the full use/def chain of MFMAs, we would need to + // introduce intermediate copies somewhere. So we only make the + // transform if all the interfering MFMAs can also be migrated. Collect + // the set of rewritable MFMAs and check if we can assign an AGPR at + // that point. + // + // If any of the MFMAs aren't reassignable, we give up and rollback to + // the original register assignments. + + using RecoloringStack = + SmallVector<std::pair<const LiveInterval *, MCRegister>, 8>; + RecoloringStack TentativeReassignments; + + for (Register RewriteReg : RewriteRegs) { + LiveInterval &LI = LIS.getInterval(RewriteReg); + TentativeReassignments.push_back({&LI, VRM.getPhys(RewriteReg)}); + LRM.unassign(LI); + } + + if (!attemptReassignmentsToAGPR(RewriteRegs, PhysRegHint)) { + // Roll back the register assignments to the original state. + for (auto [LI, OldAssign] : TentativeReassignments) { + if (VRM.hasPhys(LI->reg())) + LRM.unassign(*LI); + LRM.assign(*LI, OldAssign); + } + + return false; + } + + // Fixup the register classes of the virtual registers now that we've + // committed to the reassignments. + for (Register InterferingReg : RewriteRegs) { + const TargetRegisterClass *EquivalentAGPRRegClass = + TRI.getEquivalentAGPRClass(MRI.getRegClass(InterferingReg)); + MRI.setRegClass(InterferingReg, EquivalentAGPRRegClass); + } + + for (MachineInstr *RewriteCandidate : RewriteCandidates) { + int NewMFMAOp = + AMDGPU::getMFMASrcCVDstAGPROp(RewriteCandidate->getOpcode()); + RewriteCandidate->setDesc(TII.get(NewMFMAOp)); + ++NumMFMAsRewrittenToAGPR; + } + + return true; +} + /// Attempt to reassign the registers in \p InterferingRegs to be AGPRs, with a /// preference to use \p PhysReg first. Returns false if the reassignments /// cannot be trivially performed. @@ -206,140 +317,104 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::attemptReassignmentsToAGPR( return true; } -bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { - // This only applies on subtargets that have a configurable AGPR vs. VGPR - // allocation. - if (!ST.hasGFX90AInsts()) - return false; - - // Early exit if no AGPRs were assigned. - if (!LRM.isPhysRegUsed(AMDGPU::AGPR0)) { - LLVM_DEBUG(dbgs() << "skipping function that did not allocate AGPRs\n"); - return false; - } - +/// Identify copies that look like: +/// %vdst:vgpr = V_MFMA_.. %src0:av, %src1:av, %src2:vgpr +/// %agpr = COPY %vgpr +/// +/// Then try to replace the transitive uses of %src2 and %vdst with the AGPR +/// versions of the MFMA. This should cover the common case. +bool AMDGPURewriteAGPRCopyMFMAImpl::tryFoldCopiesToAGPR( + Register VReg, MCRegister AssignedAGPR) const { bool MadeChange = false; - - for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { - Register VReg = Register::index2VirtReg(I); - Register PhysReg = VRM.getPhys(VReg); - if (!PhysReg) + for (MachineInstr &UseMI : MRI.def_instructions(VReg)) { + if (!UseMI.isCopy()) continue; - // Find AV_* registers assigned to AGPRs. - const TargetRegisterClass *VirtRegRC = MRI.getRegClass(VReg); - if (!TRI.hasAGPRs(VirtRegRC)) + Register CopySrcReg = UseMI.getOperand(1).getReg(); + if (!CopySrcReg.isVirtual()) continue; - const TargetRegisterClass *AssignedRC = VirtRegRC; - if (TRI.hasVGPRs(VirtRegRC)) { - // If this is an AV register, we have to check if the actual assignment is - // to an AGPR - AssignedRC = TRI.getPhysRegBaseClass(PhysReg); - if (!TRI.isAGPRClass(AssignedRC)) - continue; + // TODO: Handle loop phis copied to AGPR. e.g. + // + // loop: + // %phi:vgpr = COPY %mfma:vgpr + // %mfma:vgpr = V_MFMA_xxx_vgprcd_e64 %a, %b, %phi + // s_cbranch_vccnz loop + // + // endloop: + // %agpr = mfma + // + // We need to be sure that %phi is assigned to the same physical register as + // %mfma, or else we will just be moving copies into the loop. + + for (MachineInstr &CopySrcDefMI : MRI.def_instructions(CopySrcReg)) { + if (isRewriteCandidate(CopySrcDefMI) && + tryReassigningMFMAChain( + CopySrcDefMI, CopySrcDefMI.getOperand(0).getReg(), AssignedAGPR)) + MadeChange = true; } + } - LiveInterval &LI = LIS.getInterval(VReg); - - for (VNInfo *VNI : LI.vnis()) { - if (VNI->isPHIDef() || VNI->isUnused()) - continue; - - MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def); - if (!DefMI || !DefMI->isCopy()) - continue; + return MadeChange; +} - Register MFMADstReg = DefMI->getOperand(1).getReg(); - if (!MFMADstReg.isVirtual()) - continue; +/// Identify copies that look like: +/// %src:vgpr = COPY %src:agpr +/// %vdst:vgpr = V_MFMA_... %src0:av, %src1:av, %src:vgpr +/// +/// Then try to replace the transitive uses of %src2 and %vdst with the AGPR +/// versions of the MFMA. This should cover rarer cases, and will generally be +/// redundant with tryFoldCopiesToAGPR. +bool AMDGPURewriteAGPRCopyMFMAImpl::tryFoldCopiesFromAGPR( + Register VReg, MCRegister AssignedAGPR) const { + bool MadeChange = false; + for (MachineInstr &UseMI : MRI.use_instructions(VReg)) { + if (!UseMI.isCopy()) + continue; - LiveInterval &CopySrcLI = LIS.getInterval(MFMADstReg); - LiveQueryResult LRQ = CopySrcLI.Query(VNI->def.getRegSlot()); - MachineInstr *MFMA = LIS.getInstructionFromIndex(LRQ.valueIn()->def); - if (!MFMA || !isRewriteCandidate(*MFMA)) + Register CopyDstReg = UseMI.getOperand(0).getReg(); + if (!CopyDstReg.isVirtual()) + continue; + for (MachineOperand &CopyUseMO : MRI.reg_nodbg_operands(CopyDstReg)) { + if (!CopyUseMO.readsReg()) continue; - // src2 and dst have the same physical class constraint; try to preserve - // the original src2 subclass if one were to exist. - SmallVector<MachineInstr *, 4> RewriteCandidates = {MFMA}; - SmallSetVector<Register, 4> RewriteRegs; - - // Make sure we reassign the MFMA we found the copy from first. We want - // to ensure dst ends up in the physreg we were originally copying to. - RewriteRegs.insert(MFMADstReg); - - // We've found av = COPY (MFMA), and need to verify that we can trivially - // rewrite src2 to use the new AGPR. If we can't trivially replace it, - // we're going to induce as many copies as we would have emitted in the - // first place, as well as need to assign another register, and need to - // figure out where to put them. The live range splitting is smarter than - // anything we're doing here, so trust it did something reasonable. - // - // Note recomputeRegClassExceptRewritable will consider the constraints of - // this MFMA's src2 as well as the src2/dst of any transitive MFMA users. - if (!recomputeRegClassExceptRewritable(MFMADstReg, RewriteCandidates, - RewriteRegs)) { - LLVM_DEBUG(dbgs() << "Could not recompute the regclass of dst reg " - << printReg(MFMADstReg, &TRI) << '\n'); - continue; + MachineInstr &CopyUseMI = *CopyUseMO.getParent(); + if (isRewriteCandidate(CopyUseMI)) { + if (tryReassigningMFMAChain(CopyUseMI, CopyDstReg, + VRM.getPhys(CopyDstReg))) + MadeChange = true; } + } + } - // If src2 and dst are different registers, we need to also reassign the - // input to an available AGPR if it is compatible with all other uses. - // - // If we can't reassign it, we'd need to introduce a different copy - // which is likely worse than the copy we'd be saving. - // - // It's likely that the MFMA is used in sequence with other MFMAs; if we - // cannot migrate the full use/def chain of MFMAs, we would need to - // introduce intermediate copies somewhere. So we only make the - // transform if all the interfering MFMAs can also be migrated. Collect - // the set of rewritable MFMAs and check if we can assign an AGPR at - // that point. - // - // If any of the MFMAs aren't reassignable, we give up and rollback to - // the original register assignments. - - using RecoloringStack = - SmallVector<std::pair<const LiveInterval *, MCRegister>, 8>; - RecoloringStack TentativeReassignments; - - for (Register RewriteReg : RewriteRegs) { - LiveInterval &LI = LIS.getInterval(RewriteReg); - TentativeReassignments.push_back({&LI, VRM.getPhys(RewriteReg)}); - LRM.unassign(LI); - } + return MadeChange; +} - if (!attemptReassignmentsToAGPR(RewriteRegs, PhysReg)) { - // Roll back the register assignments to the original state. - for (auto [LI, OldAssign] : TentativeReassignments) { - if (VRM.hasPhys(LI->reg())) - LRM.unassign(*LI); - LRM.assign(*LI, OldAssign); - } +bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { + // This only applies on subtargets that have a configurable AGPR vs. VGPR + // allocation. + if (!ST.hasGFX90AInsts()) + return false; - continue; - } + // Early exit if no AGPRs were assigned. + if (!LRM.isPhysRegUsed(AMDGPU::AGPR0)) { + LLVM_DEBUG(dbgs() << "skipping function that did not allocate AGPRs\n"); + return false; + } - // Fixup the register classes of the virtual registers now that we've - // committed to the reassignments. - for (Register InterferingReg : RewriteRegs) { - const TargetRegisterClass *EquivalentAGPRRegClass = - TRI.getEquivalentAGPRClass(MRI.getRegClass(InterferingReg)); - MRI.setRegClass(InterferingReg, EquivalentAGPRRegClass); - } + bool MadeChange = false; - for (MachineInstr *RewriteCandidate : RewriteCandidates) { - int NewMFMAOp = - AMDGPU::getMFMASrcCVDstAGPROp(RewriteCandidate->getOpcode()); - RewriteCandidate->setDesc(TII.get(NewMFMAOp)); - } + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { + Register VReg = Register::index2VirtReg(I); + MCRegister AssignedAGPR = getAssignedAGPR(VReg); + if (!AssignedAGPR) + continue; - // We likely left an identity copy behind after assignment; let - // VirtRegRewriter deal with it later. + if (tryFoldCopiesToAGPR(VReg, AssignedAGPR)) + MadeChange = true; + if (tryFoldCopiesFromAGPR(VReg, AssignedAGPR)) MadeChange = true; - } } return MadeChange; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index d095fc6cf954..73acb1ddbd2a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -210,18 +210,10 @@ AMDGPUSubtarget::getWavesPerEU(const Function &F) const { // Default/requested minimum/maximum flat work group sizes. std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); // Minimum number of bytes allocated in the LDS. - unsigned LDSBytes = AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size", - {0, UINT32_MAX}, true) - .first; - return getWavesPerEU(FlatWorkGroupSizes, LDSBytes, F); -} - -std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( - const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const { - // Minimum number of bytes allocated in the LDS. - unsigned LDSBytes = AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size", - {0, UINT32_MAX}, true) - .first; + unsigned LDSBytes = + AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size", {0, UINT32_MAX}, + /*OnlyFirstRequired=*/true) + .first; return getWavesPerEU(FlatWorkGroupSizes, LDSBytes, F); } @@ -237,11 +229,31 @@ AMDGPUSubtarget::getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes, return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes, LDSBytes); } -static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { +std::optional<unsigned> +AMDGPUSubtarget::getReqdWorkGroupSize(const Function &Kernel, + unsigned Dim) const { auto *Node = Kernel.getMetadata("reqd_work_group_size"); if (Node && Node->getNumOperands() == 3) return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); - return std::numeric_limits<unsigned>::max(); + return std::nullopt; +} + +bool AMDGPUSubtarget::hasWavefrontsEvenlySplittingXDim( + const Function &F, bool RequiresUniformYZ) const { + auto *Node = F.getMetadata("reqd_work_group_size"); + if (!Node || Node->getNumOperands() != 3) + return false; + unsigned XLen = + mdconst::extract<ConstantInt>(Node->getOperand(0))->getZExtValue(); + unsigned YLen = + mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue(); + unsigned ZLen = + mdconst::extract<ConstantInt>(Node->getOperand(2))->getZExtValue(); + + bool Is1D = YLen <= 1 && ZLen <= 1; + bool IsXLargeEnough = + isPowerOf2_32(XLen) && (!RequiresUniformYZ || XLen >= getWavefrontSize()); + return Is1D || IsXLargeEnough; } bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { @@ -250,9 +262,9 @@ bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const { - unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); - if (ReqdSize != std::numeric_limits<unsigned>::max()) - return ReqdSize - 1; + std::optional<unsigned> ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); + if (ReqdSize) + return *ReqdSize - 1; return getFlatWorkGroupSizes(Kernel).second - 1; } @@ -303,9 +315,9 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { } if (Dim <= 3) { - unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); - if (ReqdSize != std::numeric_limits<unsigned>::max()) - MinSize = MaxSize = ReqdSize; + std::optional<unsigned> ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); + if (ReqdSize) + MinSize = MaxSize = *ReqdSize; } } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 6878744496cf..57b757c990e1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -100,6 +100,26 @@ public: /// be converted to integer, or violate subtarget's specifications. std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const; + /// \returns The required size of workgroups that will be used to execute \p F + /// in the \p Dim dimension, if it is known (from `!reqd_work_group_size` + /// metadata. Otherwise, returns std::nullopt. + std::optional<unsigned> getReqdWorkGroupSize(const Function &F, + unsigned Dim) const; + + /// \returns true if \p F will execute in a manner that leaves the X + /// dimensions of the workitem ID evenly tiling wavefronts - that is, if X / + /// wavefrontsize is uniform. This is true if either the Y and Z block + /// dimensions are known to always be 1 or if the X dimension will always be a + /// power of 2. If \p RequireUniformYZ is true, it also ensures that the Y and + /// Z workitem IDs will be uniform (so, while a (32, 2, 1) launch with + /// wavesize64 would ordinarily pass this test, it won't with + /// \pRequiresUniformYZ). + /// + /// This information is currently only gathered from the !reqd_work_group_size + /// metadata on \p F, but this may be improved in the future. + bool hasWavefrontsEvenlySplittingXDim(const Function &F, + bool REquiresUniformYZ = false) const; + /// \returns Subtarget's default pair of minimum/maximum number of waves per /// execution unit for function \p F, or minimum/maximum number of waves per /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index e969f9ec8889..9afe7590fe4e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -22,6 +22,7 @@ #include "AMDGPUExportKernelRuntimeHandles.h" #include "AMDGPUIGroupLP.h" #include "AMDGPUISelDAGToDAG.h" +#include "AMDGPULowerVGPREncoding.h" #include "AMDGPUMacroFusion.h" #include "AMDGPUPerfHintAnalysis.h" #include "AMDGPUPreloadKernArgProlog.h" @@ -577,12 +578,14 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(*PR); initializeAMDGPULowerModuleLDSLegacyPass(*PR); initializeAMDGPULowerBufferFatPointersPass(*PR); + initializeAMDGPULowerIntrinsicsLegacyPass(*PR); initializeAMDGPUReserveWWMRegsLegacyPass(*PR); initializeAMDGPURewriteAGPRCopyMFMALegacyPass(*PR); initializeAMDGPURewriteOutArgumentsPass(*PR); initializeAMDGPURewriteUndefForPHILegacyPass(*PR); initializeSIAnnotateControlFlowLegacyPass(*PR); initializeAMDGPUInsertDelayAluLegacyPass(*PR); + initializeAMDGPULowerVGPREncodingLegacyPass(*PR); initializeSIInsertHardClausesLegacyPass(*PR); initializeSIInsertWaitcntsLegacyPass(*PR); initializeSIModeRegisterLegacyPass(*PR); @@ -1418,6 +1421,7 @@ void AMDGPUPassConfig::addCodeGenPrepare() { // nodes out of the graph, which leads to function-level passes not // being run on them, which causes crashes in the resource usage analysis). addPass(createAMDGPULowerBufferFatPointersPass()); + addPass(createAMDGPULowerIntrinsicsLegacyPass()); // In accordance with the above FIXME, manually force all the // function-level passes into a CGSCCPassManager. addPass(new DummyCGSCCPass()); @@ -1797,6 +1801,8 @@ void GCNPassConfig::addPreEmitPass() { addPass(&AMDGPUWaitSGPRHazardsLegacyID); + addPass(&AMDGPULowerVGPREncodingLegacyID); + if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less)) addPass(&AMDGPUInsertDelayAluID); @@ -2155,9 +2161,10 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const { // nodes out of the graph, which leads to function-level passes not // being run on them, which causes crashes in the resource usage analysis). addPass(AMDGPULowerBufferFatPointersPass(TM)); - addPass.requireCGSCCOrder(); + addPass(AMDGPULowerIntrinsicsPass(TM)); + Base::addCodeGenPrepare(addPass); if (isPassEnabled(EnableLoadStoreVectorizer)) @@ -2383,6 +2390,7 @@ void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const { // cases. addPass(PostRAHazardRecognizerPass()); addPass(AMDGPUWaitSGPRHazardsPass()); + addPass(AMDGPULowerVGPREncodingPass()); if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less)) { addPass(AMDGPUInsertDelayAluPass()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 846a0b6280f1..3e2b2c351056 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -22,6 +22,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/Analysis.h" +#include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/PatternMatch.h" @@ -1003,6 +1004,15 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const { DstAS == AMDGPUAS::FLAT_ADDRESS && ST->hasGloballyAddressableScratch(); } + case Intrinsic::amdgcn_workitem_id_y: + case Intrinsic::amdgcn_workitem_id_z: { + const Function *F = Intrinsic->getFunction(); + bool HasUniformYZ = + ST->hasWavefrontsEvenlySplittingXDim(*F, /*RequitezUniformYZ=*/true); + std::optional<unsigned> ThisDimSize = ST->getReqdWorkGroupSize( + *F, IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2); + return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1); + } default: return AMDGPU::isIntrinsicSourceOfDivergence(IID); } @@ -1049,28 +1059,31 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const { // packed into a same wave which gives 1 and 0 after the division by 64 // respectively. // - // FIXME: limit it to 1D kernels only, although that shall be possible - // to perform this optimization is the size of the X dimension is a power - // of 2, we just do not currently have infrastructure to query it. + // The X dimension doesn't reset within a wave if either both the Y + // and Z dimensions are of length 1, or if the X dimension's required + // size is a power of 2. Note, however, if the X dimension's maximum + // size is a power of 2 < the wavefront size, division by the wavefront + // size is guaranteed to yield 0, so this is also a no-reset case. + bool XDimDoesntResetWithinWaves = false; + if (auto *I = dyn_cast<Instruction>(V)) { + const Function *F = I->getFunction(); + XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(*F); + } using namespace llvm::PatternMatch; uint64_t C; if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(), m_ConstantInt(C))) || match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(), m_ConstantInt(C)))) { - const Function *F = cast<Instruction>(V)->getFunction(); - return C >= ST->getWavefrontSizeLog2() && - ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0; + return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves; } Value *Mask; if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(), m_Value(Mask)))) { - const Function *F = cast<Instruction>(V)->getFunction(); - const DataLayout &DL = F->getDataLayout(); return computeKnownBits(Mask, DL).countMinTrailingZeros() >= ST->getWavefrontSizeLog2() && - ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0; + XDimDoesntResetWithinWaves; } const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 2e21ba4c30b5..e420f2ad676f 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -564,6 +564,14 @@ public: return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i32); } + bool isVCSrc_b32_Lo256() const { + return isRegOrInlineNoMods(AMDGPU::VS_32_Lo256RegClassID, MVT::i32); + } + + bool isVCSrc_b64_Lo256() const { + return isRegOrInlineNoMods(AMDGPU::VS_64_Lo256RegClassID, MVT::i64); + } + bool isVCSrc_b64() const { return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::i64); } @@ -1007,7 +1015,7 @@ public: bool isEndpgm() const; auto getPredicate(std::function<bool(const AMDGPUOperand &Op)> P) const { - return [=](){ return P(*this); }; + return [this, P]() { return P(*this); }; } StringRef getToken() const { @@ -1886,6 +1894,7 @@ private: bool validateTHAndScopeBits(const MCInst &Inst, const OperandVector &Operands, const unsigned CPol); bool validateTFE(const MCInst &Inst, const OperandVector &Operands); + bool validateSetVgprMSB(const MCInst &Inst, const OperandVector &Operands); std::optional<StringRef> validateLdsDirect(const MCInst &Inst); bool validateWMMA(const MCInst &Inst, const OperandVector &Operands); unsigned getConstantBusLimit(unsigned Opcode) const; @@ -2985,7 +2994,12 @@ MCRegister AMDGPUAsmParser::getRegularReg(RegisterKind RegKind, unsigned RegNum, const MCRegisterInfo *TRI = getContext().getRegisterInfo(); const MCRegisterClass RC = TRI->getRegClass(RCID); - if (RegIdx >= RC.getNumRegs()) { + if (RegIdx >= RC.getNumRegs() || (RegKind == IS_VGPR && RegIdx > 255)) { + Error(Loc, "register index is out of range"); + return AMDGPU::NoRegister; + } + + if (RegKind == IS_VGPR && !isGFX1250() && RegIdx + RegWidth / 32 > 256) { Error(Loc, "register index is out of range"); return MCRegister(); } @@ -4768,12 +4782,14 @@ bool AMDGPUAsmParser::validateOffset(const MCInst &Inst, return validateSMEMOffset(Inst, Operands); const auto &Op = Inst.getOperand(OpNum); + // GFX12+ buffer ops: InstOffset is signed 24, but must not be a negative. if (isGFX12Plus() && (TSFlags & (SIInstrFlags::MUBUF | SIInstrFlags::MTBUF))) { const unsigned OffsetSize = 24; - if (!isIntN(OffsetSize, Op.getImm())) { + if (!isUIntN(OffsetSize - 1, Op.getImm())) { Error(getFlatOffsetLoc(Operands), - Twine("expected a ") + Twine(OffsetSize) + "-bit signed offset"); + Twine("expected a ") + Twine(OffsetSize - 1) + + "-bit unsigned offset for buffer ops"); return false; } } else { @@ -4856,7 +4872,9 @@ bool AMDGPUAsmParser::validateSMEMOffset(const MCInst &Inst, return true; Error(getSMEMOffsetLoc(Operands), - isGFX12Plus() ? "expected a 24-bit signed offset" + isGFX12Plus() && IsBuffer + ? "expected a 23-bit unsigned offset for buffer ops" + : isGFX12Plus() ? "expected a 24-bit signed offset" : (isVI() || IsBuffer) ? "expected a 20-bit unsigned offset" : "expected a 21-bit signed offset"); @@ -5216,7 +5234,7 @@ bool AMDGPUAsmParser::validateAGPRLdSt(const MCInst &Inst) const { bool AMDGPUAsmParser::validateVGPRAlign(const MCInst &Inst) const { auto FB = getFeatureBits(); - if (!FB[AMDGPU::FeatureGFX90AInsts] && !FB[AMDGPU::FeatureGFX1250Insts]) + if (!FB[AMDGPU::FeatureRequiresAlignedVGPRs]) return true; unsigned Opc = Inst.getOpcode(); @@ -5542,6 +5560,22 @@ bool AMDGPUAsmParser::validateTFE(const MCInst &Inst, return true; } +bool AMDGPUAsmParser::validateSetVgprMSB(const MCInst &Inst, + const OperandVector &Operands) { + if (Inst.getOpcode() != AMDGPU::S_SET_VGPR_MSB_gfx12) + return true; + + int Simm16Pos = + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::simm16); + if ((unsigned)Inst.getOperand(Simm16Pos).getImm() > 255) { + SMLoc Loc = Operands[1]->getStartLoc(); + Error(Loc, "s_set_vgpr_msb accepts values in range [0..255]"); + return false; + } + + return true; +} + bool AMDGPUAsmParser::validateWMMA(const MCInst &Inst, const OperandVector &Operands) { unsigned Opc = Inst.getOpcode(); @@ -5706,6 +5740,9 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, if (!validateTFE(Inst, Operands)) { return false; } + if (!validateSetVgprMSB(Inst, Operands)) { + return false; + } if (!validateWMMA(Inst, Operands)) { return false; } @@ -5799,6 +5836,7 @@ bool AMDGPUAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, uint64_t &ErrorInfo, bool MatchingInlineAsm) { MCInst Inst; + Inst.setLoc(IDLoc); unsigned Result = Match_Success; for (auto Variant : getMatchedVariants()) { uint64_t EI; @@ -5822,7 +5860,6 @@ bool AMDGPUAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, if (!validateInstruction(Inst, IDLoc, Operands)) { return true; } - Inst.setLoc(IDLoc); Out.emitInstruction(Inst, getSTI()); return false; } @@ -6144,12 +6181,6 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { ExprVal, ValRange); if (Val) ImpliedUserSGPRCount += 1; - } else if (ID == ".amdhsa_uses_cu_stores") { - if (!isGFX1250()) - return Error(IDRange.Start, "directive requires gfx12.5", IDRange); - - PARSE_BITS_ENTRY(KD.kernel_code_properties, - KERNEL_CODE_PROPERTY_USES_CU_STORES, ExprVal, ValRange); } else if (ID == ".amdhsa_wavefront_size32") { EXPR_RESOLVE_OR_ERROR(EvaluatableExpr); if (IVersion.Major < 10) diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 1956a15c57d6..f229298ba516 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -132,7 +132,6 @@ class MTBUF_Real <MTBUF_Pseudo ps, string real_name = ps.Mnemonic> : let OtherPredicates = ps.OtherPredicates; let AsmMatchConverter = ps.AsmMatchConverter; let Constraints = ps.Constraints; - let DisableEncoding = ps.DisableEncoding; let TSFlags = ps.TSFlags; let SchedRW = ps.SchedRW; let mayLoad = ps.mayLoad; @@ -159,11 +158,10 @@ class MTBUF_Real <MTBUF_Pseudo ps, string real_name = ps.Mnemonic> : bits<1> acc = !if(ps.has_vdata, vdata{9}, 0); } -class getMTBUFInsDA<list<RegisterClass> vdataList, +class getMTBUFInsDA<list<RegisterOperand> vdataList, list<RegisterClass> vaddrList=[], bit hasRestrictedSOffset> { - RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList)); + RegisterOperand vdata_op = !if(!empty(vdataList), ?, !head(vdataList)); RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); - RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret; dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset), (ins SCSrc_b32:$soffset)); @@ -179,7 +177,7 @@ class getMTBUFInsDA<list<RegisterClass> vdataList, !con((ins vdata_op:$vdata), Inputs)); } -class getMTBUFIns<int addrKind, list<RegisterClass> vdataList=[], bit hasRestrictedSOffset> { +class getMTBUFIns<int addrKind, list<RegisterOperand> vdataList=[], bit hasRestrictedSOffset> { dag ret = !if(!eq(addrKind, BUFAddrKind.Offset), getMTBUFInsDA<vdataList, [], hasRestrictedSOffset>.ret, !if(!eq(addrKind, BUFAddrKind.OffEn), getMTBUFInsDA<vdataList, [VGPR_32], hasRestrictedSOffset>.ret, @@ -218,25 +216,23 @@ class MTBUF_SetupAddr<int addrKind> { class MTBUF_Load_Pseudo <string opName, int addrKind, - RegisterClass vdataClass, + RegisterOperand vdataClass, int elems, bit hasRestrictedSOffset = 0, - list<dag> pattern=[], - // Workaround bug bz30254 - int addrKindCopy = addrKind> + list<dag> pattern=[]> : MTBUF_Pseudo<opName, - (outs getLdStRegisterOperand<vdataClass>.ret:$vdata), - getMTBUFIns<addrKindCopy, [], hasRestrictedSOffset>.ret, - getMTBUFAsmOps<addrKindCopy>.ret, + (outs vdataClass:$vdata), + getMTBUFIns<addrKind, [], hasRestrictedSOffset>.ret, + getMTBUFAsmOps<addrKind>.ret, pattern>, - MTBUF_SetupAddr<addrKindCopy> { - let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret; + MTBUF_SetupAddr<addrKind> { + let PseudoInstr = opName # "_" # getAddrName<addrKind>.ret; let mayLoad = 1; let mayStore = 0; let elements = elems; } -multiclass MTBUF_Pseudo_Loads_Helper<string opName, RegisterClass vdataClass, +multiclass MTBUF_Pseudo_Loads_Helper<string opName, RegisterOperand vdataClass, int elems, bit hasRestrictedSOffset> { def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems, hasRestrictedSOffset>, @@ -257,7 +253,7 @@ multiclass MTBUF_Pseudo_Loads_Helper<string opName, RegisterClass vdataClass, } } -multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass, +multiclass MTBUF_Pseudo_Loads<string opName, RegisterOperand vdataClass, int elems> { defm NAME : MTBUF_Pseudo_Loads_Helper<opName, vdataClass, elems, 0>; defm _VBUFFER : MTBUF_Pseudo_Loads_Helper<opName, vdataClass, elems, 1>; @@ -265,26 +261,23 @@ multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass, class MTBUF_Store_Pseudo <string opName, int addrKind, - RegisterClass vdataClass, + RegisterOperand vdataClass, int elems, bit hasRestrictedSOffset = 0, - list<dag> pattern=[], - // Workaround bug bz30254 - int addrKindCopy = addrKind, - RegisterClass vdataClassCopy = vdataClass> + list<dag> pattern=[]> : MTBUF_Pseudo<opName, (outs), - getMTBUFIns<addrKindCopy, [vdataClassCopy], hasRestrictedSOffset>.ret, - getMTBUFAsmOps<addrKindCopy>.ret, + getMTBUFIns<addrKind, [vdataClass], hasRestrictedSOffset>.ret, + getMTBUFAsmOps<addrKind>.ret, pattern>, - MTBUF_SetupAddr<addrKindCopy> { - let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret; + MTBUF_SetupAddr<addrKind> { + let PseudoInstr = opName # "_" # getAddrName<addrKind>.ret; let mayLoad = 0; let mayStore = 1; let elements = elems; } -multiclass MTBUF_Pseudo_Stores_Helper<string opName, RegisterClass vdataClass, +multiclass MTBUF_Pseudo_Stores_Helper<string opName, RegisterOperand vdataClass, int elems, bit hasRestrictedSOffset> { def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems, hasRestrictedSOffset>, @@ -305,7 +298,7 @@ multiclass MTBUF_Pseudo_Stores_Helper<string opName, RegisterClass vdataClass, } } -multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass, +multiclass MTBUF_Pseudo_Stores<string opName, RegisterOperand vdataClass, int elems> { defm NAME : MTBUF_Pseudo_Stores_Helper<opName, vdataClass, elems, 0>; defm _VBUFFER : MTBUF_Pseudo_Stores_Helper<opName, vdataClass, elems, 1>; @@ -346,7 +339,6 @@ class MUBUF_Real <MUBUF_Pseudo ps, string real_name = ps.Mnemonic> : let AsmMatchConverter = ps.AsmMatchConverter; let OtherPredicates = ps.OtherPredicates; let Constraints = ps.Constraints; - let DisableEncoding = ps.DisableEncoding; let TSFlags = ps.TSFlags; let UseNamedOperandTable = ps.UseNamedOperandTable; let SchedRW = ps.SchedRW; @@ -401,21 +393,29 @@ class MUBUF_Invalidate <string opName, SDPatternOperator node = null_frag> : let sccb_value = 0; } -class getLdStVDataRegisterOperand<RegisterClass RC, bit isTFE> { - RegisterOperand tfeVDataOp = - !cond(!eq(RC.Size, 32) : AVLdSt_64, - !eq(RC.Size, 64) : AVLdSt_96, - !eq(RC.Size, 96) : AVLdSt_128, - !eq(RC.Size, 128) : AVLdSt_160); +class getBUFVDataRegisterOperand<int Size, bit isTFE> { + defvar tfeVDataOp = + !cond(!eq(Size, 16) : AVLdSt_64, + !eq(Size, 32) : AVLdSt_64, + !eq(Size, 64) : AVLdSt_96, + !eq(Size, 96) : AVLdSt_128, + !eq(Size, 128) : AVLdSt_160); + + defvar VDataOp = + !cond(!eq(Size, 16) : AVLdSt_32, + !eq(Size, 32) : AVLdSt_32, + !eq(Size, 64) : AVLdSt_64, + !eq(Size, 96) : AVLdSt_96, + !eq(Size, 128) : AVLdSt_128); - RegisterOperand ret = !if(isTFE, tfeVDataOp, getLdStRegisterOperand<RC>.ret); + RegisterOperand ret = !if(isTFE, tfeVDataOp, VDataOp); } -class getMUBUFInsDA<list<RegisterClass> vdataList, +class getMUBUFInsDA<list<RegisterOperand> vdataList, list<RegisterClass> vaddrList, bit isTFE, bit hasRestrictedSOffset> { - RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList)); + RegisterOperand vdataClass = !if(!empty(vdataList), ?, !head(vdataList)); RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); - RegisterOperand vdata_op = getLdStVDataRegisterOperand<vdataClass, isTFE>.ret; + RegisterOperand vdata_op = getBUFVDataRegisterOperand<vdataClass.RegClass.Size, isTFE>.ret; dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset), (ins SCSrc_b32:$soffset)); dag NonVaddrInputs = !con((ins SReg_128_XNULL:$srsrc), SOffset, (ins Offset:$offset, CPol_0:$cpol, i1imm_0:$swz)); @@ -443,7 +443,7 @@ class getMUBUFElements<ValueType vt> { ); } -class getMUBUFIns<int addrKind, list<RegisterClass> vdataList, bit isTFE, bit hasRestrictedSOffset> { +class getMUBUFIns<int addrKind, list<RegisterOperand> vdataList, bit isTFE, bit hasRestrictedSOffset> { dag ret = !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isTFE, hasRestrictedSOffset>.ret, !if(!eq(addrKind, BUFAddrKind.OffEn), getMUBUFInsDA<vdataList, [VGPR_32], isTFE, hasRestrictedSOffset>.ret, @@ -491,19 +491,16 @@ class MUBUF_Load_Pseudo <string opName, bit isTFE = 0, bit hasRestrictedSOffset = 0, list<dag> pattern=[], - // Workaround bug bz30254 - int addrKindCopy = addrKind, - RegisterClass vdata_rc = getVregSrcForVT<vdata_vt>.ret.RegClass, - RegisterOperand vdata_op = getLdStVDataRegisterOperand<vdata_rc, isTFE>.ret> + RegisterOperand vdata_op = getBUFVDataRegisterOperand<vdata_vt.Size, isTFE>.ret> : MUBUF_Pseudo<opName, !if(!or(isLds, isLdsOpc), (outs), (outs vdata_op:$vdata)), - !con(getMUBUFIns<addrKindCopy, [], isTFE, hasRestrictedSOffset>.ret, + !con(getMUBUFIns<addrKind, [], isTFE, hasRestrictedSOffset>.ret, !if(HasTiedDest, (ins vdata_op:$vdata_in), (ins))), - getMUBUFAsmOps<addrKindCopy, !or(isLds, isLdsOpc), isLds, isTFE>.ret, + getMUBUFAsmOps<addrKind, !or(isLds, isLdsOpc), isLds, isTFE>.ret, pattern>, - MUBUF_SetupAddr<addrKindCopy> { + MUBUF_SetupAddr<addrKind> { let PseudoInstr = opName # !if(isLds, "_lds", "") # !if(isTFE, "_tfe", "") # - "_" # getAddrName<addrKindCopy>.ret; + "_" # getAddrName<addrKind>.ret; let AsmMatchConverter = "cvtMubuf"; let Constraints = !if(HasTiedDest, "$vdata = $vdata_in", ""); @@ -593,17 +590,15 @@ class MUBUF_Store_Pseudo <string opName, ValueType store_vt, bit isTFE = 0, bit hasRestrictedSOffset = 0, - list<dag> pattern=[], - // Workaround bug bz30254 - int addrKindCopy = addrKind> + list<dag> pattern=[]> : MUBUF_Pseudo<opName, (outs), - getMUBUFIns<addrKindCopy, [getVregSrcForVT<store_vt>.ret.RegClass], isTFE, hasRestrictedSOffset>.ret, - getMUBUFAsmOps<addrKindCopy, 0, 0, isTFE>.ret, + getMUBUFIns<addrKind, [getVregSrcForVT<store_vt>.ret], isTFE, hasRestrictedSOffset>.ret, + getMUBUFAsmOps<addrKind, 0, 0, isTFE>.ret, pattern>, - MUBUF_SetupAddr<addrKindCopy> { + MUBUF_SetupAddr<addrKind> { let PseudoInstr = opName # "_" # !if(isTFE, "_tfe", "") # - getAddrName<addrKindCopy>.ret; + getAddrName<addrKind>.ret; let mayLoad = 0; let mayStore = 1; let elements = getMUBUFElements<store_vt>.ret; @@ -676,10 +671,9 @@ class MUBUF_Pseudo_Store_Lds<string opName> let AsmMatchConverter = "cvtMubuf"; } -class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in, bit hasRestrictedSOffset, +class getMUBUFAtomicInsDA<RegisterOperand vdata_op, bit vdata_in, bit hasRestrictedSOffset, list<RegisterClass> vaddrList=[]> { RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); - RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret; dag VData = !if(vdata_in, (ins vdata_op:$vdata_in), (ins vdata_op:$vdata)); dag Data = !if(!empty(vaddrList), VData, !con(VData, (ins vaddrClass:$vaddr))); @@ -692,22 +686,20 @@ class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in, bit hasRestric } class getMUBUFAtomicIns<int addrKind, - RegisterClass vdataClass, + RegisterOperand vdataClass, bit vdata_in, - bit hasRestrictedSOffset, - // Workaround bug bz30254 - RegisterClass vdataClassCopy=vdataClass> { + bit hasRestrictedSOffset> { dag ret = !if(!eq(addrKind, BUFAddrKind.Offset), - getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, hasRestrictedSOffset>.ret, + getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset>.ret, !if(!eq(addrKind, BUFAddrKind.OffEn), - getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, hasRestrictedSOffset, [VGPR_32]>.ret, + getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VGPR_32]>.ret, !if(!eq(addrKind, BUFAddrKind.IdxEn), - getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, hasRestrictedSOffset, [VGPR_32]>.ret, + getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VGPR_32]>.ret, !if(!eq(addrKind, BUFAddrKind.BothEn), - getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, hasRestrictedSOffset, [VReg_64]>.ret, + getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VReg_64]>.ret, !if(!eq(addrKind, BUFAddrKind.Addr64), - getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, hasRestrictedSOffset, [VReg_64]>.ret, + getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VReg_64]>.ret, (ins)))))); } @@ -716,11 +708,9 @@ class MUBUF_Atomic_Pseudo<string opName, dag outs, dag ins, string asmOps, - list<dag> pattern=[], - // Workaround bug bz30254 - int addrKindCopy = addrKind> + list<dag> pattern=[]> : MUBUF_Pseudo<opName, outs, ins, asmOps, pattern>, - MUBUF_SetupAddr<addrKindCopy> { + MUBUF_SetupAddr<addrKind> { let mayStore = 1; let mayLoad = 1; let hasSideEffects = 1; @@ -732,18 +722,15 @@ class MUBUF_Atomic_Pseudo<string opName, } class MUBUF_AtomicNoRet_Pseudo<string opName, int addrKind, - RegisterClass vdataClass, + RegisterOperand vdataClass, bit hasRestrictedSOffset = 0, - list<dag> pattern=[], - // Workaround bug bz30254 - int addrKindCopy = addrKind, - RegisterClass vdataClassCopy = vdataClass> - : MUBUF_Atomic_Pseudo<opName, addrKindCopy, + list<dag> pattern=[]> + : MUBUF_Atomic_Pseudo<opName, addrKind, (outs), - getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 0, hasRestrictedSOffset>.ret, - getMUBUFAsmOps<addrKindCopy>.ret, + getMUBUFAtomicIns<addrKind, vdataClass, 0, hasRestrictedSOffset>.ret, + getMUBUFAsmOps<addrKind>.ret, pattern> { - let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret; + let PseudoInstr = opName # "_" # getAddrName<addrKind>.ret; let glc_value = 0; let dlc_value = 0; let sccb_value = 0; @@ -751,29 +738,24 @@ class MUBUF_AtomicNoRet_Pseudo<string opName, int addrKind, } class MUBUF_AtomicRet_Pseudo<string opName, int addrKind, - RegisterClass vdataClass, + RegisterOperand vdata_op, bit hasRestrictedSOffset = 0, - list<dag> pattern=[], - // Workaround bug bz30254 - int addrKindCopy = addrKind, - RegisterClass vdataClassCopy = vdataClass, - RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret> - : MUBUF_Atomic_Pseudo<opName, addrKindCopy, + list<dag> pattern=[]> + : MUBUF_Atomic_Pseudo<opName, addrKind, (outs vdata_op:$vdata), - getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 1, hasRestrictedSOffset>.ret, - getMUBUFAsmOps<addrKindCopy>.ret, + getMUBUFAtomicIns<addrKind, vdata_op, 1, hasRestrictedSOffset>.ret, + getMUBUFAsmOps<addrKind>.ret, pattern> { - let PseudoInstr = opName # "_rtn_" # getAddrName<addrKindCopy>.ret; + let PseudoInstr = opName # "_rtn_" # getAddrName<addrKind>.ret; let glc_value = 1; let dlc_value = 0; let sccb_value = 0; let IsAtomicRet = 1; let Constraints = "$vdata = $vdata_in"; - let DisableEncoding = "$vdata_in"; } multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName, - RegisterClass vdataClass, + RegisterOperand vdataClass, ValueType vdataType> { let FPAtomic = vdataType.isFP in { def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, 0>, @@ -795,7 +777,7 @@ multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName, } multiclass MUBUF_Pseudo_Atomics_RTN <string opName, - RegisterClass vdataClass, + RegisterOperand vdataClass, ValueType vdataType, SDPatternOperator atomic> { let FPAtomic = vdataType.isFP in { @@ -834,7 +816,7 @@ multiclass MUBUF_Pseudo_Atomics_RTN <string opName, } multiclass MUBUF_Pseudo_Atomics <string opName, - RegisterClass vdataClass, + RegisterOperand vdataClass, ValueType vdataType, SDPatternOperator atomic = null_frag> : MUBUF_Pseudo_Atomics_NO_RTN<opName, vdataClass, vdataType>, @@ -1029,87 +1011,87 @@ defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX4", vt, store_global>; } defm BUFFER_ATOMIC_SWAP : MUBUF_Pseudo_Atomics < - "buffer_atomic_swap", VGPR_32, i32 + "buffer_atomic_swap", AVLdSt_32, i32 >; defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Pseudo_Atomics < - "buffer_atomic_cmpswap", VReg_64, v2i32 + "buffer_atomic_cmpswap", AVLdSt_64, v2i32 >; defm BUFFER_ATOMIC_ADD : MUBUF_Pseudo_Atomics < - "buffer_atomic_add", VGPR_32, i32 + "buffer_atomic_add", AVLdSt_32, i32 >; defm BUFFER_ATOMIC_SUB : MUBUF_Pseudo_Atomics < - "buffer_atomic_sub", VGPR_32, i32 + "buffer_atomic_sub", AVLdSt_32, i32 >; defm BUFFER_ATOMIC_SMIN : MUBUF_Pseudo_Atomics < - "buffer_atomic_smin", VGPR_32, i32 + "buffer_atomic_smin", AVLdSt_32, i32 >; defm BUFFER_ATOMIC_UMIN : MUBUF_Pseudo_Atomics < - "buffer_atomic_umin", VGPR_32, i32 + "buffer_atomic_umin", AVLdSt_32, i32 >; defm BUFFER_ATOMIC_SMAX : MUBUF_Pseudo_Atomics < - "buffer_atomic_smax", VGPR_32, i32 + "buffer_atomic_smax", AVLdSt_32, i32 >; defm BUFFER_ATOMIC_UMAX : MUBUF_Pseudo_Atomics < - "buffer_atomic_umax", VGPR_32, i32 + "buffer_atomic_umax", AVLdSt_32, i32 >; defm BUFFER_ATOMIC_AND : MUBUF_Pseudo_Atomics < - "buffer_atomic_and", VGPR_32, i32 + "buffer_atomic_and", AVLdSt_32, i32 >; defm BUFFER_ATOMIC_OR : MUBUF_Pseudo_Atomics < - "buffer_atomic_or", VGPR_32, i32 + "buffer_atomic_or", AVLdSt_32, i32 >; defm BUFFER_ATOMIC_XOR : MUBUF_Pseudo_Atomics < - "buffer_atomic_xor", VGPR_32, i32 + "buffer_atomic_xor", AVLdSt_32, i32 >; defm BUFFER_ATOMIC_INC : MUBUF_Pseudo_Atomics < - "buffer_atomic_inc", VGPR_32, i32 + "buffer_atomic_inc", AVLdSt_32, i32 >; defm BUFFER_ATOMIC_DEC : MUBUF_Pseudo_Atomics < - "buffer_atomic_dec", VGPR_32, i32 + "buffer_atomic_dec", AVLdSt_32, i32 >; defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_swap_x2", VReg_64, i64 + "buffer_atomic_swap_x2", AVLdSt_64, i64 >; defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_cmpswap_x2", VReg_128, v2i64 + "buffer_atomic_cmpswap_x2", AVLdSt_128, v2i64 >; defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_add_x2", VReg_64, i64 + "buffer_atomic_add_x2", AVLdSt_64, i64 >; defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_sub_x2", VReg_64, i64 + "buffer_atomic_sub_x2", AVLdSt_64, i64 >; defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_smin_x2", VReg_64, i64 + "buffer_atomic_smin_x2", AVLdSt_64, i64 >; defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_umin_x2", VReg_64, i64 + "buffer_atomic_umin_x2", AVLdSt_64, i64 >; defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_smax_x2", VReg_64, i64 + "buffer_atomic_smax_x2", AVLdSt_64, i64 >; defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_umax_x2", VReg_64, i64 + "buffer_atomic_umax_x2", AVLdSt_64, i64 >; defm BUFFER_ATOMIC_AND_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_and_x2", VReg_64, i64 + "buffer_atomic_and_x2", AVLdSt_64, i64 >; defm BUFFER_ATOMIC_OR_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_or_x2", VReg_64, i64 + "buffer_atomic_or_x2", AVLdSt_64, i64 >; defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_xor_x2", VReg_64, i64 + "buffer_atomic_xor_x2", AVLdSt_64, i64 >; defm BUFFER_ATOMIC_INC_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_inc_x2", VReg_64, i64 + "buffer_atomic_inc_x2", AVLdSt_64, i64 >; defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_dec_x2", VReg_64, i64 + "buffer_atomic_dec_x2", AVLdSt_64, i64 >; let OtherPredicates = [HasGFX10_BEncoding] in { defm BUFFER_ATOMIC_CSUB : MUBUF_Pseudo_Atomics < - "buffer_atomic_csub", VGPR_32, i32, int_amdgcn_global_atomic_csub + "buffer_atomic_csub", VGPROp_32, i32, int_amdgcn_global_atomic_csub >; } @@ -1130,22 +1112,22 @@ def BUFFER_WBINVL1_SC : MUBUF_Invalidate <"buffer_wbinvl1_sc", let SubtargetPredicate = isGFX6GFX7GFX10Plus in { defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics < - "buffer_atomic_fcmpswap", VReg_64, v2f32, null_frag + "buffer_atomic_fcmpswap", AVLdSt_64, v2f32, null_frag >; } let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts in { defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics < - "buffer_atomic_fmin", VGPR_32, f32, null_frag + "buffer_atomic_fmin", AVLdSt_32, f32, null_frag >; defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics < - "buffer_atomic_fmax", VGPR_32, f32, null_frag + "buffer_atomic_fmax", AVLdSt_32, f32, null_frag >; } let SubtargetPredicate = isGFX6GFX7GFX10 in { defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_fcmpswap_x2", VReg_128, v2f64, null_frag + "buffer_atomic_fcmpswap_x2", VGPROp_128, v2f64, null_frag >; } @@ -1204,34 +1186,34 @@ def BUFFER_WBINVL1 : MUBUF_Invalidate < let SubtargetPredicate = HasAtomicFaddNoRtnInsts in defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN< - "buffer_atomic_add_f32", VGPR_32, f32 + "buffer_atomic_add_f32", AVLdSt_32, f32 >; let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16NoRtnInsts in defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN < - "buffer_atomic_pk_add_f16", VGPR_32, v2f16 + "buffer_atomic_pk_add_f16", AVLdSt_32, v2f16 >; let SubtargetPredicate = HasAtomicFaddRtnInsts in defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN< - "buffer_atomic_add_f32", VGPR_32, f32, null_frag + "buffer_atomic_add_f32", AVLdSt_32, f32, null_frag >; let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN < - "buffer_atomic_pk_add_f16", VGPR_32, v2f16, null_frag + "buffer_atomic_pk_add_f16", AVLdSt_32, v2f16, null_frag >; let SubtargetPredicate = isGFX12Plus in { defm BUFFER_ATOMIC_COND_SUB_U32 : MUBUF_Pseudo_Atomics < - "buffer_atomic_cond_sub_u32", VGPR_32, i32 + "buffer_atomic_cond_sub_u32", VGPROp_32, i32 >; } let SubtargetPredicate = HasAtomicBufferPkAddBF16Inst in { let FPAtomic = 1 in defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Pseudo_Atomics < - "buffer_atomic_pk_add_bf16", VGPR_32, v2bf16 + "buffer_atomic_pk_add_bf16", AVLdSt_32, v2bf16 >; } @@ -1239,39 +1221,39 @@ defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Pseudo_Atomics < // MTBUF Instructions //===----------------------------------------------------------------------===// let OtherPredicates = [HasMTBUFInsts] in { -defm TBUFFER_LOAD_FORMAT_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_x", VGPR_32, 1>; -defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", VReg_64, 2>; -defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_96, 3>; -defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", VReg_128, 4>; -defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", VGPR_32, 1>; -defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", VReg_64, 2>; -defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_96, 3>; -defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128, 4>; +defm TBUFFER_LOAD_FORMAT_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_x", AVLdSt_32, 1>; +defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", AVLdSt_64, 2>; +defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", AVLdSt_96, 3>; +defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", AVLdSt_128, 4>; +defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", AVLdSt_32, 1>; +defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", AVLdSt_64, 2>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", AVLdSt_96, 3>; +defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", AVLdSt_128, 4>; let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in { let TiedSourceNotRead = 1 in { - defm TBUFFER_LOAD_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32, 1>; - defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VReg_64, 2>; - defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_96, 3>; - defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_128, 4>; -} - defm TBUFFER_STORE_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32, 1>; - defm TBUFFER_STORE_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VReg_64, 2>; - defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_96, 3>; - defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_128, 4>; + defm TBUFFER_LOAD_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", AVLdSt_32, 1>; + defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", AVLdSt_64, 2>; + defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", AVLdSt_96, 3>; + defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", AVLdSt_128, 4>; +} + defm TBUFFER_STORE_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", AVLdSt_32, 1>; + defm TBUFFER_STORE_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", AVLdSt_64, 2>; + defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", AVLdSt_96, 3>; + defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", AVLdSt_128, 4>; } // End HasUnpackedD16VMem. let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in { let TiedSourceNotRead = 1 in { - defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32, 1>; - defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VGPR_32, 2>; - defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_64, 3>; - defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_64, 4>; -} - defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32, 1>; - defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VGPR_32, 2>; - defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_64, 3>; - defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64, 4>; + defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", AVLdSt_32, 1>; + defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", AVLdSt_32, 2>; + defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", AVLdSt_64, 3>; + defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", AVLdSt_64, 4>; +} + defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", AVLdSt_32, 1>; + defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", AVLdSt_32, 2>; + defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", AVLdSt_64, 3>; + defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", AVLdSt_64, 4>; } // End HasPackedD16VMem. } // End HasMTBUFInsts. @@ -1300,14 +1282,14 @@ let SubtargetPredicate = isGFX90APlus in { } // End SubtargetPredicate = isGFX90APlus let SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst in { - defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", VReg_64, f64>; + defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", AVLdSt_64, f64>; } // End SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in { // Note the names can be buffer_atomic_fmin_x2/buffer_atomic_fmax_x2 // depending on some subtargets. - defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", VReg_64, f64>; - defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", VReg_64, f64>; + defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", AVLdSt_64, f64>; + defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", AVLdSt_64, f64>; } def BUFFER_INV : MUBUF_Invalidate<"buffer_inv"> { @@ -2414,7 +2396,6 @@ class VBUFFER_Real <bits<8> op, BUF_Pseudo ps, string real_name> : let AsmMatchConverter = ps.AsmMatchConverter; let OtherPredicates = ps.OtherPredicates; let Constraints = ps.Constraints; - let DisableEncoding = ps.DisableEncoding; let TSFlags = ps.TSFlags; let UseNamedOperandTable = ps.UseNamedOperandTable; let SchedRW = ps.SchedRW; diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index dc9dd220130e..aae56eef73ed 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -6,7 +6,10 @@ tablegen(LLVM AMDGPUGenAsmMatcher.inc -gen-asm-matcher) tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer) tablegen(LLVM AMDGPUGenCallingConv.inc -gen-callingconv) tablegen(LLVM AMDGPUGenDAGISel.inc -gen-dag-isel) -tablegen(LLVM AMDGPUGenDisassemblerTables.inc -gen-disassembler) +tablegen(LLVM AMDGPUGenDisassemblerTables.inc -gen-disassembler + --specialize-decoders-per-bitwidth + -ignore-non-decodable-operands + -ignore-fully-defined-operands) tablegen(LLVM AMDGPUGenInstrInfo.inc -gen-instr-info) tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter) tablegen(LLVM AMDGPUGenMCPseudoLowering.inc -gen-pseudo-lowering) @@ -71,6 +74,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUImageIntrinsicOptimizer.cpp AMDGPULibFunc.cpp AMDGPULowerBufferFatPointers.cpp + AMDGPULowerIntrinsics.cpp AMDGPULowerKernelArguments.cpp AMDGPULowerKernelAttributes.cpp AMDGPULowerModuleLDSPass.cpp @@ -82,6 +86,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUMCInstLower.cpp AMDGPUMemoryUtils.cpp AMDGPUIGroupLP.cpp + AMDGPULowerVGPREncoding.cpp AMDGPUMCResourceInfo.cpp AMDGPUMarkLastScratchLoad.cpp AMDGPUMIRFormatter.cpp diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 3ff675d6e5e9..f2e432fa8d7f 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -8,7 +8,7 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]> : InstSI <outs, ins, "", pattern>, - SIMCInstr <opName, SIEncodingFamily.NONE> { + SIMCInstr <NAME, SIEncodingFamily.NONE> { let LGKM_CNT = 1; let DS = 1; @@ -19,6 +19,7 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt // Most instruction load and store data, so set this as the default. let mayLoad = 1; let mayStore = 1; + let FixedSize = true; let hasSideEffects = 0; let SchedRW = [WriteLDS]; @@ -76,7 +77,6 @@ class DS_Real <DS_Pseudo ps, string opName = ps.Mnemonic> : let isConvergent = ps.isConvergent; let Constraints = ps.Constraints; - let DisableEncoding = ps.DisableEncoding; // encoding fields bits<10> vdst; @@ -91,16 +91,33 @@ class DS_Real <DS_Pseudo ps, string opName = ps.Mnemonic> : let offset0 = !if(ps.has_offset, offset{7-0}, ?); let offset1 = !if(ps.has_offset, offset{15-8}, ?); - bits<1> acc = !if(ps.has_vdst, vdst{9}, - !if(!or(ps.has_data0, ps.has_gws_data0), data0{9}, 0)); + // Figure out if we should set the acc bit. Simple load and store + // instructions with a single data operand can use AV_* classes, in + // which case the encoding comes from the assigned register field. + + // For more compliated cases with multiple data operands, since the + // register fields are only 8-bit, so data operands must all be AGPR + // or VGPR. + defvar DstOpIsAV = !if(ps.has_vdst, + VDstOperandIsAV<ps.OutOperandList>.ret, 0); + defvar DstOpIsAGPR = !if(ps.has_vdst, + VDstOperandIsAGPR<ps.OutOperandList>.ret, 0); + defvar DataOpIsAV = !if(!or(ps.has_data0, ps.has_gws_data0), + Data0OperandIsAV<ps.InOperandList>.ret, 0); + defvar DataOpIsAGPR = !if(!or(ps.has_data0, ps.has_gws_data0), + Data0OperandIsAGPR<ps.InOperandList>.ret, 0); + + bits<1> acc = !if(ps.has_vdst, + !if(DstOpIsAV, vdst{9}, DstOpIsAGPR), + !if(DataOpIsAV, data0{9}, DataOpIsAGPR)); } // DS Pseudo instructions -class DS_0A1D_NORET<string opName, RegisterClass rc = VGPR_32> +class DS_0A1D_NORET<string opName, RegisterOperand rc = AVLdSt_32> : DS_Pseudo<opName, (outs), - (ins getLdStRegisterOperand<rc>.ret:$data0, Offset:$offset, gds:$gds), + (ins rc:$data0, Offset:$offset, gds:$gds), " $data0$offset$gds"> { let has_addr = 0; @@ -108,10 +125,10 @@ class DS_0A1D_NORET<string opName, RegisterClass rc = VGPR_32> let has_vdst = 0; } -class DS_1A1D_NORET<string opName, RegisterClass rc = VGPR_32> +class DS_1A1D_NORET<string opName, RegisterOperand rc = AVLdSt_32> : DS_Pseudo<opName, (outs), - (ins VGPR_32:$addr, getLdStRegisterOperand<rc>.ret:$data0, Offset:$offset, gds:$gds), + (ins VGPR_32:$addr, rc:$data0, Offset:$offset, gds:$gds), " $addr, $data0$offset$gds"> { let has_data1 = 0; @@ -119,7 +136,7 @@ class DS_1A1D_NORET<string opName, RegisterClass rc = VGPR_32> let IsAtomicNoRet = 1; } -multiclass DS_1A1D_NORET_mc<string opName, RegisterClass rc = VGPR_32> { +multiclass DS_1A1D_NORET_mc<string opName, RegisterOperand rc = AVLdSt_32> { def "" : DS_1A1D_NORET<opName, rc>; let has_m0_read = 0 in { @@ -127,23 +144,23 @@ multiclass DS_1A1D_NORET_mc<string opName, RegisterClass rc = VGPR_32> { } } -multiclass DS_1A1D_NORET_t16<string opName, RegisterClass rc = VGPR_32> +multiclass DS_1A1D_NORET_t16<string opName, RegisterOperand rc = AVLdSt_32> : DS_1A1D_NORET_mc<opName, rc> { let has_m0_read = 0 in { let True16Predicate = UseRealTrue16Insts in { - def "_t16" : DS_1A1D_NORET<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_D16_HI", NAME>; + def "_t16" : DS_1A1D_NORET<opName#"_t16", VGPROp_16>, + True16D16Table<NAME#"_D16_HI", NAME#"_gfx9">; } } } -multiclass DS_1A1D_NORET_mc_gfx9<string opName, RegisterClass rc = VGPR_32> { +multiclass DS_1A1D_NORET_mc_gfx9<string opName, RegisterOperand rc = AVLdSt_32> { let has_m0_read = 0 in { def "" : DS_1A1D_NORET<opName, rc>; } } -class DS_1A2D_NORET<string opName, RegisterClass rc = VGPR_32, - RegisterOperand data_op = getLdStRegisterOperand<rc>.ret> +class DS_1A2D_NORET<string opName, RegisterOperand data_op = VGPROp_32> : DS_Pseudo<opName, (outs), (ins VGPR_32:$addr, data_op:$data0, data_op:$data1, Offset:$offset, gds:$gds), @@ -153,16 +170,24 @@ class DS_1A2D_NORET<string opName, RegisterClass rc = VGPR_32, let IsAtomicNoRet = 1; } -multiclass DS_1A2D_NORET_mc<string opName, RegisterClass rc = VGPR_32> { +// DS_xx2D cases should only be instantiated with VGPR operand classes. +multiclass DS_1A2D_NORET_mc<string opName, RegisterOperand rc = VGPROp_32> { + assert OperandIsVGPR<rc>.ret, + "DS with 2 data operands should be declared with VGPRs"; + def "" : DS_1A2D_NORET<opName, rc>; let has_m0_read = 0 in { def _gfx9 : DS_1A2D_NORET<opName, rc>; + + // All data operands are replaced with AGPRs in this form. + let SubtargetPredicate = isGFX90APlus in { + def _agpr : DS_1A2D_NORET<opName, getEquivalentAGPROperand<rc>.ret>; + } } } -class DS_1A2D_Off8_NORET <string opName, RegisterClass rc = VGPR_32, - RegisterOperand data_op = getLdStRegisterOperand<rc>.ret> +class DS_1A2D_Off8_NORET <string opName, RegisterOperand data_op = VGPROp_32> : DS_Pseudo<opName, (outs), (ins VGPR_32:$addr, data_op:$data0, data_op:$data1, @@ -173,17 +198,23 @@ class DS_1A2D_Off8_NORET <string opName, RegisterClass rc = VGPR_32, let has_offset = 0; } -multiclass DS_1A2D_Off8_NORET_mc <string opName, RegisterClass rc = VGPR_32> { +multiclass DS_1A2D_Off8_NORET_mc <string opName, RegisterOperand rc = VGPROp_32> { + assert OperandIsVGPR<rc>.ret, + "DS with 2 data operands should be declared with VGPRs"; + def "" : DS_1A2D_Off8_NORET<opName, rc>; let has_m0_read = 0 in { def _gfx9 : DS_1A2D_Off8_NORET<opName, rc>; + + let SubtargetPredicate = isGFX90APlus in { + def _agpr : DS_1A2D_Off8_NORET<opName, getEquivalentAGPROperand<rc>.ret>; + } } } -class DS_0A1D_RET_GDS<string opName, RegisterClass rc = VGPR_32, RegisterClass src = rc, - RegisterOperand dst_op = getLdStRegisterOperand<rc>.ret, - RegisterOperand src_op = getLdStRegisterOperand<src>.ret> +class DS_0A1D_RET_GDS<string opName, RegisterOperand dst_op = AVLdSt_32, + RegisterOperand src_op = dst_op> : DS_Pseudo<opName, (outs dst_op:$vdst), (ins src_op:$data0, Offset:$offset), @@ -196,8 +227,7 @@ class DS_0A1D_RET_GDS<string opName, RegisterClass rc = VGPR_32, RegisterClass s let hasSideEffects = 1; } -class DS_1A1D_RET <string opName, RegisterClass rc = VGPR_32, - RegisterOperand data_op = getLdStRegisterOperand<rc>.ret> +class DS_1A1D_RET <string opName, RegisterOperand data_op = AVLdSt_32> : DS_Pseudo<opName, (outs data_op:$vdst), (ins VGPR_32:$addr, data_op:$data0, Offset:$offset, gds:$gds), @@ -207,76 +237,84 @@ class DS_1A1D_RET <string opName, RegisterClass rc = VGPR_32, let IsAtomicRet = 1; } -multiclass DS_1A1D_RET_mc <string opName, RegisterClass rc = VGPR_32> { +multiclass DS_1A1D_RET_mc <string opName, RegisterOperand rc = VGPROp_32> { + assert OperandIsVGPR<rc>.ret, + "DS with 2 data operands should be declared with VGPRs"; + def "" : DS_1A1D_RET<opName, rc>; let has_m0_read = 0 in { def _gfx9 : DS_1A1D_RET<opName, rc>; + def _agpr : DS_1A1D_RET<opName, getEquivalentAGPROperand<rc>.ret>; } } -multiclass DS_1A1D_RET_mc_gfx9 <string opName, RegisterClass rc = VGPR_32> { +multiclass DS_1A1D_RET_mc_gfx9 <string opName, RegisterOperand rc = VGPROp_32> { let has_m0_read = 0 in { def "" : DS_1A1D_RET<opName, rc>; + def _agpr : DS_1A1D_RET<opName, getEquivalentAGPROperand<rc>.ret>; } } class DS_1A2D_RET<string opName, - RegisterClass rc = VGPR_32, - RegisterClass src = rc, - RegisterOperand dst_op = getLdStRegisterOperand<rc>.ret, - RegisterOperand src_op = getLdStRegisterOperand<src>.ret> -: DS_Pseudo<opName, - (outs dst_op:$vdst), - (ins VGPR_32:$addr, src_op:$data0, src_op:$data1, Offset:$offset, gds:$gds), + RegisterOperand dst_rc = VGPROp_32, + RegisterOperand src_rc = dst_rc>: DS_Pseudo<opName, + (outs dst_rc:$vdst), + (ins VGPR_32:$addr, src_rc:$data0, src_rc:$data1, Offset:$offset, gds:$gds), " $vdst, $addr, $data0, $data1$offset$gds"> { let IsAtomicRet = 1; } multiclass DS_1A2D_RET_mc<string opName, - RegisterClass rc = VGPR_32, - RegisterClass src = rc> { - def "" : DS_1A2D_RET<opName, rc, src>; + RegisterOperand dst_rc = VGPROp_32, + RegisterOperand src_rc = dst_rc> { + assert !and(OperandIsVGPR<dst_rc>.ret, OperandIsVGPR<src_rc>.ret), + "DS with 2 data operands should be declared with VGPRs"; + + def "" : DS_1A2D_RET<opName, dst_rc, src_rc>; let has_m0_read = 0 in { - def _gfx9 : DS_1A2D_RET<opName, rc, src>; + def _gfx9 : DS_1A2D_RET<opName, dst_rc, src_rc>; + def _agpr : DS_1A2D_RET<opName, getEquivalentAGPROperand<dst_rc>.ret, + getEquivalentAGPROperand<src_rc>.ret>; } } class DS_1A2D_Off8_RET<string opName, - RegisterClass rc = VGPR_32, - RegisterClass src = rc, - RegisterOperand dst_op = getLdStRegisterOperand<rc>.ret, - RegisterOperand src_op = getLdStRegisterOperand<src>.ret> + RegisterOperand dst_rc = VGPROp_32, + RegisterOperand src_rc = dst_rc> : DS_Pseudo<opName, - (outs dst_op:$vdst), - (ins VGPR_32:$addr, src_op:$data0, src_op:$data1, Offset0:$offset0, Offset1:$offset1, gds:$gds), + (outs dst_rc:$vdst), + (ins VGPR_32:$addr, src_rc:$data0, src_rc:$data1, Offset0:$offset0, Offset1:$offset1, gds:$gds), " $vdst, $addr, $data0, $data1$offset0$offset1$gds"> { let has_offset = 0; } multiclass DS_1A2D_Off8_RET_mc<string opName, - RegisterClass rc = VGPR_32, - RegisterClass src = rc> { - def "" : DS_1A2D_Off8_RET<opName, rc, src>; + RegisterOperand dst_rc = VGPROp_32, + RegisterOperand src_rc = dst_rc> { + assert !and(OperandIsVGPR<dst_rc>.ret, OperandIsVGPR<src_rc>.ret) , + "DS with 2 data operands should be declared with VGPRs"; + + def "" : DS_1A2D_Off8_RET<opName, dst_rc, src_rc>; let has_m0_read = 0 in { - def _gfx9 : DS_1A2D_Off8_RET<opName, rc, src>; + def _gfx9 : DS_1A2D_Off8_RET<opName, dst_rc, src_rc>; + def _agpr : DS_1A2D_Off8_RET<opName, getEquivalentAGPROperand<dst_rc>.ret, + getEquivalentAGPROperand<src_rc>.ret>; } } class DS_BVH_STACK<string opName, - RegisterClass vdst_rc, - RegisterClass data1_rc> + RegisterOperand vdst_rc, + RegisterOperand data1_rc> : DS_Pseudo<opName, - (outs getLdStRegisterOperand<vdst_rc>.ret:$vdst, VGPR_32:$addr), - (ins VGPR_32:$addr_in, getLdStRegisterOperand<VGPR_32>.ret:$data0, - data1_rc:$data1, Offset:$offset), + (outs vdst_rc:$vdst, VGPR_32:$addr), + (ins VGPR_32:$addr_in, VGPR_32:$data0, data1_rc:$data1, Offset:$offset), " $vdst, $addr, $data0, $data1$offset"> { let Constraints = "$addr = $addr_in"; - let DisableEncoding = "$addr_in"; let has_gds = 0; let gdsValue = 0; // TODO: Use MMOs in the LDS address space instead of hasSideEffects = 1. @@ -284,8 +322,8 @@ class DS_BVH_STACK<string opName, let SchedRW = [WriteLDS, WriteLDS]; } -class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = Offset, - RegisterOperand data_op = getLdStRegisterOperand<rc>.ret> +class DS_1A_RET<string opName, RegisterOperand data_op = AVLdSt_32, + bit HasTiedOutput = 0, Operand ofs = Offset> : DS_Pseudo<opName, (outs data_op:$vdst), !if(HasTiedOutput, @@ -293,12 +331,12 @@ class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0 (ins VGPR_32:$addr, ofs:$offset, gds:$gds)), " $vdst, $addr$offset$gds"> { let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", ""); - let DisableEncoding = !if(HasTiedOutput, "$vdst_in", ""); let has_data0 = 0; let has_data1 = 0; } -multiclass DS_1A_RET_mc<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = Offset> { +multiclass DS_1A_RET_mc<string opName, RegisterOperand rc = AVLdSt_32, + bit HasTiedOutput = 0, Operand ofs = Offset> { def "" : DS_1A_RET<opName, rc, HasTiedOutput, ofs>; let has_m0_read = 0 in { @@ -306,27 +344,28 @@ multiclass DS_1A_RET_mc<string opName, RegisterClass rc = VGPR_32, bit HasTiedOu } } -multiclass DS_1A_RET_t16<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = Offset> +multiclass DS_1A_RET_t16<string opName, RegisterOperand rc = AVLdSt_32, + bit HasTiedOutput = 0, Operand ofs = Offset> : DS_1A_RET_mc<opName, rc, HasTiedOutput, ofs> { let has_m0_read = 0 in { let True16Predicate = UseRealTrue16Insts in { - def "_t16" : DS_1A_RET<opName#"_t16", VGPR_16, HasTiedOutput, ofs>, True16D16Table<NAME#"_D16_HI", NAME#"_D16">; + def "_t16" : DS_1A_RET<opName#"_t16", VGPROp_16, HasTiedOutput, ofs>, True16D16Table<NAME#"_D16_HI", NAME#"_D16">; } } } -multiclass DS_1A_RET_NoM0<string opName, RegisterClass rc = VGPR_32> { +multiclass DS_1A_RET_NoM0<string opName, RegisterOperand rc = VGPROp_32> { let has_m0_read = 0 in { def "" : DS_1A_RET<opName, rc>; } } -class DS_1A_RET_Tied<string opName, RegisterClass rc = VGPR_32> : +class DS_1A_RET_Tied<string opName, RegisterOperand rc = AVLdSt_32> : DS_1A_RET<opName, rc, 1>; -class DS_1A_Off8_RET <string opName, RegisterClass rc = VGPR_32> +class DS_1A_Off8_RET <string opName, RegisterOperand rc = AVLdSt_32> : DS_Pseudo<opName, - (outs getLdStRegisterOperand<rc>.ret:$vdst), + (outs rc:$vdst), (ins VGPR_32:$addr, Offset0:$offset0, Offset1:$offset1, gds:$gds), " $vdst, $addr$offset0$offset1$gds"> { @@ -335,7 +374,7 @@ class DS_1A_Off8_RET <string opName, RegisterClass rc = VGPR_32> let has_data1 = 0; } -multiclass DS_1A_Off8_RET_mc <string opName, RegisterClass rc = VGPR_32> { +multiclass DS_1A_Off8_RET_mc <string opName, RegisterOperand rc = VGPROp_32> { def "" : DS_1A_Off8_RET<opName, rc>; let has_m0_read = 0 in { @@ -344,7 +383,7 @@ multiclass DS_1A_Off8_RET_mc <string opName, RegisterClass rc = VGPR_32> { } class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName, - (outs getLdStRegisterOperand<VGPR_32>.ret:$vdst), + (outs AVLdSt_32:$vdst), (ins VGPR_32:$addr, Offset:$offset), " $vdst, $addr$offset gds"> { @@ -369,7 +408,7 @@ class DS_1A_Off16_NORET <string opName> } class DS_0A_RET <string opName> : DS_Pseudo<opName, - (outs getLdStRegisterOperand<VGPR_32>.ret:$vdst), + (outs AVLdSt_32:$vdst), (ins Offset:$offset, gds:$gds), " $vdst$offset$gds"> { @@ -424,7 +463,7 @@ class DS_GWS_0D <string opName> class DS_GWS_1D <string opName> : DS_GWS<opName, - (ins getLdStRegisterOperand<VGPR_32>.ret:$data0, Offset:$offset), + (ins AVLdSt_32:$data0, Offset:$offset), " $data0$offset gds"> { let has_gws_data0 = 1; @@ -449,7 +488,7 @@ class DS_VOID <string opName> : DS_Pseudo<opName, } class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag, - RegisterOperand data_op = getLdStRegisterOperand<VGPR_32>.ret> + RegisterOperand data_op = AVLdSt_32> : DS_Pseudo<opName, (outs data_op:$vdst), (ins VGPR_32:$addr, data_op:$data0, Offset:$offset), @@ -465,12 +504,75 @@ class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag, let has_gds = 0; } +multiclass DS_1A1D_PERMUTE_mc <string opName, SDPatternOperator node = null_frag, + RegisterOperand data_op = VGPROp_32> { + assert OperandIsVGPR<data_op>.ret, + "DS with 2 data operands should be declared with VGPRs"; + def "" : DS_1A1D_PERMUTE<opName, node, data_op>; + + let SubtargetPredicate = isGFX90APlus in { + def _agpr : DS_1A1D_PERMUTE<opName, null_frag, + getEquivalentAGPROperand<data_op>.ret>; + } +} + + class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, int complexity = 0, bit gds=0> : GCNPat <(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), (inst $ptr, getVregSrcForVT<vt>.ret:$value, Offset:$offset, (i1 gds))> { let AddedComplexity = complexity; } +multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> { + let OtherPredicates = [LDSRequiresM0Init] in { + def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt)>; + } + + let OtherPredicates = [NotLDSRequiresM0Init] in { + def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, + !cast<PatFrag>(frag#"_local_"#vt)>; + } + + let OtherPredicates = [HasGDS] in { + def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt), + /* complexity */ 0, /* gds */ 1>; + } +} + +multiclass DSAtomicRetNoRetPat_NoM0_mc<DS_Pseudo inst, DS_Pseudo noRetInst, + ValueType vt, string frag> { + def : DSAtomicRetPat<inst, vt, + !cast<PatFrag>(frag#"_local_"#vt)>; + def : DSAtomicRetPat<noRetInst, vt, + !cast<PatFrag>(frag#"_local_noret_"#vt), /* complexity */ 1>; +} + +multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst, + ValueType vt, string frag> { + let OtherPredicates = [LDSRequiresM0Init] in { + def : DSAtomicRetPat<inst, vt, + !cast<PatFrag>(frag#"_local_m0_"#vt)>; + def : DSAtomicRetPat<noRetInst, vt, + !cast<PatFrag>(frag#"_local_m0_noret_"#vt), /* complexity */ 1>; + } + + let OtherPredicates = [NotLDSRequiresM0Init] in { + defm : DSAtomicRetNoRetPat_NoM0_mc< + !cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), + !cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), + vt, frag>; + } + + let OtherPredicates = [HasGDS] in { + def : DSAtomicRetPat<inst, vt, + !cast<PatFrag>(frag#"_region_m0_"#vt), + /* complexity */ 0, /* gds */ 1>; + def : DSAtomicRetPat<noRetInst, vt, + !cast<PatFrag>(frag#"_region_m0_noret_"#vt), + /* complexity */ 1, /* gds */ 1>; + } +} + defm DS_ADD_U32 : DS_1A1D_NORET_mc<"ds_add_u32">; defm DS_SUB_U32 : DS_1A1D_NORET_mc<"ds_sub_u32">; defm DS_RSUB_U32 : DS_1A1D_NORET_mc<"ds_rsub_u32">; @@ -516,100 +618,100 @@ def DS_WRITE_ADDTID_B32 : DS_0A1D_NORET<"ds_write_addtid_b32">; } // End mayLoad = 0 let SubtargetPredicate = HasLdsAtomicAddF64 in { - defm DS_ADD_F64 : DS_1A1D_NORET_mc_gfx9<"ds_add_f64", VReg_64>; - defm DS_ADD_RTN_F64 : DS_1A1D_RET_mc_gfx9<"ds_add_rtn_f64", VReg_64>; + defm DS_ADD_F64 : DS_1A1D_NORET_mc_gfx9<"ds_add_f64", AVLdSt_64>; + defm DS_ADD_RTN_F64 : DS_1A1D_RET_mc_gfx9<"ds_add_rtn_f64", VGPROp_64>; } // End SubtargetPredicate = HasLdsAtomicAddF64 let SubtargetPredicate = HasAtomicDsPkAdd16Insts in { - defm DS_PK_ADD_F16 : DS_1A1D_NORET_mc<"ds_pk_add_f16">; - defm DS_PK_ADD_RTN_F16 : DS_1A1D_RET_mc<"ds_pk_add_rtn_f16", VGPR_32>; - defm DS_PK_ADD_BF16 : DS_1A1D_NORET_mc<"ds_pk_add_bf16">; - defm DS_PK_ADD_RTN_BF16 : DS_1A1D_RET_mc<"ds_pk_add_rtn_bf16", VGPR_32>; + defm DS_PK_ADD_F16 : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_f16">; + defm DS_PK_ADD_RTN_F16 : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_f16">; + defm DS_PK_ADD_BF16 : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_bf16">; + defm DS_PK_ADD_RTN_BF16 : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_bf16">; } // End SubtargetPredicate = HasAtomicDsPkAdd16Insts defm DS_CMPSTORE_B32 : DS_1A2D_NORET_mc<"ds_cmpstore_b32">; defm DS_CMPSTORE_F32 : DS_1A2D_NORET_mc<"ds_cmpstore_f32">; -defm DS_CMPSTORE_B64 : DS_1A2D_NORET_mc<"ds_cmpstore_b64", VReg_64>; -defm DS_CMPSTORE_F64 : DS_1A2D_NORET_mc<"ds_cmpstore_f64", VReg_64>; -defm DS_CMPSTORE_RTN_B32 : DS_1A2D_RET_mc<"ds_cmpstore_rtn_b32", VGPR_32>; -defm DS_CMPSTORE_RTN_F32 : DS_1A2D_RET_mc<"ds_cmpstore_rtn_f32", VGPR_32>; -defm DS_CMPSTORE_RTN_B64 : DS_1A2D_RET_mc<"ds_cmpstore_rtn_b64", VReg_64>; -defm DS_CMPSTORE_RTN_F64 : DS_1A2D_RET_mc<"ds_cmpstore_rtn_f64", VReg_64>; +defm DS_CMPSTORE_B64 : DS_1A2D_NORET_mc<"ds_cmpstore_b64", VGPROp_64>; +defm DS_CMPSTORE_F64 : DS_1A2D_NORET_mc<"ds_cmpstore_f64", VGPROp_64>; +defm DS_CMPSTORE_RTN_B32 : DS_1A2D_RET_mc<"ds_cmpstore_rtn_b32">; +defm DS_CMPSTORE_RTN_F32 : DS_1A2D_RET_mc<"ds_cmpstore_rtn_f32">; +defm DS_CMPSTORE_RTN_B64 : DS_1A2D_RET_mc<"ds_cmpstore_rtn_b64", VGPROp_64>; +defm DS_CMPSTORE_RTN_F64 : DS_1A2D_RET_mc<"ds_cmpstore_rtn_f64", VGPROp_64>; defm DS_MSKOR_B32 : DS_1A2D_NORET_mc<"ds_mskor_b32">; defm DS_CMPST_B32 : DS_1A2D_NORET_mc<"ds_cmpst_b32">; defm DS_CMPST_F32 : DS_1A2D_NORET_mc<"ds_cmpst_f32">; -defm DS_ADD_U64 : DS_1A1D_NORET_mc<"ds_add_u64", VReg_64>; -defm DS_SUB_U64 : DS_1A1D_NORET_mc<"ds_sub_u64", VReg_64>; -defm DS_RSUB_U64 : DS_1A1D_NORET_mc<"ds_rsub_u64", VReg_64>; -defm DS_INC_U64 : DS_1A1D_NORET_mc<"ds_inc_u64", VReg_64>; -defm DS_DEC_U64 : DS_1A1D_NORET_mc<"ds_dec_u64", VReg_64>; -defm DS_MIN_I64 : DS_1A1D_NORET_mc<"ds_min_i64", VReg_64>; -defm DS_MAX_I64 : DS_1A1D_NORET_mc<"ds_max_i64", VReg_64>; -defm DS_MIN_U64 : DS_1A1D_NORET_mc<"ds_min_u64", VReg_64>; -defm DS_MAX_U64 : DS_1A1D_NORET_mc<"ds_max_u64", VReg_64>; -defm DS_AND_B64 : DS_1A1D_NORET_mc<"ds_and_b64", VReg_64>; -defm DS_OR_B64 : DS_1A1D_NORET_mc<"ds_or_b64", VReg_64>; -defm DS_XOR_B64 : DS_1A1D_NORET_mc<"ds_xor_b64", VReg_64>; -defm DS_MSKOR_B64 : DS_1A2D_NORET_mc<"ds_mskor_b64", VReg_64>; +defm DS_ADD_U64 : DS_1A1D_NORET_mc<"ds_add_u64", AVLdSt_64>; +defm DS_SUB_U64 : DS_1A1D_NORET_mc<"ds_sub_u64", AVLdSt_64>; +defm DS_RSUB_U64 : DS_1A1D_NORET_mc<"ds_rsub_u64", AVLdSt_64>; +defm DS_INC_U64 : DS_1A1D_NORET_mc<"ds_inc_u64", AVLdSt_64>; +defm DS_DEC_U64 : DS_1A1D_NORET_mc<"ds_dec_u64", AVLdSt_64>; +defm DS_MIN_I64 : DS_1A1D_NORET_mc<"ds_min_i64", AVLdSt_64>; +defm DS_MAX_I64 : DS_1A1D_NORET_mc<"ds_max_i64", AVLdSt_64>; +defm DS_MIN_U64 : DS_1A1D_NORET_mc<"ds_min_u64", AVLdSt_64>; +defm DS_MAX_U64 : DS_1A1D_NORET_mc<"ds_max_u64", AVLdSt_64>; +defm DS_AND_B64 : DS_1A1D_NORET_mc<"ds_and_b64", AVLdSt_64>; +defm DS_OR_B64 : DS_1A1D_NORET_mc<"ds_or_b64", AVLdSt_64>; +defm DS_XOR_B64 : DS_1A1D_NORET_mc<"ds_xor_b64", AVLdSt_64>; +defm DS_MSKOR_B64 : DS_1A2D_NORET_mc<"ds_mskor_b64", VGPROp_64>; let mayLoad = 0 in { -defm DS_WRITE_B64 : DS_1A1D_NORET_mc<"ds_write_b64", VReg_64>; -defm DS_WRITE2_B64 : DS_1A2D_Off8_NORET_mc<"ds_write2_b64", VReg_64>; -defm DS_WRITE2ST64_B64: DS_1A2D_Off8_NORET_mc<"ds_write2st64_b64", VReg_64>; +defm DS_WRITE_B64 : DS_1A1D_NORET_mc<"ds_write_b64", AVLdSt_64>; +defm DS_WRITE2_B64 : DS_1A2D_Off8_NORET_mc<"ds_write2_b64", VGPROp_64>; +defm DS_WRITE2ST64_B64: DS_1A2D_Off8_NORET_mc<"ds_write2st64_b64", VGPROp_64>; } -defm DS_CMPST_B64 : DS_1A2D_NORET_mc<"ds_cmpst_b64", VReg_64>; -defm DS_CMPST_F64 : DS_1A2D_NORET_mc<"ds_cmpst_f64", VReg_64>; -defm DS_MIN_F64 : DS_1A1D_NORET_mc<"ds_min_f64", VReg_64>; -defm DS_MAX_F64 : DS_1A1D_NORET_mc<"ds_max_f64", VReg_64>; +defm DS_CMPST_B64 : DS_1A2D_NORET_mc<"ds_cmpst_b64", VGPROp_64>; +defm DS_CMPST_F64 : DS_1A2D_NORET_mc<"ds_cmpst_f64", VGPROp_64>; +defm DS_MIN_F64 : DS_1A1D_NORET_mc<"ds_min_f64", AVLdSt_64>; +defm DS_MAX_F64 : DS_1A1D_NORET_mc<"ds_max_f64", AVLdSt_64>; -defm DS_ADD_RTN_U32 : DS_1A1D_RET_mc<"ds_add_rtn_u32", VGPR_32>; +defm DS_ADD_RTN_U32 : DS_1A1D_RET_mc<"ds_add_rtn_u32">; let SubtargetPredicate = HasLDSFPAtomicAddF32 in { -defm DS_ADD_RTN_F32 : DS_1A1D_RET_mc<"ds_add_rtn_f32", VGPR_32>; -} -defm DS_SUB_RTN_U32 : DS_1A1D_RET_mc<"ds_sub_rtn_u32", VGPR_32>; -defm DS_RSUB_RTN_U32 : DS_1A1D_RET_mc<"ds_rsub_rtn_u32", VGPR_32>; -defm DS_INC_RTN_U32 : DS_1A1D_RET_mc<"ds_inc_rtn_u32", VGPR_32>; -defm DS_DEC_RTN_U32 : DS_1A1D_RET_mc<"ds_dec_rtn_u32", VGPR_32>; -defm DS_MIN_RTN_I32 : DS_1A1D_RET_mc<"ds_min_rtn_i32", VGPR_32>; -defm DS_MAX_RTN_I32 : DS_1A1D_RET_mc<"ds_max_rtn_i32", VGPR_32>; -defm DS_MIN_RTN_U32 : DS_1A1D_RET_mc<"ds_min_rtn_u32", VGPR_32>; -defm DS_MAX_RTN_U32 : DS_1A1D_RET_mc<"ds_max_rtn_u32", VGPR_32>; -defm DS_AND_RTN_B32 : DS_1A1D_RET_mc<"ds_and_rtn_b32", VGPR_32>; -defm DS_OR_RTN_B32 : DS_1A1D_RET_mc<"ds_or_rtn_b32", VGPR_32>; -defm DS_XOR_RTN_B32 : DS_1A1D_RET_mc<"ds_xor_rtn_b32", VGPR_32>; -defm DS_MSKOR_RTN_B32 : DS_1A2D_RET_mc<"ds_mskor_rtn_b32", VGPR_32>; -defm DS_CMPST_RTN_B32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_b32", VGPR_32>; -defm DS_CMPST_RTN_F32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_f32", VGPR_32>; -defm DS_MIN_RTN_F32 : DS_1A1D_RET_mc<"ds_min_rtn_f32", VGPR_32>; -defm DS_MAX_RTN_F32 : DS_1A1D_RET_mc<"ds_max_rtn_f32", VGPR_32>; +defm DS_ADD_RTN_F32 : DS_1A1D_RET_mc<"ds_add_rtn_f32">; +} +defm DS_SUB_RTN_U32 : DS_1A1D_RET_mc<"ds_sub_rtn_u32">; +defm DS_RSUB_RTN_U32 : DS_1A1D_RET_mc<"ds_rsub_rtn_u32">; +defm DS_INC_RTN_U32 : DS_1A1D_RET_mc<"ds_inc_rtn_u32">; +defm DS_DEC_RTN_U32 : DS_1A1D_RET_mc<"ds_dec_rtn_u32">; +defm DS_MIN_RTN_I32 : DS_1A1D_RET_mc<"ds_min_rtn_i32">; +defm DS_MAX_RTN_I32 : DS_1A1D_RET_mc<"ds_max_rtn_i32">; +defm DS_MIN_RTN_U32 : DS_1A1D_RET_mc<"ds_min_rtn_u32">; +defm DS_MAX_RTN_U32 : DS_1A1D_RET_mc<"ds_max_rtn_u32">; +defm DS_AND_RTN_B32 : DS_1A1D_RET_mc<"ds_and_rtn_b32">; +defm DS_OR_RTN_B32 : DS_1A1D_RET_mc<"ds_or_rtn_b32">; +defm DS_XOR_RTN_B32 : DS_1A1D_RET_mc<"ds_xor_rtn_b32">; +defm DS_MSKOR_RTN_B32 : DS_1A2D_RET_mc<"ds_mskor_rtn_b32", VGPROp_32>; +defm DS_CMPST_RTN_B32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_b32", VGPROp_32>; +defm DS_CMPST_RTN_F32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_f32", VGPROp_32>; +defm DS_MIN_RTN_F32 : DS_1A1D_RET_mc<"ds_min_rtn_f32">; +defm DS_MAX_RTN_F32 : DS_1A1D_RET_mc<"ds_max_rtn_f32">; defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b32">; -defm DS_WRXCHG2_RTN_B32 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b32", VReg_64, VGPR_32>; -defm DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b32", VReg_64, VGPR_32>; - -defm DS_ADD_RTN_U64 : DS_1A1D_RET_mc<"ds_add_rtn_u64", VReg_64>; -defm DS_SUB_RTN_U64 : DS_1A1D_RET_mc<"ds_sub_rtn_u64", VReg_64>; -defm DS_RSUB_RTN_U64 : DS_1A1D_RET_mc<"ds_rsub_rtn_u64", VReg_64>; -defm DS_INC_RTN_U64 : DS_1A1D_RET_mc<"ds_inc_rtn_u64", VReg_64>; -defm DS_DEC_RTN_U64 : DS_1A1D_RET_mc<"ds_dec_rtn_u64", VReg_64>; -defm DS_MIN_RTN_I64 : DS_1A1D_RET_mc<"ds_min_rtn_i64", VReg_64>; -defm DS_MAX_RTN_I64 : DS_1A1D_RET_mc<"ds_max_rtn_i64", VReg_64>; -defm DS_MIN_RTN_U64 : DS_1A1D_RET_mc<"ds_min_rtn_u64", VReg_64>; -defm DS_MAX_RTN_U64 : DS_1A1D_RET_mc<"ds_max_rtn_u64", VReg_64>; -defm DS_AND_RTN_B64 : DS_1A1D_RET_mc<"ds_and_rtn_b64", VReg_64>; -defm DS_OR_RTN_B64 : DS_1A1D_RET_mc<"ds_or_rtn_b64", VReg_64>; -defm DS_XOR_RTN_B64 : DS_1A1D_RET_mc<"ds_xor_rtn_b64", VReg_64>; -defm DS_MSKOR_RTN_B64 : DS_1A2D_RET_mc<"ds_mskor_rtn_b64", VReg_64>; -defm DS_CMPST_RTN_B64 : DS_1A2D_RET_mc<"ds_cmpst_rtn_b64", VReg_64>; -defm DS_CMPST_RTN_F64 : DS_1A2D_RET_mc<"ds_cmpst_rtn_f64", VReg_64>; -defm DS_MIN_RTN_F64 : DS_1A1D_RET_mc<"ds_min_rtn_f64", VReg_64>; -defm DS_MAX_RTN_F64 : DS_1A1D_RET_mc<"ds_max_rtn_f64", VReg_64>; - -defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b64", VReg_64>; -defm DS_WRXCHG2_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>; -defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>; +defm DS_WRXCHG2_RTN_B32 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b32", VGPROp_64, VGPROp_32>; +defm DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b32", VGPROp_64, VGPROp_32>; + +defm DS_ADD_RTN_U64 : DS_1A1D_RET_mc<"ds_add_rtn_u64", VGPROp_64>; +defm DS_SUB_RTN_U64 : DS_1A1D_RET_mc<"ds_sub_rtn_u64", VGPROp_64>; +defm DS_RSUB_RTN_U64 : DS_1A1D_RET_mc<"ds_rsub_rtn_u64", VGPROp_64>; +defm DS_INC_RTN_U64 : DS_1A1D_RET_mc<"ds_inc_rtn_u64", VGPROp_64>; +defm DS_DEC_RTN_U64 : DS_1A1D_RET_mc<"ds_dec_rtn_u64", VGPROp_64>; +defm DS_MIN_RTN_I64 : DS_1A1D_RET_mc<"ds_min_rtn_i64", VGPROp_64>; +defm DS_MAX_RTN_I64 : DS_1A1D_RET_mc<"ds_max_rtn_i64", VGPROp_64>; +defm DS_MIN_RTN_U64 : DS_1A1D_RET_mc<"ds_min_rtn_u64", VGPROp_64>; +defm DS_MAX_RTN_U64 : DS_1A1D_RET_mc<"ds_max_rtn_u64", VGPROp_64>; +defm DS_AND_RTN_B64 : DS_1A1D_RET_mc<"ds_and_rtn_b64", VGPROp_64>; +defm DS_OR_RTN_B64 : DS_1A1D_RET_mc<"ds_or_rtn_b64", VGPROp_64>; +defm DS_XOR_RTN_B64 : DS_1A1D_RET_mc<"ds_xor_rtn_b64", VGPROp_64>; +defm DS_MSKOR_RTN_B64 : DS_1A2D_RET_mc<"ds_mskor_rtn_b64", VGPROp_64>; +defm DS_CMPST_RTN_B64 : DS_1A2D_RET_mc<"ds_cmpst_rtn_b64", VGPROp_64>; +defm DS_CMPST_RTN_F64 : DS_1A2D_RET_mc<"ds_cmpst_rtn_f64", VGPROp_64>; +defm DS_MIN_RTN_F64 : DS_1A1D_RET_mc<"ds_min_rtn_f64", VGPROp_64>; +defm DS_MAX_RTN_F64 : DS_1A1D_RET_mc<"ds_max_rtn_f64", VGPROp_64>; + +defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b64", VGPROp_64>; +defm DS_WRXCHG2_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b64", VGPROp_128, VGPROp_64>; +defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b64", VGPROp_128, VGPROp_64>; let isConvergent = 1, usesCustomInserter = 1 in { def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init"> { @@ -657,19 +759,19 @@ def DS_WRITE_SRC2_B64 : DS_1A<"ds_write_src2_b64">; } // End SubtargetPredicate = HasDsSrc2Insts let Uses = [EXEC], mayLoad = 0, mayStore = 0, isConvergent = 1 in { -def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, 0, Swizzle>; +def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", AVLdSt_32, 0, Swizzle>; } let mayStore = 0 in { defm DS_READ_I16 : DS_1A_RET_mc<"ds_read_i16">; defm DS_READ_B32 : DS_1A_RET_mc<"ds_read_b32">; -defm DS_READ_B64 : DS_1A_RET_mc<"ds_read_b64", VReg_64>; +defm DS_READ_B64 : DS_1A_RET_mc<"ds_read_b64", AVLdSt_64>; -defm DS_READ2_B32 : DS_1A_Off8_RET_mc<"ds_read2_b32", VReg_64>; -defm DS_READ2ST64_B32: DS_1A_Off8_RET_mc<"ds_read2st64_b32", VReg_64>; +defm DS_READ2_B32 : DS_1A_Off8_RET_mc<"ds_read2_b32", AVLdSt_64>; +defm DS_READ2ST64_B32: DS_1A_Off8_RET_mc<"ds_read2st64_b32", AVLdSt_64>; -defm DS_READ2_B64 : DS_1A_Off8_RET_mc<"ds_read2_b64", VReg_128>; -defm DS_READ2ST64_B64: DS_1A_Off8_RET_mc<"ds_read2st64_b64", VReg_128>; +defm DS_READ2_B64 : DS_1A_Off8_RET_mc<"ds_read2_b64", AVLdSt_128>; +defm DS_READ2ST64_B64: DS_1A_Off8_RET_mc<"ds_read2st64_b64", AVLdSt_128>; let has_m0_read = 0 in { let SubtargetPredicate = HasD16LoadStore, TiedSourceNotRead = 1 in { @@ -704,21 +806,21 @@ def DS_ORDERED_COUNT : DS_1A_RET_GDS<"ds_ordered_count">; let SubtargetPredicate = isGFX7Plus in { -defm DS_WRAP_RTN_B32 : DS_1A2D_RET_mc<"ds_wrap_rtn_b32", VGPR_32>; -defm DS_CONDXCHG32_RTN_B64 : DS_1A1D_RET_mc<"ds_condxchg32_rtn_b64", VReg_64>; +defm DS_WRAP_RTN_B32 : DS_1A2D_RET_mc<"ds_wrap_rtn_b32", VGPROp_32>; +defm DS_CONDXCHG32_RTN_B64 : DS_1A1D_RET_mc<"ds_condxchg32_rtn_b64", VGPROp_64>; let isConvergent = 1, usesCustomInserter = 1 in { def DS_GWS_SEMA_RELEASE_ALL : DS_GWS_0D<"ds_gws_sema_release_all">; } let mayStore = 0 in { -defm DS_READ_B96 : DS_1A_RET_mc<"ds_read_b96", VReg_96>; -defm DS_READ_B128: DS_1A_RET_mc<"ds_read_b128", VReg_128>; +defm DS_READ_B96 : DS_1A_RET_mc<"ds_read_b96", AVLdSt_96>; +defm DS_READ_B128: DS_1A_RET_mc<"ds_read_b128", AVLdSt_128>; } // End mayStore = 0 let mayLoad = 0 in { -defm DS_WRITE_B96 : DS_1A1D_NORET_mc<"ds_write_b96", VReg_96>; -defm DS_WRITE_B128 : DS_1A1D_NORET_mc<"ds_write_b128", VReg_128>; +defm DS_WRITE_B96 : DS_1A1D_NORET_mc<"ds_write_b96", AVLdSt_96>; +defm DS_WRITE_B128 : DS_1A1D_NORET_mc<"ds_write_b128", AVLdSt_128>; } // End mayLoad = 0 def DS_NOP : DS_VOID<"ds_nop">; @@ -732,10 +834,10 @@ def DS_NOP : DS_VOID<"ds_nop">; let SubtargetPredicate = isGFX8Plus in { let Uses = [EXEC] in { -def DS_PERMUTE_B32 : DS_1A1D_PERMUTE <"ds_permute_b32", - int_amdgcn_ds_permute>; -def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32", - int_amdgcn_ds_bpermute>; +defm DS_PERMUTE_B32 : DS_1A1D_PERMUTE_mc<"ds_permute_b32", + int_amdgcn_ds_permute>; +defm DS_BPERMUTE_B32 : DS_1A1D_PERMUTE_mc<"ds_bpermute_b32", + int_amdgcn_ds_bpermute>; } } // let SubtargetPredicate = isGFX8Plus @@ -751,8 +853,8 @@ def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">; let SubtargetPredicate = isGFX11Only in { -def DS_ADD_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_add_gs_reg_rtn", VReg_64, VGPR_32>; -def DS_SUB_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_sub_gs_reg_rtn", VReg_64, VGPR_32>; +def DS_ADD_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_add_gs_reg_rtn", VGPROp_64, VGPROp_32>; +def DS_SUB_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_sub_gs_reg_rtn", VGPROp_64, VGPROp_32>; } // let SubtargetPredicate = isGFX11Only @@ -760,7 +862,7 @@ let SubtargetPredicate = isGFX11Plus in { let OtherPredicates = [HasImageInsts] in def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32", - VGPR_32, VReg_128> ; + VGPROp_32, VGPROp_128> ; } // let SubtargetPredicate = isGFX11Plus @@ -772,15 +874,15 @@ let SubtargetPredicate = isGFX12Plus in { let OtherPredicates = [HasImageInsts] in { def DS_BVH_STACK_PUSH8_POP1_RTN_B32 : DS_BVH_STACK< - "ds_bvh_stack_push8_pop1_rtn_b32", VGPR_32, VReg_256>; + "ds_bvh_stack_push8_pop1_rtn_b32", VGPROp_32, VGPROp_256>; def DS_BVH_STACK_PUSH8_POP2_RTN_B64 : DS_BVH_STACK< - "ds_bvh_stack_push8_pop2_rtn_b64", VReg_64, VReg_256>; + "ds_bvh_stack_push8_pop2_rtn_b64", VGPROp_64, VGPROp_256>; } // End OtherPredicates = [HasImageInsts]. -defm DS_COND_SUB_U32 : DS_1A1D_NORET_mc<"ds_cond_sub_u32">; -defm DS_COND_SUB_RTN_U32 : DS_1A1D_RET_mc<"ds_cond_sub_rtn_u32", VGPR_32>; -defm DS_SUB_CLAMP_U32 : DS_1A1D_NORET_mc<"ds_sub_clamp_u32">; -defm DS_SUB_CLAMP_RTN_U32 : DS_1A1D_RET_mc<"ds_sub_clamp_rtn_u32", VGPR_32>; +defm DS_COND_SUB_U32 : DS_1A1D_NORET_mc_gfx9<"ds_cond_sub_u32">; +defm DS_COND_SUB_RTN_U32 : DS_1A1D_RET_mc_gfx9<"ds_cond_sub_rtn_u32", VGPROp_32>; +defm DS_SUB_CLAMP_U32 : DS_1A1D_NORET_mc_gfx9<"ds_sub_clamp_u32">; +defm DS_SUB_CLAMP_RTN_U32 : DS_1A1D_RET_mc_gfx9<"ds_sub_clamp_rtn_u32", VGPROp_32>; def DS_BPERMUTE_FI_B32 : DS_1A1D_PERMUTE <"ds_bpermute_fi_b32", int_amdgcn_ds_bpermute_fi_b32>; @@ -801,11 +903,11 @@ let SubtargetPredicate = isGFX1250Plus in { let WaveSizePredicate = isWave32, mayStore = 0 in { let OtherPredicates = [HasTransposeLoadF4F6Insts] in { -defm DS_LOAD_TR4_B64 : DS_1A_RET_NoM0<"ds_load_tr4_b64", VReg_64>; -defm DS_LOAD_TR6_B96 : DS_1A_RET_NoM0<"ds_load_tr6_b96", VReg_96>; +defm DS_LOAD_TR4_B64 : DS_1A_RET_NoM0<"ds_load_tr4_b64", VGPROp_64>; +defm DS_LOAD_TR6_B96 : DS_1A_RET_NoM0<"ds_load_tr6_b96", VGPROp_96>; } // End OtherPredicates = [HasTransposeLoadF4F6Insts] -defm DS_LOAD_TR8_B64 : DS_1A_RET_NoM0<"ds_load_tr8_b64", VReg_64>; -defm DS_LOAD_TR16_B128 : DS_1A_RET_NoM0<"ds_load_tr16_b128", VReg_128>; +defm DS_LOAD_TR8_B64 : DS_1A_RET_NoM0<"ds_load_tr8_b64", VGPROp_64>; +defm DS_LOAD_TR16_B128 : DS_1A_RET_NoM0<"ds_load_tr16_b128", VGPROp_128>; } // End WaveSizePredicate = isWave32, mayStore = 0 let OtherPredicates = [HasLdsBarrierArriveAtomic] in { @@ -818,7 +920,7 @@ def : GCNPat < (DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 VGPR_32:$ptr, Offset:$offset, (i1 0)) >; -defm DS_ATOMIC_BARRIER_ARRIVE_RTN_B64 : DS_1A1D_RET_mc_gfx9<"ds_atomic_barrier_arrive_rtn_b64", VReg_64>; +defm DS_ATOMIC_BARRIER_ARRIVE_RTN_B64 : DS_1A1D_RET_mc_gfx9<"ds_atomic_barrier_arrive_rtn_b64", VGPROp_64>; def : GCNPat< (i64 (int_amdgcn_ds_atomic_barrier_arrive_rtn_b64 (DS1Addr1Offset i32:$ptr, i32:$offset), i64:$data)), @@ -829,10 +931,10 @@ def : GCNPat< } // End SubtargetPredicate = isGFX1250Plus let WaveSizePredicate = isWave64, SubtargetPredicate = HasGFX950Insts, mayStore = 0 in { - defm DS_READ_B64_TR_B4 : DS_1A_RET_NoM0<"ds_read_b64_tr_b4", VReg_64>; - defm DS_READ_B64_TR_B8 : DS_1A_RET_NoM0<"ds_read_b64_tr_b8", VReg_64>; - defm DS_READ_B64_TR_B16 : DS_1A_RET_NoM0<"ds_read_b64_tr_b16", VReg_64>; - defm DS_READ_B96_TR_B6 : DS_1A_RET_NoM0<"ds_read_b96_tr_b6", VReg_96>; + defm DS_READ_B64_TR_B4 : DS_1A_RET_NoM0<"ds_read_b64_tr_b4", AVLdSt_64>; + defm DS_READ_B64_TR_B8 : DS_1A_RET_NoM0<"ds_read_b64_tr_b8", AVLdSt_64>; + defm DS_READ_B64_TR_B16 : DS_1A_RET_NoM0<"ds_read_b64_tr_b16", AVLdSt_64>; + defm DS_READ_B96_TR_B6 : DS_1A_RET_NoM0<"ds_read_b96_tr_b6", AVLdSt_96>; } //===----------------------------------------------------------------------===// @@ -984,6 +1086,7 @@ class DS64Bit4ByteAlignedReadPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : G (inst $ptr, $offset0, $offset1, (i1 0)) >; +// TODO: Should this use AVLdSt_64 for the class? class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat< (frag vt:$value, (DS64Bit4ByteAligned i32:$ptr, i32:$offset0, i32:$offset1)), (inst $ptr, (i32 (EXTRACT_SUBREG VReg_64:$value, sub0)), @@ -1091,50 +1194,6 @@ defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align_less_than_4_local">; } // End AddedComplexity = 100 -multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> { - let OtherPredicates = [LDSRequiresM0Init] in { - def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt)>; - } - - let OtherPredicates = [NotLDSRequiresM0Init] in { - def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_"#vt)>; - } - - let OtherPredicates = [HasGDS] in { - def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt), - /* complexity */ 0, /* gds */ 1>; - } -} - -multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst, - ValueType vt, string frag> { - let OtherPredicates = [LDSRequiresM0Init] in { - def : DSAtomicRetPat<inst, vt, - !cast<PatFrag>(frag#"_local_m0_"#vt)>; - def : DSAtomicRetPat<noRetInst, vt, - !cast<PatFrag>(frag#"_local_m0_noret_"#vt), /* complexity */ 1>; - } - - let OtherPredicates = [NotLDSRequiresM0Init] in { - def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_"#vt)>; - def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_noret_"#vt), /* complexity */ 1>; - } - - let OtherPredicates = [HasGDS] in { - def : DSAtomicRetPat<inst, vt, - !cast<PatFrag>(frag#"_region_m0_"#vt), - /* complexity */ 0, /* gds */ 1>; - def : DSAtomicRetPat<noRetInst, vt, - !cast<PatFrag>(frag#"_region_m0_noret_"#vt), - /* complexity */ 1, /* gds */ 1>; - } -} - - - let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in { // Caution, the order of src and cmp is the *opposite* of the BUFFER_ATOMIC_CMPSWAP opcode. class DSAtomicCmpXChgSwapped<DS_Pseudo inst, ValueType vt, PatFrag frag, @@ -1212,8 +1271,8 @@ defm : DSAtomicRetNoRetPat_mc<DS_MAX_RTN_F32, DS_MAX_F32, f32, "atomic_load_fmax let SubtargetPredicate = HasAtomicDsPkAdd16Insts in { -defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_load_fadd">; -defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_BF16, DS_PK_ADD_BF16, v2bf16, "atomic_load_fadd">; +defm : DSAtomicRetNoRetPat_NoM0_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_load_fadd">; +defm : DSAtomicRetNoRetPat_NoM0_mc<DS_PK_ADD_RTN_BF16, DS_PK_ADD_BF16, v2bf16, "atomic_load_fadd">; } let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in { @@ -1265,7 +1324,7 @@ class DSAtomicRetPatIntrinsic<DS_Pseudo inst, ValueType vt, PatFrag frag, } // End SubtargetPredicate = HasLdsAtomicAddF64 let SubtargetPredicate = HasAtomicDsPkAdd16Insts in { -defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_load_fadd">; +defm : DSAtomicRetNoRetPat_NoM0_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_load_fadd">; } // End SubtargetPredicate = HasAtomicDsPkAdd16Insts let OtherPredicates = [HasGDS] in @@ -1357,8 +1416,10 @@ class Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op, DS_Pseudo ps, int ef, // GFX12. //===----------------------------------------------------------------------===// -multiclass DS_Real_gfx12<bits<8> op, string name = !tolower(NAME)> { - defvar ps = !cast<DS_Pseudo>(NAME); +multiclass DS_Real_gfx12<bits<8> op, + DS_Pseudo ps = !cast<DS_Pseudo>(NAME), + string name = !tolower(NAME)> { + let AssemblerPredicate = isGFX12Plus in { let DecoderNamespace = "GFX12" in def _gfx12 : @@ -1369,14 +1430,20 @@ multiclass DS_Real_gfx12<bits<8> op, string name = !tolower(NAME)> { } // End AssemblerPredicate } -defm DS_MIN_F32 : DS_Real_gfx12<0x012, "ds_min_num_f32">; -defm DS_MAX_F32 : DS_Real_gfx12<0x013, "ds_max_num_f32">; -defm DS_MIN_RTN_F32 : DS_Real_gfx12<0x032, "ds_min_num_rtn_f32">; -defm DS_MAX_RTN_F32 : DS_Real_gfx12<0x033, "ds_max_num_rtn_f32">; -defm DS_MIN_F64 : DS_Real_gfx12<0x052, "ds_min_num_f64">; -defm DS_MAX_F64 : DS_Real_gfx12<0x053, "ds_max_num_f64">; -defm DS_MIN_RTN_F64 : DS_Real_gfx12<0x072, "ds_min_num_rtn_f64">; -defm DS_MAX_RTN_F64 : DS_Real_gfx12<0x073, "ds_max_num_rtn_f64">; +// Helper to avoid repeating the pseudo-name if we only need to set +// the gfx12 name. +multiclass DS_Real_gfx12_with_name<bits<8> op, string name> { + defm "" : DS_Real_gfx12<op, !cast<DS_Pseudo>(NAME#"_gfx9"), name>; +} + +defm DS_MIN_F32 : DS_Real_gfx12_with_name<0x012, "ds_min_num_f32">; +defm DS_MAX_F32 : DS_Real_gfx12_with_name<0x013, "ds_max_num_f32">; +defm DS_MIN_RTN_F32 : DS_Real_gfx12_with_name<0x032, "ds_min_num_rtn_f32">; +defm DS_MAX_RTN_F32 : DS_Real_gfx12_with_name<0x033, "ds_max_num_rtn_f32">; +defm DS_MIN_F64 : DS_Real_gfx12_with_name<0x052, "ds_min_num_f64">; +defm DS_MAX_F64 : DS_Real_gfx12_with_name<0x053, "ds_max_num_f64">; +defm DS_MIN_RTN_F64 : DS_Real_gfx12_with_name<0x072, "ds_min_num_rtn_f64">; +defm DS_MAX_RTN_F64 : DS_Real_gfx12_with_name<0x073, "ds_max_num_rtn_f64">; defm DS_COND_SUB_U32 : DS_Real_gfx12<0x098>; defm DS_SUB_CLAMP_U32 : DS_Real_gfx12<0x099>; defm DS_COND_SUB_RTN_U32 : DS_Real_gfx12<0x0a8>; @@ -1392,8 +1459,8 @@ defm DS_LOAD_TR6_B96 : DS_Real_gfx12<0x0fb>; defm DS_LOAD_TR16_B128 : DS_Real_gfx12<0x0fc>; defm DS_LOAD_TR8_B64 : DS_Real_gfx12<0x0fd>; -defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx12<0x0e0, - "ds_bvh_stack_push4_pop1_rtn_b32">; +defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx12<0x0e0, DS_BVH_STACK_RTN_B32, + "ds_bvh_stack_push4_pop1_rtn_b32">; defm DS_BVH_STACK_PUSH8_POP1_RTN_B32 : DS_Real_gfx12<0x0e1>; defm DS_BVH_STACK_PUSH8_POP2_RTN_B64 : DS_Real_gfx12<0x0e2>; @@ -1421,8 +1488,8 @@ def : MnemonicAlias<"ds_load_tr_b128", "ds_load_tr16_b128">, Requires<[isGFX1250 // GFX11. //===----------------------------------------------------------------------===// -multiclass DS_Real_gfx11<bits<8> op, string name = !tolower(NAME)> { - defvar ps = !cast<DS_Pseudo>(NAME); +multiclass DS_Real_gfx11<bits<8> op, DS_Pseudo ps = !cast<DS_Pseudo>(NAME#"_gfx9"), + string name = !tolower(NAME)> { let AssemblerPredicate = isGFX11Only in { let DecoderNamespace = "GFX11" in def _gfx11 : @@ -1433,8 +1500,11 @@ multiclass DS_Real_gfx11<bits<8> op, string name = !tolower(NAME)> { } // End AssemblerPredicate } -multiclass DS_Real_gfx11_gfx12<bits<8> op, string name = !tolower(NAME)> - : DS_Real_gfx11<op, name>, DS_Real_gfx12<op, name>; +multiclass DS_Real_gfx11_gfx12<bits<8> op, + string name = !tolower(NAME), + DS_Pseudo ps = !cast<DS_Pseudo>(NAME#"_gfx9")> + : DS_Real_gfx11<op, ps, name>, + DS_Real_gfx12<op, ps, name>; defm DS_WRITE_B32 : DS_Real_gfx11_gfx12<0x00d, "ds_store_b32">; defm DS_WRITE2_B32 : DS_Real_gfx11_gfx12<0x00e, "ds_store_2addr_b32">; @@ -1460,16 +1530,16 @@ defm DS_WRXCHG2ST64_RTN_B64 : DS_Real_gfx11_gfx12<0x06f, "ds_storexchg_2addr_str defm DS_READ_B64 : DS_Real_gfx11_gfx12<0x076, "ds_load_b64">; defm DS_READ2_B64 : DS_Real_gfx11_gfx12<0x077, "ds_load_2addr_b64">; defm DS_READ2ST64_B64 : DS_Real_gfx11_gfx12<0x078, "ds_load_2addr_stride64_b64">; -defm DS_WRITE_B8_D16_HI : DS_Real_gfx11_gfx12<0x0a0, "ds_store_b8_d16_hi">; -defm DS_WRITE_B16_D16_HI : DS_Real_gfx11_gfx12<0x0a1, "ds_store_b16_d16_hi">; -defm DS_READ_U8_D16 : DS_Real_gfx11_gfx12<0x0a2, "ds_load_u8_d16">; -defm DS_READ_U8_D16_HI : DS_Real_gfx11_gfx12<0x0a3, "ds_load_u8_d16_hi">; -defm DS_READ_I8_D16 : DS_Real_gfx11_gfx12<0x0a4, "ds_load_i8_d16">; -defm DS_READ_I8_D16_HI : DS_Real_gfx11_gfx12<0x0a5, "ds_load_i8_d16_hi">; -defm DS_READ_U16_D16 : DS_Real_gfx11_gfx12<0x0a6, "ds_load_u16_d16">; -defm DS_READ_U16_D16_HI : DS_Real_gfx11_gfx12<0x0a7, "ds_load_u16_d16_hi">; -defm DS_WRITE_ADDTID_B32 : DS_Real_gfx11_gfx12<0x0b0, "ds_store_addtid_b32">; -defm DS_READ_ADDTID_B32 : DS_Real_gfx11_gfx12<0x0b1, "ds_load_addtid_b32">; +defm DS_WRITE_B8_D16_HI : DS_Real_gfx11_gfx12<0x0a0, "ds_store_b8_d16_hi", DS_WRITE_B8_D16_HI>; +defm DS_WRITE_B16_D16_HI : DS_Real_gfx11_gfx12<0x0a1, "ds_store_b16_d16_hi", DS_WRITE_B16_D16_HI>; +defm DS_READ_U8_D16 : DS_Real_gfx11_gfx12<0x0a2, "ds_load_u8_d16", DS_READ_U8_D16>; +defm DS_READ_U8_D16_HI : DS_Real_gfx11_gfx12<0x0a3, "ds_load_u8_d16_hi", DS_READ_U8_D16_HI>; +defm DS_READ_I8_D16 : DS_Real_gfx11_gfx12<0x0a4, "ds_load_i8_d16", DS_READ_I8_D16>; +defm DS_READ_I8_D16_HI : DS_Real_gfx11_gfx12<0x0a5, "ds_load_i8_d16_hi", DS_READ_I8_D16_HI>; +defm DS_READ_U16_D16 : DS_Real_gfx11_gfx12<0x0a6, "ds_load_u16_d16", DS_READ_U16_D16>; +defm DS_READ_U16_D16_HI : DS_Real_gfx11_gfx12<0x0a7, "ds_load_u16_d16_hi", DS_READ_U16_D16_HI>; +defm DS_WRITE_ADDTID_B32 : DS_Real_gfx11_gfx12<0x0b0, "ds_store_addtid_b32", DS_WRITE_ADDTID_B32>; +defm DS_READ_ADDTID_B32 : DS_Real_gfx11_gfx12<0x0b1, "ds_load_addtid_b32", DS_READ_ADDTID_B32>; defm DS_WRITE_B96 : DS_Real_gfx11_gfx12<0x0de, "ds_store_b96">; defm DS_WRITE_B128 : DS_Real_gfx11_gfx12<0x0df, "ds_store_b128">; defm DS_READ_B96 : DS_Real_gfx11_gfx12<0x0fe, "ds_load_b96">; @@ -1489,22 +1559,22 @@ defm DS_CMPSTORE_RTN_B64 : DS_Real_gfx11_gfx12<0x070>; defm DS_CMPSTORE_RTN_F64 : DS_Real_gfx11<0x071>; defm DS_ADD_RTN_F32 : DS_Real_gfx11_gfx12<0x079>; -defm DS_ADD_GS_REG_RTN : DS_Real_gfx11<0x07a>; -defm DS_SUB_GS_REG_RTN : DS_Real_gfx11<0x07b>; -defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx11<0x0ad>; +defm DS_ADD_GS_REG_RTN : DS_Real_gfx11<0x07a, DS_ADD_GS_REG_RTN>; +defm DS_SUB_GS_REG_RTN : DS_Real_gfx11<0x07b, DS_SUB_GS_REG_RTN>; +defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx11<0x0ad, DS_BVH_STACK_RTN_B32>; //===----------------------------------------------------------------------===// // GFX10. //===----------------------------------------------------------------------===// let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { - multiclass DS_Real_gfx10<bits<8> op> { + multiclass DS_Real_gfx10<bits<8> op, DS_Pseudo ps = !cast<DS_Pseudo>(NAME)> { def _gfx10 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, - !cast<DS_Pseudo>(NAME), SIEncodingFamily.GFX10>; + ps, SIEncodingFamily.GFX10>; } } // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" -defm DS_ADD_RTN_F32 : DS_Real_gfx10<0x055>; +defm DS_ADD_RTN_F32 : DS_Real_gfx10<0x055, DS_ADD_RTN_F32_gfx9>; defm DS_WRITE_B8_D16_HI : DS_Real_gfx10<0x0a0>; defm DS_WRITE_B16_D16_HI : DS_Real_gfx10<0x0a1>; defm DS_READ_U8_D16 : DS_Real_gfx10<0x0a2>; @@ -1520,39 +1590,48 @@ defm DS_READ_ADDTID_B32 : DS_Real_gfx10<0x0b1>; // GFX10, GFX11, GFX12. //===----------------------------------------------------------------------===// -multiclass DS_Real_gfx10_gfx11_gfx12<bits<8> op> : - DS_Real_gfx10<op>, DS_Real_gfx11<op>, DS_Real_gfx12<op>; +multiclass DS_Real_gfx10_gfx11_gfx12<bits<8> op, DS_Pseudo ps = !cast<DS_Pseudo>(NAME#"_gfx9")> : + DS_Real_gfx10<op, ps>, + DS_Real_gfx11<op, ps>, + DS_Real_gfx12<op, ps>; -multiclass DS_Real_gfx10_gfx11<bits<8> op> : - DS_Real_gfx10<op>, DS_Real_gfx11<op>; +multiclass DS_Real_gfx10_gfx11<bits<8> op, DS_Pseudo ps = !cast<DS_Pseudo>(NAME#"_gfx9")> : + DS_Real_gfx10<op, ps>, DS_Real_gfx11<op, ps>; defm DS_ADD_F32 : DS_Real_gfx10_gfx11_gfx12<0x015>; defm DS_ADD_SRC2_F32 : DS_Real_gfx10<0x095>; -defm DS_PERMUTE_B32 : DS_Real_gfx10_gfx11_gfx12<0x0b2>; -defm DS_BPERMUTE_B32 : DS_Real_gfx10_gfx11_gfx12<0x0b3>; +defm DS_PERMUTE_B32 : DS_Real_gfx10_gfx11_gfx12<0x0b2, DS_PERMUTE_B32>; +defm DS_BPERMUTE_B32 : DS_Real_gfx10_gfx11_gfx12<0x0b3, DS_BPERMUTE_B32>; //===----------------------------------------------------------------------===// // GFX7, GFX10, GFX11, GFX12. //===----------------------------------------------------------------------===// let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in { - multiclass DS_Real_gfx7<bits<8> op> { + multiclass DS_Real_gfx7<bits<8> op, DS_Pseudo ps> { def _gfx7 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, - !cast<DS_Pseudo>(NAME), SIEncodingFamily.SI>; + ps, SIEncodingFamily.SI>; } } // End AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" -multiclass DS_Real_gfx7_gfx10_gfx11_gfx12<bits<8> op> : - DS_Real_gfx7<op>, DS_Real_gfx10_gfx11_gfx12<op>; +multiclass DS_Real_gfx7_gfx10_gfx11_gfx12<bits<8> op, + DS_Pseudo ps_gfx6 = !cast<DS_Pseudo>(NAME), + DS_Pseudo ps_gfx9 = !cast<DS_Pseudo>(NAME#"_gfx9")> : + DS_Real_gfx7<op, ps_gfx6>, + DS_Real_gfx10_gfx11_gfx12<op, ps_gfx9>; -multiclass DS_Real_gfx7_gfx10_gfx11<bits<8> op> : - DS_Real_gfx7<op>, DS_Real_gfx10_gfx11<op>; +multiclass DS_Real_gfx7_gfx10_gfx11<bits<8> op, + DS_Pseudo ps_gfx6 = !cast<DS_Pseudo>(NAME), + DS_Pseudo ps_gfx9 = !cast<DS_Pseudo>(NAME#"_gfx9")> : + DS_Real_gfx7<op, ps_gfx6>, DS_Real_gfx10_gfx11<op, ps_gfx9>; -multiclass DS_Real_gfx7_gfx10<bits<8> op> : - DS_Real_gfx7<op>, DS_Real_gfx10<op>; +multiclass DS_Real_gfx7_gfx10<bits<8> op, + DS_Pseudo ps_gfx6 = !cast<DS_Pseudo>(NAME), + DS_Pseudo ps_gfx9 = !cast<DS_Pseudo>(NAME#"_gfx9")> : + DS_Real_gfx7<op, ps_gfx6>, DS_Real_gfx10<op, ps_gfx9>; // FIXME-GFX7: Add tests when upstreaming this part. -defm DS_GWS_SEMA_RELEASE_ALL : DS_Real_gfx7_gfx10_gfx11<0x018>; +defm DS_GWS_SEMA_RELEASE_ALL : DS_Real_gfx7_gfx10_gfx11<0x018, DS_GWS_SEMA_RELEASE_ALL, DS_GWS_SEMA_RELEASE_ALL>; defm DS_WRAP_RTN_B32 : DS_Real_gfx7_gfx10_gfx11<0x034>; defm DS_CONDXCHG32_RTN_B64 : DS_Real_gfx7_gfx10_gfx11_gfx12<0x07e>; defm DS_WRITE_B96 : DS_Real_gfx7_gfx10<0x0de>; @@ -1565,20 +1644,27 @@ defm DS_READ_B128 : DS_Real_gfx7_gfx10<0x0ff>; //===----------------------------------------------------------------------===// let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in { - multiclass DS_Real_gfx6_gfx7<bits<8> op> { + multiclass DS_Real_gfx6_gfx7<bits<8> op, DS_Pseudo ps> { def _gfx6_gfx7 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, - !cast<DS_Pseudo>(NAME), SIEncodingFamily.SI>; + ps, SIEncodingFamily.SI>; } } // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" -multiclass DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op> : - DS_Real_gfx6_gfx7<op>, DS_Real_gfx10_gfx11_gfx12<op>; +multiclass DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op, + DS_Pseudo ps_gfx6 = !cast<DS_Pseudo>(NAME), + DS_Pseudo ps_gfx9 = !cast<DS_Pseudo>(NAME#"_gfx9")> : + DS_Real_gfx6_gfx7<op, ps_gfx6>, + DS_Real_gfx10_gfx11_gfx12<op, ps_gfx9>; -multiclass DS_Real_gfx6_gfx7_gfx10_gfx11<bits<8> op> : - DS_Real_gfx6_gfx7<op>, DS_Real_gfx10_gfx11<op>; +multiclass DS_Real_gfx6_gfx7_gfx10_gfx11<bits<8> op, + DS_Pseudo ps_gfx6 = !cast<DS_Pseudo>(NAME), + DS_Pseudo ps_gfx9 = !cast<DS_Pseudo>(NAME#"_gfx9")> : + DS_Real_gfx6_gfx7<op, ps_gfx6>, DS_Real_gfx10_gfx11<op, ps_gfx9>; -multiclass DS_Real_gfx6_gfx7_gfx10<bits<8> op> : - DS_Real_gfx6_gfx7<op>, DS_Real_gfx10<op>; +multiclass DS_Real_gfx6_gfx7_gfx10<bits<8> op, + DS_Pseudo ps_gfx6 = !cast<DS_Pseudo>(NAME), + DS_Pseudo ps_gfx9 = !cast<DS_Pseudo>(NAME#"_gfx9")> : + DS_Real_gfx6_gfx7<op, ps_gfx6>, DS_Real_gfx10<op, ps_gfx9>; defm DS_ADD_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x000>; defm DS_SUB_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x001>; @@ -1602,12 +1688,12 @@ defm DS_CMPST_F32 : DS_Real_gfx6_gfx7_gfx10<0x011>; defm DS_MIN_F32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x012>; defm DS_MAX_F32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x013>; -defm DS_NOP : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x014>; -defm DS_GWS_INIT : DS_Real_gfx6_gfx7_gfx10_gfx11<0x019>; -defm DS_GWS_SEMA_V : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01a>; -defm DS_GWS_SEMA_BR : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01b>; -defm DS_GWS_SEMA_P : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01c>; -defm DS_GWS_BARRIER : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01d>; +defm DS_NOP : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x014, DS_NOP, DS_NOP>; +defm DS_GWS_INIT : DS_Real_gfx6_gfx7_gfx10_gfx11<0x019, DS_GWS_INIT, DS_GWS_INIT>; +defm DS_GWS_SEMA_V : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01a, DS_GWS_SEMA_V, DS_GWS_SEMA_V>; +defm DS_GWS_SEMA_BR : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01b, DS_GWS_SEMA_BR, DS_GWS_SEMA_BR>; +defm DS_GWS_SEMA_P : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01c, DS_GWS_SEMA_P, DS_GWS_SEMA_P>; +defm DS_GWS_BARRIER : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01d, DS_GWS_BARRIER, DS_GWS_BARRIER>; defm DS_WRITE_B8 : DS_Real_gfx6_gfx7_gfx10<0x01e>; defm DS_WRITE_B16 : DS_Real_gfx6_gfx7_gfx10<0x01f>; @@ -1634,7 +1720,7 @@ defm DS_CMPST_RTN_F32 : DS_Real_gfx6_gfx7_gfx10<0x031>; defm DS_MIN_RTN_F32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x032>; defm DS_MAX_RTN_F32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x033>; -defm DS_SWIZZLE_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x035>; +defm DS_SWIZZLE_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x035, DS_SWIZZLE_B32, DS_SWIZZLE_B32>; defm DS_READ_B32 : DS_Real_gfx6_gfx7_gfx10<0x036>; defm DS_READ2_B32 : DS_Real_gfx6_gfx7_gfx10<0x037>; @@ -1644,9 +1730,9 @@ defm DS_READ_U8 : DS_Real_gfx6_gfx7_gfx10<0x03a>; defm DS_READ_I16 : DS_Real_gfx6_gfx7_gfx10<0x03b>; defm DS_READ_U16 : DS_Real_gfx6_gfx7_gfx10<0x03c>; -defm DS_CONSUME : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x03d>; -defm DS_APPEND : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x03e>; -defm DS_ORDERED_COUNT : DS_Real_gfx6_gfx7_gfx10_gfx11<0x03f>; +defm DS_CONSUME : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x03d, DS_CONSUME, DS_CONSUME>; +defm DS_APPEND : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x03e, DS_APPEND, DS_APPEND>; +defm DS_ORDERED_COUNT : DS_Real_gfx6_gfx7_gfx10_gfx11<0x03f, DS_ORDERED_COUNT, DS_ORDERED_COUNT>; defm DS_ADD_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x040>; defm DS_SUB_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x041>; defm DS_RSUB_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x042>; @@ -1695,42 +1781,42 @@ defm DS_MAX_RTN_F64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x073>; defm DS_READ_B64 : DS_Real_gfx6_gfx7_gfx10<0x076>; defm DS_READ2_B64 : DS_Real_gfx6_gfx7_gfx10<0x077>; defm DS_READ2ST64_B64 : DS_Real_gfx6_gfx7_gfx10<0x078>; -defm DS_ADD_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x080>; -defm DS_SUB_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x081>; -defm DS_RSUB_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x082>; -defm DS_INC_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x083>; -defm DS_DEC_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x084>; -defm DS_MIN_SRC2_I32 : DS_Real_gfx6_gfx7_gfx10<0x085>; -defm DS_MAX_SRC2_I32 : DS_Real_gfx6_gfx7_gfx10<0x086>; -defm DS_MIN_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x087>; -defm DS_MAX_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x088>; -defm DS_AND_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x089>; -defm DS_OR_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x08a>; -defm DS_XOR_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x08b>; -defm DS_WRITE_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x08d>; -defm DS_MIN_SRC2_F32 : DS_Real_gfx6_gfx7_gfx10<0x092>; -defm DS_MAX_SRC2_F32 : DS_Real_gfx6_gfx7_gfx10<0x093>; -defm DS_ADD_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c0>; -defm DS_SUB_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c1>; -defm DS_RSUB_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c2>; -defm DS_INC_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c3>; -defm DS_DEC_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c4>; -defm DS_MIN_SRC2_I64 : DS_Real_gfx6_gfx7_gfx10<0x0c5>; -defm DS_MAX_SRC2_I64 : DS_Real_gfx6_gfx7_gfx10<0x0c6>; -defm DS_MIN_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c7>; -defm DS_MAX_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c8>; -defm DS_AND_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0c9>; -defm DS_OR_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0ca>; -defm DS_XOR_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0cb>; -defm DS_WRITE_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0cd>; -defm DS_MIN_SRC2_F64 : DS_Real_gfx6_gfx7_gfx10<0x0d2>; -defm DS_MAX_SRC2_F64 : DS_Real_gfx6_gfx7_gfx10<0x0d3>; +defm DS_ADD_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x080, DS_ADD_SRC2_U32, DS_ADD_SRC2_U32>; +defm DS_SUB_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x081, DS_SUB_SRC2_U32, DS_SUB_SRC2_U32>; +defm DS_RSUB_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x082, DS_RSUB_SRC2_U32, DS_RSUB_SRC2_U32>; +defm DS_INC_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x083, DS_INC_SRC2_U32, DS_INC_SRC2_U32>; +defm DS_DEC_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x084, DS_DEC_SRC2_U32, DS_DEC_SRC2_U32>; +defm DS_MIN_SRC2_I32 : DS_Real_gfx6_gfx7_gfx10<0x085, DS_MIN_SRC2_I32, DS_MIN_SRC2_I32>; +defm DS_MAX_SRC2_I32 : DS_Real_gfx6_gfx7_gfx10<0x086, DS_MAX_SRC2_I32, DS_MAX_SRC2_I32>; +defm DS_MIN_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x087, DS_MIN_SRC2_U32, DS_MIN_SRC2_U32>; +defm DS_MAX_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x088, DS_MAX_SRC2_U32, DS_MAX_SRC2_U32>; +defm DS_AND_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x089, DS_AND_SRC2_B32, DS_AND_SRC2_B32>; +defm DS_OR_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x08a, DS_OR_SRC2_B32, DS_OR_SRC2_B32>; +defm DS_XOR_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x08b, DS_XOR_SRC2_B32, DS_XOR_SRC2_B32>; +defm DS_WRITE_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x08d, DS_WRITE_SRC2_B32, DS_WRITE_SRC2_B32>; +defm DS_MIN_SRC2_F32 : DS_Real_gfx6_gfx7_gfx10<0x092, DS_MIN_SRC2_F32, DS_MIN_SRC2_F32>; +defm DS_MAX_SRC2_F32 : DS_Real_gfx6_gfx7_gfx10<0x093, DS_MAX_SRC2_F32, DS_MAX_SRC2_F32>; +defm DS_ADD_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c0, DS_ADD_SRC2_U64, DS_ADD_SRC2_U64>; +defm DS_SUB_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c1, DS_SUB_SRC2_U64, DS_SUB_SRC2_U64>; +defm DS_RSUB_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c2, DS_RSUB_SRC2_U64, DS_RSUB_SRC2_U64>; +defm DS_INC_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c3, DS_INC_SRC2_U64, DS_INC_SRC2_U64>; +defm DS_DEC_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c4, DS_DEC_SRC2_U64, DS_DEC_SRC2_U64>; +defm DS_MIN_SRC2_I64 : DS_Real_gfx6_gfx7_gfx10<0x0c5, DS_MIN_SRC2_I64, DS_MIN_SRC2_I64>; +defm DS_MAX_SRC2_I64 : DS_Real_gfx6_gfx7_gfx10<0x0c6, DS_MAX_SRC2_I64, DS_MAX_SRC2_I64>; +defm DS_MIN_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c7, DS_MIN_SRC2_U64, DS_MIN_SRC2_U64>; +defm DS_MAX_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c8, DS_MAX_SRC2_U64, DS_MAX_SRC2_U64>; +defm DS_AND_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0c9, DS_AND_SRC2_B64, DS_AND_SRC2_B64>; +defm DS_OR_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0ca, DS_OR_SRC2_B64, DS_OR_SRC2_B64>; +defm DS_XOR_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0cb, DS_XOR_SRC2_B64, DS_XOR_SRC2_B64>; +defm DS_WRITE_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0cd, DS_WRITE_SRC2_B64, DS_WRITE_SRC2_B64>; +defm DS_MIN_SRC2_F64 : DS_Real_gfx6_gfx7_gfx10<0x0d2, DS_MIN_SRC2_F64, DS_MIN_SRC2_F64>; +defm DS_MAX_SRC2_F64 : DS_Real_gfx6_gfx7_gfx10<0x0d3, DS_MAX_SRC2_F64, DS_MAX_SRC2_F64>; //===----------------------------------------------------------------------===// // GFX8, GFX9 (VI). //===----------------------------------------------------------------------===// -class DS_Real_vi <bits<8> op, DS_Pseudo ps> : +class DS_Real_Base_vi <bits<8> op, DS_Pseudo ps> : DS_Real <ps>, SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> { let AssemblerPredicate = isGFX8GFX9; @@ -1749,181 +1835,210 @@ class DS_Real_vi <bits<8> op, DS_Pseudo ps> : let Inst{63-56} = !if(ps.has_vdst, vdst{7-0}, 0); } -def DS_ADD_U32_vi : DS_Real_vi<0x0, DS_ADD_U32>; -def DS_SUB_U32_vi : DS_Real_vi<0x1, DS_SUB_U32>; -def DS_RSUB_U32_vi : DS_Real_vi<0x2, DS_RSUB_U32>; -def DS_INC_U32_vi : DS_Real_vi<0x3, DS_INC_U32>; -def DS_DEC_U32_vi : DS_Real_vi<0x4, DS_DEC_U32>; -def DS_MIN_I32_vi : DS_Real_vi<0x5, DS_MIN_I32>; -def DS_MAX_I32_vi : DS_Real_vi<0x6, DS_MAX_I32>; -def DS_MIN_U32_vi : DS_Real_vi<0x7, DS_MIN_U32>; -def DS_MAX_U32_vi : DS_Real_vi<0x8, DS_MAX_U32>; -def DS_AND_B32_vi : DS_Real_vi<0x9, DS_AND_B32>; -def DS_OR_B32_vi : DS_Real_vi<0xa, DS_OR_B32>; -def DS_XOR_B32_vi : DS_Real_vi<0xb, DS_XOR_B32>; -def DS_MSKOR_B32_vi : DS_Real_vi<0xc, DS_MSKOR_B32>; -def DS_WRITE_B32_vi : DS_Real_vi<0xd, DS_WRITE_B32>; -def DS_WRITE2_B32_vi : DS_Real_vi<0xe, DS_WRITE2_B32>; -def DS_WRITE2ST64_B32_vi : DS_Real_vi<0xf, DS_WRITE2ST64_B32>; -def DS_CMPST_B32_vi : DS_Real_vi<0x10, DS_CMPST_B32>; -def DS_CMPST_F32_vi : DS_Real_vi<0x11, DS_CMPST_F32>; -def DS_MIN_F32_vi : DS_Real_vi<0x12, DS_MIN_F32>; -def DS_MAX_F32_vi : DS_Real_vi<0x13, DS_MAX_F32>; -def DS_NOP_vi : DS_Real_vi<0x14, DS_NOP>; -def DS_ADD_F32_vi : DS_Real_vi<0x15, DS_ADD_F32>; -def DS_GWS_INIT_vi : DS_Real_vi<0x99, DS_GWS_INIT>; -def DS_GWS_SEMA_V_vi : DS_Real_vi<0x9a, DS_GWS_SEMA_V>; -def DS_GWS_SEMA_BR_vi : DS_Real_vi<0x9b, DS_GWS_SEMA_BR>; -def DS_GWS_SEMA_P_vi : DS_Real_vi<0x9c, DS_GWS_SEMA_P>; -def DS_GWS_BARRIER_vi : DS_Real_vi<0x9d, DS_GWS_BARRIER>; -def DS_WRITE_ADDTID_B32_vi : DS_Real_vi<0x1d, DS_WRITE_ADDTID_B32>; -def DS_WRITE_B8_vi : DS_Real_vi<0x1e, DS_WRITE_B8>; -def DS_WRITE_B16_vi : DS_Real_vi<0x1f, DS_WRITE_B16>; -def DS_ADD_RTN_U32_vi : DS_Real_vi<0x20, DS_ADD_RTN_U32>; -def DS_SUB_RTN_U32_vi : DS_Real_vi<0x21, DS_SUB_RTN_U32>; -def DS_RSUB_RTN_U32_vi : DS_Real_vi<0x22, DS_RSUB_RTN_U32>; -def DS_INC_RTN_U32_vi : DS_Real_vi<0x23, DS_INC_RTN_U32>; -def DS_DEC_RTN_U32_vi : DS_Real_vi<0x24, DS_DEC_RTN_U32>; -def DS_MIN_RTN_I32_vi : DS_Real_vi<0x25, DS_MIN_RTN_I32>; -def DS_MAX_RTN_I32_vi : DS_Real_vi<0x26, DS_MAX_RTN_I32>; -def DS_MIN_RTN_U32_vi : DS_Real_vi<0x27, DS_MIN_RTN_U32>; -def DS_MAX_RTN_U32_vi : DS_Real_vi<0x28, DS_MAX_RTN_U32>; -def DS_AND_RTN_B32_vi : DS_Real_vi<0x29, DS_AND_RTN_B32>; -def DS_OR_RTN_B32_vi : DS_Real_vi<0x2a, DS_OR_RTN_B32>; -def DS_XOR_RTN_B32_vi : DS_Real_vi<0x2b, DS_XOR_RTN_B32>; -def DS_MSKOR_RTN_B32_vi : DS_Real_vi<0x2c, DS_MSKOR_RTN_B32>; -def DS_WRXCHG_RTN_B32_vi : DS_Real_vi<0x2d, DS_WRXCHG_RTN_B32>; -def DS_WRXCHG2_RTN_B32_vi : DS_Real_vi<0x2e, DS_WRXCHG2_RTN_B32>; -def DS_WRXCHG2ST64_RTN_B32_vi : DS_Real_vi<0x2f, DS_WRXCHG2ST64_RTN_B32>; -def DS_CMPST_RTN_B32_vi : DS_Real_vi<0x30, DS_CMPST_RTN_B32>; -def DS_CMPST_RTN_F32_vi : DS_Real_vi<0x31, DS_CMPST_RTN_F32>; -def DS_MIN_RTN_F32_vi : DS_Real_vi<0x32, DS_MIN_RTN_F32>; -def DS_MAX_RTN_F32_vi : DS_Real_vi<0x33, DS_MAX_RTN_F32>; -def DS_WRAP_RTN_B32_vi : DS_Real_vi<0x34, DS_WRAP_RTN_B32>; -def DS_ADD_RTN_F32_vi : DS_Real_vi<0x35, DS_ADD_RTN_F32>; -def DS_READ_B32_vi : DS_Real_vi<0x36, DS_READ_B32>; -def DS_READ2_B32_vi : DS_Real_vi<0x37, DS_READ2_B32>; -def DS_READ2ST64_B32_vi : DS_Real_vi<0x38, DS_READ2ST64_B32>; -def DS_READ_I8_vi : DS_Real_vi<0x39, DS_READ_I8>; -def DS_READ_U8_vi : DS_Real_vi<0x3a, DS_READ_U8>; -def DS_READ_I16_vi : DS_Real_vi<0x3b, DS_READ_I16>; -def DS_READ_U16_vi : DS_Real_vi<0x3c, DS_READ_U16>; -def DS_READ_ADDTID_B32_vi : DS_Real_vi<0xb6, DS_READ_ADDTID_B32>; -def DS_CONSUME_vi : DS_Real_vi<0xbd, DS_CONSUME>; -def DS_APPEND_vi : DS_Real_vi<0xbe, DS_APPEND>; -def DS_ORDERED_COUNT_vi : DS_Real_vi<0xbf, DS_ORDERED_COUNT>; -def DS_SWIZZLE_B32_vi : DS_Real_vi<0x3d, DS_SWIZZLE_B32>; -def DS_PERMUTE_B32_vi : DS_Real_vi<0x3e, DS_PERMUTE_B32>; -def DS_BPERMUTE_B32_vi : DS_Real_vi<0x3f, DS_BPERMUTE_B32>; - -def DS_ADD_U64_vi : DS_Real_vi<0x40, DS_ADD_U64>; -def DS_SUB_U64_vi : DS_Real_vi<0x41, DS_SUB_U64>; -def DS_RSUB_U64_vi : DS_Real_vi<0x42, DS_RSUB_U64>; -def DS_INC_U64_vi : DS_Real_vi<0x43, DS_INC_U64>; -def DS_DEC_U64_vi : DS_Real_vi<0x44, DS_DEC_U64>; -def DS_MIN_I64_vi : DS_Real_vi<0x45, DS_MIN_I64>; -def DS_MAX_I64_vi : DS_Real_vi<0x46, DS_MAX_I64>; -def DS_MIN_U64_vi : DS_Real_vi<0x47, DS_MIN_U64>; -def DS_MAX_U64_vi : DS_Real_vi<0x48, DS_MAX_U64>; -def DS_AND_B64_vi : DS_Real_vi<0x49, DS_AND_B64>; -def DS_OR_B64_vi : DS_Real_vi<0x4a, DS_OR_B64>; -def DS_XOR_B64_vi : DS_Real_vi<0x4b, DS_XOR_B64>; -def DS_MSKOR_B64_vi : DS_Real_vi<0x4c, DS_MSKOR_B64>; -def DS_WRITE_B64_vi : DS_Real_vi<0x4d, DS_WRITE_B64>; -def DS_WRITE2_B64_vi : DS_Real_vi<0x4E, DS_WRITE2_B64>; -def DS_WRITE2ST64_B64_vi : DS_Real_vi<0x4f, DS_WRITE2ST64_B64>; -def DS_CMPST_B64_vi : DS_Real_vi<0x50, DS_CMPST_B64>; -def DS_CMPST_F64_vi : DS_Real_vi<0x51, DS_CMPST_F64>; -def DS_MIN_F64_vi : DS_Real_vi<0x52, DS_MIN_F64>; -def DS_MAX_F64_vi : DS_Real_vi<0x53, DS_MAX_F64>; - -def DS_WRITE_B8_D16_HI_vi : DS_Real_vi<0x54, DS_WRITE_B8_D16_HI>; -def DS_WRITE_B16_D16_HI_vi : DS_Real_vi<0x55, DS_WRITE_B16_D16_HI>; - -def DS_READ_U8_D16_vi : DS_Real_vi<0x56, DS_READ_U8_D16>; -def DS_READ_U8_D16_HI_vi : DS_Real_vi<0x57, DS_READ_U8_D16_HI>; -def DS_READ_I8_D16_vi : DS_Real_vi<0x58, DS_READ_I8_D16>; -def DS_READ_I8_D16_HI_vi : DS_Real_vi<0x59, DS_READ_I8_D16_HI>; -def DS_READ_U16_D16_vi : DS_Real_vi<0x5a, DS_READ_U16_D16>; -def DS_READ_U16_D16_HI_vi : DS_Real_vi<0x5b, DS_READ_U16_D16_HI>; - -def DS_ADD_RTN_U64_vi : DS_Real_vi<0x60, DS_ADD_RTN_U64>; -def DS_SUB_RTN_U64_vi : DS_Real_vi<0x61, DS_SUB_RTN_U64>; -def DS_RSUB_RTN_U64_vi : DS_Real_vi<0x62, DS_RSUB_RTN_U64>; -def DS_INC_RTN_U64_vi : DS_Real_vi<0x63, DS_INC_RTN_U64>; -def DS_DEC_RTN_U64_vi : DS_Real_vi<0x64, DS_DEC_RTN_U64>; -def DS_MIN_RTN_I64_vi : DS_Real_vi<0x65, DS_MIN_RTN_I64>; -def DS_MAX_RTN_I64_vi : DS_Real_vi<0x66, DS_MAX_RTN_I64>; -def DS_MIN_RTN_U64_vi : DS_Real_vi<0x67, DS_MIN_RTN_U64>; -def DS_MAX_RTN_U64_vi : DS_Real_vi<0x68, DS_MAX_RTN_U64>; -def DS_AND_RTN_B64_vi : DS_Real_vi<0x69, DS_AND_RTN_B64>; -def DS_OR_RTN_B64_vi : DS_Real_vi<0x6a, DS_OR_RTN_B64>; -def DS_XOR_RTN_B64_vi : DS_Real_vi<0x6b, DS_XOR_RTN_B64>; -def DS_MSKOR_RTN_B64_vi : DS_Real_vi<0x6c, DS_MSKOR_RTN_B64>; -def DS_WRXCHG_RTN_B64_vi : DS_Real_vi<0x6d, DS_WRXCHG_RTN_B64>; -def DS_WRXCHG2_RTN_B64_vi : DS_Real_vi<0x6e, DS_WRXCHG2_RTN_B64>; -def DS_WRXCHG2ST64_RTN_B64_vi : DS_Real_vi<0x6f, DS_WRXCHG2ST64_RTN_B64>; -def DS_CONDXCHG32_RTN_B64_vi : DS_Real_vi<0x7e, DS_CONDXCHG32_RTN_B64>; -def DS_GWS_SEMA_RELEASE_ALL_vi : DS_Real_vi<0x98, DS_GWS_SEMA_RELEASE_ALL>; -def DS_CMPST_RTN_B64_vi : DS_Real_vi<0x70, DS_CMPST_RTN_B64>; -def DS_CMPST_RTN_F64_vi : DS_Real_vi<0x71, DS_CMPST_RTN_F64>; -def DS_MIN_RTN_F64_vi : DS_Real_vi<0x72, DS_MIN_RTN_F64>; -def DS_MAX_RTN_F64_vi : DS_Real_vi<0x73, DS_MAX_RTN_F64>; - -def DS_READ_B64_vi : DS_Real_vi<0x76, DS_READ_B64>; -def DS_READ2_B64_vi : DS_Real_vi<0x77, DS_READ2_B64>; -def DS_READ2ST64_B64_vi : DS_Real_vi<0x78, DS_READ2ST64_B64>; - -def DS_ADD_SRC2_U32_vi : DS_Real_vi<0x80, DS_ADD_SRC2_U32>; -def DS_SUB_SRC2_U32_vi : DS_Real_vi<0x81, DS_SUB_SRC2_U32>; -def DS_RSUB_SRC2_U32_vi : DS_Real_vi<0x82, DS_RSUB_SRC2_U32>; -def DS_INC_SRC2_U32_vi : DS_Real_vi<0x83, DS_INC_SRC2_U32>; -def DS_DEC_SRC2_U32_vi : DS_Real_vi<0x84, DS_DEC_SRC2_U32>; -def DS_MIN_SRC2_I32_vi : DS_Real_vi<0x85, DS_MIN_SRC2_I32>; -def DS_MAX_SRC2_I32_vi : DS_Real_vi<0x86, DS_MAX_SRC2_I32>; -def DS_MIN_SRC2_U32_vi : DS_Real_vi<0x87, DS_MIN_SRC2_U32>; -def DS_MAX_SRC2_U32_vi : DS_Real_vi<0x88, DS_MAX_SRC2_U32>; -def DS_AND_SRC2_B32_vi : DS_Real_vi<0x89, DS_AND_SRC2_B32>; -def DS_OR_SRC2_B32_vi : DS_Real_vi<0x8a, DS_OR_SRC2_B32>; -def DS_XOR_SRC2_B32_vi : DS_Real_vi<0x8b, DS_XOR_SRC2_B32>; -def DS_WRITE_SRC2_B32_vi : DS_Real_vi<0x8d, DS_WRITE_SRC2_B32>; -def DS_MIN_SRC2_F32_vi : DS_Real_vi<0x92, DS_MIN_SRC2_F32>; -def DS_MAX_SRC2_F32_vi : DS_Real_vi<0x93, DS_MAX_SRC2_F32>; -def DS_ADD_SRC2_F32_vi : DS_Real_vi<0x95, DS_ADD_SRC2_F32>; -def DS_ADD_SRC2_U64_vi : DS_Real_vi<0xc0, DS_ADD_SRC2_U64>; -def DS_SUB_SRC2_U64_vi : DS_Real_vi<0xc1, DS_SUB_SRC2_U64>; -def DS_RSUB_SRC2_U64_vi : DS_Real_vi<0xc2, DS_RSUB_SRC2_U64>; -def DS_INC_SRC2_U64_vi : DS_Real_vi<0xc3, DS_INC_SRC2_U64>; -def DS_DEC_SRC2_U64_vi : DS_Real_vi<0xc4, DS_DEC_SRC2_U64>; -def DS_MIN_SRC2_I64_vi : DS_Real_vi<0xc5, DS_MIN_SRC2_I64>; -def DS_MAX_SRC2_I64_vi : DS_Real_vi<0xc6, DS_MAX_SRC2_I64>; -def DS_MIN_SRC2_U64_vi : DS_Real_vi<0xc7, DS_MIN_SRC2_U64>; -def DS_MAX_SRC2_U64_vi : DS_Real_vi<0xc8, DS_MAX_SRC2_U64>; -def DS_AND_SRC2_B64_vi : DS_Real_vi<0xc9, DS_AND_SRC2_B64>; -def DS_OR_SRC2_B64_vi : DS_Real_vi<0xca, DS_OR_SRC2_B64>; -def DS_XOR_SRC2_B64_vi : DS_Real_vi<0xcb, DS_XOR_SRC2_B64>; -def DS_WRITE_SRC2_B64_vi : DS_Real_vi<0xcd, DS_WRITE_SRC2_B64>; -def DS_MIN_SRC2_F64_vi : DS_Real_vi<0xd2, DS_MIN_SRC2_F64>; -def DS_MAX_SRC2_F64_vi : DS_Real_vi<0xd3, DS_MAX_SRC2_F64>; -def DS_WRITE_B96_vi : DS_Real_vi<0xde, DS_WRITE_B96>; -def DS_WRITE_B128_vi : DS_Real_vi<0xdf, DS_WRITE_B128>; -def DS_READ_B96_vi : DS_Real_vi<0xfe, DS_READ_B96>; -def DS_READ_B128_vi : DS_Real_vi<0xff, DS_READ_B128>; + +multiclass DS_Real_vi <bits<8> op, DS_Pseudo base_pseudo, bit need_gfx9_suffix = true> { + def "" : DS_Real_Base_vi<op, base_pseudo>; + + if need_gfx9_suffix then { + def _gfx9 : DS_Real_Base_vi<op, !cast<DS_Pseudo>(!cast<string>(base_pseudo)#"_gfx9")> { + let DecoderNamespace = "GFX9"; + } + } + + // Handle cases that are available in all-AGPR or all-VGPR data + // operand forms. This should be used for all DS instructions with 2 + // data operands. + defvar agpr_suffixed_name = !cast<string>(base_pseudo)#"_agpr"; + + if !exists<DS_Pseudo>(agpr_suffixed_name) then { + def _agpr : DS_Real_Base_vi<op, !cast<DS_Pseudo>(agpr_suffixed_name)> { + let DecoderNamespace = "GFX9"; + let AssemblerPredicate = isGFX90APlus; + } + } +} + +// Instructions which use m0 or not for both gfx8 and gfx9 (or did not +// exist on gfx8) +multiclass DS_Real_m0_vi<bits<8> op, DS_Pseudo ps> : DS_Real_vi<op, ps, false>; + +defm DS_ADD_U32_vi : DS_Real_vi<0x0, DS_ADD_U32>; +defm DS_SUB_U32_vi : DS_Real_vi<0x1, DS_SUB_U32>; +defm DS_RSUB_U32_vi : DS_Real_vi<0x2, DS_RSUB_U32>; +defm DS_INC_U32_vi : DS_Real_vi<0x3, DS_INC_U32>; +defm DS_DEC_U32_vi : DS_Real_vi<0x4, DS_DEC_U32>; +defm DS_MIN_I32_vi : DS_Real_vi<0x5, DS_MIN_I32>; +defm DS_MAX_I32_vi : DS_Real_vi<0x6, DS_MAX_I32>; +defm DS_MIN_U32_vi : DS_Real_vi<0x7, DS_MIN_U32>; +defm DS_MAX_U32_vi : DS_Real_vi<0x8, DS_MAX_U32>; +defm DS_AND_B32_vi : DS_Real_vi<0x9, DS_AND_B32>; +defm DS_OR_B32_vi : DS_Real_vi<0xa, DS_OR_B32>; +defm DS_XOR_B32_vi : DS_Real_vi<0xb, DS_XOR_B32>; +defm DS_MSKOR_B32_vi : DS_Real_vi<0xc, DS_MSKOR_B32>; +defm DS_WRITE_B32_vi : DS_Real_vi<0xd, DS_WRITE_B32>; +defm DS_WRITE2_B32_vi : DS_Real_vi<0xe, DS_WRITE2_B32>; +defm DS_WRITE2ST64_B32_vi : DS_Real_vi<0xf, DS_WRITE2ST64_B32>; + +defm DS_CMPST_B32_vi : DS_Real_vi<0x10, DS_CMPST_B32>; +defm DS_CMPST_F32_vi : DS_Real_vi<0x11, DS_CMPST_F32>; +defm DS_MIN_F32_vi : DS_Real_vi<0x12, DS_MIN_F32>; +defm DS_MAX_F32_vi : DS_Real_vi<0x13, DS_MAX_F32>; +defm DS_NOP_vi : DS_Real_m0_vi<0x14, DS_NOP>; +defm DS_ADD_F32_vi : DS_Real_vi<0x15, DS_ADD_F32>; +defm DS_GWS_INIT_vi : DS_Real_m0_vi<0x99, DS_GWS_INIT>; +defm DS_GWS_SEMA_V_vi : DS_Real_m0_vi<0x9a, DS_GWS_SEMA_V>; +defm DS_GWS_SEMA_BR_vi : DS_Real_m0_vi<0x9b, DS_GWS_SEMA_BR>; +defm DS_GWS_SEMA_P_vi : DS_Real_m0_vi<0x9c, DS_GWS_SEMA_P>; +defm DS_GWS_BARRIER_vi : DS_Real_m0_vi<0x9d, DS_GWS_BARRIER>; +defm DS_WRITE_ADDTID_B32_vi: DS_Real_m0_vi<0x1d, DS_WRITE_ADDTID_B32>; +defm DS_WRITE_B8_vi : DS_Real_vi<0x1e, DS_WRITE_B8>; +defm DS_WRITE_B16_vi : DS_Real_vi<0x1f, DS_WRITE_B16>; +defm DS_ADD_RTN_U32_vi : DS_Real_vi<0x20, DS_ADD_RTN_U32>; +defm DS_SUB_RTN_U32_vi : DS_Real_vi<0x21, DS_SUB_RTN_U32>; +defm DS_RSUB_RTN_U32_vi : DS_Real_vi<0x22, DS_RSUB_RTN_U32>; +defm DS_INC_RTN_U32_vi : DS_Real_vi<0x23, DS_INC_RTN_U32>; +defm DS_DEC_RTN_U32_vi : DS_Real_vi<0x24, DS_DEC_RTN_U32>; +defm DS_MIN_RTN_I32_vi : DS_Real_vi<0x25, DS_MIN_RTN_I32>; +defm DS_MAX_RTN_I32_vi : DS_Real_vi<0x26, DS_MAX_RTN_I32>; +defm DS_MIN_RTN_U32_vi : DS_Real_vi<0x27, DS_MIN_RTN_U32>; +defm DS_MAX_RTN_U32_vi : DS_Real_vi<0x28, DS_MAX_RTN_U32>; +defm DS_AND_RTN_B32_vi : DS_Real_vi<0x29, DS_AND_RTN_B32>; +defm DS_OR_RTN_B32_vi : DS_Real_vi<0x2a, DS_OR_RTN_B32>; +defm DS_XOR_RTN_B32_vi : DS_Real_vi<0x2b, DS_XOR_RTN_B32>; +defm DS_MSKOR_RTN_B32_vi : DS_Real_vi<0x2c, DS_MSKOR_RTN_B32>; +defm DS_WRXCHG_RTN_B32_vi : DS_Real_vi<0x2d, DS_WRXCHG_RTN_B32>; +defm DS_WRXCHG2_RTN_B32_vi : DS_Real_vi<0x2e, DS_WRXCHG2_RTN_B32>; +defm DS_WRXCHG2ST64_RTN_B32_vi : DS_Real_vi<0x2f, DS_WRXCHG2ST64_RTN_B32>; +defm DS_CMPST_RTN_B32_vi : DS_Real_vi<0x30, DS_CMPST_RTN_B32>; +defm DS_CMPST_RTN_F32_vi : DS_Real_vi<0x31, DS_CMPST_RTN_F32>; +defm DS_MIN_RTN_F32_vi : DS_Real_vi<0x32, DS_MIN_RTN_F32>; +defm DS_MAX_RTN_F32_vi : DS_Real_vi<0x33, DS_MAX_RTN_F32>; +defm DS_WRAP_RTN_B32_vi : DS_Real_vi<0x34, DS_WRAP_RTN_B32>; +defm DS_ADD_RTN_F32_vi : DS_Real_vi<0x35, DS_ADD_RTN_F32>; +defm DS_READ_B32_vi : DS_Real_vi<0x36, DS_READ_B32>; +defm DS_READ2_B32_vi : DS_Real_vi<0x37, DS_READ2_B32>; +defm DS_READ2ST64_B32_vi : DS_Real_vi<0x38, DS_READ2ST64_B32>; +defm DS_READ_I8_vi : DS_Real_vi<0x39, DS_READ_I8>; +defm DS_READ_U8_vi : DS_Real_vi<0x3a, DS_READ_U8>; +defm DS_READ_I16_vi : DS_Real_vi<0x3b, DS_READ_I16>; +defm DS_READ_U16_vi : DS_Real_vi<0x3c, DS_READ_U16>; +defm DS_READ_ADDTID_B32_vi : DS_Real_m0_vi<0xb6, DS_READ_ADDTID_B32>; +defm DS_CONSUME_vi : DS_Real_m0_vi<0xbd, DS_CONSUME>; +defm DS_APPEND_vi : DS_Real_m0_vi<0xbe, DS_APPEND>; +defm DS_ORDERED_COUNT_vi : DS_Real_m0_vi<0xbf, DS_ORDERED_COUNT>; +defm DS_SWIZZLE_B32_vi : DS_Real_m0_vi<0x3d, DS_SWIZZLE_B32>; +defm DS_PERMUTE_B32_vi : DS_Real_m0_vi<0x3e, DS_PERMUTE_B32>; +defm DS_BPERMUTE_B32_vi : DS_Real_m0_vi<0x3f, DS_BPERMUTE_B32>; + +defm DS_ADD_U64_vi : DS_Real_vi<0x40, DS_ADD_U64>; +defm DS_SUB_U64_vi : DS_Real_vi<0x41, DS_SUB_U64>; +defm DS_RSUB_U64_vi : DS_Real_vi<0x42, DS_RSUB_U64>; +defm DS_INC_U64_vi : DS_Real_vi<0x43, DS_INC_U64>; +defm DS_DEC_U64_vi : DS_Real_vi<0x44, DS_DEC_U64>; +defm DS_MIN_I64_vi : DS_Real_vi<0x45, DS_MIN_I64>; +defm DS_MAX_I64_vi : DS_Real_vi<0x46, DS_MAX_I64>; +defm DS_MIN_U64_vi : DS_Real_vi<0x47, DS_MIN_U64>; +defm DS_MAX_U64_vi : DS_Real_vi<0x48, DS_MAX_U64>; +defm DS_AND_B64_vi : DS_Real_vi<0x49, DS_AND_B64>; +defm DS_OR_B64_vi : DS_Real_vi<0x4a, DS_OR_B64>; +defm DS_XOR_B64_vi : DS_Real_vi<0x4b, DS_XOR_B64>; +defm DS_MSKOR_B64_vi : DS_Real_vi<0x4c, DS_MSKOR_B64>; +defm DS_WRITE_B64_vi : DS_Real_vi<0x4d, DS_WRITE_B64>; +defm DS_WRITE2_B64_vi : DS_Real_vi<0x4E, DS_WRITE2_B64>; +defm DS_WRITE2ST64_B64_vi : DS_Real_vi<0x4f, DS_WRITE2ST64_B64>; + +defm DS_CMPST_B64_vi : DS_Real_vi<0x50, DS_CMPST_B64>; +defm DS_CMPST_F64_vi : DS_Real_vi<0x51, DS_CMPST_F64>; +defm DS_MIN_F64_vi : DS_Real_vi<0x52, DS_MIN_F64>; +defm DS_MAX_F64_vi : DS_Real_vi<0x53, DS_MAX_F64>; + +defm DS_WRITE_B8_D16_HI_vi : DS_Real_m0_vi<0x54, DS_WRITE_B8_D16_HI>; +defm DS_WRITE_B16_D16_HI_vi: DS_Real_m0_vi<0x55, DS_WRITE_B16_D16_HI>; + +defm DS_READ_U8_D16_vi : DS_Real_m0_vi<0x56, DS_READ_U8_D16>; +defm DS_READ_U8_D16_HI_vi : DS_Real_m0_vi<0x57, DS_READ_U8_D16_HI>; +defm DS_READ_I8_D16_vi : DS_Real_m0_vi<0x58, DS_READ_I8_D16>; +defm DS_READ_I8_D16_HI_vi : DS_Real_m0_vi<0x59, DS_READ_I8_D16_HI>; +defm DS_READ_U16_D16_vi : DS_Real_m0_vi<0x5a, DS_READ_U16_D16>; +defm DS_READ_U16_D16_HI_vi: DS_Real_m0_vi<0x5b, DS_READ_U16_D16_HI>; + +defm DS_ADD_RTN_U64_vi : DS_Real_vi<0x60, DS_ADD_RTN_U64>; +defm DS_SUB_RTN_U64_vi : DS_Real_vi<0x61, DS_SUB_RTN_U64>; +defm DS_RSUB_RTN_U64_vi : DS_Real_vi<0x62, DS_RSUB_RTN_U64>; +defm DS_INC_RTN_U64_vi : DS_Real_vi<0x63, DS_INC_RTN_U64>; +defm DS_DEC_RTN_U64_vi : DS_Real_vi<0x64, DS_DEC_RTN_U64>; +defm DS_MIN_RTN_I64_vi : DS_Real_vi<0x65, DS_MIN_RTN_I64>; +defm DS_MAX_RTN_I64_vi : DS_Real_vi<0x66, DS_MAX_RTN_I64>; +defm DS_MIN_RTN_U64_vi : DS_Real_vi<0x67, DS_MIN_RTN_U64>; +defm DS_MAX_RTN_U64_vi : DS_Real_vi<0x68, DS_MAX_RTN_U64>; +defm DS_AND_RTN_B64_vi : DS_Real_vi<0x69, DS_AND_RTN_B64>; +defm DS_OR_RTN_B64_vi : DS_Real_vi<0x6a, DS_OR_RTN_B64>; +defm DS_XOR_RTN_B64_vi : DS_Real_vi<0x6b, DS_XOR_RTN_B64>; +defm DS_MSKOR_RTN_B64_vi : DS_Real_vi<0x6c, DS_MSKOR_RTN_B64>; +defm DS_WRXCHG_RTN_B64_vi : DS_Real_vi<0x6d, DS_WRXCHG_RTN_B64>; +defm DS_WRXCHG2_RTN_B64_vi : DS_Real_vi<0x6e, DS_WRXCHG2_RTN_B64>; +defm DS_WRXCHG2ST64_RTN_B64_vi : DS_Real_vi<0x6f, DS_WRXCHG2ST64_RTN_B64>; +defm DS_CONDXCHG32_RTN_B64_vi : DS_Real_vi<0x7e, DS_CONDXCHG32_RTN_B64>; +defm DS_GWS_SEMA_RELEASE_ALL_vi: DS_Real_m0_vi<0x98, DS_GWS_SEMA_RELEASE_ALL>; +defm DS_CMPST_RTN_B64_vi : DS_Real_vi<0x70, DS_CMPST_RTN_B64>; +defm DS_CMPST_RTN_F64_vi : DS_Real_vi<0x71, DS_CMPST_RTN_F64>; +defm DS_MIN_RTN_F64_vi : DS_Real_vi<0x72, DS_MIN_RTN_F64>; +defm DS_MAX_RTN_F64_vi : DS_Real_vi<0x73, DS_MAX_RTN_F64>; + +defm DS_READ_B64_vi : DS_Real_vi<0x76, DS_READ_B64>; +defm DS_READ2_B64_vi : DS_Real_vi<0x77, DS_READ2_B64>; +defm DS_READ2ST64_B64_vi : DS_Real_vi<0x78, DS_READ2ST64_B64>; + +defm DS_ADD_SRC2_U32_vi : DS_Real_m0_vi<0x80, DS_ADD_SRC2_U32>; +defm DS_SUB_SRC2_U32_vi : DS_Real_m0_vi<0x81, DS_SUB_SRC2_U32>; +defm DS_RSUB_SRC2_U32_vi : DS_Real_m0_vi<0x82, DS_RSUB_SRC2_U32>; +defm DS_INC_SRC2_U32_vi : DS_Real_m0_vi<0x83, DS_INC_SRC2_U32>; +defm DS_DEC_SRC2_U32_vi : DS_Real_m0_vi<0x84, DS_DEC_SRC2_U32>; +defm DS_MIN_SRC2_I32_vi : DS_Real_m0_vi<0x85, DS_MIN_SRC2_I32>; +defm DS_MAX_SRC2_I32_vi : DS_Real_m0_vi<0x86, DS_MAX_SRC2_I32>; +defm DS_MIN_SRC2_U32_vi : DS_Real_m0_vi<0x87, DS_MIN_SRC2_U32>; +defm DS_MAX_SRC2_U32_vi : DS_Real_m0_vi<0x88, DS_MAX_SRC2_U32>; +defm DS_AND_SRC2_B32_vi : DS_Real_m0_vi<0x89, DS_AND_SRC2_B32>; +defm DS_OR_SRC2_B32_vi : DS_Real_m0_vi<0x8a, DS_OR_SRC2_B32>; +defm DS_XOR_SRC2_B32_vi : DS_Real_m0_vi<0x8b, DS_XOR_SRC2_B32>; +defm DS_WRITE_SRC2_B32_vi : DS_Real_m0_vi<0x8d, DS_WRITE_SRC2_B32>; +defm DS_MIN_SRC2_F32_vi : DS_Real_m0_vi<0x92, DS_MIN_SRC2_F32>; +defm DS_MAX_SRC2_F32_vi : DS_Real_m0_vi<0x93, DS_MAX_SRC2_F32>; +defm DS_ADD_SRC2_F32_vi : DS_Real_m0_vi<0x95, DS_ADD_SRC2_F32>; +defm DS_ADD_SRC2_U64_vi : DS_Real_m0_vi<0xc0, DS_ADD_SRC2_U64>; +defm DS_SUB_SRC2_U64_vi : DS_Real_m0_vi<0xc1, DS_SUB_SRC2_U64>; +defm DS_RSUB_SRC2_U64_vi : DS_Real_m0_vi<0xc2, DS_RSUB_SRC2_U64>; +defm DS_INC_SRC2_U64_vi : DS_Real_m0_vi<0xc3, DS_INC_SRC2_U64>; +defm DS_DEC_SRC2_U64_vi : DS_Real_m0_vi<0xc4, DS_DEC_SRC2_U64>; +defm DS_MIN_SRC2_I64_vi : DS_Real_m0_vi<0xc5, DS_MIN_SRC2_I64>; +defm DS_MAX_SRC2_I64_vi : DS_Real_m0_vi<0xc6, DS_MAX_SRC2_I64>; +defm DS_MIN_SRC2_U64_vi : DS_Real_m0_vi<0xc7, DS_MIN_SRC2_U64>; +defm DS_MAX_SRC2_U64_vi : DS_Real_m0_vi<0xc8, DS_MAX_SRC2_U64>; +defm DS_AND_SRC2_B64_vi : DS_Real_m0_vi<0xc9, DS_AND_SRC2_B64>; +defm DS_OR_SRC2_B64_vi : DS_Real_m0_vi<0xca, DS_OR_SRC2_B64>; +defm DS_XOR_SRC2_B64_vi : DS_Real_m0_vi<0xcb, DS_XOR_SRC2_B64>; +defm DS_WRITE_SRC2_B64_vi : DS_Real_m0_vi<0xcd, DS_WRITE_SRC2_B64>; +defm DS_MIN_SRC2_F64_vi : DS_Real_m0_vi<0xd2, DS_MIN_SRC2_F64>; +defm DS_MAX_SRC2_F64_vi : DS_Real_m0_vi<0xd3, DS_MAX_SRC2_F64>; +defm DS_WRITE_B96_vi : DS_Real_vi<0xde, DS_WRITE_B96>; +defm DS_WRITE_B128_vi : DS_Real_vi<0xdf, DS_WRITE_B128>; +defm DS_READ_B96_vi : DS_Real_vi<0xfe, DS_READ_B96>; +defm DS_READ_B128_vi : DS_Real_vi<0xff, DS_READ_B128>; // GFX90A+. -def DS_ADD_F64_vi : DS_Real_vi<0x5c, DS_ADD_F64>; -def DS_ADD_RTN_F64_vi : DS_Real_vi<0x7c, DS_ADD_RTN_F64>; +defm DS_ADD_F64_vi : DS_Real_m0_vi<0x5c, DS_ADD_F64>; +defm DS_ADD_RTN_F64_vi: DS_Real_m0_vi<0x7c, DS_ADD_RTN_F64>; // GFX942+. -def DS_PK_ADD_F16_vi : DS_Real_vi<0x17, DS_PK_ADD_F16>; -def DS_PK_ADD_RTN_F16_vi : DS_Real_vi<0xb7, DS_PK_ADD_RTN_F16>; -def DS_PK_ADD_BF16_vi : DS_Real_vi<0x18, DS_PK_ADD_BF16>; -def DS_PK_ADD_RTN_BF16_vi : DS_Real_vi<0xb8, DS_PK_ADD_RTN_BF16>; +defm DS_PK_ADD_F16_vi : DS_Real_m0_vi<0x17, DS_PK_ADD_F16>; +defm DS_PK_ADD_RTN_F16_vi : DS_Real_m0_vi<0xb7, DS_PK_ADD_RTN_F16>; +defm DS_PK_ADD_BF16_vi : DS_Real_m0_vi<0x18, DS_PK_ADD_BF16>; +defm DS_PK_ADD_RTN_BF16_vi: DS_Real_m0_vi<0xb8, DS_PK_ADD_RTN_BF16>; //===----------------------------------------------------------------------===// // GFX950. //===----------------------------------------------------------------------===// -def DS_READ_B64_TR_B4_vi : DS_Real_vi<0x0e0, DS_READ_B64_TR_B4>; -def DS_READ_B96_TR_B6_vi : DS_Real_vi<0x0e1, DS_READ_B96_TR_B6>; -def DS_READ_B64_TR_B8_vi : DS_Real_vi<0x0e2, DS_READ_B64_TR_B8>; -def DS_READ_B64_TR_B16_vi : DS_Real_vi<0x0e3, DS_READ_B64_TR_B16>; +defm DS_READ_B64_TR_B4_vi : DS_Real_m0_vi<0x0e0, DS_READ_B64_TR_B4>; +defm DS_READ_B96_TR_B6_vi : DS_Real_m0_vi<0x0e1, DS_READ_B96_TR_B6>; +defm DS_READ_B64_TR_B8_vi : DS_Real_m0_vi<0x0e2, DS_READ_B64_TR_B8>; +defm DS_READ_B64_TR_B16_vi: DS_Real_m0_vi<0x0e3, DS_READ_B64_TR_B16>; diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 6a2beeed41df..6f6039bf4ec2 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -38,6 +38,7 @@ #include "llvm/Support/Compiler.h" using namespace llvm; +using namespace llvm::MCD; #define DEBUG_TYPE "amdgpu-disassembler" @@ -446,6 +447,14 @@ static DecodeStatus decodeVersionImm(MCInst &Inst, unsigned Imm, #include "AMDGPUGenDisassemblerTables.inc" +namespace { +// Define bitwidths for various types used to instantiate the decoder. +template <> constexpr uint32_t InsnBitWidth<uint32_t> = 32; +template <> constexpr uint32_t InsnBitWidth<uint64_t> = 64; +template <> constexpr uint32_t InsnBitWidth<std::bitset<96>> = 96; +template <> constexpr uint32_t InsnBitWidth<std::bitset<128>> = 128; +} // namespace + //===----------------------------------------------------------------------===// // //===----------------------------------------------------------------------===// @@ -498,26 +507,24 @@ template <typename T> static inline T eatBytes(ArrayRef<uint8_t>& Bytes) { return Res; } -static inline DecoderUInt128 eat12Bytes(ArrayRef<uint8_t> &Bytes) { +static inline std::bitset<96> eat12Bytes(ArrayRef<uint8_t> &Bytes) { + using namespace llvm::support::endian; assert(Bytes.size() >= 12); - uint64_t Lo = - support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data()); + std::bitset<96> Lo(read<uint64_t, endianness::little>(Bytes.data())); Bytes = Bytes.slice(8); - uint64_t Hi = - support::endian::read<uint32_t, llvm::endianness::little>(Bytes.data()); + std::bitset<96> Hi(read<uint32_t, endianness::little>(Bytes.data())); Bytes = Bytes.slice(4); - return DecoderUInt128(Lo, Hi); + return (Hi << 64) | Lo; } -static inline DecoderUInt128 eat16Bytes(ArrayRef<uint8_t> &Bytes) { +static inline std::bitset<128> eat16Bytes(ArrayRef<uint8_t> &Bytes) { + using namespace llvm::support::endian; assert(Bytes.size() >= 16); - uint64_t Lo = - support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data()); + std::bitset<128> Lo(read<uint64_t, endianness::little>(Bytes.data())); Bytes = Bytes.slice(8); - uint64_t Hi = - support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data()); + std::bitset<128> Hi(read<uint64_t, endianness::little>(Bytes.data())); Bytes = Bytes.slice(8); - return DecoderUInt128(Lo, Hi); + return (Hi << 64) | Lo; } void AMDGPUDisassembler::decodeImmOperands(MCInst &MI, @@ -600,14 +607,14 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, // Try to decode DPP and SDWA first to solve conflict with VOP1 and VOP2 // encodings if (isGFX1250() && Bytes.size() >= 16) { - DecoderUInt128 DecW = eat16Bytes(Bytes); + std::bitset<128> DecW = eat16Bytes(Bytes); if (tryDecodeInst(DecoderTableGFX1250128, MI, DecW, Address, CS)) break; Bytes = Bytes_.slice(0, MaxInstBytesNum); } if (isGFX11Plus() && Bytes.size() >= 12) { - DecoderUInt128 DecW = eat12Bytes(Bytes); + std::bitset<96> DecW = eat12Bytes(Bytes); if (isGFX11() && tryDecodeInst(DecoderTableGFX1196, DecoderTableGFX11_FAKE1696, MI, @@ -642,7 +649,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } else if (Bytes.size() >= 16 && STI.hasFeature(AMDGPU::FeatureGFX950Insts)) { - DecoderUInt128 DecW = eat16Bytes(Bytes); + std::bitset<128> DecW = eat16Bytes(Bytes); if (tryDecodeInst(DecoderTableGFX940128, MI, DecW, Address, CS)) break; @@ -836,6 +843,18 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } } + // Validate buffer instruction offsets for GFX12+ - must not be a negative. + if (isGFX12Plus() && isBufferInstruction(MI)) { + int OffsetIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::offset); + if (OffsetIdx != -1) { + uint32_t Imm = MI.getOperand(OffsetIdx).getImm(); + int64_t SignedOffset = SignExtend64<24>(Imm); + if (SignedOffset < 0) + return MCDisassembler::Fail; + } + } + if (MCII->get(MI.getOpcode()).TSFlags & (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) { int SWZOpIdx = @@ -1216,6 +1235,26 @@ void AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const { } } +// Given a wide tuple \p Reg check if it will overflow 256 registers. +// \returns \p Reg on success or NoRegister otherwise. +static unsigned CheckVGPROverflow(unsigned Reg, const MCRegisterClass &RC, + const MCRegisterInfo &MRI) { + unsigned NumRegs = RC.getSizeInBits() / 32; + MCRegister Sub0 = MRI.getSubReg(Reg, AMDGPU::sub0); + if (!Sub0) + return Reg; + + MCRegister BaseReg; + if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(Sub0)) + BaseReg = AMDGPU::VGPR0; + else if (MRI.getRegClass(AMDGPU::AGPR_32RegClassID).contains(Sub0)) + BaseReg = AMDGPU::AGPR0; + + assert(BaseReg && "Only vector registers expected"); + + return (Sub0 - BaseReg + NumRegs <= 256) ? Reg : AMDGPU::NoRegister; +} + // Note that before gfx10, the MIMG encoding provided no information about // VADDR size. Consequently, decoded instructions always show address as if it // has 1 dword, which could be not really so. @@ -1320,8 +1359,9 @@ void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { MCRegister VdataSub0 = MRI.getSubReg(Vdata0, AMDGPU::sub0); Vdata0 = (VdataSub0 != 0)? VdataSub0 : Vdata0; - NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0, - &MRI.getRegClass(DataRCID)); + const MCRegisterClass &NewRC = MRI.getRegClass(DataRCID); + NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0, &NewRC); + NewVdata = CheckVGPROverflow(NewVdata, NewRC, MRI); if (!NewVdata) { // It's possible to encode this such that the low register + enabled // components exceeds the register count. @@ -1340,8 +1380,9 @@ void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { VAddrSA = VAddrSubSA ? VAddrSubSA : VAddrSA; auto AddrRCID = MCII->get(NewOpcode).operands()[VAddrSAIdx].RegClass; - NewVAddrSA = MRI.getMatchingSuperReg(VAddrSA, AMDGPU::sub0, - &MRI.getRegClass(AddrRCID)); + const MCRegisterClass &NewRC = MRI.getRegClass(AddrRCID); + NewVAddrSA = MRI.getMatchingSuperReg(VAddrSA, AMDGPU::sub0, &NewRC); + NewVAddrSA = CheckVGPROverflow(NewVAddrSA, NewRC, MRI); if (!NewVAddrSA) return; } @@ -2598,9 +2639,6 @@ Expected<bool> AMDGPUDisassembler::decodeKernelDescriptorDirective( KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT); PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size", KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE); - if (isGFX1250()) - PRINT_DIRECTIVE(".amdhsa_uses_cu_stores", - KERNEL_CODE_PROPERTY_USES_CU_STORES); if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED0) return createReservedKDBitsError(KERNEL_CODE_PROPERTY_RESERVED0, @@ -2743,6 +2781,20 @@ const MCExpr *AMDGPUDisassembler::createConstantSymbolExpr(StringRef Id, return MCSymbolRefExpr::create(Sym, Ctx); } +bool AMDGPUDisassembler::isBufferInstruction(const MCInst &MI) const { + const uint64_t TSFlags = MCII->get(MI.getOpcode()).TSFlags; + + // Check for MUBUF and MTBUF instructions + if (TSFlags & (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) + return true; + + // Check for SMEM buffer instructions (S_BUFFER_* instructions) + if ((TSFlags & SIInstrFlags::SMRD) && AMDGPU::getSMEMIsBuffer(MI.getOpcode())) + return true; + + return false; +} + //===----------------------------------------------------------------------===// // AMDGPUSymbolizer //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index f4d164bf10c3..c1131c2936fc 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -32,44 +32,6 @@ class MCOperand; class MCSubtargetInfo; class Twine; -// Exposes an interface expected by autogenerated code in -// FixedLenDecoderEmitter -class DecoderUInt128 { -private: - uint64_t Lo = 0; - uint64_t Hi = 0; - -public: - DecoderUInt128() = default; - DecoderUInt128(uint64_t Lo, uint64_t Hi = 0) : Lo(Lo), Hi(Hi) {} - operator bool() const { return Lo || Hi; } - uint64_t extractBitsAsZExtValue(unsigned NumBits, - unsigned BitPosition) const { - assert(NumBits && NumBits <= 64); - assert(BitPosition < 128); - uint64_t Val; - if (BitPosition < 64) - Val = Lo >> BitPosition | Hi << 1 << (63 - BitPosition); - else - Val = Hi >> (BitPosition - 64); - return Val & ((uint64_t(2) << (NumBits - 1)) - 1); - } - DecoderUInt128 operator&(const DecoderUInt128 &RHS) const { - return DecoderUInt128(Lo & RHS.Lo, Hi & RHS.Hi); - } - DecoderUInt128 operator&(const uint64_t &RHS) const { - return *this & DecoderUInt128(RHS); - } - DecoderUInt128 operator~() const { return DecoderUInt128(~Lo, ~Hi); } - bool operator==(const DecoderUInt128 &RHS) { - return Lo == RHS.Lo && Hi == RHS.Hi; - } - bool operator!=(const DecoderUInt128 &RHS) { - return Lo != RHS.Lo || Hi != RHS.Hi; - } - bool operator!=(const int &RHS) { return *this != DecoderUInt128(RHS); } -}; - //===----------------------------------------------------------------------===// // AMDGPUDisassembler //===----------------------------------------------------------------------===// @@ -223,6 +185,9 @@ public: bool hasKernargPreload() const; bool isMacDPP(MCInst &MI) const; + + /// Check if the instruction is a buffer operation (MUBUF, MTBUF, or S_BUFFER) + bool isBufferInstruction(const MCInst &MI) const; }; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td index 280def5440c8..dadc7dcd7054 100644 --- a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td +++ b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td @@ -626,7 +626,6 @@ class R600_LDS_1A <bits<6> lds_op, string name, list<dag> pattern> : R600_LDS < let usesCustomInserter = 1; let LDS_1A = 1; - let DisableEncoding = "$dst"; } class R600_LDS_1A1D <bits<6> lds_op, dag outs, string name, list<dag> pattern, @@ -658,7 +657,6 @@ class R600_LDS_1A1D_RET <bits<6> lds_op, string name, list<dag> pattern> : let BaseOp = name; let usesCustomInserter = 1; - let DisableEncoding = "$dst"; } class R600_LDS_1A2D <bits<6> lds_op, dag outs, string name, list<dag> pattern, @@ -688,7 +686,6 @@ class R600_LDS_1A2D_RET <bits<6> lds_op, string name, list<dag> pattern> : let BaseOp = name; let usesCustomInserter = 1; - let DisableEncoding = "$dst"; } def LDS_ADD : R600_LDS_1A1D_NORET <0x0, "LDS_ADD", [] >; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index f5d438436b29..a1306565bbe2 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -12,9 +12,11 @@ let WantsRoot = true in { def ScratchOffset : ComplexPattern<iPTR, 2, "SelectScratchOffset", [], [], -10>; def GlobalSAddrNoIOffset : ComplexPattern<iPTR, 3, "SelectGlobalSAddrNoIOffset", [], [], -3>; + def GlobalSAddrNoIOffsetM0 : ComplexPattern<iPTR, 3, "SelectGlobalSAddrNoIOffsetM0", [], [], -3>; def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>; def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>; def GlobalSAddrCPol : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPol", [], [], -10>; + def GlobalSAddrCPolM0 : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPolM0", [], [], -10>; def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [], -10>; def ScratchSVAddr : ComplexPattern<iPTR, 4, "SelectScratchSVAddr", [], [], -10>; } @@ -135,7 +137,18 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> : // unsigned for flat accesses. bits<13> offset; // GFX90A+ only: instruction uses AccVGPR for data - bits<1> acc = !if(ps.has_vdst, vdst{9}, !if(ps.has_data, vdata{9}, 0)); + defvar DstOpIsAV = !if(ps.has_vdst, + VDstOperandIsAV<ps.OutOperandList>.ret, 0); + defvar DstOpIsAGPR = !if(ps.has_vdst, + VDstOperandIsAGPR<ps.OutOperandList>.ret, 0); + defvar DataOpIsAV = !if(ps.has_data, + VDataOperandIsAV<ps.InOperandList>.ret, 0); + defvar DataOpIsAGPR = !if(ps.has_data, + VDataOperandIsAGPR<ps.InOperandList>.ret, 0); + + bits<1> acc = !if(ps.has_vdst, + !if(DstOpIsAV, vdst{9}, DstOpIsAGPR), + !if(DataOpIsAV, vdata{9}, DataOpIsAGPR)); // We don't use tfe right now, and it was removed in gfx9. bits<1> tfe = 0; @@ -214,11 +227,10 @@ class GlobalSaddrTable <bit is_saddr, string Name = ""> { // same encoding value as exec_hi, so it isn't possible to use that if // saddr is 32-bit (which isn't handled here yet). class FLAT_Load_Pseudo< - string opName, RegisterClass regClass, bit HasTiedOutput = 0, + string opName, RegisterOperand vdata_op, bit HasTiedOutput = 0, bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo<opName, (outs), (ins), ""> { - defvar vdata_op = getLdStRegisterOperand<regClass>.ret; let OutOperandList = (outs vdata_op:$vdst); let InOperandList = !con( !if(EnableSaddr, @@ -239,10 +251,9 @@ class FLAT_Load_Pseudo< let enabled_saddr = EnableSaddr; let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", ""); - let DisableEncoding = !if(HasTiedOutput, "$vdst_in", ""); } -multiclass FLAT_Flat_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> { +multiclass FLAT_Flat_Load_Pseudo<string opName, RegisterOperand regClass = AVLdSt_32, bit HasTiedInput = 0> { def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput>, GlobalSaddrTable<0, opName>; let OtherPredicates = [HasFlatGVSMode] in @@ -251,19 +262,19 @@ multiclass FLAT_Flat_Load_Pseudo<string opName, RegisterClass regClass, bit HasT } multiclass FLAT_Flat_Load_Pseudo_t16<string opName> { - defm "" : FLAT_Flat_Load_Pseudo<opName, VGPR_32, 1>; + defm "" : FLAT_Flat_Load_Pseudo<opName, AVLdSt_32, 1>; let True16Predicate = UseRealTrue16Insts in - defm _t16 : FLAT_Flat_Load_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_HI", NAME>; + defm _t16 : FLAT_Flat_Load_Pseudo<opName#"_t16", VGPROp_16>, True16D16Table<NAME#"_HI", NAME>; } -class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass, +class FLAT_Store_Pseudo <string opName, RegisterOperand vdataClass, bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo< opName, (outs), !con( !if(EnableSaddr, - (ins VGPR_32:$vaddr, getLdStRegisterOperand<vdataClass>.ret:$vdata, SReg_64_XEXEC_XNULL:$saddr), - (ins VReg_64:$vaddr, getLdStRegisterOperand<vdataClass>.ret:$vdata)), + (ins VGPR_32:$vaddr, vdataClass:$vdata, SReg_64_XEXEC_XNULL:$saddr), + (ins VReg_64:$vaddr, vdataClass:$vdata)), (ins flat_offset:$offset, CPol_0:$cpol)), " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$cpol"> { let mayLoad = 0; @@ -273,7 +284,7 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass, let enabled_saddr = EnableSaddr; } -multiclass FLAT_Flat_Store_Pseudo<string opName, RegisterClass regClass> { +multiclass FLAT_Flat_Store_Pseudo<string opName, RegisterOperand regClass = AVLdSt_32> { def "" : FLAT_Store_Pseudo<opName, regClass>, GlobalSaddrTable<0, opName>; let OtherPredicates = [HasFlatGVSMode] in @@ -282,21 +293,22 @@ multiclass FLAT_Flat_Store_Pseudo<string opName, RegisterClass regClass> { } multiclass FLAT_Flat_Store_Pseudo_t16<string opName> { - defm "" : FLAT_Flat_Store_Pseudo<opName, VGPR_32>; + defm "" : FLAT_Flat_Store_Pseudo<opName, AVLdSt_32>; defvar Name16 = opName#"_t16"; let OtherPredicates = [HasFlatGVSMode, HasTrue16BitInsts] in { - def _t16 : FLAT_Store_Pseudo<Name16, VGPR_16, 1>, + def _t16 : FLAT_Store_Pseudo<Name16, VGPROp_16, 1>, GlobalSaddrTable<0, Name16>, True16D16Table<NAME#"_D16_HI", NAME>; - def _SADDR_t16 : FLAT_Store_Pseudo<Name16, VGPR_16, 1, 1>, + def _SADDR_t16 : FLAT_Store_Pseudo<Name16, VGPROp_16, 1, 1>, GlobalSaddrTable<1, Name16>, True16D16Table<NAME#"_D16_HI_SADDR", NAME#"_SADDR">; } } -multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> { - let is_flat_global = 1 in { +multiclass FLAT_Global_Load_Pseudo<string opName, RegisterOperand regClass = AVLdSt_32, + bit HasTiedInput = 0> { + let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in { def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1>, GlobalSaddrTable<0, opName>; def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>, @@ -305,21 +317,21 @@ multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit Ha } multiclass FLAT_Global_Load_Pseudo_t16<string opName> { - defm "" : FLAT_Global_Load_Pseudo<opName, VGPR_32, 1>; + defm "" : FLAT_Global_Load_Pseudo<opName, AVLdSt_32, 1>; defvar Name16 = opName#"_t16"; let OtherPredicates = [HasTrue16BitInsts], SubtargetPredicate = HasFlatGlobalInsts, is_flat_global = 1 in { - def _t16 : FLAT_Load_Pseudo<Name16, VGPR_16, 0, 1>, + def _t16 : FLAT_Load_Pseudo<Name16, VGPROp_16, 0, 1>, GlobalSaddrTable<0, Name16>, True16D16Table<NAME#"_HI", NAME>; - def _SADDR_t16 : FLAT_Load_Pseudo<Name16, VGPR_16, 0, 1, 1>, + def _SADDR_t16 : FLAT_Load_Pseudo<Name16, VGPROp_16, 0, 1, 1>, GlobalSaddrTable<1, Name16>, True16D16Table<NAME#"_HI_SADDR", NAME#"_SADDR">; } } -class FLAT_Global_Load_AddTid_Pseudo <string opName, RegisterClass regClass, +class FLAT_Global_Load_AddTid_Pseudo <string opName, RegisterOperand regClass, bit HasTiedOutput = 0, bit EnableSaddr = 0> : FLAT_Pseudo< opName, (outs regClass:$vdst), @@ -335,10 +347,9 @@ class FLAT_Global_Load_AddTid_Pseudo <string opName, RegisterClass regClass, let enabled_saddr = EnableSaddr; let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", ""); - let DisableEncoding = !if(HasTiedOutput, "$vdst_in", ""); } -multiclass FLAT_Global_Load_AddTid_Pseudo<string opName, RegisterClass regClass, +multiclass FLAT_Global_Load_AddTid_Pseudo<string opName, RegisterOperand regClass, bit HasTiedOutput = 0> { def "" : FLAT_Global_Load_AddTid_Pseudo<opName, regClass, HasTiedOutput>, GlobalSaddrTable<0, opName>; @@ -346,8 +357,8 @@ multiclass FLAT_Global_Load_AddTid_Pseudo<string opName, RegisterClass regClass, GlobalSaddrTable<1, opName>; } -multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> { - let is_flat_global = 1 in { +multiclass FLAT_Global_Store_Pseudo<string opName, RegisterOperand regClass = AVLdSt_32> { + let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in { def "" : FLAT_Store_Pseudo<opName, regClass, 1>, GlobalSaddrTable<0, opName>; def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1>, @@ -356,15 +367,15 @@ multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> { } multiclass FLAT_Global_Store_Pseudo_t16<string opName> { - defm "" : FLAT_Global_Store_Pseudo<opName, VGPR_32>; + defm "" : FLAT_Global_Store_Pseudo<opName, AVLdSt_32>; defvar Name16 = opName#"_t16"; let OtherPredicates = [HasTrue16BitInsts], SubtargetPredicate = HasFlatGlobalInsts, is_flat_global = 1 in { - def _t16 : FLAT_Store_Pseudo<Name16, VGPR_16, 1>, + def _t16 : FLAT_Store_Pseudo<Name16, VGPROp_16, 1>, GlobalSaddrTable<0, Name16>, True16D16Table<NAME#"_D16_HI", NAME>; - def _SADDR_t16 : FLAT_Store_Pseudo<Name16, VGPR_16, 1, 1>, + def _SADDR_t16 : FLAT_Store_Pseudo<Name16, VGPROp_16, 1, 1>, GlobalSaddrTable<1, Name16>, True16D16Table<NAME#"_D16_HI_SADDR", NAME#"_SADDR">; } @@ -435,7 +446,7 @@ multiclass FLAT_Global_STORE_LDS_Pseudo<string opName> { GlobalSaddrTable<1, opName>; } -class FLAT_Global_Store_AddTid_Pseudo <string opName, RegisterClass vdataClass, +class FLAT_Global_Store_AddTid_Pseudo <string opName, RegisterOperand vdataClass, bit EnableSaddr = 0> : FLAT_Pseudo< opName, (outs), @@ -451,7 +462,7 @@ class FLAT_Global_Store_AddTid_Pseudo <string opName, RegisterClass vdataClass, let enabled_saddr = EnableSaddr; } -multiclass FLAT_Global_Store_AddTid_Pseudo<string opName, RegisterClass regClass> { +multiclass FLAT_Global_Store_AddTid_Pseudo<string opName, RegisterOperand regClass> { def "" : FLAT_Global_Store_AddTid_Pseudo<opName, regClass>, GlobalSaddrTable<0, opName>; def _SADDR : FLAT_Global_Store_AddTid_Pseudo<opName, regClass, 1>, @@ -539,14 +550,14 @@ class FlatScratchInst <string sv_op, string mode> { string Mode = mode; } -class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass, +class FLAT_Scratch_Load_Pseudo <string opName, RegisterOperand regClass = AVLdSt_32, bit HasTiedOutput = 0, bit EnableSaddr = 0, bit EnableSVE = 0, bit EnableVaddr = !or(EnableSVE, !not(EnableSaddr))> : FLAT_Pseudo< opName, - (outs getLdStRegisterOperand<regClass>.ret:$vdst), + (outs regClass:$vdst), !con( !if(EnableSVE, (ins VGPR_32:$vaddr, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset), @@ -555,7 +566,7 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass, !if(EnableVaddr, (ins VGPR_32:$vaddr, flat_offset:$offset), (ins flat_offset:$offset)))), - !if(HasTiedOutput, (ins CPol:$cpol, getLdStRegisterOperand<regClass>.ret:$vdst_in), + !if(HasTiedOutput, (ins CPol:$cpol, regClass:$vdst_in), (ins CPol_0:$cpol))), " $vdst, "#!if(EnableVaddr, "$vaddr, ", "off, ")#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> { let is_flat_scratch = 1; @@ -568,13 +579,11 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass, let sve = EnableVaddr; let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", ""); - let DisableEncoding = !if(HasTiedOutput, "$vdst_in", ""); } -class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit EnableSaddr = 0, +class FLAT_Scratch_Store_Pseudo <string opName, RegisterOperand vdata_op, bit EnableSaddr = 0, bit EnableSVE = 0, - bit EnableVaddr = !or(EnableSVE, !not(EnableSaddr)), - RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret> : FLAT_Pseudo< + bit EnableVaddr = !or(EnableSVE, !not(EnableSaddr))> : FLAT_Pseudo< opName, (outs), !if(EnableSVE, @@ -596,7 +605,8 @@ class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit En let sve = EnableVaddr; } -multiclass FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedOutput = 0> { +multiclass FLAT_Scratch_Load_Pseudo<string opName, RegisterOperand regClass = AVLdSt_32, + bit HasTiedOutput = 0> { def "" : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput>, FlatScratchInst<opName, "SV">; def _SADDR : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 1>, @@ -612,29 +622,29 @@ multiclass FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass, bit H } multiclass FLAT_Scratch_Load_Pseudo_t16<string opName> { - defm "" : FLAT_Scratch_Load_Pseudo<opName, VGPR_32, 1>; + defm "" : FLAT_Scratch_Load_Pseudo<opName, AVLdSt_32, 1>; defvar Name16 = opName#"_t16"; let OtherPredicates = [HasTrue16BitInsts], is_flat_scratch = 1 in { - def _t16 : FLAT_Scratch_Load_Pseudo<Name16, VGPR_16, 0>, + def _t16 : FLAT_Scratch_Load_Pseudo<Name16, VGPROp_16, 0>, FlatScratchInst<Name16, "SV">, True16D16Table<NAME#"_HI", NAME>; - def _SADDR_t16 : FLAT_Scratch_Load_Pseudo<Name16, VGPR_16, 0, 1>, + def _SADDR_t16 : FLAT_Scratch_Load_Pseudo<Name16, VGPROp_16, 0, 1>, FlatScratchInst<Name16, "SS">, True16D16Table<NAME#"_HI_SADDR", NAME#"_SADDR">; let SubtargetPredicate = HasFlatScratchSVSMode in - def _SVS_t16 : FLAT_Scratch_Load_Pseudo<Name16, VGPR_16, 0, 1, 1>, + def _SVS_t16 : FLAT_Scratch_Load_Pseudo<Name16, VGPROp_16, 0, 1, 1>, FlatScratchInst<Name16, "SVS">, True16D16Table<NAME#"_HI_SVS", NAME#"_SVS">; let SubtargetPredicate = HasFlatScratchSTMode in - def _ST_t16 : FLAT_Scratch_Load_Pseudo<Name16, VGPR_16, 0, 0, 0, 0>, + def _ST_t16 : FLAT_Scratch_Load_Pseudo<Name16, VGPROp_16, 0, 0, 0, 0>, FlatScratchInst<Name16, "ST">, True16D16Table<NAME#"_HI_ST", NAME#"_ST">; } } -multiclass FLAT_Scratch_Store_Pseudo<string opName, RegisterClass regClass> { +multiclass FLAT_Scratch_Store_Pseudo<string opName, RegisterOperand regClass = AVLdSt_32> { def "" : FLAT_Scratch_Store_Pseudo<opName, regClass>, FlatScratchInst<opName, "SV">; def _SADDR : FLAT_Scratch_Store_Pseudo<opName, regClass, 1>, @@ -650,24 +660,24 @@ multiclass FLAT_Scratch_Store_Pseudo<string opName, RegisterClass regClass> { } multiclass FLAT_Scratch_Store_Pseudo_t16<string opName> { - defm "" : FLAT_Scratch_Store_Pseudo<opName, VGPR_32>; + defm "" : FLAT_Scratch_Store_Pseudo<opName, AVLdSt_32>; defvar Name16 = opName#"_t16"; let OtherPredicates = [HasTrue16BitInsts], is_flat_scratch = 1 in { - def _t16 : FLAT_Scratch_Store_Pseudo<Name16, VGPR_16>, + def _t16 : FLAT_Scratch_Store_Pseudo<Name16, VGPROp_16>, FlatScratchInst<Name16, "SV">, True16D16Table<NAME#"_D16_HI", NAME>; - def _SADDR_t16 : FLAT_Scratch_Store_Pseudo<Name16, VGPR_16, 1>, + def _SADDR_t16 : FLAT_Scratch_Store_Pseudo<Name16, VGPROp_16, 1>, FlatScratchInst<Name16, "SS">, True16D16Table<NAME#"_D16_HI_SADDR", NAME#"_SADDR">; let SubtargetPredicate = HasFlatScratchSVSMode in - def _SVS_t16 : FLAT_Scratch_Store_Pseudo<Name16, VGPR_16, 1, 1>, + def _SVS_t16 : FLAT_Scratch_Store_Pseudo<Name16, VGPROp_16, 1, 1>, FlatScratchInst<Name16, "SVS">, True16D16Table<NAME#"_D16_HI_SVS", NAME#"_SVS">; let SubtargetPredicate = HasFlatScratchSTMode in - def _ST_t16 : FLAT_Scratch_Store_Pseudo<Name16, VGPR_16, 0, 0, 0>, + def _ST_t16 : FLAT_Scratch_Store_Pseudo<Name16, VGPROp_16, 0, 0, 0>, FlatScratchInst<Name16, "ST">, True16D16Table<NAME#"_D16_HI_ST", NAME#"_ST">; } @@ -741,11 +751,10 @@ class FLAT_AtomicRet_Pseudo<string opName, dag outs, dag ins, multiclass FLAT_Atomic_Pseudo_NO_RTN< string opName, - RegisterClass vdst_rc, + RegisterOperand vdst_op, ValueType vt, ValueType data_vt = vt, - RegisterClass data_rc = vdst_rc, - RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> { + RegisterOperand data_op = vdst_op> { def "" : FLAT_AtomicNoRet_Pseudo <opName, (outs), (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_0:$cpol), @@ -770,15 +779,17 @@ multiclass FLAT_Atomic_Pseudo_NO_RTN< multiclass FLAT_Atomic_Pseudo_RTN< string opName, - RegisterClass vdst_rc, + RegisterOperand vdst_op, ValueType vt, ValueType data_vt = vt, - RegisterClass data_rc = vdst_rc, - RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret, - RegisterOperand vdst_op = getLdStRegisterOperand<vdst_rc>.ret> { + RegisterOperand data_op = vdst_op> { + + defvar vdst_op_vgpr = getEquivalentVGPROperand<vdst_op>.ret; + defvar data_op_vgpr = getEquivalentVGPROperand<data_op>.ret; + def _RTN : FLAT_AtomicRet_Pseudo <opName, - (outs vdst_op:$vdst), - (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), + (outs vdst_op_vgpr:$vdst), + (ins VReg_64:$vaddr, data_op_vgpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), " $vdst, $vaddr, $vdata$offset$cpol">, GlobalSaddrTable<0, opName#"_rtn"> { let FPAtomic = data_vt.isFP; @@ -786,8 +797,8 @@ multiclass FLAT_Atomic_Pseudo_RTN< } def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName, - (outs vdst_op:$vdst), - (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_GLC1:$cpol), + (outs vdst_op_vgpr:$vdst), + (ins VGPR_32:$vaddr, data_op_vgpr:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_GLC1:$cpol), " $vdst, $vaddr, $vdata, $saddr$offset$cpol">, GlobalSaddrTable<1, opName#"_rtn"> { let OtherPredicates = [HasFlatGVSMode]; @@ -797,26 +808,37 @@ multiclass FLAT_Atomic_Pseudo_RTN< let FPAtomic = data_vt.isFP; let AddedComplexity = -1; // Prefer global atomics if available } + + defvar vdst_op_agpr = getEquivalentAGPROperand<vdst_op>.ret; + defvar data_op_agpr = getEquivalentAGPROperand<data_op>.ret; + + def _RTN_agpr : FLAT_AtomicRet_Pseudo <opName, + (outs vdst_op_agpr:$vdst), + (ins VReg_64:$vaddr, data_op_agpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), + " $vdst, $vaddr, $vdata$offset$cpol">, + GlobalSaddrTable<0, opName#"_rtn_agpr"> { + let FPAtomic = data_vt.isFP; + let AddedComplexity = -1; // Prefer global atomics if available + } + // No saddr agpr form. HasFlatGVSMode targets do not have AGPRs. } multiclass FLAT_Atomic_Pseudo< string opName, - RegisterClass vdst_rc, + RegisterOperand vdst_op, ValueType vt, ValueType data_vt = vt, - RegisterClass data_rc = vdst_rc, - RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> { - defm "" : FLAT_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, data_vt, data_rc, data_op>; - defm "" : FLAT_Atomic_Pseudo_RTN<opName, vdst_rc, vt, data_vt, data_rc, data_op>; + RegisterOperand data_op = vdst_op> { + defm "" : FLAT_Atomic_Pseudo_NO_RTN<opName, vdst_op, vt, data_vt, data_op>; + defm "" : FLAT_Atomic_Pseudo_RTN<opName, vdst_op, vt, data_vt, data_op>; } multiclass FLAT_Global_Atomic_Pseudo_NO_RTN< string opName, - RegisterClass vdst_rc, + RegisterOperand vdst_op, ValueType vt, ValueType data_vt = vt, - RegisterClass data_rc = vdst_rc, - RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> { + RegisterOperand data_op = vdst_op> { let is_flat_global = 1 in { def "" : FLAT_AtomicNoRet_Pseudo <opName, @@ -842,17 +864,18 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN< multiclass FLAT_Global_Atomic_Pseudo_RTN< string opName, - RegisterClass vdst_rc, + RegisterOperand vdst_op, ValueType vt, ValueType data_vt = vt, - RegisterClass data_rc = vdst_rc, - RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret, - RegisterOperand vdst_op = getLdStRegisterOperand<vdst_rc>.ret> { + RegisterOperand data_op = vdst_op> { + + defvar vdst_op_vgpr = getEquivalentVGPROperand<vdst_op>.ret; + defvar data_op_vgpr = getEquivalentVGPROperand<data_op>.ret; let is_flat_global = 1 in { def _RTN : FLAT_AtomicRet_Pseudo <opName, - (outs vdst_op:$vdst), - (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), + (outs vdst_op_vgpr:$vdst), + (ins VReg_64:$vaddr, data_op_vgpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), " $vdst, $vaddr, $vdata, off$offset$cpol">, GlobalSaddrTable<0, opName#"_rtn"> { let has_saddr = 1; @@ -860,23 +883,47 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN< } def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName, - (outs vdst_op:$vdst), - (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64_XEXEC_XNULL:$saddr, flat_offset:$offset, CPol_GLC1:$cpol), + (outs vdst_op_vgpr:$vdst), + (ins VGPR_32:$vaddr, data_op_vgpr:$vdata, SReg_64_XEXEC_XNULL:$saddr, flat_offset:$offset, CPol_GLC1:$cpol), " $vdst, $vaddr, $vdata, $saddr$offset$cpol">, GlobalSaddrTable<1, opName#"_rtn"> { let has_saddr = 1; let enabled_saddr = 1; let FPAtomic = data_vt.isFP; } + + defvar vdst_op_agpr = getEquivalentAGPROperand<vdst_op>.ret; + defvar data_op_agpr = getEquivalentAGPROperand<data_op>.ret; + + let SubtargetPredicate = isGFX90APlus in { + def _RTN_agpr : FLAT_AtomicRet_Pseudo <opName, + (outs vdst_op_agpr:$vdst), + (ins VReg_64:$vaddr, data_op_agpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), + " $vdst, $vaddr, $vdata, off$offset$cpol">, + GlobalSaddrTable<0, opName#"_rtn_agpr"> { + let has_saddr = 1; + let FPAtomic = data_vt.isFP; + } + + def _SADDR_RTN_agpr : FLAT_AtomicRet_Pseudo <opName, + (outs vdst_op_agpr:$vdst), + (ins VGPR_32:$vaddr, data_op_agpr:$vdata, SReg_64_XEXEC_XNULL:$saddr, flat_offset:$offset, CPol_GLC1:$cpol), + " $vdst, $vaddr, $vdata, $saddr$offset$cpol">, + GlobalSaddrTable<1, opName#"_rtn_agpr"> { + let has_saddr = 1; + let enabled_saddr = 1; + let FPAtomic = data_vt.isFP; + } + } } } multiclass FLAT_Global_Atomic_Pseudo< string opName, - RegisterClass vdst_rc, + RegisterOperand vdst_rc, ValueType vt, ValueType data_vt = vt, - RegisterClass data_rc = vdst_rc> { + RegisterOperand data_rc = vdst_rc> { defm "" : FLAT_Global_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, data_vt, data_rc>; defm "" : FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_rc, vt, data_vt, data_rc>; } @@ -885,119 +932,119 @@ multiclass FLAT_Global_Atomic_Pseudo< // Flat Instructions //===----------------------------------------------------------------------===// -defm FLAT_LOAD_UBYTE : FLAT_Flat_Load_Pseudo <"flat_load_ubyte", VGPR_32>; -defm FLAT_LOAD_SBYTE : FLAT_Flat_Load_Pseudo <"flat_load_sbyte", VGPR_32>; -defm FLAT_LOAD_USHORT : FLAT_Flat_Load_Pseudo <"flat_load_ushort", VGPR_32>; -defm FLAT_LOAD_SSHORT : FLAT_Flat_Load_Pseudo <"flat_load_sshort", VGPR_32>; -defm FLAT_LOAD_DWORD : FLAT_Flat_Load_Pseudo <"flat_load_dword", VGPR_32>; -defm FLAT_LOAD_DWORDX2 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx2", VReg_64>; -defm FLAT_LOAD_DWORDX4 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx4", VReg_128>; -defm FLAT_LOAD_DWORDX3 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx3", VReg_96>; +defm FLAT_LOAD_UBYTE : FLAT_Flat_Load_Pseudo <"flat_load_ubyte">; +defm FLAT_LOAD_SBYTE : FLAT_Flat_Load_Pseudo <"flat_load_sbyte">; +defm FLAT_LOAD_USHORT : FLAT_Flat_Load_Pseudo <"flat_load_ushort">; +defm FLAT_LOAD_SSHORT : FLAT_Flat_Load_Pseudo <"flat_load_sshort">; +defm FLAT_LOAD_DWORD : FLAT_Flat_Load_Pseudo <"flat_load_dword">; +defm FLAT_LOAD_DWORDX2 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx2", AVLdSt_64>; +defm FLAT_LOAD_DWORDX4 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx4", AVLdSt_128>; +defm FLAT_LOAD_DWORDX3 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx3", AVLdSt_96>; -defm FLAT_STORE_DWORD : FLAT_Flat_Store_Pseudo <"flat_store_dword", VGPR_32>; -defm FLAT_STORE_DWORDX2 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx2", VReg_64>; -defm FLAT_STORE_DWORDX4 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx4", VReg_128>; -defm FLAT_STORE_DWORDX3 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx3", VReg_96>; +defm FLAT_STORE_DWORD : FLAT_Flat_Store_Pseudo <"flat_store_dword">; +defm FLAT_STORE_DWORDX2 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx2", AVLdSt_64>; +defm FLAT_STORE_DWORDX4 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx4", AVLdSt_128>; +defm FLAT_STORE_DWORDX3 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx3", AVLdSt_96>; let SubtargetPredicate = HasD16LoadStore in { let TiedSourceNotRead = 1 in { -defm FLAT_LOAD_UBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>; +defm FLAT_LOAD_UBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_ubyte_d16_hi", AVLdSt_32, 1>; defm FLAT_LOAD_UBYTE_D16 : FLAT_Flat_Load_Pseudo_t16 <"flat_load_ubyte_d16">; -defm FLAT_LOAD_SBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>; +defm FLAT_LOAD_SBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_sbyte_d16_hi", AVLdSt_32, 1>; defm FLAT_LOAD_SBYTE_D16 : FLAT_Flat_Load_Pseudo_t16 <"flat_load_sbyte_d16">; -defm FLAT_LOAD_SHORT_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>; +defm FLAT_LOAD_SHORT_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_short_d16_hi", AVLdSt_32, 1>; defm FLAT_LOAD_SHORT_D16 : FLAT_Flat_Load_Pseudo_t16 <"flat_load_short_d16">; } -defm FLAT_STORE_BYTE_D16_HI : FLAT_Flat_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>; -defm FLAT_STORE_SHORT_D16_HI : FLAT_Flat_Store_Pseudo <"flat_store_short_d16_hi", VGPR_32>; +defm FLAT_STORE_BYTE_D16_HI : FLAT_Flat_Store_Pseudo <"flat_store_byte_d16_hi">; +defm FLAT_STORE_SHORT_D16_HI : FLAT_Flat_Store_Pseudo <"flat_store_short_d16_hi">; } defm FLAT_STORE_BYTE : FLAT_Flat_Store_Pseudo_t16 <"flat_store_byte">; defm FLAT_STORE_SHORT : FLAT_Flat_Store_Pseudo_t16 <"flat_store_short">; defm FLAT_ATOMIC_CMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap", - VGPR_32, i32, v2i32, VReg_64>; + AVLdSt_32, i32, v2i32, AVLdSt_64>; defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap_x2", - VReg_64, i64, v2i64, VReg_128>; + AVLdSt_64, i64, v2i64, AVLdSt_128>; defm FLAT_ATOMIC_SWAP : FLAT_Atomic_Pseudo <"flat_atomic_swap", - VGPR_32, i32>; + AVLdSt_32, i32>; defm FLAT_ATOMIC_SWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_swap_x2", - VReg_64, i64>; + AVLdSt_64, i64>; defm FLAT_ATOMIC_ADD : FLAT_Atomic_Pseudo <"flat_atomic_add", - VGPR_32, i32>; + AVLdSt_32, i32>; defm FLAT_ATOMIC_SUB : FLAT_Atomic_Pseudo <"flat_atomic_sub", - VGPR_32, i32>; + AVLdSt_32, i32>; defm FLAT_ATOMIC_SMIN : FLAT_Atomic_Pseudo <"flat_atomic_smin", - VGPR_32, i32>; + AVLdSt_32, i32>; defm FLAT_ATOMIC_UMIN : FLAT_Atomic_Pseudo <"flat_atomic_umin", - VGPR_32, i32>; + AVLdSt_32, i32>; defm FLAT_ATOMIC_SMAX : FLAT_Atomic_Pseudo <"flat_atomic_smax", - VGPR_32, i32>; + AVLdSt_32, i32>; defm FLAT_ATOMIC_UMAX : FLAT_Atomic_Pseudo <"flat_atomic_umax", - VGPR_32, i32>; + AVLdSt_32, i32>; defm FLAT_ATOMIC_AND : FLAT_Atomic_Pseudo <"flat_atomic_and", - VGPR_32, i32>; + AVLdSt_32, i32>; defm FLAT_ATOMIC_OR : FLAT_Atomic_Pseudo <"flat_atomic_or", - VGPR_32, i32>; + AVLdSt_32, i32>; defm FLAT_ATOMIC_XOR : FLAT_Atomic_Pseudo <"flat_atomic_xor", - VGPR_32, i32>; + AVLdSt_32, i32>; defm FLAT_ATOMIC_INC : FLAT_Atomic_Pseudo <"flat_atomic_inc", - VGPR_32, i32>; + AVLdSt_32, i32>; defm FLAT_ATOMIC_DEC : FLAT_Atomic_Pseudo <"flat_atomic_dec", - VGPR_32, i32>; + AVLdSt_32, i32>; defm FLAT_ATOMIC_ADD_X2 : FLAT_Atomic_Pseudo <"flat_atomic_add_x2", - VReg_64, i64>; + AVLdSt_64, i64>; defm FLAT_ATOMIC_SUB_X2 : FLAT_Atomic_Pseudo <"flat_atomic_sub_x2", - VReg_64, i64>; + AVLdSt_64, i64>; defm FLAT_ATOMIC_SMIN_X2 : FLAT_Atomic_Pseudo <"flat_atomic_smin_x2", - VReg_64, i64>; + AVLdSt_64, i64>; defm FLAT_ATOMIC_UMIN_X2 : FLAT_Atomic_Pseudo <"flat_atomic_umin_x2", - VReg_64, i64>; + AVLdSt_64, i64>; defm FLAT_ATOMIC_SMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_smax_x2", - VReg_64, i64>; + AVLdSt_64, i64>; defm FLAT_ATOMIC_UMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_umax_x2", - VReg_64, i64>; + AVLdSt_64, i64>; defm FLAT_ATOMIC_AND_X2 : FLAT_Atomic_Pseudo <"flat_atomic_and_x2", - VReg_64, i64>; + AVLdSt_64, i64>; defm FLAT_ATOMIC_OR_X2 : FLAT_Atomic_Pseudo <"flat_atomic_or_x2", - VReg_64, i64>; + AVLdSt_64, i64>; defm FLAT_ATOMIC_XOR_X2 : FLAT_Atomic_Pseudo <"flat_atomic_xor_x2", - VReg_64, i64>; + AVLdSt_64, i64>; defm FLAT_ATOMIC_INC_X2 : FLAT_Atomic_Pseudo <"flat_atomic_inc_x2", - VReg_64, i64>; + AVLdSt_64, i64>; defm FLAT_ATOMIC_DEC_X2 : FLAT_Atomic_Pseudo <"flat_atomic_dec_x2", - VReg_64, i64>; + AVLdSt_64, i64>; // GFX7-, GFX10-only flat instructions. let SubtargetPredicate = isGFX7GFX10 in { defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap_x2", - VReg_64, f64, v2f64, VReg_128>; + AVLdSt_64, f64, v2f64, AVLdSt_128>; } // End SubtargetPredicate = isGFX7GFX10 @@ -1005,169 +1052,173 @@ defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap_x2", // choose this as the canonical name. let SubtargetPredicate = HasAtomicFMinFMaxF64FlatInsts in { defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo <"flat_atomic_min_f64", - VReg_64, f64>; + AVLdSt_64, f64>; defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo <"flat_atomic_max_f64", - VReg_64, f64>; + AVLdSt_64, f64>; } let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in { -defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64>; -defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64>; +defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", AVLdSt_64, f64>; +defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", AVLdSt_64, f64>; } let SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst in { - defm FLAT_ATOMIC_ADD_F64 : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", VReg_64, f64>; - defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", VReg_64, f64>; + defm FLAT_ATOMIC_ADD_F64 : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", AVLdSt_64, f64>; + defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", AVLdSt_64, f64>; } // End SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst let SubtargetPredicate = HasAtomicFlatPkAdd16Insts in { - defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_f16", VGPR_32, v2f16>; + defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_f16", AVLdSt_32, v2f16>; let FPAtomic = 1 in - defm FLAT_ATOMIC_PK_ADD_BF16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_bf16", VGPR_32, v2i16>; + defm FLAT_ATOMIC_PK_ADD_BF16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_bf16", AVLdSt_32, v2i16>; } // End SubtargetPredicate = HasAtomicFlatPkAdd16Insts let SubtargetPredicate = HasAtomicGlobalPkAddBF16Inst, FPAtomic = 1 in - defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Atomic_Pseudo<"global_atomic_pk_add_bf16", VGPR_32, v2i16>; + defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Atomic_Pseudo<"global_atomic_pk_add_bf16", AVLdSt_32, v2i16>; // GFX7-, GFX10-, GFX11-only flat instructions. let SubtargetPredicate = isGFX7GFX10GFX11 in { defm FLAT_ATOMIC_FCMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap", - VGPR_32, f32, v2f32, VReg_64>; + AVLdSt_32, f32, v2f32, AVLdSt_64>; defm FLAT_ATOMIC_FMIN : FLAT_Atomic_Pseudo <"flat_atomic_fmin", - VGPR_32, f32>; + AVLdSt_32, f32>; defm FLAT_ATOMIC_FMAX : FLAT_Atomic_Pseudo <"flat_atomic_fmax", - VGPR_32, f32>; + AVLdSt_32, f32>; } // End SubtargetPredicate = isGFX7GFX10GFX11 // GFX942-, GFX11-only flat instructions. let SubtargetPredicate = HasFlatAtomicFaddF32Inst in { - defm FLAT_ATOMIC_ADD_F32 : FLAT_Atomic_Pseudo<"flat_atomic_add_f32", VGPR_32, f32>; + defm FLAT_ATOMIC_ADD_F32 : FLAT_Atomic_Pseudo<"flat_atomic_add_f32", AVLdSt_32, f32>; } // End SubtargetPredicate = HasFlatAtomicFaddF32Inst let SubtargetPredicate = isGFX12Plus in { - defm FLAT_ATOMIC_CSUB_U32 : FLAT_Atomic_Pseudo <"flat_atomic_csub_u32", VGPR_32, i32>; - defm FLAT_ATOMIC_COND_SUB_U32 : FLAT_Atomic_Pseudo <"flat_atomic_cond_sub_u32", VGPR_32, i32>; -} // End SubtargetPredicate = isGFX12Plus + defm FLAT_ATOMIC_CSUB_U32 : FLAT_Atomic_Pseudo <"flat_atomic_csub_u32", VGPROp_32, i32>; + defm FLAT_ATOMIC_COND_SUB_U32 : FLAT_Atomic_Pseudo_RTN<"flat_atomic_cond_sub_u32", VGPROp_32, i32>; +} + +let SubtargetPredicate = HasAtomicCSubNoRtnInsts in { + defm FLAT_ATOMIC_COND_SUB_U32 : FLAT_Atomic_Pseudo_NO_RTN<"flat_atomic_cond_sub_u32", VGPROp_32, i32>; +} -defm GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>; -defm GLOBAL_LOAD_SBYTE : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>; -defm GLOBAL_LOAD_USHORT : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>; -defm GLOBAL_LOAD_SSHORT : FLAT_Global_Load_Pseudo <"global_load_sshort", VGPR_32>; -defm GLOBAL_LOAD_DWORD : FLAT_Global_Load_Pseudo <"global_load_dword", VGPR_32>; -defm GLOBAL_LOAD_DWORDX2 : FLAT_Global_Load_Pseudo <"global_load_dwordx2", VReg_64>; -defm GLOBAL_LOAD_DWORDX3 : FLAT_Global_Load_Pseudo <"global_load_dwordx3", VReg_96>; -defm GLOBAL_LOAD_DWORDX4 : FLAT_Global_Load_Pseudo <"global_load_dwordx4", VReg_128>; +defm GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte">; +defm GLOBAL_LOAD_SBYTE : FLAT_Global_Load_Pseudo <"global_load_sbyte">; +defm GLOBAL_LOAD_USHORT : FLAT_Global_Load_Pseudo <"global_load_ushort">; +defm GLOBAL_LOAD_SSHORT : FLAT_Global_Load_Pseudo <"global_load_sshort">; +defm GLOBAL_LOAD_DWORD : FLAT_Global_Load_Pseudo <"global_load_dword">; +defm GLOBAL_LOAD_DWORDX2 : FLAT_Global_Load_Pseudo <"global_load_dwordx2", AVLdSt_64>; +defm GLOBAL_LOAD_DWORDX3 : FLAT_Global_Load_Pseudo <"global_load_dwordx3", AVLdSt_96>; +defm GLOBAL_LOAD_DWORDX4 : FLAT_Global_Load_Pseudo <"global_load_dwordx4", AVLdSt_128>; let TiedSourceNotRead = 1 in { -defm GLOBAL_LOAD_SBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16_hi", VGPR_32, 1>; -defm GLOBAL_LOAD_SHORT_D16_HI : FLAT_Global_Load_Pseudo <"global_load_short_d16_hi", VGPR_32, 1>; -defm GLOBAL_LOAD_UBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_ubyte_d16_hi", VGPR_32, 1>; +defm GLOBAL_LOAD_SBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16_hi", AVLdSt_32, 1>; +defm GLOBAL_LOAD_SHORT_D16_HI : FLAT_Global_Load_Pseudo <"global_load_short_d16_hi", AVLdSt_32, 1>; +defm GLOBAL_LOAD_UBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_ubyte_d16_hi", AVLdSt_32, 1>; defm GLOBAL_LOAD_SBYTE_D16 : FLAT_Global_Load_Pseudo_t16 <"global_load_sbyte_d16">; defm GLOBAL_LOAD_SHORT_D16 : FLAT_Global_Load_Pseudo_t16 <"global_load_short_d16">; defm GLOBAL_LOAD_UBYTE_D16 : FLAT_Global_Load_Pseudo_t16 <"global_load_ubyte_d16">; } -defm GLOBAL_STORE_BYTE_D16_HI : FLAT_Global_Store_Pseudo <"global_store_byte_d16_hi", VGPR_32>; -defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Global_Store_Pseudo <"global_store_short_d16_hi", VGPR_32>; +defm GLOBAL_STORE_BYTE_D16_HI : FLAT_Global_Store_Pseudo <"global_store_byte_d16_hi">; +defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Global_Store_Pseudo <"global_store_short_d16_hi">; let OtherPredicates = [HasGFX10_BEncoding] in -defm GLOBAL_LOAD_DWORD_ADDTID : FLAT_Global_Load_AddTid_Pseudo <"global_load_dword_addtid", VGPR_32>; +defm GLOBAL_LOAD_DWORD_ADDTID : FLAT_Global_Load_AddTid_Pseudo <"global_load_dword_addtid", VGPROp_32>; defm GLOBAL_STORE_BYTE : FLAT_Global_Store_Pseudo_t16 <"global_store_byte">; defm GLOBAL_STORE_SHORT : FLAT_Global_Store_Pseudo_t16 <"global_store_short">; -defm GLOBAL_STORE_DWORD : FLAT_Global_Store_Pseudo <"global_store_dword", VGPR_32>; -defm GLOBAL_STORE_DWORDX2 : FLAT_Global_Store_Pseudo <"global_store_dwordx2", VReg_64>; -defm GLOBAL_STORE_DWORDX3 : FLAT_Global_Store_Pseudo <"global_store_dwordx3", VReg_96>; -defm GLOBAL_STORE_DWORDX4 : FLAT_Global_Store_Pseudo <"global_store_dwordx4", VReg_128>; +defm GLOBAL_STORE_DWORD : FLAT_Global_Store_Pseudo <"global_store_dword">; +defm GLOBAL_STORE_DWORDX2 : FLAT_Global_Store_Pseudo <"global_store_dwordx2", AVLdSt_64>; +defm GLOBAL_STORE_DWORDX3 : FLAT_Global_Store_Pseudo <"global_store_dwordx3", AVLdSt_96>; +defm GLOBAL_STORE_DWORDX4 : FLAT_Global_Store_Pseudo <"global_store_dwordx4", AVLdSt_128>; let OtherPredicates = [HasGFX10_BEncoding] in -defm GLOBAL_STORE_DWORD_ADDTID : FLAT_Global_Store_AddTid_Pseudo <"global_store_dword_addtid", VGPR_32>; +defm GLOBAL_STORE_DWORD_ADDTID : FLAT_Global_Store_AddTid_Pseudo <"global_store_dword_addtid", VGPROp_32>; defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Global_Atomic_Pseudo <"global_atomic_cmpswap", - VGPR_32, i32, v2i32, VReg_64>; + AVLdSt_32, i32, v2i32, AVLdSt_64>; defm GLOBAL_ATOMIC_CMPSWAP_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_cmpswap_x2", - VReg_64, i64, v2i64, VReg_128>; + AVLdSt_64, i64, v2i64, AVLdSt_128>; defm GLOBAL_ATOMIC_SWAP : FLAT_Global_Atomic_Pseudo <"global_atomic_swap", - VGPR_32, i32>; + AVLdSt_32, i32>; defm GLOBAL_ATOMIC_SWAP_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_swap_x2", - VReg_64, i64>; + AVLdSt_64, i64>; defm GLOBAL_ATOMIC_ADD : FLAT_Global_Atomic_Pseudo <"global_atomic_add", - VGPR_32, i32>; + AVLdSt_32, i32>; defm GLOBAL_ATOMIC_SUB : FLAT_Global_Atomic_Pseudo <"global_atomic_sub", - VGPR_32, i32>; + AVLdSt_32, i32>; defm GLOBAL_ATOMIC_SMIN : FLAT_Global_Atomic_Pseudo <"global_atomic_smin", - VGPR_32, i32>; + AVLdSt_32, i32>; defm GLOBAL_ATOMIC_UMIN : FLAT_Global_Atomic_Pseudo <"global_atomic_umin", - VGPR_32, i32>; + AVLdSt_32, i32>; defm GLOBAL_ATOMIC_SMAX : FLAT_Global_Atomic_Pseudo <"global_atomic_smax", - VGPR_32, i32>; + AVLdSt_32, i32>; defm GLOBAL_ATOMIC_UMAX : FLAT_Global_Atomic_Pseudo <"global_atomic_umax", - VGPR_32, i32>; + AVLdSt_32, i32>; defm GLOBAL_ATOMIC_AND : FLAT_Global_Atomic_Pseudo <"global_atomic_and", - VGPR_32, i32>; + AVLdSt_32, i32>; defm GLOBAL_ATOMIC_OR : FLAT_Global_Atomic_Pseudo <"global_atomic_or", - VGPR_32, i32>; + AVLdSt_32, i32>; defm GLOBAL_ATOMIC_XOR : FLAT_Global_Atomic_Pseudo <"global_atomic_xor", - VGPR_32, i32>; + AVLdSt_32, i32>; defm GLOBAL_ATOMIC_INC : FLAT_Global_Atomic_Pseudo <"global_atomic_inc", - VGPR_32, i32>; + AVLdSt_32, i32>; defm GLOBAL_ATOMIC_DEC : FLAT_Global_Atomic_Pseudo <"global_atomic_dec", - VGPR_32, i32>; + AVLdSt_32, i32>; defm GLOBAL_ATOMIC_ADD_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_add_x2", - VReg_64, i64>; + AVLdSt_64, i64>; defm GLOBAL_ATOMIC_SUB_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_sub_x2", - VReg_64, i64>; + AVLdSt_64, i64>; defm GLOBAL_ATOMIC_SMIN_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_smin_x2", - VReg_64, i64>; + AVLdSt_64, i64>; defm GLOBAL_ATOMIC_UMIN_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_umin_x2", - VReg_64, i64>; + AVLdSt_64, i64>; defm GLOBAL_ATOMIC_SMAX_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_smax_x2", - VReg_64, i64>; + AVLdSt_64, i64>; defm GLOBAL_ATOMIC_UMAX_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_umax_x2", - VReg_64, i64>; + AVLdSt_64, i64>; defm GLOBAL_ATOMIC_AND_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_and_x2", - VReg_64, i64>; + AVLdSt_64, i64>; defm GLOBAL_ATOMIC_OR_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_or_x2", - VReg_64, i64>; + AVLdSt_64, i64>; defm GLOBAL_ATOMIC_XOR_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_xor_x2", - VReg_64, i64>; + AVLdSt_64, i64>; defm GLOBAL_ATOMIC_INC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_inc_x2", - VReg_64, i64>; + AVLdSt_64, i64>; defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_dec_x2", - VReg_64, i64>; + AVLdSt_64, i64>; let SubtargetPredicate = HasGFX10_BEncoding in { defm GLOBAL_ATOMIC_CSUB : FLAT_Global_Atomic_Pseudo <"global_atomic_csub", - VGPR_32, i32>; + VGPROp_32, i32>; } defm GLOBAL_LOAD_LDS_UBYTE : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ubyte">; @@ -1182,10 +1233,10 @@ defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwo } let SubtargetPredicate = isGFX12PlusNot12_50 in - defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo <"global_atomic_ordered_add_b64", VReg_64, i64>; + defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo <"global_atomic_ordered_add_b64", VGPROp_64, i64>; let SubtargetPredicate = isGFX12Plus in { - defm GLOBAL_ATOMIC_COND_SUB_U32 : FLAT_Global_Atomic_Pseudo <"global_atomic_cond_sub_u32", VGPR_32, i32>; + defm GLOBAL_ATOMIC_COND_SUB_U32 : FLAT_Global_Atomic_Pseudo <"global_atomic_cond_sub_u32", VGPROp_32, i32>; def GLOBAL_INV : FLAT_Global_Invalidate_Writeback<"global_inv">; def GLOBAL_WB : FLAT_Global_Invalidate_Writeback<"global_wb">; @@ -1194,6 +1245,12 @@ let SubtargetPredicate = isGFX12Plus in { let SubtargetPredicate = isGFX1250Plus in { +let Uses = [M0, EXEC, ASYNCcnt], WaveSizePredicate = isWave32 in { +defm CLUSTER_LOAD_ASYNC_TO_LDS_B8 : FLAT_Global_Load_LDS_Pseudo<"cluster_load_async_to_lds_b8", 1>; +defm CLUSTER_LOAD_ASYNC_TO_LDS_B32 : FLAT_Global_Load_LDS_Pseudo<"cluster_load_async_to_lds_b32", 1>; +defm CLUSTER_LOAD_ASYNC_TO_LDS_B64 : FLAT_Global_Load_LDS_Pseudo<"cluster_load_async_to_lds_b64", 1>; +defm CLUSTER_LOAD_ASYNC_TO_LDS_B128 : FLAT_Global_Load_LDS_Pseudo<"cluster_load_async_to_lds_b128", 1>; +} // End Uses = [M0, EXEC, ASYNCcnt], WaveSizePredicate = isWave32 defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b8", 1>; defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b32", 1>; defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b64", 1>; @@ -1207,33 +1264,33 @@ def TENSOR_SAVE : FLAT_Global_Tensor_Pseudo<"tensor_save", 1>; def TENSOR_STOP : FLAT_Global_Tensor_Pseudo<"tensor_stop">; } // End SubtargetPredicate = isGFX1250Plus -defm SCRATCH_LOAD_UBYTE : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte", VGPR_32>; -defm SCRATCH_LOAD_SBYTE : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte", VGPR_32>; -defm SCRATCH_LOAD_USHORT : FLAT_Scratch_Load_Pseudo <"scratch_load_ushort", VGPR_32>; -defm SCRATCH_LOAD_SSHORT : FLAT_Scratch_Load_Pseudo <"scratch_load_sshort", VGPR_32>; -defm SCRATCH_LOAD_DWORD : FLAT_Scratch_Load_Pseudo <"scratch_load_dword", VGPR_32>; -defm SCRATCH_LOAD_DWORDX2 : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx2", VReg_64>; -defm SCRATCH_LOAD_DWORDX3 : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx3", VReg_96>; -defm SCRATCH_LOAD_DWORDX4 : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx4", VReg_128>; +defm SCRATCH_LOAD_UBYTE : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte">; +defm SCRATCH_LOAD_SBYTE : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte">; +defm SCRATCH_LOAD_USHORT : FLAT_Scratch_Load_Pseudo <"scratch_load_ushort">; +defm SCRATCH_LOAD_SSHORT : FLAT_Scratch_Load_Pseudo <"scratch_load_sshort">; +defm SCRATCH_LOAD_DWORD : FLAT_Scratch_Load_Pseudo <"scratch_load_dword">; +defm SCRATCH_LOAD_DWORDX2 : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx2", AVLdSt_64>; +defm SCRATCH_LOAD_DWORDX3 : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx3", AVLdSt_96>; +defm SCRATCH_LOAD_DWORDX4 : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx4", AVLdSt_128>; let TiedSourceNotRead = 1 in { -defm SCRATCH_LOAD_UBYTE_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte_d16_hi", VGPR_32, 1>; -defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte_d16_hi", VGPR_32, 1>; -defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_short_d16_hi", VGPR_32, 1>; +defm SCRATCH_LOAD_UBYTE_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte_d16_hi", AVLdSt_32, 1>; +defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte_d16_hi", AVLdSt_32, 1>; +defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_short_d16_hi", AVLdSt_32, 1>; defm SCRATCH_LOAD_UBYTE_D16 : FLAT_Scratch_Load_Pseudo_t16 <"scratch_load_ubyte_d16">; defm SCRATCH_LOAD_SBYTE_D16 : FLAT_Scratch_Load_Pseudo_t16 <"scratch_load_sbyte_d16">; defm SCRATCH_LOAD_SHORT_D16 : FLAT_Scratch_Load_Pseudo_t16 <"scratch_load_short_d16">; } -defm SCRATCH_STORE_BYTE_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_byte_d16_hi", VGPR_32>; -defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_short_d16_hi", VGPR_32>; +defm SCRATCH_STORE_BYTE_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_byte_d16_hi">; +defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_short_d16_hi">; defm SCRATCH_STORE_BYTE : FLAT_Scratch_Store_Pseudo_t16 <"scratch_store_byte">; defm SCRATCH_STORE_SHORT : FLAT_Scratch_Store_Pseudo_t16 <"scratch_store_short">; -defm SCRATCH_STORE_DWORD : FLAT_Scratch_Store_Pseudo <"scratch_store_dword", VGPR_32>; -defm SCRATCH_STORE_DWORDX2 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx2", VReg_64>; -defm SCRATCH_STORE_DWORDX3 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx3", VReg_96>; -defm SCRATCH_STORE_DWORDX4 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx4", VReg_128>; +defm SCRATCH_STORE_DWORD : FLAT_Scratch_Store_Pseudo <"scratch_store_dword">; +defm SCRATCH_STORE_DWORDX2 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx2", AVLdSt_64>; +defm SCRATCH_STORE_DWORDX3 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx3", AVLdSt_96>; +defm SCRATCH_STORE_DWORDX4 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx4", AVLdSt_128>; defm SCRATCH_LOAD_LDS_UBYTE : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_ubyte">; defm SCRATCH_LOAD_LDS_SBYTE : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_sbyte">; @@ -1242,69 +1299,77 @@ defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_s defm SCRATCH_LOAD_LDS_DWORD : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_dword">; let SubtargetPredicate = isGFX125xOnly in { -defm FLAT_LOAD_MONITOR_B32 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b32", VGPR_32>; -defm FLAT_LOAD_MONITOR_B64 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b64", VReg_64>; -defm FLAT_LOAD_MONITOR_B128 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b128", VReg_128>; +defm FLAT_LOAD_MONITOR_B32 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b32", VGPROp_32>; +defm FLAT_LOAD_MONITOR_B64 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b64", VGPROp_64>; +defm FLAT_LOAD_MONITOR_B128 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b128", VGPROp_128>; -defm GLOBAL_LOAD_MONITOR_B32 : FLAT_Global_Load_Pseudo <"global_load_monitor_b32", VGPR_32>; -defm GLOBAL_LOAD_MONITOR_B64 : FLAT_Global_Load_Pseudo <"global_load_monitor_b64", VReg_64>; -defm GLOBAL_LOAD_MONITOR_B128 : FLAT_Global_Load_Pseudo <"global_load_monitor_b128", VReg_128>; +defm GLOBAL_LOAD_MONITOR_B32 : FLAT_Global_Load_Pseudo <"global_load_monitor_b32", VGPROp_32>; +defm GLOBAL_LOAD_MONITOR_B64 : FLAT_Global_Load_Pseudo <"global_load_monitor_b64", VGPROp_64>; +defm GLOBAL_LOAD_MONITOR_B128 : FLAT_Global_Load_Pseudo <"global_load_monitor_b128", VGPROp_128>; } // End SubtargetPredicate = isGFX125xOnly +let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32 in { +let Uses = [M0, EXEC] in { // Use M0 for broadcast workgroup mask. +defm CLUSTER_LOAD_B32 : FLAT_Global_Load_Pseudo <"cluster_load_b32", VGPROp_32>; +defm CLUSTER_LOAD_B64 : FLAT_Global_Load_Pseudo <"cluster_load_b64", VGPROp_64>; +defm CLUSTER_LOAD_B128 : FLAT_Global_Load_Pseudo <"cluster_load_b128", VGPROp_128>; +} // End Uses = [M0, EXEC] +} // End SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32 + let SubtargetPredicate = isGFX12Plus in { let Uses = [EXEC, M0] in { - defm GLOBAL_LOAD_BLOCK : FLAT_Global_Load_Pseudo <"global_load_block", VReg_1024>; - defm GLOBAL_STORE_BLOCK : FLAT_Global_Store_Pseudo <"global_store_block", VReg_1024>; + defm GLOBAL_LOAD_BLOCK : FLAT_Global_Load_Pseudo <"global_load_block", VGPROp_1024>; + defm GLOBAL_STORE_BLOCK : FLAT_Global_Store_Pseudo <"global_store_block", VGPROp_1024>; } let Uses = [EXEC, FLAT_SCR, M0] in { - defm SCRATCH_LOAD_BLOCK : FLAT_Scratch_Load_Pseudo <"scratch_load_block", VReg_1024>; - defm SCRATCH_STORE_BLOCK : FLAT_Scratch_Store_Pseudo <"scratch_store_block", VReg_1024>; + defm SCRATCH_LOAD_BLOCK : FLAT_Scratch_Load_Pseudo <"scratch_load_block", VGPROp_1024>; + defm SCRATCH_STORE_BLOCK : FLAT_Scratch_Store_Pseudo <"scratch_store_block", VGPROp_1024>; } let WaveSizePredicate = isWave32 in { - defm GLOBAL_LOAD_TR_B128_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b128", VReg_128>; - defm GLOBAL_LOAD_TR_B64_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b64", VReg_64>; + defm GLOBAL_LOAD_TR_B128_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b128", VGPROp_128>; + defm GLOBAL_LOAD_TR_B64_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b64", VGPROp_64>; } } // End SubtargetPredicate = isGFX12Plus let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12PlusNot12_50 in { let Mnemonic = "global_load_tr_b128" in - defm GLOBAL_LOAD_TR_B128_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b128_w64", VReg_64>; + defm GLOBAL_LOAD_TR_B128_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b128_w64", VGPROp_64>; let Mnemonic = "global_load_tr_b64" in - defm GLOBAL_LOAD_TR_B64_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b64_w64", VGPR_32>; + defm GLOBAL_LOAD_TR_B64_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b64_w64", VGPROp_32>; } let WaveSizePredicate = isWave32, SubtargetPredicate = HasTransposeLoadF4F6Insts in { - defm GLOBAL_LOAD_TR6_B96 : FLAT_Global_Load_Pseudo <"global_load_tr6_b96", VReg_96>; - defm GLOBAL_LOAD_TR4_B64 : FLAT_Global_Load_Pseudo <"global_load_tr4_b64", VReg_64>; + defm GLOBAL_LOAD_TR6_B96 : FLAT_Global_Load_Pseudo <"global_load_tr6_b96", VGPROp_96>; + defm GLOBAL_LOAD_TR4_B64 : FLAT_Global_Load_Pseudo <"global_load_tr4_b64", VGPROp_64>; } let SubtargetPredicate = isGFX10Plus in { defm GLOBAL_ATOMIC_FCMPSWAP : - FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", VGPR_32, f32, v2f32, VReg_64>; + FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", AVLdSt_32, f32, v2f32, AVLdSt_64>; defm GLOBAL_ATOMIC_FMIN : - FLAT_Global_Atomic_Pseudo<"global_atomic_fmin", VGPR_32, f32>; + FLAT_Global_Atomic_Pseudo<"global_atomic_fmin", AVLdSt_32, f32>; defm GLOBAL_ATOMIC_FMAX : - FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", VGPR_32, f32>; + FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", AVLdSt_32, f32>; defm GLOBAL_ATOMIC_FCMPSWAP_X2 : - FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap_x2", VReg_64, f64, v2f64, VReg_128>; + FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap_x2", AVLdSt_64, f64, v2f64, AVLdSt_128>; } // End SubtargetPredicate = isGFX10Plus -let OtherPredicates = [HasAtomicFaddNoRtnInsts] in +let SubtargetPredicate = HasAtomicFaddNoRtnInsts in defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN < - "global_atomic_add_f32", VGPR_32, f32 + "global_atomic_add_f32", AVLdSt_32, f32 >; -let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in +let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16NoRtnInsts in defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN < - "global_atomic_pk_add_f16", VGPR_32, v2f16 + "global_atomic_pk_add_f16", AVLdSt_32, v2f16 >; -let OtherPredicates = [HasAtomicFaddRtnInsts] in +let SubtargetPredicate = HasAtomicFaddRtnInsts in defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_RTN < - "global_atomic_add_f32", VGPR_32, f32 + "global_atomic_add_f32", AVLdSt_32, f32 >; -let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in +let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_RTN < - "global_atomic_pk_add_f16", VGPR_32, v2f16 + "global_atomic_pk_add_f16", AVLdSt_32, v2f16 >; let SubtargetPredicate = HasVmemPrefInsts in { @@ -1362,6 +1427,16 @@ class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT (inst $saddr, $voffset, $offset, $cpol) >; +class FlatLoadLDSSignedPat_M0 <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < + (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol), M0), + (inst $dsaddr, $vaddr, $offset, $cpol) +>; + +class GlobalLoadLDSSaddrPat_M0 <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < + (node (GlobalSAddrNoIOffsetM0 (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm), M0), + (inst $dsaddr, $saddr, $voffset, $offset, $cpol) +>; + class FlatLoadLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)), (inst $dsaddr, $vaddr, $offset, $cpol) @@ -1397,6 +1472,16 @@ class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> (inst $saddr, $voffset, $offset, $cpol) >; +class FlatLoadSignedPat_M0 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), (i32 timm:$cpol), M0)), + (inst $vaddr, $offset, $cpol) +>; + +class GlobalLoadSaddrPat_M0 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalSAddrCPolM0 (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), (i32 timm), M0)), + (inst $saddr, $voffset, $offset, $cpol) +>; + class FlatLoadSignedPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), (i32 timm:$cpol))), (inst $vaddr, $offset, $cpol) @@ -1416,8 +1501,10 @@ class FlatStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, class FlatAtomicSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ComplexPattern pat, ValueType vt, ValueType data_vt = vt> : GCNPat < (vt (node (pat (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), data_vt:$data)), - (inst $voffset, getVregSrcForVT<data_vt>.ret:$data, $saddr, $offset, $cpol) ->; + (inst $voffset, getVregSrcForVT<data_vt>.ret:$data, $saddr, $offset, $cpol)> { + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; +} class GlobalAtomicNoRtnSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < @@ -1443,19 +1530,24 @@ class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, (inst $vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset) >; -multiclass FlatAtomicNoRtnPatBase <string inst, string node, ValueType vt, +multiclass FlatAtomicNoRtnPatBase <string base_inst_name, string node, ValueType vt, ValueType data_vt = vt> { - + defvar inst = !cast<FLAT_Pseudo>(base_inst_name); + defvar inst_saddr = !cast<FLAT_Pseudo>(inst#"_SADDR"); defvar noRtnNode = !cast<PatFrags>(node); let AddedComplexity = 1 in def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)), - (!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>; + (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> { + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; + } - def : FlatAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), !cast<SDPatternOperator>(node), + def : FlatAtomicSaddrPat<inst_saddr, !cast<SDPatternOperator>(node), GlobalSAddr, vt, data_vt> { let AddedComplexity = 9; - let SubtargetPredicate = HasFlatGVSMode; + let SubtargetPredicate = inst_saddr.SubtargetPredicate; + let OtherPredicates = inst_saddr.OtherPredicates; } } @@ -1468,17 +1560,22 @@ multiclass FlatAtomicNoRtnPat <string inst, string node, ValueType vt, FlatAtomicNoRtnPatBase<inst, node # "_noret" # !if(isIntr, "", "_"#vt), vt, data_vt>; -multiclass FlatAtomicRtnPatBase <string inst, string node, ValueType vt, +multiclass FlatAtomicRtnPatBase <string inst_name, string node, ValueType vt, ValueType data_vt = vt> { - + defvar inst = !cast<FLAT_Pseudo>(inst_name#"_RTN"); + defvar inst_saddr = !cast<FLAT_Pseudo>(inst_name#"_SADDR_RTN"); defvar rtnNode = !cast<SDPatternOperator>(node); def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)), - (!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>; + (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> { + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; + } - def : FlatAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_RTN"), rtnNode, GlobalSAddrGLC, vt, data_vt> { + def : FlatAtomicSaddrPat<inst_saddr, rtnNode, GlobalSAddrGLC, vt, data_vt> { let AddedComplexity = 8; - let SubtargetPredicate = HasFlatGVSMode; + let SubtargetPredicate = inst_saddr.SubtargetPredicate; + let OtherPredicates = inst_saddr.OtherPredicates; } } @@ -1514,8 +1611,10 @@ multiclass FlatAtomicIntrPat <string inst, string node, ValueType vt, class FlatSignedAtomicPatBase <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, ValueType data_vt = vt> : GCNPat < (vt (node (GlobalOffset i64:$vaddr, i32:$offset), data_vt:$data)), - (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset) ->; + (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> { + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; +} multiclass FlatSignedAtomicPat <string inst, string node, ValueType vt, ValueType data_vt = vt, int complexity = 0, @@ -1592,6 +1691,16 @@ class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Va (inst $vaddr, $saddr, $offset, $cpol) >; +multiclass GlobalLoadLDSPats_M0<FLAT_Pseudo inst, SDPatternOperator node> { + def : FlatLoadLDSSignedPat_M0 <inst, node> { + let AddedComplexity = 10; + } + + def : GlobalLoadLDSSaddrPat_M0<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node> { + let AddedComplexity = 11; + } +} + multiclass GlobalLoadLDSPats<FLAT_Pseudo inst, SDPatternOperator node> { def : FlatLoadLDSSignedPat <inst, node> { let AddedComplexity = 10; @@ -1615,20 +1724,42 @@ multiclass GlobalStoreLDSPats<FLAT_Pseudo inst, SDPatternOperator node> { multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { def : FlatLoadSignedPat <inst, node, vt> { let AddedComplexity = 10; + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; } def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { let AddedComplexity = 11; + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; + } +} + +multiclass GlobalFLATLoadPats_M0<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatLoadSignedPat_M0 <inst, node, vt> { + let AddedComplexity = 10; + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; + } + + def : GlobalLoadSaddrPat_M0<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 11; + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; } } multiclass GlobalFLATLoadPats_CPOL<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { def : FlatLoadSignedPat_CPOL<inst, node, vt> { let AddedComplexity = 10; + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; } def : GlobalLoadSaddrPat_CPOL<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { let AddedComplexity = 11; + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; } } @@ -1655,10 +1786,14 @@ multiclass GlobalFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Value multiclass GlobalFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { def : FlatStoreSignedPat <inst, node, vt> { + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; let AddedComplexity = 10; } def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; let AddedComplexity = 11; } } @@ -1803,7 +1938,9 @@ multiclass ScratchFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Valu } multiclass FlatLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { - def : FlatLoadPat <inst, node, vt>; + def : FlatLoadPat <inst, node, vt> { + let OtherPredicates = [HasFlatAddressSpace]; + } def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { let AddedComplexity = 9; @@ -1830,7 +1967,9 @@ multiclass FlatLoadPats_D16_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueT } multiclass FlatStorePats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { - def : FlatStorePat <inst, node, vt>; + def : FlatStorePat <inst, node, vt> { + let OtherPredicates = [HasFlatAddressSpace]; + } def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { let AddedComplexity = 9; @@ -1847,8 +1986,6 @@ multiclass FlatStorePats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType } } -let OtherPredicates = [HasFlatAddressSpace] in { - defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i32>; defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i32>; defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_aext_16_flat, i32>; @@ -1898,6 +2035,7 @@ let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predi defm : FlatLoadPats <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>; defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>; defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, v2i32>; +defm : FlatLoadPats <FLAT_LOAD_DWORDX4, atomic_load_nonext_128_flat, v4i32>; defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i32>; defm : FlatStorePats <FLAT_STORE_SHORT, truncstorei16_flat, i32>; @@ -1922,6 +2060,7 @@ defm : FlatStorePats <FLAT_STORE_DWORDX4, store_flat, vt>; defm : FlatStorePats <FLAT_STORE_DWORD, atomic_store_32_flat, i32>; defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>; defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, v2i32>; +defm : FlatStorePats <FLAT_STORE_DWORDX4, atomic_store_128_flat, v4i32>; defm : FlatStorePats <FLAT_STORE_BYTE, atomic_store_8_flat, i32>; defm : FlatStorePats <FLAT_STORE_SHORT, atomic_store_16_flat, i32>; @@ -1970,12 +2109,7 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>; defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>; defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>; -} // End OtherPredicates = [HasFlatAddressSpace] - -let OtherPredicates = [isGFX12Plus] in defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>; - -let OtherPredicates = [isGFX12Plus, HasAtomicCSubNoRtnInsts] in defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>; let OtherPredicates = [HasD16LoadStore] in { @@ -2000,8 +2134,6 @@ defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>; defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>; } -let OtherPredicates = [HasFlatGlobalInsts] in { - defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_aext_8_global, i32>; defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_zext_8_global, i32>; defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_aext_16_global, i32>; @@ -2015,7 +2147,7 @@ defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, zextloadi16_global, i32>; defm : GlobalFLATLoadPats <GLOBAL_LOAD_SSHORT, sextloadi16_global, i32>; foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in -let OtherPredicates = [HasFlatGlobalInsts], True16Predicate = p in { +let True16Predicate = p in { defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, extloadi8_global, i16>; defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, zextloadi8_global, i16>; defm : GlobalFLATLoadPats <GLOBAL_LOAD_SBYTE, sextloadi8_global, i16>; @@ -2029,7 +2161,7 @@ defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_nonext_16_global, i16 defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i16>; } -let OtherPredicates = [HasFlatGlobalInsts, D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in { +let OtherPredicates = [D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in { defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", extloadi8_global, i16>; defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", zextloadi8_global, i16>; defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SBYTE_D16", sextloadi8_global, i16>; @@ -2068,6 +2200,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX4, store_global, vt>; defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORD, atomic_load_nonext_32_global, i32>; defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, i64>; defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, v2i32>; +defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX4, atomic_load_nonext_128_global, v4i32>; defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, i32>; defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, truncstorei16_global, i32>; @@ -2108,6 +2241,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i32>; defm : GlobalFLATStorePats <GLOBAL_STORE_DWORD, atomic_store_32_global, i32>; defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, i64>; defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, v2i32>; +defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX4, atomic_store_128_global, v4i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", "atomic_load_add_global", i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", "atomic_load_sub_global", i32>; @@ -2124,7 +2258,7 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", "AMDGPUatomic_cmp_swap_glo defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", "atomic_load_xor_global", i32>; defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>; -let OtherPredicates = [HasAtomicCSubNoRtnInsts] in +let SubtargetPredicate = HasAtomicCSubNoRtnInsts in defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", "atomic_load_add_global", i64>; @@ -2144,7 +2278,7 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i let SubtargetPredicate = isGFX12Plus in { defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>; - let OtherPredicates = [HasAtomicCSubNoRtnInsts] in + let SubtargetPredicate = HasAtomicCSubNoRtnInsts in defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>; } @@ -2179,6 +2313,15 @@ let OtherPredicates = [isGFX125xOnly] in { } // End SubtargetPredicate = isGFX125xOnly let OtherPredicates = [isGFX1250Plus] in { + defm : GlobalFLATLoadPats_M0 <CLUSTER_LOAD_B32, int_amdgcn_cluster_load_b32, i32>; + defm : GlobalFLATLoadPats_M0 <CLUSTER_LOAD_B64, int_amdgcn_cluster_load_b64, v2i32>; + defm : GlobalFLATLoadPats_M0 <CLUSTER_LOAD_B128, int_amdgcn_cluster_load_b128, v4i32>; + + defm : GlobalLoadLDSPats_M0 <CLUSTER_LOAD_ASYNC_TO_LDS_B8, int_amdgcn_cluster_load_async_to_lds_b8>; + defm : GlobalLoadLDSPats_M0 <CLUSTER_LOAD_ASYNC_TO_LDS_B32, int_amdgcn_cluster_load_async_to_lds_b32>; + defm : GlobalLoadLDSPats_M0 <CLUSTER_LOAD_ASYNC_TO_LDS_B64, int_amdgcn_cluster_load_async_to_lds_b64>; + defm : GlobalLoadLDSPats_M0 <CLUSTER_LOAD_ASYNC_TO_LDS_B128, int_amdgcn_cluster_load_async_to_lds_b128>; + defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B8, int_amdgcn_global_load_async_to_lds_b8>; defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B32, int_amdgcn_global_load_async_to_lds_b32>; defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B64, int_amdgcn_global_load_async_to_lds_b64>; @@ -2190,62 +2333,38 @@ let OtherPredicates = [isGFX1250Plus] in { defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B128, int_amdgcn_global_store_async_from_lds_b128>; } -let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>; -} - -let SubtargetPredicate = HasAtomicFMinFMaxF32FlatInsts in { defm : FlatAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>; defm : FlatAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>; -} -let OtherPredicates = [isGFX12Only] in { - // FIXME: Remove these intrinsics - defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin_num", f32>; - defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax_num", f32>; - defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin_num", f32>; - defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax_num", f32>; +// FIXME: Remove these intrinsics +let SubtargetPredicate = isGFX12Only in { +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin_num", f32>; +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax_num", f32>; +defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin_num", f32>; +defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax_num", f32>; } -let OtherPredicates = [HasAtomicFaddNoRtnInsts] in { defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>; -} -let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in { defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>; -} -let OtherPredicates = [HasAtomicFaddRtnInsts] in { defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>; -} -let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>; -} -let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>; -} -let OtherPredicates = [HasFlatBufferGlobalAtomicFaddF64Inst] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>; defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>; -} -let OtherPredicates = [HasFlatAtomicFaddF32Inst] in { defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>; -} - -let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in { defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_flat", v2f16>; defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16>; -} -let OtherPredicates = [HasAtomicGlobalPkAddBF16Inst] in defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_global", v2bf16>; -} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in { @@ -2566,6 +2685,7 @@ multiclass FLAT_Real_Atomics_vi <bits<7> op, defvar ps = !cast<FLAT_Pseudo>(NAME); def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr), has_sccb>; def _RTN_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN"), has_sccb>; + def _RTN_agpr_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN_agpr"), has_sccb>; } multiclass FLAT_Global_Real_Atomics_vi<bits<7> op, @@ -2573,8 +2693,10 @@ multiclass FLAT_Global_Real_Atomics_vi<bits<7> op, FLAT_Real_AllAddr_vi<op, has_sccb> { def _RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN"), has_sccb>; def _SADDR_RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN"), has_sccb>; -} + def _RTN_agpr_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN_agpr"), has_sccb>; + def _SADDR_RTN_agpr_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN_agpr"), has_sccb>; +} defm FLAT_ATOMIC_SWAP : FLAT_Real_Atomics_vi <0x40>; defm FLAT_ATOMIC_CMPSWAP : FLAT_Real_Atomics_vi <0x41>; @@ -3473,6 +3595,14 @@ defm GLOBAL_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>; defm GLOBAL_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>; defm GLOBAL_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>; +defm CLUSTER_LOAD_B32 : VFLAT_Real_AllAddr_gfx1250<0x067>; +defm CLUSTER_LOAD_B64 : VFLAT_Real_AllAddr_gfx1250<0x068>; +defm CLUSTER_LOAD_B128 : VFLAT_Real_AllAddr_gfx1250<0x069>; + +defm CLUSTER_LOAD_ASYNC_TO_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x6a>; +defm CLUSTER_LOAD_ASYNC_TO_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x6b>; +defm CLUSTER_LOAD_ASYNC_TO_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x6c>; +defm CLUSTER_LOAD_ASYNC_TO_LDS_B128 : VFLAT_Real_AllAddr_gfx1250<0x6d>; defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x5f>; defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x60>; defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x61>; diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp index 184929a5a50f..8821558bb023 100644 --- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -193,16 +193,6 @@ MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const { return &OldOpnd; } -[[maybe_unused]] static unsigned getOperandSize(MachineInstr &MI, unsigned Idx, - MachineRegisterInfo &MRI) { - int16_t RegClass = MI.getDesc().operands()[Idx].RegClass; - if (RegClass == -1) - return 0; - - const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); - return TRI->getRegSizeInBits(*TRI->getRegClass(RegClass)); -} - MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI, RegSubRegPair CombOldVGPR, @@ -250,7 +240,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, ++NumOperands; } if (auto *SDst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::sdst)) { - if (TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, SDst)) { + if (AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::sdst)) { DPPInst.add(*SDst); ++NumOperands; } @@ -295,12 +285,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, } auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0); assert(Src0); - int Src0Idx = NumOperands; - if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) { - LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n"); - Fail = true; - break; - } + [[maybe_unused]] int Src0Idx = NumOperands; + DPPInst.add(*Src0); DPPInst->getOperand(NumOperands).setIsKill(false); ++NumOperands; @@ -319,21 +305,17 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, } auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1); if (Src1) { - int OpNum = NumOperands; + assert(AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::src1) && + "dpp version of instruction missing src1"); // If subtarget does not support SGPRs for src1 operand then the // requirements are the same as for src0. We check src0 instead because // pseudos are shared between subtargets and allow SGPR for src1 on all. if (!ST->hasDPPSrc1SGPR()) { - assert(getOperandSize(*DPPInst, Src0Idx, *MRI) == - getOperandSize(*DPPInst, NumOperands, *MRI) && + assert(TII->getOpSize(*DPPInst, Src0Idx) == + TII->getOpSize(*DPPInst, NumOperands) && "Src0 and Src1 operands should have the same size"); - OpNum = Src0Idx; - } - if (!TII->isOperandLegal(*DPPInst.getInstr(), OpNum, Src1)) { - LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n"); - Fail = true; - break; } + DPPInst.add(*Src1); ++NumOperands; } @@ -349,9 +331,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, } auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2); if (Src2) { - if (!TII->getNamedOperand(*DPPInst.getInstr(), AMDGPU::OpName::src2) || - !TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) { - LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n"); + if (!AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::src2)) { + LLVM_DEBUG(dbgs() << " failed: dpp does not have src2\n"); Fail = true; break; } @@ -431,6 +412,24 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask)); DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask)); DPPInst.addImm(CombBCZ ? 1 : 0); + + constexpr AMDGPU::OpName Srcs[] = { + AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2}; + + // FIXME: isOperandLegal expects to operate on an completely built + // instruction. We should have better legality APIs to check if the + // candidate operands will be legal without building the instruction first. + for (auto [I, OpName] : enumerate(Srcs)) { + int OpIdx = AMDGPU::getNamedOperandIdx(DPPOp, OpName); + if (OpIdx == -1) + break; + + if (!TII->isOperandLegal(*DPPInst, OpIdx)) { + LLVM_DEBUG(dbgs() << " failed: src" << I << " operand is illegal\n"); + Fail = true; + break; + } + } } while (false); if (Fail) { diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index 931966b6df1d..7b94ea3ffbf1 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -577,6 +577,7 @@ GCNSubtarget::getMaxNumVectorRegs(const Function &F) const { unsigned MaxNumVGPRs = MaxVectorRegs; unsigned MaxNumAGPRs = 0; + unsigned NumArchVGPRs = has1024AddressableVGPRs() ? 1024 : 256; // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically, // a wave may have up to 512 total vector registers combining together both @@ -589,7 +590,6 @@ GCNSubtarget::getMaxNumVectorRegs(const Function &F) const { if (hasGFX90AInsts()) { unsigned MinNumAGPRs = 0; const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs(); - const unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u}; @@ -614,11 +614,11 @@ GCNSubtarget::getMaxNumVectorRegs(const Function &F) const { MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs); MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs); - MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, TotalNumVGPRs); + MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, NumArchVGPRs); MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs); assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs && - MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= TotalNumVGPRs && + MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= NumArchVGPRs && "invalid register counts"); } else if (hasMAIInsts()) { // On gfx908 the number of AGPRs always equals the number of VGPRs. diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 2a8385df3f93..cbd6f64976d2 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -198,6 +198,7 @@ protected: bool DynamicVGPR = false; bool DynamicVGPRBlockSize32 = false; bool HasVMemToLDSLoad = false; + bool RequiresAlignVGPR = false; // This should not be used directly. 'TargetID' tracks the dynamic settings // for SRAMECC. @@ -235,6 +236,7 @@ protected: bool HasPseudoScalarTrans = false; bool HasRestrictedSOffset = false; bool Has64BitLiterals = false; + bool Has1024AddressableVGPRs = false; bool HasBitOp3Insts = false; bool HasTanhInsts = false; bool HasTensorCvtLutInsts = false; @@ -250,7 +252,6 @@ protected: bool HasVmemPrefInsts = false; bool HasSafeSmemPrefetch = false; bool HasSafeCUPrefetch = false; - bool HasCUStores = false; bool HasVcmpxExecWARHazard = false; bool HasLdsBranchVmemWARHazard = false; bool HasNSAtoVMEMBug = false; @@ -1015,8 +1016,6 @@ public: bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; } - bool hasCUStores() const { return HasCUStores; } - // Has s_cmpk_* instructions. bool hasSCmpK() const { return getGeneration() < GFX12; } @@ -1350,7 +1349,7 @@ public: } /// Return if operations acting on VGPR tuples require even alignment. - bool needsAlignedVGPRs() const { return GFX90AInsts || GFX1250Insts; } + bool needsAlignedVGPRs() const { return RequiresAlignVGPR; } /// Return true if the target has the S_PACK_HL_B32_B16 instruction. bool hasSPackHL() const { return GFX11Insts; } @@ -1436,6 +1435,8 @@ public: bool hasAddPC64Inst() const { return GFX1250Insts; } + bool has1024AddressableVGPRs() const { return Has1024AddressableVGPRs; } + bool hasMinimum3Maximum3PKF16() const { return HasMinimum3Maximum3PKF16; } @@ -1831,6 +1832,13 @@ public: bool hasScratchBaseForwardingHazard() const { return GFX1250Insts && getGeneration() == GFX12; } + + /// \returns true if the subtarget supports clusters of workgroups. + bool hasClusters() const { return GFX1250Insts; } + + /// \returns true if the subtarget requires a wait for xcnt before atomic + /// flat/global stores & rmw. + bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; } }; class GCNUserSGPRUsageInfo { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index aafbdc2e86a9..f098e7a3c6c6 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -80,12 +80,9 @@ void AMDGPUInstPrinter::printFP64ImmOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { // KIMM64 - // This part needs to align with AMDGPUInstPrinter::printImmediate64. + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); uint64_t Imm = MI->getOperand(OpNo).getImm(); - if (STI.hasFeature(AMDGPU::Feature64BitLiterals) && Lo_32(Imm)) - O << "lit64(" << formatHex(static_cast<uint64_t>(Imm)) << ')'; - else - O << formatHex(static_cast<uint64_t>(Hi_32(Imm))); + printLiteral64(Desc, Imm, STI, O, /*IsFP=*/true); } void AMDGPUInstPrinter::printNamedBit(const MCInst *MI, unsigned OpNo, @@ -327,6 +324,54 @@ void AMDGPUInstPrinter::printSymbolicFormat(const MCInst *MI, } } +// \returns a low 256 vgpr representing a high vgpr \p Reg [v256..v1023] or +// \p Reg itself otherwise. +static MCPhysReg getRegForPrinting(MCPhysReg Reg, const MCRegisterInfo &MRI) { + unsigned Enc = MRI.getEncodingValue(Reg); + unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK; + if (Idx < 0x100) + return Reg; + + const MCRegisterClass *RC = getVGPRPhysRegClass(Reg, MRI); + return RC->getRegister(Idx % 0x100); +} + +// Restore MSBs of a VGPR above 255 from the MCInstrAnalysis. +static MCPhysReg getRegFromMIA(MCPhysReg Reg, unsigned OpNo, + const MCInstrDesc &Desc, + const MCRegisterInfo &MRI, + const AMDGPUMCInstrAnalysis &MIA) { + unsigned VgprMSBs = MIA.getVgprMSBs(); + if (!VgprMSBs) + return Reg; + + unsigned Enc = MRI.getEncodingValue(Reg); + if (!(Enc & AMDGPU::HWEncoding::IS_VGPR)) + return Reg; + + auto Ops = AMDGPU::getVGPRLoweringOperandTables(Desc); + if (!Ops.first) + return Reg; + unsigned Opc = Desc.getOpcode(); + unsigned I; + for (I = 0; I < 4; ++I) { + if (Ops.first[I] != AMDGPU::OpName::NUM_OPERAND_NAMES && + (unsigned)AMDGPU::getNamedOperandIdx(Opc, Ops.first[I]) == OpNo) + break; + if (Ops.second && Ops.second[I] != AMDGPU::OpName::NUM_OPERAND_NAMES && + (unsigned)AMDGPU::getNamedOperandIdx(Opc, Ops.second[I]) == OpNo) + break; + } + if (I == 4) + return Reg; + unsigned OpMSBs = (VgprMSBs >> (I * 2)) & 3; + if (!OpMSBs) + return Reg; + if (MCRegister NewReg = AMDGPU::getVGPRWithMSBs(Reg, OpMSBs, MRI)) + return NewReg; + return Reg; +} + void AMDGPUInstPrinter::printRegOperand(MCRegister Reg, raw_ostream &O, const MCRegisterInfo &MRI) { #if !defined(NDEBUG) @@ -340,7 +385,20 @@ void AMDGPUInstPrinter::printRegOperand(MCRegister Reg, raw_ostream &O, } #endif - O << getRegisterName(Reg); + unsigned PrintReg = getRegForPrinting(Reg, MRI); + O << getRegisterName(PrintReg); + + if (PrintReg != Reg.id()) + O << " /*" << getRegisterName(Reg) << "*/"; +} + +void AMDGPUInstPrinter::printRegOperand(MCRegister Reg, unsigned Opc, + unsigned OpNo, raw_ostream &O, + const MCRegisterInfo &MRI) { + if (MIA) + Reg = getRegFromMIA(Reg, OpNo, MII.get(Opc), MRI, + *static_cast<const AMDGPUMCInstrAnalysis *>(MIA)); + printRegOperand(Reg, O, MRI); } void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo, @@ -594,7 +652,7 @@ void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, O << formatHex(static_cast<uint64_t>(Imm)); } -void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, +void AMDGPUInstPrinter::printImmediate64(const MCInstrDesc &Desc, uint64_t Imm, const MCSubtargetInfo &STI, raw_ostream &O, bool IsFP) { int64_t SImm = static_cast<int64_t>(Imm); @@ -624,18 +682,24 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, else if (Imm == 0x3fc45f306dc9c882 && STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm)) O << "0.15915494309189532"; - else { - // This part needs to align with AMDGPUOperand::addLiteralImmOperand. - if (IsFP) { - if (STI.hasFeature(AMDGPU::Feature64BitLiterals) && Lo_32(Imm)) - O << "lit64(" << formatHex(static_cast<uint64_t>(Imm)) << ')'; - else - O << formatHex(static_cast<uint64_t>(Hi_32(Imm))); - return; - } + else + printLiteral64(Desc, Imm, STI, O, IsFP); +} - if (STI.hasFeature(AMDGPU::Feature64BitLiterals) && - (!isInt<32>(Imm) || !isUInt<32>(Imm))) +void AMDGPUInstPrinter::printLiteral64(const MCInstrDesc &Desc, uint64_t Imm, + const MCSubtargetInfo &STI, + raw_ostream &O, bool IsFP) { + // This part needs to align with AMDGPUOperand::addLiteralImmOperand. + bool CanUse64BitLiterals = + STI.hasFeature(AMDGPU::Feature64BitLiterals) && + !(Desc.TSFlags & (SIInstrFlags::VOP3 | SIInstrFlags::VOP3P)); + if (IsFP) { + if (CanUse64BitLiterals && Lo_32(Imm)) + O << "lit64(" << formatHex(static_cast<uint64_t>(Imm)) << ')'; + else + O << formatHex(static_cast<uint64_t>(Hi_32(Imm))); + } else { + if (CanUse64BitLiterals && (!isInt<32>(Imm) || !isUInt<32>(Imm))) O << "lit64(" << formatHex(static_cast<uint64_t>(Imm)) << ')'; else O << formatHex(static_cast<uint64_t>(Imm)); @@ -719,7 +783,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, const MCOperand &Op = MI->getOperand(OpNo); if (Op.isReg()) { - printRegOperand(Op.getReg(), O, MRI); + printRegOperand(Op.getReg(), MI->getOpcode(), OpNo, O, MRI); // Check if operand register class contains register used. // Intention: print disassembler message when invalid code is decoded, @@ -750,12 +814,12 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, break; case AMDGPU::OPERAND_REG_IMM_INT64: case AMDGPU::OPERAND_REG_INLINE_C_INT64: - printImmediate64(Op.getImm(), STI, O, false); + printImmediate64(Desc, Op.getImm(), STI, O, false); break; case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_INLINE_C_FP64: case AMDGPU::OPERAND_REG_INLINE_AC_FP64: - printImmediate64(Op.getImm(), STI, O, true); + printImmediate64(Desc, Op.getImm(), STI, O, true); break; case AMDGPU::OPERAND_REG_INLINE_C_INT16: case AMDGPU::OPERAND_REG_IMM_INT16: @@ -793,22 +857,6 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, // custom printer. llvm_unreachable("unexpected immediate operand type"); } - } else if (Op.isDFPImm()) { - double Value = bit_cast<double>(Op.getDFPImm()); - // We special case 0.0 because otherwise it will be printed as an integer. - if (Value == 0.0) - O << "0.0"; - else { - const MCInstrDesc &Desc = MII.get(MI->getOpcode()); - int RCID = Desc.operands()[OpNo].RegClass; - unsigned RCBits = AMDGPU::getRegBitWidth(MRI.getRegClass(RCID)); - if (RCBits == 32) - printImmediate32(llvm::bit_cast<uint32_t>((float)Value), STI, O); - else if (RCBits == 64) - printImmediate64(llvm::bit_cast<uint64_t>(Value), STI, O, true); - else - llvm_unreachable("Invalid register class size"); - } } else if (Op.isExpr()) { const MCExpr *Exp = Op.getExpr(); MAI.printExpr(O, *Exp); @@ -891,7 +939,7 @@ void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI, if (OpNo + 1 < MI->getNumOperands() && (InputModifiers & SISrcMods::ABS) == 0) { const MCOperand &Op = MI->getOperand(OpNo + 1); - NegMnemo = Op.isImm() || Op.isDFPImm(); + NegMnemo = Op.isImm(); } if (NegMnemo) { O << "neg("; @@ -1146,7 +1194,7 @@ void AMDGPUInstPrinter::printExpSrcN(const MCInst *MI, unsigned OpNo, OpNo = OpNo - N + N / 2; if (En & (1 << N)) - printRegOperand(MI->getOperand(OpNo).getReg(), O, MRI); + printRegOperand(MI->getOperand(OpNo).getReg(), Opc, OpNo, O, MRI); else O << "off"; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index be32061c6453..21cc2f229de9 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -35,6 +35,8 @@ public: const MCSubtargetInfo &STI, raw_ostream &O) override; static void printRegOperand(MCRegister Reg, raw_ostream &O, const MCRegisterInfo &MRI); + void printRegOperand(MCRegister Reg, unsigned Opc, unsigned OpNo, + raw_ostream &O, const MCRegisterInfo &MRI); private: void printU16ImmOperand(const MCInst *MI, unsigned OpNo, @@ -70,7 +72,7 @@ private: void printSymbolicFormat(const MCInst *MI, const MCSubtargetInfo &STI, raw_ostream &O); - void printRegOperand(unsigned RegNo, raw_ostream &O); + void printRegOperand(MCRegister Reg, raw_ostream &O); void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printVINTRPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, @@ -87,8 +89,10 @@ private: raw_ostream &O); void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O); - void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI, - raw_ostream &O, bool IsFP); + void printImmediate64(const MCInstrDesc &Desc, uint64_t Imm, + const MCSubtargetInfo &STI, raw_ostream &O, bool IsFP); + void printLiteral64(const MCInstrDesc &Desc, uint64_t Imm, + const MCSubtargetInfo &STI, raw_ostream &O, bool IsFP); void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printRegularOperand(const MCInst *MI, unsigned OpNo, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index 61f673221739..fd65f95334f7 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -88,7 +88,7 @@ private: /// Encode an fp or int literal. std::optional<uint64_t> - getLitEncoding(const MCOperand &MO, const MCOperandInfo &OpInfo, + getLitEncoding(const MCInstrDesc &Desc, const MCOperand &MO, unsigned OpNo, const MCSubtargetInfo &STI, bool HasMandatoryLiteral = false) const; @@ -219,8 +219,8 @@ static uint32_t getLit16IntEncoding(uint32_t Val, const MCSubtargetInfo &STI) { return getLit32Encoding(Val, STI); } -static uint32_t getLit64Encoding(uint64_t Val, const MCSubtargetInfo &STI, - bool IsFP) { +static uint32_t getLit64Encoding(const MCInstrDesc &Desc, uint64_t Val, + const MCSubtargetInfo &STI, bool IsFP) { uint32_t IntImm = getIntInlineImmEncoding(static_cast<int64_t>(Val)); if (IntImm != 0) return IntImm; @@ -253,29 +253,27 @@ static uint32_t getLit64Encoding(uint64_t Val, const MCSubtargetInfo &STI, STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm)) return 248; - // The rest part needs to align with AMDGPUInstPrinter::printImmediate64. + // The rest part needs to align with AMDGPUInstPrinter::printLiteral64. + bool CanUse64BitLiterals = + STI.hasFeature(AMDGPU::Feature64BitLiterals) && + !(Desc.TSFlags & (SIInstrFlags::VOP3 | SIInstrFlags::VOP3P)); if (IsFP) { - return STI.hasFeature(AMDGPU::Feature64BitLiterals) && Lo_32(Val) ? 254 - : 255; + return CanUse64BitLiterals && Lo_32(Val) ? 254 : 255; } - return STI.hasFeature(AMDGPU::Feature64BitLiterals) && - (!isInt<32>(Val) || !isUInt<32>(Val)) - ? 254 - : 255; + return CanUse64BitLiterals && (!isInt<32>(Val) || !isUInt<32>(Val)) ? 254 + : 255; } std::optional<uint64_t> AMDGPUMCCodeEmitter::getLitEncoding( - const MCOperand &MO, const MCOperandInfo &OpInfo, + const MCInstrDesc &Desc, const MCOperand &MO, unsigned OpNo, const MCSubtargetInfo &STI, bool HasMandatoryLiteral) const { + const MCOperandInfo &OpInfo = Desc.operands()[OpNo]; int64_t Imm; if (MO.isExpr()) { if (!MO.getExpr()->evaluateAsAbsolute(Imm)) - return (STI.hasFeature(AMDGPU::Feature64BitLiterals) && - OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64) - ? 254 - : 255; + return AMDGPU::getOperandSize(OpInfo) == 8 ? 254 : 255; } else { assert(!MO.isDFPImm()); @@ -299,14 +297,14 @@ std::optional<uint64_t> AMDGPUMCCodeEmitter::getLitEncoding( case AMDGPU::OPERAND_REG_IMM_INT64: case AMDGPU::OPERAND_REG_INLINE_C_INT64: - return getLit64Encoding(static_cast<uint64_t>(Imm), STI, false); + return getLit64Encoding(Desc, static_cast<uint64_t>(Imm), STI, false); case AMDGPU::OPERAND_REG_INLINE_C_FP64: case AMDGPU::OPERAND_REG_INLINE_AC_FP64: - return getLit64Encoding(static_cast<uint64_t>(Imm), STI, true); + return getLit64Encoding(Desc, static_cast<uint64_t>(Imm), STI, true); case AMDGPU::OPERAND_REG_IMM_FP64: { - auto Enc = getLit64Encoding(static_cast<uint64_t>(Imm), STI, true); + auto Enc = getLit64Encoding(Desc, static_cast<uint64_t>(Imm), STI, true); return (HasMandatoryLiteral && Enc == 255) ? 254 : Enc; } @@ -405,7 +403,7 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI, if (AMDGPU::isGFX10Plus(STI) && isVCMPX64(Desc)) { assert((Encoding & 0xFF) == 0); Encoding |= MRI.getEncodingValue(AMDGPU::EXEC_LO) & - AMDGPU::HWEncoding::REG_IDX_MASK; + AMDGPU::HWEncoding::LO256_REG_IDX_MASK; } for (unsigned i = 0; i < bytes; i++) { @@ -447,7 +445,7 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI, // Is this operand a literal immediate? const MCOperand &Op = MI.getOperand(i); - auto Enc = getLitEncoding(Op, Desc.operands()[i], STI); + auto Enc = getLitEncoding(Desc, Op, i, STI); if (!Enc || (*Enc != 255 && *Enc != 254)) continue; @@ -521,7 +519,7 @@ void AMDGPUMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, return; } else { const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); - auto Enc = getLitEncoding(MO, Desc.operands()[OpNo], STI); + auto Enc = getLitEncoding(Desc, MO, OpNo, STI); if (Enc && *Enc != 255) { Op = *Enc | SDWA9EncValues::SRC_SGPR_MASK; return; @@ -554,7 +552,7 @@ void AMDGPUMCCodeEmitter::getAVOperandEncoding( SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { MCRegister Reg = MI.getOperand(OpNo).getReg(); unsigned Enc = MRI.getEncodingValue(Reg); - unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK; + unsigned Idx = Enc & AMDGPU::HWEncoding::LO256_REG_IDX_MASK; bool IsVGPROrAGPR = Enc & (AMDGPU::HWEncoding::IS_VGPR | AMDGPU::HWEncoding::IS_AGPR); @@ -596,7 +594,7 @@ void AMDGPUMCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCSubtargetInfo &STI) const { if (MO.isReg()){ unsigned Enc = MRI.getEncodingValue(MO.getReg()); - unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK; + unsigned Idx = Enc & AMDGPU::HWEncoding::LO256_REG_IDX_MASK; bool IsVGPROrAGPR = Enc & (AMDGPU::HWEncoding::IS_VGPR | AMDGPU::HWEncoding::IS_AGPR); Op = Idx | (IsVGPROrAGPR << 8); @@ -659,7 +657,7 @@ void AMDGPUMCCodeEmitter::getMachineOpValueT16Lo128( const MCOperand &MO = MI.getOperand(OpNo); if (MO.isReg()) { uint16_t Encoding = MRI.getEncodingValue(MO.getReg()); - unsigned RegIdx = Encoding & AMDGPU::HWEncoding::REG_IDX_MASK; + unsigned RegIdx = Encoding & AMDGPU::HWEncoding::LO256_REG_IDX_MASK; bool IsHi = Encoding & AMDGPU::HWEncoding::IS_HI16; bool IsVGPR = Encoding & AMDGPU::HWEncoding::IS_VGPR; assert((!IsVGPR || isUInt<7>(RegIdx)) && "VGPR0-VGPR127 expected!"); @@ -695,11 +693,8 @@ void AMDGPUMCCodeEmitter::getMachineOpValueCommon( const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); uint32_t Offset = Desc.getSize(); assert(Offset == 4 || Offset == 8); - auto OpType = Desc.operands()[OpNo].OperandType; - MCFixupKind Kind = (STI.hasFeature(AMDGPU::Feature64BitLiterals) && - OpType == AMDGPU::OPERAND_REG_IMM_INT64) - ? FK_Data_8 - : FK_Data_4; + unsigned Size = AMDGPU::getOperandSize(Desc, OpNo); + MCFixupKind Kind = MCFixup::getDataKindForSize(Size); addFixup(Fixups, Offset, MO.getExpr(), Kind, PCRel); } @@ -707,8 +702,7 @@ void AMDGPUMCCodeEmitter::getMachineOpValueCommon( if (AMDGPU::isSISrcOperand(Desc, OpNo)) { bool HasMandatoryLiteral = AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::imm); - if (auto Enc = getLitEncoding(MO, Desc.operands()[OpNo], STI, - HasMandatoryLiteral)) { + if (auto Enc = getLitEncoding(Desc, MO, OpNo, STI, HasMandatoryLiteral)) { Op = *Enc; return; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp index d66725d3a6c4..90c56f690146 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -21,9 +21,9 @@ #include "TargetInfo/AMDGPUTargetInfo.h" #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCELFStreamer.h" #include "llvm/MC/MCInstPrinter.h" -#include "llvm/MC/MCInstrAnalysis.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCObjectWriter.h" @@ -130,31 +130,35 @@ static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context, std::move(Emitter)); } -namespace { - -class AMDGPUMCInstrAnalysis : public MCInstrAnalysis { -public: - explicit AMDGPUMCInstrAnalysis(const MCInstrInfo *Info) - : MCInstrAnalysis(Info) {} - - bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size, - uint64_t &Target) const override { - if (Inst.getNumOperands() == 0 || !Inst.getOperand(0).isImm() || - Info->get(Inst.getOpcode()).operands()[0].OperandType != - MCOI::OPERAND_PCREL) - return false; +namespace llvm { +namespace AMDGPU { + +bool AMDGPUMCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr, + uint64_t Size, + uint64_t &Target) const { + if (Inst.getNumOperands() == 0 || !Inst.getOperand(0).isImm() || + Info->get(Inst.getOpcode()).operands()[0].OperandType != + MCOI::OPERAND_PCREL) + return false; + + int64_t Imm = Inst.getOperand(0).getImm(); + // Our branches take a simm16. + Target = SignExtend64<16>(Imm) * 4 + Addr + Size; + return true; +} - int64_t Imm = Inst.getOperand(0).getImm(); - // Our branches take a simm16. - Target = SignExtend64<16>(Imm) * 4 + Addr + Size; - return true; - } -}; +void AMDGPUMCInstrAnalysis::updateState(const MCInst &Inst, uint64_t Addr) { + if (Inst.getOpcode() == AMDGPU::S_SET_VGPR_MSB_gfx12) + VgprMSBs = Inst.getOperand(0).getImm(); + else if (isTerminator(Inst)) + VgprMSBs = 0; +} -} // end anonymous namespace +} // end namespace AMDGPU +} // end namespace llvm static MCInstrAnalysis *createAMDGPUMCInstrAnalysis(const MCInstrInfo *Info) { - return new AMDGPUMCInstrAnalysis(Info); + return new AMDGPU::AMDGPUMCInstrAnalysis(Info); } extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h index 9c0b2da0fcb0..986388414096 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -15,6 +15,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCTARGETDESC_H #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCTARGETDESC_H +#include "llvm/MC/MCInstrAnalysis.h" #include <cstdint> #include <memory> @@ -44,6 +45,28 @@ MCAsmBackend *createAMDGPUAsmBackend(const Target &T, std::unique_ptr<MCObjectTargetWriter> createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, bool HasRelocationAddend); + +namespace AMDGPU { +class AMDGPUMCInstrAnalysis : public MCInstrAnalysis { +private: + unsigned VgprMSBs = 0; + +public: + explicit AMDGPUMCInstrAnalysis(const MCInstrInfo *Info) + : MCInstrAnalysis(Info) {} + + bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size, + uint64_t &Target) const override; + + void resetState() override { VgprMSBs = 0; } + + void updateState(const MCInst &Inst, uint64_t Addr) override; + + unsigned getVgprMSBs() const { return VgprMSBs; } +}; + +} // namespace AMDGPU + } // namespace llvm #define GET_REGINFO_ENUM diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 0bbab29dbda1..ff6a21239345 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -448,11 +448,6 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, ".amdhsa_user_sgpr_private_segment_size"); - if (isGFX1250(STI)) - PrintField(KD.kernel_code_properties, - amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES_SHIFT, - amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES, - ".amdhsa_uses_cu_stores"); if (IVersion.Major >= 10) PrintField(KD.kernel_code_properties, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT, diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index ff5321df6452..bf787b230067 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -420,7 +420,7 @@ class VSAMPLE_gfx12<int op, dag outs, int num_addrs, string dns="", } class MIMG_NoSampler_Helper <mimgopc op, string asm, - RegisterClass dst_rc, + RegisterOperand dst_rc, RegisterClass addr_rc, string dns=""> : MIMG_gfx6789 <op.GFX10M, (outs dst_rc:$vdata), dns> { @@ -433,10 +433,10 @@ class MIMG_NoSampler_Helper <mimgopc op, string asm, } class MIMG_NoSampler_Helper_gfx90a <mimgopc op, string asm, - RegisterClass dst_rc, + RegisterOperand dst_rc, RegisterClass addr_rc, string dns=""> - : MIMG_gfx90a <op.GFX10M, (outs getLdStRegisterOperand<dst_rc>.ret:$vdata), dns> { + : MIMG_gfx90a <op.GFX10M, (outs getAlign2RegOp<dst_rc>.ret:$vdata), dns> { let InOperandList = !con((ins addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, LWE:$lwe, DA:$da), @@ -446,7 +446,7 @@ class MIMG_NoSampler_Helper_gfx90a <mimgopc op, string asm, } class MIMG_NoSampler_gfx10<mimgopc op, string opcode, - RegisterClass DataRC, RegisterClass AddrRC, + RegisterOperand DataRC, RegisterClass AddrRC, string dns=""> : MIMG_gfx10<op.GFX10M, (outs DataRC:$vdata), dns> { let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, @@ -458,7 +458,7 @@ class MIMG_NoSampler_gfx10<mimgopc op, string opcode, } class MIMG_NoSampler_nsa_gfx10<mimgopc op, string opcode, - RegisterClass DataRC, int num_addrs, + RegisterOperand DataRC, int num_addrs, string dns=""> : MIMG_nsa_gfx10<op.GFX10M, (outs DataRC:$vdata), num_addrs, dns> { let InOperandList = !con(AddrIns, @@ -471,7 +471,7 @@ class MIMG_NoSampler_nsa_gfx10<mimgopc op, string opcode, } class MIMG_NoSampler_gfx11<mimgopc op, string opcode, - RegisterClass DataRC, RegisterClass AddrRC, + RegisterOperand DataRC, RegisterClass AddrRC, string dns=""> : MIMG_gfx11<op.GFX11, (outs DataRC:$vdata), dns> { let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, @@ -483,7 +483,7 @@ class MIMG_NoSampler_gfx11<mimgopc op, string opcode, } class MIMG_NoSampler_nsa_gfx11<mimgopc op, string opcode, - RegisterClass DataRC, int num_addrs, + RegisterOperand DataRC, int num_addrs, string dns=""> : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdata), num_addrs, dns> { let InOperandList = !con(AddrIns, @@ -496,7 +496,7 @@ class MIMG_NoSampler_nsa_gfx11<mimgopc op, string opcode, } class VIMAGE_NoSampler_gfx12<mimgopc op, string opcode, - RegisterClass DataRC, int num_addrs, + RegisterOperand DataRC, int num_addrs, string dns=""> : VIMAGE_gfx12<op.GFX12, (outs DataRC:$vdata), num_addrs, dns> { let InOperandList = !con(AddrIns, @@ -507,7 +507,7 @@ class VIMAGE_NoSampler_gfx12<mimgopc op, string opcode, #!if(BaseOpcode.HasD16, "$d16", ""); } -class VSAMPLE_Sampler_gfx12<mimgopc op, string opcode, RegisterClass DataRC, +class VSAMPLE_Sampler_gfx12<mimgopc op, string opcode, RegisterOperand DataRC, int num_addrs, RegisterClass Addr3RC = VGPR_32, string dns=""> : VSAMPLE_gfx12<op.GFX12, (outs DataRC:$vdata), num_addrs, dns, Addr3RC> { @@ -544,7 +544,7 @@ class VSAMPLE_Sampler_nortn_gfx12<mimgopc op, string opcode, } multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm, - RegisterClass dst_rc, bit enableDisasm, + RegisterOperand dst_rc, bit enableDisasm, bit ExtendedImageInst = 1, bit isVSample = 0> { let VAddrDwords = 1 in { @@ -578,7 +578,7 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm, if op.HAS_GFX10M then { def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>; if !not(ExtendedImageInst) then - def _V2_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_64>; + def _V2_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_64_Align2>; def _V2_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_64>; def _V2_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 2>; } @@ -602,7 +602,7 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm, if op.HAS_GFX10M then { def _V3 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_96>; if !not(ExtendedImageInst) then - def _V3_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_96>; + def _V3_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_96_Align2>; def _V3_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_96>; def _V3_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 3>; } @@ -626,7 +626,7 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm, if op.HAS_GFX10M then { def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>; if !not(ExtendedImageInst) then - def _V4_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_128>; + def _V4_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_128_Align2>; def _V4_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_128>; def _V4_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 4, !if(enableDisasm, "GFX10", "")>; @@ -664,20 +664,20 @@ multiclass MIMG_NoSampler <mimgopc op, string asm, bit has_d16, bit mip = 0, let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), mayLoad = !not(isResInfo) in { let VDataDwords = 1 in - defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1, msaa>; + defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, AVLdSt_32, 1, msaa>; let VDataDwords = 2 in - defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 0, msaa>; + defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, AVLdSt_64, 0, msaa>; let VDataDwords = 3 in - defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0, msaa>; + defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, AVLdSt_96, 0, msaa>; let VDataDwords = 4 in - defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0, msaa>; + defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, AVLdSt_128, 0, msaa>; let VDataDwords = 5 in - defm _V5 : MIMG_NoSampler_Src_Helper <op, asm, VReg_160, 0, msaa>; + defm _V5 : MIMG_NoSampler_Src_Helper <op, asm, AVLdSt_160, 0, msaa>; } } class MIMG_Store_Helper <mimgopc op, string asm, - RegisterClass data_rc, + RegisterOperand data_rc, RegisterClass addr_rc, string dns = ""> : MIMG_gfx6789<op.GFX10M, (outs), dns> { @@ -690,11 +690,11 @@ class MIMG_Store_Helper <mimgopc op, string asm, } class MIMG_Store_Helper_gfx90a <mimgopc op, string asm, - RegisterClass data_rc, + RegisterOperand data_rc, RegisterClass addr_rc, string dns = ""> : MIMG_gfx90a<op.GFX10M, (outs), dns> { - let InOperandList = !con((ins getLdStRegisterOperand<data_rc>.ret:$vdata, + let InOperandList = !con((ins getAlign2RegOp<data_rc>.ret:$vdata, addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, LWE:$lwe, DA:$da), @@ -704,7 +704,7 @@ class MIMG_Store_Helper_gfx90a <mimgopc op, string asm, } class MIMG_Store_gfx10<mimgopc op, string opcode, - RegisterClass DataRC, RegisterClass AddrRC, + RegisterOperand DataRC, RegisterClass AddrRC, string dns=""> : MIMG_gfx10<op.GFX10M, (outs), dns> { let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, @@ -716,7 +716,7 @@ class MIMG_Store_gfx10<mimgopc op, string opcode, } class MIMG_Store_nsa_gfx10<mimgopc op, string opcode, - RegisterClass DataRC, int num_addrs, + RegisterOperand DataRC, int num_addrs, string dns=""> : MIMG_nsa_gfx10<op.GFX10M, (outs), num_addrs, dns> { let InOperandList = !con((ins DataRC:$vdata), @@ -730,7 +730,7 @@ class MIMG_Store_nsa_gfx10<mimgopc op, string opcode, } class MIMG_Store_gfx11<mimgopc op, string opcode, - RegisterClass DataRC, RegisterClass AddrRC, + RegisterOperand DataRC, RegisterClass AddrRC, string dns=""> : MIMG_gfx11<op.GFX11, (outs), dns> { let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, @@ -742,7 +742,7 @@ class MIMG_Store_gfx11<mimgopc op, string opcode, } class MIMG_Store_nsa_gfx11<mimgopc op, string opcode, - RegisterClass DataRC, int num_addrs, + RegisterOperand DataRC, int num_addrs, string dns=""> : MIMG_nsa_gfx11<op.GFX11, (outs), num_addrs, dns> { let InOperandList = !con((ins DataRC:$vdata), @@ -756,7 +756,7 @@ class MIMG_Store_nsa_gfx11<mimgopc op, string opcode, } class VIMAGE_Store_gfx12<mimgopc op, string opcode, - RegisterClass DataRC, int num_addrs, + RegisterOperand DataRC, int num_addrs, string dns=""> : VIMAGE_gfx12<op.GFX12, (outs), num_addrs, dns> { let InOperandList = !con((ins DataRC:$vdata), @@ -769,7 +769,7 @@ class VIMAGE_Store_gfx12<mimgopc op, string opcode, } multiclass MIMG_Store_Addr_Helper <mimgopc op, string asm, - RegisterClass data_rc, + RegisterOperand data_rc, bit enableDisasm> { let mayLoad = 0, mayStore = 1, hasSideEffects = 0, hasPostISelHook = 0, DisableWQM = 1 in { @@ -797,7 +797,7 @@ multiclass MIMG_Store_Addr_Helper <mimgopc op, string asm, let ssamp = 0 in { if op.HAS_GFX10M then { def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>; - def _V2_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_64>; + def _V2_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_64_Align2>; def _V2_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_64>; def _V2_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 2>; } @@ -814,7 +814,7 @@ multiclass MIMG_Store_Addr_Helper <mimgopc op, string asm, let ssamp = 0 in { if op.HAS_GFX10M then { def _V3 : MIMG_Store_Helper <op, asm, data_rc, VReg_96>; - def _V3_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_96>; + def _V3_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_96_Align2>; def _V3_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_96>; def _V3_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 3>; } @@ -831,7 +831,7 @@ multiclass MIMG_Store_Addr_Helper <mimgopc op, string asm, let ssamp = 0 in { if op.HAS_GFX10M then { def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>; - def _V4_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_128>; + def _V4_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_128_Align2>; def _V4_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_128>; def _V4_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 4, !if(enableDisasm, "GFX10", "")>; @@ -860,19 +860,19 @@ multiclass MIMG_Store <mimgopc op, string asm, bit has_d16, bit mip = 0> { let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in { let VDataDwords = 1 in - defm _V1 : MIMG_Store_Addr_Helper <op, asm, VGPR_32, 1>; + defm _V1 : MIMG_Store_Addr_Helper <op, asm, AVLdSt_32, 1>; let VDataDwords = 2 in - defm _V2 : MIMG_Store_Addr_Helper <op, asm, VReg_64, 0>; + defm _V2 : MIMG_Store_Addr_Helper <op, asm, AVLdSt_64, 0>; let VDataDwords = 3 in - defm _V3 : MIMG_Store_Addr_Helper <op, asm, VReg_96, 0>; + defm _V3 : MIMG_Store_Addr_Helper <op, asm, AVLdSt_96, 0>; let VDataDwords = 4 in - defm _V4 : MIMG_Store_Addr_Helper <op, asm, VReg_128, 0>; + defm _V4 : MIMG_Store_Addr_Helper <op, asm, AVLdSt_128, 0>; let VDataDwords = 5 in - defm _V5 : MIMG_Store_Addr_Helper <op, asm, VReg_160, 0>; + defm _V5 : MIMG_Store_Addr_Helper <op, asm, AVLdSt_160, 0>; } } -class MIMG_Atomic_gfx6789_base <bits<8> op, string asm, RegisterClass data_rc, +class MIMG_Atomic_gfx6789_base <bits<8> op, string asm, RegisterOperand data_rc, RegisterClass addr_rc, string dns=""> : MIMG_gfx6789 <op, (outs data_rc:$vdst), dns> { let Constraints = "$vdst = $vdata"; @@ -883,33 +883,33 @@ class MIMG_Atomic_gfx6789_base <bits<8> op, string asm, RegisterClass data_rc, let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da"; } -class MIMG_Atomic_gfx90a_base <bits<8> op, string asm, RegisterClass data_rc, +class MIMG_Atomic_gfx90a_base <bits<8> op, string asm, RegisterOperand data_rc, RegisterClass addr_rc, string dns=""> - : MIMG_gfx90a <op, (outs getLdStRegisterOperand<data_rc>.ret:$vdst), dns> { + : MIMG_gfx90a <op, (outs getAlign2RegOp<data_rc>.ret:$vdst), dns> { let Constraints = "$vdst = $vdata"; - let InOperandList = (ins getLdStRegisterOperand<data_rc>.ret:$vdata, + let InOperandList = (ins getAlign2RegOp<data_rc>.ret:$vdata, addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, LWE:$lwe, DA:$da); let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da"; } -class MIMG_Atomic_si<mimgopc op, string asm, RegisterClass data_rc, +class MIMG_Atomic_si<mimgopc op, string asm, RegisterOperand data_rc, RegisterClass addr_rc, bit enableDasm = 0> : MIMG_Atomic_gfx6789_base<op.SI, asm, data_rc, addr_rc, !if(enableDasm, "GFX6GFX7", "")> { let AssemblerPredicate = isGFX6GFX7; } -class MIMG_Atomic_vi<mimgopc op, string asm, RegisterClass data_rc, +class MIMG_Atomic_vi<mimgopc op, string asm, RegisterOperand data_rc, RegisterClass addr_rc, bit enableDasm = 0> : MIMG_Atomic_gfx6789_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX8", "")> { let AssemblerPredicate = isGFX8GFX9NotGFX90A; let MIMGEncoding = MIMGEncGfx8; } -class MIMG_Atomic_gfx90a<mimgopc op, string asm, RegisterClass data_rc, +class MIMG_Atomic_gfx90a<mimgopc op, string asm, RegisterOperand data_rc, RegisterClass addr_rc, bit enableDasm = 0> : MIMG_Atomic_gfx90a_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX90A", "")> { let AssemblerPredicate = isGFX90APlus; @@ -917,7 +917,7 @@ class MIMG_Atomic_gfx90a<mimgopc op, string asm, RegisterClass data_rc, } class MIMG_Atomic_gfx10<mimgopc op, string opcode, - RegisterClass DataRC, RegisterClass AddrRC, + RegisterOperand DataRC, RegisterClass AddrRC, bit enableDisasm = 0> : MIMG_gfx10<op.GFX10M, (outs DataRC:$vdst), !if(enableDisasm, "GFX10", "")> { @@ -930,7 +930,7 @@ class MIMG_Atomic_gfx10<mimgopc op, string opcode, } class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode, - RegisterClass DataRC, int num_addrs, + RegisterOperand DataRC, int num_addrs, bit enableDisasm = 0> : MIMG_nsa_gfx10<op.GFX10M, (outs DataRC:$vdst), num_addrs, !if(enableDisasm, "GFX10", "")> { @@ -945,7 +945,7 @@ class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode, } class MIMG_Atomic_gfx11<mimgopc op, string opcode, - RegisterClass DataRC, RegisterClass AddrRC, + RegisterOperand DataRC, RegisterClass AddrRC, bit enableDisasm = 0> : MIMG_gfx11<op.GFX11, (outs DataRC:$vdst), !if(enableDisasm, "GFX11", "")> { @@ -958,7 +958,7 @@ class MIMG_Atomic_gfx11<mimgopc op, string opcode, } class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode, - RegisterClass DataRC, int num_addrs, + RegisterOperand DataRC, int num_addrs, bit enableDisasm = 0> : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdst), num_addrs, !if(enableDisasm, "GFX11", "")> { @@ -972,7 +972,7 @@ class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode, let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; } -class VIMAGE_Atomic_gfx12<mimgopc op, string opcode, RegisterClass DataRC, +class VIMAGE_Atomic_gfx12<mimgopc op, string opcode, RegisterOperand DataRC, int num_addrs, string renamed, bit enableDisasm = 0> : VIMAGE_gfx12<op.GFX12, (outs DataRC:$vdst), num_addrs, !if(enableDisasm, "GFX12", "")> { @@ -987,7 +987,7 @@ class VIMAGE_Atomic_gfx12<mimgopc op, string opcode, RegisterClass DataRC, } multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm, - RegisterClass data_rc, + RegisterOperand data_rc, bit enableDasm = 0, bit isFP = 0, string renamed = ""> { @@ -1022,7 +1022,7 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm, } if op.HAS_VI then { def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, 0>; - def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64, 0>; + def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64_Align2, 0>; } if op.HAS_GFX10M then { def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, 0>; @@ -1044,7 +1044,7 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm, } if op.HAS_VI then { def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, 0>; - def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96, 0>; + def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96_Align2, 0>; } if op.HAS_GFX10M then { def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, 0>; @@ -1066,7 +1066,7 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm, } if op.HAS_VI then { def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>; - def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128, 0>; + def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128_Align2, 0>; } if op.HAS_GFX10M then { def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, 0>; @@ -1105,19 +1105,19 @@ multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0, // Other variants are reconstructed by disassembler using dmask and tfe. if !not(isCmpSwap) then { let VDataDwords = 1 in - defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, VGPR_32, 1, isFP, renamed>; + defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_32, 1, isFP, renamed>; } let VDataDwords = 2 in - defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, VReg_64, isCmpSwap, isFP, renamed>; + defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_64, isCmpSwap, isFP, renamed>; let VDataDwords = 3 in - defm _V3 : MIMG_Atomic_Addr_Helper_m <op, asm, VReg_96, 0, isFP, renamed>; + defm _V3 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_96, 0, isFP, renamed>; if isCmpSwap then { let VDataDwords = 4 in - defm _V4 : MIMG_Atomic_Addr_Helper_m <op, asm, VReg_128, 0, isFP, renamed>; + defm _V4 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_128, 0, isFP, renamed>; let VDataDwords = 5 in - defm _V5 : MIMG_Atomic_Addr_Helper_m <op, asm, VReg_160, 0, isFP, renamed>; + defm _V5 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_160, 0, isFP, renamed>; } } } // End IsAtomicRet = 1 @@ -1127,7 +1127,7 @@ multiclass MIMG_Atomic_Renamed <mimgopc op, string asm, string renamed, bit isCmpSwap = 0, bit isFP = 0> : MIMG_Atomic <op, asm, isCmpSwap, isFP, renamed>; -class MIMG_Sampler_Helper <mimgopc op, string asm, RegisterClass dst_rc, +class MIMG_Sampler_Helper <mimgopc op, string asm, RegisterOperand dst_rc, RegisterClass src_rc, string dns=""> : MIMG_gfx6789 <op.VI, (outs dst_rc:$vdata), dns> { let InOperandList = !con((ins src_rc:$vaddr, SReg_256_XNULL:$srsrc, SReg_128_XNULL:$ssamp, @@ -1138,9 +1138,9 @@ class MIMG_Sampler_Helper <mimgopc op, string asm, RegisterClass dst_rc, #!if(BaseOpcode.HasD16, "$d16", ""); } -class MIMG_Sampler_gfx90a<mimgopc op, string asm, RegisterClass dst_rc, +class MIMG_Sampler_gfx90a<mimgopc op, string asm, RegisterOperand dst_rc, RegisterClass src_rc, string dns=""> - : MIMG_gfx90a<op.GFX10M, (outs getLdStRegisterOperand<dst_rc>.ret:$vdata), dns> { + : MIMG_gfx90a<op.GFX10M, (outs dst_rc:$vdata), dns> { let InOperandList = !con((ins src_rc:$vaddr, SReg_256_XNULL:$srsrc, SReg_128_XNULL:$ssamp, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, LWE:$lwe, DA:$da), @@ -1164,7 +1164,7 @@ class MIMG_Sampler_Asm_gfx10p<string opcode, string AsmPrefix, bit HasD16> { } class MIMG_Sampler_gfx10<mimgopc op, string opcode, - RegisterClass DataRC, RegisterClass AddrRC, + RegisterOperand DataRC, RegisterClass AddrRC, string dns=""> : MIMG_gfx10<op.GFX10M, (outs DataRC:$vdata), dns> { let InOperandList = MIMG_Sampler_OpList_gfx10p<(ins AddrRC:$vaddr0), BaseOpcode.HasD16>.ret; @@ -1172,7 +1172,7 @@ class MIMG_Sampler_gfx10<mimgopc op, string opcode, } class MIMG_Sampler_nsa_gfx10<mimgopc op, string opcode, - RegisterClass DataRC, int num_addrs, + RegisterOperand DataRC, int num_addrs, string dns=""> : MIMG_nsa_gfx10<op.GFX10M, (outs DataRC:$vdata), num_addrs, dns> { let InOperandList = MIMG_Sampler_OpList_gfx10p<AddrIns, BaseOpcode.HasD16>.ret; @@ -1200,7 +1200,7 @@ class MIMG_Sampler_nortn_nsa_gfx10<mimgopc op, string opcode, } class MIMG_Sampler_gfx11<mimgopc op, string opcode, - RegisterClass DataRC, RegisterClass AddrRC, + RegisterOperand DataRC, RegisterClass AddrRC, string dns=""> : MIMG_gfx11<op.GFX11, (outs DataRC:$vdata), dns> { let InOperandList = MIMG_Sampler_OpList_gfx10p<(ins AddrRC:$vaddr0), BaseOpcode.HasD16>.ret; @@ -1208,7 +1208,7 @@ class MIMG_Sampler_gfx11<mimgopc op, string opcode, } class MIMG_Sampler_nsa_gfx11<mimgopc op, string opcode, - RegisterClass DataRC, int num_addrs, + RegisterOperand DataRC, int num_addrs, RegisterClass LastVAddrSize, string dns=""> : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdata), num_addrs, dns, [], LastVAddrSize> { @@ -1345,7 +1345,7 @@ class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample, bit isG16, } multiclass MIMG_Sampler_Src_Helper <mimgopc op, string asm, - AMDGPUSampleVariant sample, RegisterClass dst_rc, + AMDGPUSampleVariant sample, RegisterOperand dst_rc, bit enableDisasm = 0, bit ExtendedImageInst = 1, bit isG16 = 0> { foreach addr = MIMG_Sampler_AddrSizes<sample, isG16>.MachineInstrs in { @@ -1473,15 +1473,15 @@ multiclass MIMG_Sampler <mimgopc op, AMDGPUSampleVariant sample, bit isPointSamp let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm, mayLoad = !not(isGetLod) in { let VDataDwords = 1 in - defm _V1 : MIMG_Sampler_Src_Helper<op, asm, sample, VGPR_32, 1, ExtendedImageInst, isG16>; + defm _V1 : MIMG_Sampler_Src_Helper<op, asm, sample, AVLdSt_32, 1, ExtendedImageInst, isG16>; let VDataDwords = 2 in - defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64, 0, ExtendedImageInst, isG16>; + defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, AVLdSt_64, 0, ExtendedImageInst, isG16>; let VDataDwords = 3 in - defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96, 0, ExtendedImageInst, isG16>; + defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, AVLdSt_96, 0, ExtendedImageInst, isG16>; let VDataDwords = 4 in - defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128, 0, ExtendedImageInst, isG16>; + defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, AVLdSt_128, 0, ExtendedImageInst, isG16>; let VDataDwords = 5 in - defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_160, 0, ExtendedImageInst, isG16>; + defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, AVLdSt_160, 0, ExtendedImageInst, isG16>; } if !not(isGetLod) then @@ -1501,11 +1501,11 @@ multiclass MIMG_Gather <mimgopc op, AMDGPUSampleVariant sample, bit wqm = 0, let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm, Gather4 = 1 in { let VDataDwords = 2 in - defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64, /*enableDisasm*/ true>; /* for packed D16 only */ + defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, AVLdSt_64, /*enableDisasm*/ true>; /* for packed D16 only */ let VDataDwords = 4 in - defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128>; + defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, AVLdSt_128>; let VDataDwords = 5 in - defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_160>; + defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, AVLdSt_160>; } } @@ -1632,13 +1632,13 @@ multiclass MIMG_MSAA_Load <mimgopc op, string asm> { let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), Gather4 = 1, hasPostISelHook = 0, mayLoad = 1 in { let VDataDwords = 2 in - defm _V2 : MIMG_NoSampler_Src_Helper<op, asm, VReg_64, 0, 0, 1>; /* packed D16 */ + defm _V2 : MIMG_NoSampler_Src_Helper<op, asm, AVLdSt_64, 0, 0, 1>; /* packed D16 */ let VDataDwords = 3 in - defm _V3 : MIMG_NoSampler_Src_Helper<op, asm, VReg_96, 0, 0, 1>; /* packed D16 + tfe */ + defm _V3 : MIMG_NoSampler_Src_Helper<op, asm, AVLdSt_96, 0, 0, 1>; /* packed D16 + tfe */ let VDataDwords = 4 in - defm _V4 : MIMG_NoSampler_Src_Helper<op, asm, VReg_128, 1, 0, 1>; + defm _V4 : MIMG_NoSampler_Src_Helper<op, asm, AVLdSt_128, 1, 0, 1>; let VDataDwords = 5 in - defm _V5 : MIMG_NoSampler_Src_Helper<op, asm, VReg_160, 0, 0, 1>; + defm _V5 : MIMG_NoSampler_Src_Helper<op, asm, AVLdSt_160, 0, 0, 1>; } } diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp index 8d27153fcfcd..3e256cce97af 100644 --- a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -29,7 +29,7 @@ using namespace llvm; #include "R600GenInstrInfo.inc" R600InstrInfo::R600InstrInfo(const R600Subtarget &ST) - : R600GenInstrInfo(-1, -1), RI(), ST(ST) {} + : R600GenInstrInfo(ST, -1, -1), RI(), ST(ST) {} bool R600InstrInfo::isVector(const MachineInstr &MI) const { return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR; diff --git a/llvm/lib/Target/AMDGPU/R600Instructions.td b/llvm/lib/Target/AMDGPU/R600Instructions.td index f82bd55beccc..dda0cf6a3218 100644 --- a/llvm/lib/Target/AMDGPU/R600Instructions.td +++ b/llvm/lib/Target/AMDGPU/R600Instructions.td @@ -123,7 +123,6 @@ class R600_1OP <bits<11> inst, string opName, list<dag> pattern, let HasNativeOperands = 1; let Op1 = 1; let ALUInst = 1; - let DisableEncoding = "$literal"; let UseNamedOperandTable = 1; let Inst{31-0} = Word0; @@ -161,7 +160,6 @@ class R600_2OP <bits<11> inst, string opName, list<dag> pattern, let HasNativeOperands = 1; let Op2 = 1; let ALUInst = 1; - let DisableEncoding = "$literal"; let UseNamedOperandTable = 1; let Inst{31-0} = Word0; @@ -201,7 +199,6 @@ class R600_3OP <bits<5> inst, string opName, list<dag> pattern, R600ALU_Word1_OP3<inst>{ let HasNativeOperands = 1; - let DisableEncoding = "$literal"; let Op3 = 1; let UseNamedOperandTable = 1; let ALUInst = 1; @@ -1783,7 +1780,7 @@ def : DwordAddrPat <i32, R600_Reg32>; def getLDSNoRetOp : InstrMapping { let FilterClass = "R600_LDS_1A1D"; let RowFields = ["BaseOp"]; - let ColFields = ["DisableEncoding"]; - let KeyCol = ["$dst"]; - let ValueCols = [[""""]]; + let ColFields = ["usesCustomInserter"]; + let KeyCol = ["1"]; + let ValueCols = [["0"]]; } diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 268b153c6c92..ecc4659ee0e8 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -237,16 +237,16 @@ enum OperandType : unsigned { OPERAND_REG_INLINE_AC_FP32, OPERAND_REG_INLINE_AC_FP64, + // Operand for AV_MOV_B64_IMM_PSEUDO, which is a pair of 32-bit inline + // constants. Does not accept registers. + OPERAND_INLINE_C_AV64_PSEUDO, + // Operand for source modifiers for VOP instructions OPERAND_INPUT_MODS, // Operand for SDWA instructions OPERAND_SDWA_VOPC_DST, - // Operand for AV_MOV_B64_IMM_PSEUDO, which is a pair of 32-bit inline - // constants. - OPERAND_INLINE_C_AV64_PSEUDO, - OPERAND_REG_IMM_FIRST = OPERAND_REG_IMM_INT32, OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_V2FP32, @@ -254,7 +254,7 @@ enum OperandType : unsigned { OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_AC_FP64, OPERAND_REG_INLINE_AC_FIRST = OPERAND_REG_INLINE_AC_INT32, - OPERAND_REG_INLINE_AC_LAST = OPERAND_REG_INLINE_AC_FP64, + OPERAND_REG_INLINE_AC_LAST = OPERAND_INLINE_C_AV64_PSEUDO, OPERAND_SRC_FIRST = OPERAND_REG_IMM_INT32, OPERAND_SRC_LAST = OPERAND_REG_INLINE_C_LAST, @@ -354,10 +354,11 @@ enum : unsigned { // Register codes as defined in the TableGen's HWEncoding field. namespace HWEncoding { enum : unsigned { - REG_IDX_MASK = 0xff, - IS_VGPR = 1 << 8, - IS_AGPR = 1 << 9, - IS_HI16 = 1 << 10, + REG_IDX_MASK = 0x3ff, + LO256_REG_IDX_MASK = 0xff, + IS_VGPR = 1 << 10, + IS_AGPR = 1 << 11, + IS_HI16 = 1 << 12, }; } // namespace HWEncoding @@ -457,6 +458,8 @@ enum Id { // Message ID, width(4) [3:0]. ID_RTN_GET_TBA_TO_PC = 134, ID_RTN_GET_SE_AID_ID = 135, + ID_RTN_GET_CLUSTER_BARRIER_STATE = 136, // added in GFX1250 + ID_MASK_PreGFX11_ = 0xF, ID_MASK_GFX11Plus_ = 0xFF }; @@ -572,7 +575,17 @@ enum ModeRegisterMasks : uint32_t { GPR_IDX_EN_MASK = 1 << 27, VSKIP_MASK = 1 << 28, - CSP_MASK = 0x7u << 29 // Bits 29..31 + CSP_MASK = 0x7u << 29, // Bits 29..31 + + // GFX1250 + DST_VGPR_MSB = 1 << 12, + SRC0_VGPR_MSB = 1 << 13, + SRC1_VGPR_MSB = 1 << 14, + SRC2_VGPR_MSB = 1 << 15, + VGPR_MSB_MASK = 0xf << 12, // Bits 12..15 + + REPLAY_MODE = 1 << 25, + FLAT_SCRATCH_IS_NV = 1 << 26, }; } // namespace Hwreg diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index dce4e6f99300..6533d4c8eca3 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -627,6 +627,9 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) { TRI = ST.getRegisterInfo(); TII = ST.getInstrInfo(); + // Instructions to re-legalize after changing register classes + SmallVector<MachineInstr *, 8> Relegalize; + for (MachineBasicBlock &MBB : MF) { for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I) { @@ -634,6 +637,11 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) { switch (MI.getOpcode()) { default: + // scale_src has a register class restricted to low 256 VGPRs, changing + // registers to VGPR may not take it into acount. + if (TII->isWMMA(MI) && + AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::scale_src0)) + Relegalize.push_back(&MI); continue; case AMDGPU::COPY: { const TargetRegisterClass *SrcRC, *DstRC; @@ -791,6 +799,9 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) { for (auto *MI : PHINodes) { processPHINode(*MI); } + while (!Relegalize.empty()) + TII->legalizeOperands(*Relegalize.pop_back_val(), MDT); + if (MF.getTarget().getOptLevel() > CodeGenOptLevel::None && EnableM0Merge) hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII); diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 962c276bc212..5297816ec1f2 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -173,6 +173,7 @@ struct FoldCandidate { class SIFoldOperandsImpl { public: + MachineFunction *MF; MachineRegisterInfo *MRI; const SIInstrInfo *TII; const SIRegisterInfo *TRI; @@ -705,6 +706,36 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const { } MachineOperand *New = Fold.Def.OpToFold; + + // Verify the register is compatible with the operand. + if (const TargetRegisterClass *OpRC = + TII->getRegClass(MI->getDesc(), Fold.UseOpNo, TRI, *MF)) { + const TargetRegisterClass *OldRC = MRI->getRegClass(Old.getReg()); + const TargetRegisterClass *NewRC = MRI->getRegClass(New->getReg()); + unsigned NewSubReg = New->getSubReg(); + unsigned OldSubReg = Old.getSubReg(); + + const TargetRegisterClass *ConstrainRC = OpRC; + if (NewSubReg && OldSubReg) { + unsigned PreA, PreB; + ConstrainRC = TRI->getCommonSuperRegClass(OpRC, OldSubReg, NewRC, + NewSubReg, PreA, PreB); + } else if (OldSubReg) { + ConstrainRC = TRI->getMatchingSuperRegClass(OldRC, OpRC, OldSubReg); + } else if (NewSubReg) { + ConstrainRC = TRI->getMatchingSuperRegClass(NewRC, OpRC, NewSubReg); + } + + if (!ConstrainRC) + return false; + + if (!MRI->constrainRegClass(New->getReg(), ConstrainRC)) { + LLVM_DEBUG(dbgs() << "Cannot constrain " << printReg(New->getReg(), TRI) + << TRI->getRegClassName(ConstrainRC) << '\n'); + return false; + } + } + // Rework once the VS_16 register class is updated to include proper // 16-bit SGPRs instead of 32-bit ones. if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg())) @@ -1248,6 +1279,7 @@ void SIFoldOperandsImpl::foldOperand( if (FoldingImmLike && UseMI->isCopy()) { Register DestReg = UseMI->getOperand(0).getReg(); Register SrcReg = UseMI->getOperand(1).getReg(); + unsigned UseSubReg = UseMI->getOperand(1).getSubReg(); assert(SrcReg.isVirtual()); const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg); @@ -1259,63 +1291,74 @@ void SIFoldOperandsImpl::foldOperand( return; const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg); - if (!DestReg.isPhysical() && DestRC == &AMDGPU::AGPR_32RegClass) { - std::optional<int64_t> UseImmVal = OpToFold.getEffectiveImmVal(); - if (UseImmVal && TII->isInlineConstant( - *UseImmVal, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { - UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64)); - UseMI->getOperand(1).ChangeToImmediate(*UseImmVal); - CopiesToReplace.push_back(UseMI); - return; + // In order to fold immediates into copies, we need to change the copy to a + // MOV. Find a compatible mov instruction with the value. + for (unsigned MovOp : + {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64, + AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_MOV_B16_t16_e64, + AMDGPU::V_ACCVGPR_WRITE_B32_e64, AMDGPU::AV_MOV_B32_IMM_PSEUDO, + AMDGPU::AV_MOV_B64_IMM_PSEUDO}) { + const MCInstrDesc &MovDesc = TII->get(MovOp); + assert(MovDesc.getNumDefs() > 0 && MovDesc.operands()[0].RegClass != -1); + + const TargetRegisterClass *MovDstRC = + TRI->getRegClass(MovDesc.operands()[0].RegClass); + + // Fold if the destination register class of the MOV instruction (ResRC) + // is a superclass of (or equal to) the destination register class of the + // COPY (DestRC). If this condition fails, folding would be illegal. + if (!DestRC->hasSuperClassEq(MovDstRC)) + continue; + + const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1; + const TargetRegisterClass *MovSrcRC = + TRI->getRegClass(MovDesc.operands()[SrcIdx].RegClass); + if (MovSrcRC) { + if (UseSubReg) + MovSrcRC = TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg); + if (!MRI->constrainRegClass(SrcReg, MovSrcRC)) + break; + + // FIXME: This is mutating the instruction only and deferring the actual + // fold of the immediate + } else { + // For the _IMM_PSEUDO cases, there can be value restrictions on the + // immediate to verify. Technically we should always verify this, but it + // only matters for these concrete cases. + // TODO: Handle non-imm case if it's useful. + if (!OpToFold.isImm() || + !TII->isImmOperandLegal(MovDesc, 1, *OpToFold.getEffectiveImmVal())) + break; } - } - // Allow immediates COPYd into sgpr_lo16 to be further folded while - // still being legal if not further folded - if (DestRC == &AMDGPU::SGPR_LO16RegClass) { - assert(ST->useRealTrue16Insts()); - MRI->setRegClass(DestReg, &AMDGPU::SGPR_32RegClass); - DestRC = &AMDGPU::SGPR_32RegClass; + MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin(); + MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end(); + while (ImpOpI != ImpOpE) { + MachineInstr::mop_iterator Tmp = ImpOpI; + ImpOpI++; + UseMI->removeOperand(UseMI->getOperandNo(Tmp)); + } + UseMI->setDesc(MovDesc); + + if (MovOp == AMDGPU::V_MOV_B16_t16_e64) { + const auto &SrcOp = UseMI->getOperand(UseOpIdx); + MachineOperand NewSrcOp(SrcOp); + MachineFunction *MF = UseMI->getParent()->getParent(); + UseMI->removeOperand(1); + UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers + UseMI->addOperand(NewSrcOp); // src0 + UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel + UseOpIdx = SrcIdx; + UseOp = &UseMI->getOperand(UseOpIdx); + } + CopiesToReplace.push_back(UseMI); + break; } - // In order to fold immediates into copies, we need to change the - // copy to a MOV. - - unsigned MovOp = TII->getMovOpcode(DestRC); - if (MovOp == AMDGPU::COPY) - return; - - // Fold if the destination register class of the MOV instruction (ResRC) - // is a superclass of (or equal to) the destination register class of the - // COPY (DestRC). If this condition fails, folding would be illegal. - const MCInstrDesc &MovDesc = TII->get(MovOp); - assert(MovDesc.getNumDefs() > 0 && MovDesc.operands()[0].RegClass != -1); - const TargetRegisterClass *ResRC = - TRI->getRegClass(MovDesc.operands()[0].RegClass); - if (!DestRC->hasSuperClassEq(ResRC)) + // We failed to replace the copy, so give up. + if (UseMI->getOpcode() == AMDGPU::COPY) return; - MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin(); - MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end(); - while (ImpOpI != ImpOpE) { - MachineInstr::mop_iterator Tmp = ImpOpI; - ImpOpI++; - UseMI->removeOperand(UseMI->getOperandNo(Tmp)); - } - UseMI->setDesc(TII->get(MovOp)); - - if (MovOp == AMDGPU::V_MOV_B16_t16_e64) { - const auto &SrcOp = UseMI->getOperand(UseOpIdx); - MachineOperand NewSrcOp(SrcOp); - MachineFunction *MF = UseMI->getParent()->getParent(); - UseMI->removeOperand(1); - UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers - UseMI->addOperand(NewSrcOp); // src0 - UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel - UseOpIdx = 2; - UseOp = &UseMI->getOperand(UseOpIdx); - } - CopiesToReplace.push_back(UseMI); } else { if (UseMI->isCopy() && OpToFold.isReg() && UseMI->getOperand(0).getReg().isVirtual() && @@ -1430,30 +1473,9 @@ void SIFoldOperandsImpl::foldOperand( return; } - if (!FoldingImmLike) { - if (OpToFold.isReg() && ST->needsAlignedVGPRs()) { - // Don't fold if OpToFold doesn't hold an aligned register. - const TargetRegisterClass *RC = - TRI->getRegClassForReg(*MRI, OpToFold.getReg()); - assert(RC); - if (TRI->hasVectorRegisters(RC) && OpToFold.getSubReg()) { - unsigned SubReg = OpToFold.getSubReg(); - if (const TargetRegisterClass *SubRC = - TRI->getSubRegisterClass(RC, SubReg)) - RC = SubRC; - } - - if (!RC || !TRI->isProperlyAlignedRC(*RC)) - return; - } - - tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold); - - // FIXME: We could try to change the instruction from 64-bit to 32-bit - // to enable more folding opportunities. The shrink operands pass - // already does this. - return; - } + // FIXME: We could try to change the instruction from 64-bit to 32-bit + // to enable more folding opportunities. The shrink operands pass + // already does this. tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold); } @@ -1931,8 +1953,10 @@ bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const { // Direct copy from SGPR to AGPR is not possible on gfx908. To avoid // creation of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() // later, create a copy here and track if we already have such a copy. - if (TRI->getSubRegisterClass(MRI->getRegClass(Src.Reg), Src.SubReg) != - VGPRUseSubRC) { + const TargetRegisterClass *SubRC = + TRI->getSubRegisterClass(MRI->getRegClass(Src.Reg), Src.SubReg); + if (!VGPRUseSubRC->hasSubClassEq(SubRC)) { + // TODO: Try to reconstrain class VGPRCopy = MRI->createVirtualRegister(VGPRUseSubRC); BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), VGPRCopy).add(*Def); B.addReg(VGPRCopy); @@ -2748,6 +2772,7 @@ bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) { } bool SIFoldOperandsImpl::run(MachineFunction &MF) { + this->MF = &MF; MRI = &MF.getRegInfo(); ST = &MF.getSubtarget<GCNSubtarget>(); TII = ST->getInstrInfo(); diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 9b348d46fec4..ce25bf499c41 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -1132,9 +1132,18 @@ void SIFrameLowering::emitCSRSpillRestores( RestoreWWMRegisters(WWMCalleeSavedRegs); // The original EXEC is the first operand of the return instruction. - const MachineInstr &Return = MBB.instr_back(); - assert(Return.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN && - "Unexpected return inst"); + MachineInstr &Return = MBB.instr_back(); + unsigned Opcode = Return.getOpcode(); + switch (Opcode) { + case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: + Opcode = AMDGPU::SI_RETURN; + break; + case AMDGPU::SI_TCRETURN_GFX_WholeWave: + Opcode = AMDGPU::SI_TCRETURN_GFX; + break; + default: + llvm_unreachable("Unexpected return inst"); + } Register OrigExec = Return.getOperand(0).getReg(); if (!WWMScratchRegs.empty()) { @@ -1148,6 +1157,11 @@ void SIFrameLowering::emitCSRSpillRestores( // Restore original EXEC. unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addReg(OrigExec); + + // Drop the first operand and update the opcode. + Return.removeOperand(0); + Return.setDesc(TII->get(Opcode)); + return; } @@ -1728,7 +1742,9 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, "Whole wave functions can use the reg mapped for their i1 argument"); // FIXME: Be more efficient! - for (MCRegister Reg : AMDGPU::VGPR_32RegClass) + unsigned NumArchVGPRs = ST.has1024AddressableVGPRs() ? 1024 : 256; + for (MCRegister Reg : + AMDGPU::VGPR_32RegClass.getRegisters().take_front(NumArchVGPRs)) if (MF.getRegInfo().isPhysRegModified(Reg)) { MFI->reserveWWMRegister(Reg); MF.begin()->addLiveIn(Reg); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 66c1dfc71c2f..2a977247bc2c 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1263,22 +1263,61 @@ MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const { static unsigned getIntrMemWidth(unsigned IntrID) { switch (IntrID) { case Intrinsic::amdgcn_global_load_async_to_lds_b8: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b8: case Intrinsic::amdgcn_global_store_async_from_lds_b8: return 8; case Intrinsic::amdgcn_global_load_async_to_lds_b32: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b32: case Intrinsic::amdgcn_global_store_async_from_lds_b32: + case Intrinsic::amdgcn_cooperative_atomic_load_32x4B: + case Intrinsic::amdgcn_cooperative_atomic_store_32x4B: return 32; case Intrinsic::amdgcn_global_load_async_to_lds_b64: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b64: case Intrinsic::amdgcn_global_store_async_from_lds_b64: + case Intrinsic::amdgcn_cooperative_atomic_load_16x8B: + case Intrinsic::amdgcn_cooperative_atomic_store_16x8B: return 64; case Intrinsic::amdgcn_global_load_async_to_lds_b128: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: case Intrinsic::amdgcn_global_store_async_from_lds_b128: + case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: + case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: return 128; default: llvm_unreachable("Unknown width"); } } +static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad, + TargetLoweringBase::IntrinsicInfo &Info) { + Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2); + unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue(); + switch (AtomicOrderingCABI(Ord)) { + case AtomicOrderingCABI::acquire: + Info.order = AtomicOrdering::Acquire; + break; + case AtomicOrderingCABI::release: + Info.order = AtomicOrdering::Release; + break; + case AtomicOrderingCABI::seq_cst: + Info.order = AtomicOrdering::SequentiallyConsistent; + break; + default: + Info.order = AtomicOrdering::Monotonic; + break; + } + + Info.flags = + (IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore); + Info.flags |= MOCooperative; + + MDNode *ScopeMD = cast<MDNode>( + cast<MetadataAsValue>(CI.getArgOperand(IsLoad ? 2 : 3))->getMetadata()); + StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString(); + Info.ssid = CI.getContext().getOrInsertSyncScopeID(Scope); +} + bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &CI, MachineFunction &MF, @@ -1506,6 +1545,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::amdgcn_global_load_monitor_b32: case Intrinsic::amdgcn_global_load_monitor_b64: case Intrinsic::amdgcn_global_load_monitor_b128: + case Intrinsic::amdgcn_cluster_load_b32: + case Intrinsic::amdgcn_cluster_load_b64: + case Intrinsic::amdgcn_cluster_load_b128: case Intrinsic::amdgcn_ds_load_tr6_b96: case Intrinsic::amdgcn_ds_load_tr4_b64: case Intrinsic::amdgcn_ds_load_tr8_b64: @@ -1525,6 +1567,26 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags |= MachineMemOperand::MOLoad; return true; } + case Intrinsic::amdgcn_cooperative_atomic_load_32x4B: + case Intrinsic::amdgcn_cooperative_atomic_load_16x8B: + case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID)); + Info.ptrVal = CI.getOperand(0); + Info.align.reset(); + getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info); + return true; + } + case Intrinsic::amdgcn_cooperative_atomic_store_32x4B: + case Intrinsic::amdgcn_cooperative_atomic_store_16x8B: + case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: { + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID)); + Info.ptrVal = CI.getArgOperand(0); + Info.align.reset(); + getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info); + return true; + } case Intrinsic::amdgcn_ds_gws_init: case Intrinsic::amdgcn_ds_gws_barrier: case Intrinsic::amdgcn_ds_gws_sema_v: @@ -1553,7 +1615,11 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::amdgcn_global_load_async_to_lds_b8: case Intrinsic::amdgcn_global_load_async_to_lds_b32: case Intrinsic::amdgcn_global_load_async_to_lds_b64: - case Intrinsic::amdgcn_global_load_async_to_lds_b128: { + case Intrinsic::amdgcn_global_load_async_to_lds_b128: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b8: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b32: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b64: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: { Info.opc = ISD::INTRINSIC_VOID; Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID)); Info.ptrVal = CI.getArgOperand(1); @@ -1636,6 +1702,9 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II, Value *Ptr = nullptr; switch (II->getIntrinsicID()) { case Intrinsic::amdgcn_atomic_cond_sub_u32: + case Intrinsic::amdgcn_cluster_load_b128: + case Intrinsic::amdgcn_cluster_load_b64: + case Intrinsic::amdgcn_cluster_load_b32: case Intrinsic::amdgcn_ds_append: case Intrinsic::amdgcn_ds_consume: case Intrinsic::amdgcn_ds_load_tr8_b64: @@ -1678,6 +1747,10 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II, case Intrinsic::amdgcn_global_load_async_to_lds_b32: case Intrinsic::amdgcn_global_load_async_to_lds_b64: case Intrinsic::amdgcn_global_load_async_to_lds_b128: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b8: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b32: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b64: + case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: Ptr = II->getArgOperand(1); break; default: @@ -4260,6 +4333,11 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, break; } + // If the caller is a whole wave function, we need to use a special opcode + // so we can patch up EXEC. + if (Info->isWholeWaveFunction()) + OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave; + return DAG.getNode(OPC, DL, MVT::Other, Ops); } @@ -5192,7 +5270,58 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, return LoopBB; } -static uint32_t getIdentityValueForWaveReduction(unsigned Opc) { +static MachineBasicBlock *Expand64BitScalarArithmetic(MachineInstr &MI, + MachineBasicBlock *BB) { + // For targets older than GFX12, we emit a sequence of 32-bit operations. + // For GFX12, we emit s_add_u64 and s_sub_u64. + MachineFunction *MF = BB->getParent(); + const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const DebugLoc &DL = MI.getDebugLoc(); + MachineOperand &Dest = MI.getOperand(0); + MachineOperand &Src0 = MI.getOperand(1); + MachineOperand &Src1 = MI.getOperand(2); + bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); + if (ST.hasScalarAddSub64()) { + unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64; + // clang-format off + BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg()) + .add(Src0) + .add(Src1); + // clang-format on + } else { + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const TargetRegisterClass *BoolRC = TRI->getBoolRC(); + + Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + + MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm( + MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass); + MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm( + MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass); + + MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm( + MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass); + MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm( + MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass); + + unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; + unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; + BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0); + BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1); + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg()) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + } + MI.eraseFromParent(); + return BB; +} + +static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) { switch (Opc) { case AMDGPU::S_MIN_U32: return std::numeric_limits<uint32_t>::max(); @@ -5210,10 +5339,42 @@ static uint32_t getIdentityValueForWaveReduction(unsigned Opc) { case AMDGPU::S_AND_B32: return std::numeric_limits<uint32_t>::max(); default: - llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction"); + llvm_unreachable( + "Unexpected opcode in getIdentityValueFor32BitWaveReduction"); } } +static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc) { + switch (Opc) { + case AMDGPU::V_CMP_LT_U64_e64: // umin.u64 + return std::numeric_limits<uint64_t>::max(); + case AMDGPU::V_CMP_LT_I64_e64: // min.i64 + return std::numeric_limits<int64_t>::max(); + case AMDGPU::V_CMP_GT_U64_e64: // umax.u64 + return std::numeric_limits<uint64_t>::min(); + case AMDGPU::V_CMP_GT_I64_e64: // max.i64 + return std::numeric_limits<int64_t>::min(); + case AMDGPU::S_ADD_U64_PSEUDO: + case AMDGPU::S_SUB_U64_PSEUDO: + case AMDGPU::S_OR_B64: + case AMDGPU::S_XOR_B64: + return std::numeric_limits<uint64_t>::min(); + case AMDGPU::S_AND_B64: + return std::numeric_limits<uint64_t>::max(); + default: + llvm_unreachable( + "Unexpected opcode in getIdentityValueFor64BitWaveReduction"); + } +} + +static bool is32bitWaveReduceOperation(unsigned Opc) { + return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 || + Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 || + Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 || + Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 || + Opc == AMDGPU::S_XOR_B32; +} + static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, @@ -5241,53 +5402,99 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, RetBB = &BB; break; } + case AMDGPU::V_CMP_LT_U64_e64: // umin + case AMDGPU::V_CMP_LT_I64_e64: // min + case AMDGPU::V_CMP_GT_U64_e64: // umax + case AMDGPU::V_CMP_GT_I64_e64: // max + case AMDGPU::S_AND_B64: + case AMDGPU::S_OR_B64: { + // Idempotent operations. + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg); + RetBB = &BB; + break; + } case AMDGPU::S_XOR_B32: + case AMDGPU::S_XOR_B64: case AMDGPU::S_ADD_I32: - case AMDGPU::S_SUB_I32: { + case AMDGPU::S_ADD_U64_PSEUDO: + case AMDGPU::S_SUB_I32: + case AMDGPU::S_SUB_U64_PSEUDO: { const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass(); const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg); Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass); - Register ActiveLanes = MRI.createVirtualRegister(DstRegClass); + Register NumActiveLanes = + MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); bool IsWave32 = ST.isWave32(); unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - unsigned CountReg = + unsigned BitCountOpc = IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64; - auto Exec = - BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg); + BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg); - auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes) - .addReg(Exec->getOperand(0).getReg()); + auto NewAccumulator = + BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes) + .addReg(ExecMask); switch (Opc) { - case AMDGPU::S_XOR_B32: { + case AMDGPU::S_XOR_B32: + case AMDGPU::S_XOR_B64: { // Performing an XOR operation on a uniform value // depends on the parity of the number of active lanes. // For even parity, the result will be 0, for odd // parity the result will be the same as the input value. - Register ParityRegister = MRI.createVirtualRegister(DstRegClass); - - auto ParityReg = - BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister) - .addReg(NewAccumulator->getOperand(0).getReg()) - .addImm(1); - BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg) - .addReg(SrcReg) - .addReg(ParityReg->getOperand(0).getReg()); + Register ParityRegister = + MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister) + .addReg(NewAccumulator->getOperand(0).getReg()) + .addImm(1) + .setOperandDead(3); // Dead scc + if (Opc == AMDGPU::S_XOR_B32) { + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg) + .addReg(SrcReg) + .addReg(ParityRegister); + } else { + Register DestSub0 = + MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register DestSub1 = + MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + + const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); + const TargetRegisterClass *SrcSubRC = + TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0); + + MachineOperand Op1L = TII->buildExtractSubRegOrImm( + MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC); + MachineOperand Op1H = TII->buildExtractSubRegOrImm( + MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC); + + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0) + .add(Op1L) + .addReg(ParityRegister); + + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1) + .add(Op1H) + .addReg(ParityRegister); + + BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + } break; } case AMDGPU::S_SUB_I32: { Register NegatedVal = MRI.createVirtualRegister(DstRegClass); // Take the negation of the source operand. - auto InvertedValReg = - BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal) - .addImm(-1) - .addReg(SrcReg); + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal) + .addImm(0) + .addReg(SrcReg); BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg) - .addReg(InvertedValReg->getOperand(0).getReg()) + .addReg(NegatedVal) .addReg(NewAccumulator->getOperand(0).getReg()); break; } @@ -5297,6 +5504,75 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, .addReg(NewAccumulator->getOperand(0).getReg()); break; } + case AMDGPU::S_ADD_U64_PSEUDO: + case AMDGPU::S_SUB_U64_PSEUDO: { + Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register Op1H_Op0L_Reg = + MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register Op1L_Op0H_Reg = + MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register NegatedValLo = + MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register NegatedValHi = + MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + + const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg); + const TargetRegisterClass *Src1SubRC = + TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0); + + MachineOperand Op1L = TII->buildExtractSubRegOrImm( + MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC); + MachineOperand Op1H = TII->buildExtractSubRegOrImm( + MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC); + + if (Opc == AMDGPU::S_SUB_U64_PSEUDO) { + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo) + .addImm(0) + .addReg(NewAccumulator->getOperand(0).getReg()) + .setOperandDead(3); // Dead scc + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi) + .addReg(NegatedValLo) + .addImm(31) + .setOperandDead(3); // Dead scc + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg) + .add(Op1L) + .addReg(NegatedValHi); + } + Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO + ? NegatedValLo + : NewAccumulator->getOperand(0).getReg(); + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0) + .add(Op1L) + .addReg(LowOpcode); + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg) + .add(Op1L) + .addReg(LowOpcode); + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg) + .add(Op1H) + .addReg(LowOpcode); + + Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1; + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal) + .addReg(CarryReg) + .addReg(Op1H_Op0L_Reg) + .setOperandDead(3); // Dead scc + + if (Opc == AMDGPU::S_SUB_U64_PSEUDO) { + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1) + .addReg(HiVal) + .addReg(Op1L_Op0H_Reg) + .setOperandDead(3); // Dead scc + } + BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + break; + } } RetBB = &BB; } @@ -5313,6 +5589,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, // so that we will get the next active lane for next iteration. MachineBasicBlock::iterator I = BB.end(); Register SrcReg = MI.getOperand(1).getReg(); + bool is32BitOpc = is32bitWaveReduceOperation(Opc); // Create Control flow for loop // Split MI's Machine Basic block into For loop @@ -5322,73 +5599,160 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass(); const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg); Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass); - Register InitalValReg = MRI.createVirtualRegister(DstRegClass); - + Register IdentityValReg = MRI.createVirtualRegister(DstRegClass); Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass); Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass); Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass); - - Register FF1Reg = MRI.createVirtualRegister(DstRegClass); - Register LaneValueReg = - MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register LaneValueReg = MRI.createVirtualRegister(DstRegClass); bool IsWave32 = ST.isWave32(); - unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; // Create initial values of induction variable from Exec, Accumulator and // insert branch instr to newly created ComputeBlock - uint32_t InitalValue = getIdentityValueForWaveReduction(Opc); - auto TmpSReg = - BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg); - BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg) - .addImm(InitalValue); + BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg); + if (is32BitOpc) { + uint32_t IdentityValue = getIdentityValueFor32BitWaveReduction(Opc); + BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg) + .addImm(IdentityValue); + } else { + uint64_t IdentityValue = getIdentityValueFor64BitWaveReduction(Opc); + BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg) + .addImm(IdentityValue); + } // clang-format off BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)) .addMBB(ComputeLoop); // clang-format on // Start constructing ComputeLoop - I = ComputeLoop->end(); + I = ComputeLoop->begin(); auto Accumulator = BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg) - .addReg(InitalValReg) + .addReg(IdentityValReg) .addMBB(&BB); auto ActiveBits = BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg) - .addReg(TmpSReg->getOperand(0).getReg()) + .addReg(LoopIterator) .addMBB(&BB); + I = ComputeLoop->end(); + MachineInstr *NewAccumulator; // Perform the computations unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64; - auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg) - .addReg(ActiveBits->getOperand(0).getReg()); - auto LaneValue = BuildMI(*ComputeLoop, I, DL, - TII->get(AMDGPU::V_READLANE_B32), LaneValueReg) - .addReg(SrcReg) - .addReg(FF1->getOperand(0).getReg()); - auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg) - .addReg(Accumulator->getOperand(0).getReg()) - .addReg(LaneValue->getOperand(0).getReg()); - + BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg) + .addReg(ActiveBitsReg); + if (is32BitOpc) { + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), + LaneValueReg) + .addReg(SrcReg) + .addReg(FF1Reg); + NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg) + .addReg(Accumulator->getOperand(0).getReg()) + .addReg(LaneValueReg); + } else { + Register LaneValueLoReg = + MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register LaneValueHiReg = + MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); + const TargetRegisterClass *SrcSubRC = + TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0); + MachineOperand Op1L = TII->buildExtractSubRegOrImm( + MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC); + MachineOperand Op1H = TII->buildExtractSubRegOrImm( + MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC); + // lane value input should be in an sgpr + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), + LaneValueLoReg) + .add(Op1L) + .addReg(FF1Reg); + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), + LaneValueHiReg) + .add(Op1H) + .addReg(FF1Reg); + auto LaneValue = BuildMI(*ComputeLoop, I, DL, + TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg) + .addReg(LaneValueLoReg) + .addImm(AMDGPU::sub0) + .addReg(LaneValueHiReg) + .addImm(AMDGPU::sub1); + switch (Opc) { + case AMDGPU::S_OR_B64: + case AMDGPU::S_AND_B64: + case AMDGPU::S_XOR_B64: { + NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg) + .addReg(Accumulator->getOperand(0).getReg()) + .addReg(LaneValue->getOperand(0).getReg()) + .setOperandDead(3); // Dead scc + break; + } + case AMDGPU::V_CMP_GT_I64_e64: + case AMDGPU::V_CMP_GT_U64_e64: + case AMDGPU::V_CMP_LT_I64_e64: + case AMDGPU::V_CMP_LT_U64_e64: { + Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass); + Register ComparisonResultReg = + MRI.createVirtualRegister(WaveMaskRegClass); + const TargetRegisterClass *VregClass = TRI->getVGPR64Class(); + const TargetRegisterClass *VSubRegClass = + TRI->getSubRegisterClass(VregClass, AMDGPU::sub0); + Register AccumulatorVReg = MRI.createVirtualRegister(VregClass); + MachineOperand SrcReg0Sub0 = + TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0), + VregClass, AMDGPU::sub0, VSubRegClass); + MachineOperand SrcReg0Sub1 = + TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0), + VregClass, AMDGPU::sub1, VSubRegClass); + BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE), + AccumulatorVReg) + .add(SrcReg0Sub0) + .addImm(AMDGPU::sub0) + .add(SrcReg0Sub1) + .addImm(AMDGPU::sub1); + BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg) + .addReg(LaneValue->getOperand(0).getReg()) + .addReg(AccumulatorVReg); + + unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; + BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg) + .addReg(LaneMaskReg) + .addReg(ActiveBitsReg); + + NewAccumulator = BuildMI(*ComputeLoop, I, DL, + TII->get(AMDGPU::S_CSELECT_B64), DstReg) + .addReg(LaneValue->getOperand(0).getReg()) + .addReg(Accumulator->getOperand(0).getReg()); + break; + } + case AMDGPU::S_ADD_U64_PSEUDO: + case AMDGPU::S_SUB_U64_PSEUDO: { + NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg) + .addReg(Accumulator->getOperand(0).getReg()) + .addReg(LaneValue->getOperand(0).getReg()); + ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop); + break; + } + } + } // Manipulate the iterator to get the next active lane unsigned BITSETOpc = IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64; - auto NewActiveBits = - BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg) - .addReg(FF1->getOperand(0).getReg()) - .addReg(ActiveBits->getOperand(0).getReg()); + BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg) + .addReg(FF1Reg) + .addReg(ActiveBitsReg); // Add phi nodes - Accumulator.addReg(NewAccumulator->getOperand(0).getReg()) - .addMBB(ComputeLoop); - ActiveBits.addReg(NewActiveBits->getOperand(0).getReg()) - .addMBB(ComputeLoop); + Accumulator.addReg(DstReg).addMBB(ComputeLoop); + ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop); // Creating branching unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64; BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc)) - .addReg(NewActiveBits->getOperand(0).getReg()) + .addReg(NewActiveBitsReg) .addImm(0); BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) .addMBB(ComputeLoop); @@ -5410,22 +5774,40 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, switch (MI.getOpcode()) { case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32); + case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64); case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32); + case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64); case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32); + case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64); case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32); + case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64); case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32); + case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO); case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32); + case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO); case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32); + case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64); case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32); + case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64); case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32); + case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64); case AMDGPU::S_UADDO_PSEUDO: case AMDGPU::S_USUBO_PSEUDO: { const DebugLoc &DL = MI.getDebugLoc(); @@ -5452,55 +5834,7 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, } case AMDGPU::S_ADD_U64_PSEUDO: case AMDGPU::S_SUB_U64_PSEUDO: { - // For targets older than GFX12, we emit a sequence of 32-bit operations. - // For GFX12, we emit s_add_u64 and s_sub_u64. - const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - const DebugLoc &DL = MI.getDebugLoc(); - MachineOperand &Dest = MI.getOperand(0); - MachineOperand &Src0 = MI.getOperand(1); - MachineOperand &Src1 = MI.getOperand(2); - bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); - if (Subtarget->hasScalarAddSub64()) { - unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64; - // clang-format off - BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg()) - .add(Src0) - .add(Src1); - // clang-format on - } else { - const SIRegisterInfo *TRI = ST.getRegisterInfo(); - const TargetRegisterClass *BoolRC = TRI->getBoolRC(); - - Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - - MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm( - MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass); - MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm( - MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass); - - MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm( - MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass); - MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm( - MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass); - - unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; - unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; - BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0) - .add(Src0Sub0) - .add(Src1Sub0); - BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1) - .add(Src0Sub1) - .add(Src1Sub1); - BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg()) - .addReg(DestSub0) - .addImm(AMDGPU::sub0) - .addReg(DestSub1) - .addImm(AMDGPU::sub1); - } - MI.eraseFromParent(); - return BB; + return Expand64BitScalarArithmetic(MI, BB); } case AMDGPU::V_ADD_U64_PSEUDO: case AMDGPU::V_SUB_U64_PSEUDO: { @@ -6023,14 +6357,15 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.eraseFromParent(); return SplitBB; } + case AMDGPU::SI_TCRETURN_GFX_WholeWave: case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: { assert(MFI->isWholeWaveFunction()); // During ISel, it's difficult to propagate the original EXEC mask to use as // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead. MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent()); - Register OriginalExec = Setup->getOperand(0).getReg(); assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC"); + Register OriginalExec = Setup->getOperand(0).getReg(); MF->getRegInfo().clearKillFlags(OriginalExec); MI.getOperand(0).setReg(OriginalExec); return BB; @@ -10246,6 +10581,16 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); return SDValue(NewMI, 0); } + case Intrinsic::amdgcn_cooperative_atomic_load_32x4B: + case Intrinsic::amdgcn_cooperative_atomic_load_16x8B: + case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: { + MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op); + SDValue Chain = Op->getOperand(0); + SDValue Ptr = Op->getOperand(2); + EVT VT = Op->getValueType(0); + return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT, + Chain, Ptr, MII->getMemOperand()); + } default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = @@ -10421,41 +10766,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE; return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0); } - case Intrinsic::amdgcn_s_barrier: - case Intrinsic::amdgcn_s_barrier_signal: - case Intrinsic::amdgcn_s_barrier_wait: { - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) { - unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second; - if (WGSize <= ST.getWavefrontSize()) { - // If the workgroup fits in a wave, remove s_barrier_signal and lower - // s_barrier/s_barrier_wait to wave_barrier. - if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal) - return Op.getOperand(0); - else - return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, - MVT::Other, Op.getOperand(0)), - 0); - } - } - - if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) { - // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait - SDValue K = - DAG.getSignedTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32); - SDValue BarSignal = - SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL, - MVT::Other, K, Op.getOperand(0)), - 0); - SDValue BarWait = - SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K, - BarSignal.getValue(0)), - 0); - return BarWait; - } - - return SDValue(); - }; case Intrinsic::amdgcn_struct_tbuffer_store: case Intrinsic::amdgcn_struct_ptr_tbuffer_store: { @@ -10913,6 +11223,16 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } + case Intrinsic::amdgcn_cooperative_atomic_store_32x4B: + case Intrinsic::amdgcn_cooperative_atomic_store_16x8B: + case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: { + MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op); + SDValue Chain = Op->getOperand(0); + SDValue Ptr = Op->getOperand(2); + SDValue Val = Op->getOperand(3); + return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val, + Ptr, MII->getMemOperand()); + } default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) @@ -16933,10 +17253,12 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, switch (BitWidth) { case 16: RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass - : &AMDGPU::VGPR_32RegClass; + : &AMDGPU::VGPR_32_Lo256RegClass; break; default: - RC = TRI->getVGPRClassForBitWidth(BitWidth); + RC = Subtarget->has1024AddressableVGPRs() + ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth) + : TRI->getVGPRClassForBitWidth(BitWidth); if (!RC) return std::pair(0U, nullptr); break; @@ -16980,7 +17302,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint); if (Kind != '\0') { if (Kind == 'v') { - RC = &AMDGPU::VGPR_32RegClass; + RC = &AMDGPU::VGPR_32_Lo256RegClass; } else if (Kind == 's') { RC = &AMDGPU::SGPR_32RegClass; } else if (Kind == 'a') { @@ -17022,6 +17344,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, return std::pair(0U, nullptr); if (Idx < RC->getNumRegs()) return std::pair(RC->getRegister(Idx), RC); + return std::pair(0U, nullptr); } } @@ -17808,11 +18131,19 @@ static bool flatInstrMayAccessPrivate(const Instruction *I) { !AMDGPU::hasValueInRangeLikeMetadata(*MD, AMDGPUAS::PRIVATE_ADDRESS); } +static TargetLowering::AtomicExpansionKind +getPrivateAtomicExpansionKind(const GCNSubtarget &STI) { + // For GAS, lower to flat atomic. + return STI.hasGloballyAddressableScratch() + ? TargetLowering::AtomicExpansionKind::CustomExpand + : TargetLowering::AtomicExpansionKind::NotAtomic; +} + TargetLowering::AtomicExpansionKind SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { unsigned AS = RMW->getPointerAddressSpace(); if (AS == AMDGPUAS::PRIVATE_ADDRESS) - return AtomicExpansionKind::NotAtomic; + return getPrivateAtomicExpansionKind(*getSubtarget()); // 64-bit flat atomics that dynamically reside in private memory will silently // be dropped. @@ -17823,7 +18154,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { if (AS == AMDGPUAS::FLAT_ADDRESS && DL.getTypeSizeInBits(RMW->getType()) == 64 && flatInstrMayAccessPrivate(RMW)) - return AtomicExpansionKind::Expand; + return AtomicExpansionKind::CustomExpand; auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) { OptimizationRemarkEmitter ORE(RMW->getFunction()); @@ -17898,7 +18229,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { // does. InstCombine transforms these with 0 to or, so undo that. if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand()); ConstVal && ConstVal->isNullValue()) - return AtomicExpansionKind::Expand; + return AtomicExpansionKind::CustomExpand; } // If the allocation could be in remote, fine-grained memory, the rmw @@ -18027,9 +18358,9 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { // fadd. if (Subtarget->hasLDSFPAtomicAddF32()) { if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts()) - return AtomicExpansionKind::Expand; + return AtomicExpansionKind::CustomExpand; if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts()) - return AtomicExpansionKind::Expand; + return AtomicExpansionKind::CustomExpand; } } } @@ -18083,14 +18414,14 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { TargetLowering::AtomicExpansionKind SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS - ? AtomicExpansionKind::NotAtomic + ? getPrivateAtomicExpansionKind(*getSubtarget()) : AtomicExpansionKind::None; } TargetLowering::AtomicExpansionKind SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS - ? AtomicExpansionKind::NotAtomic + ? getPrivateAtomicExpansionKind(*getSubtarget()) : AtomicExpansionKind::None; } @@ -18098,7 +18429,7 @@ TargetLowering::AtomicExpansionKind SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const { unsigned AddrSpace = CmpX->getPointerAddressSpace(); if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) - return AtomicExpansionKind::NotAtomic; + return getPrivateAtomicExpansionKind(*getSubtarget()); if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX)) return AtomicExpansionKind::None; @@ -18109,7 +18440,7 @@ SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const { // If a 64-bit flat atomic may alias private, we need to avoid using the // atomic in the private case. - return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand + return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand : AtomicExpansionKind::None; } @@ -18468,9 +18799,24 @@ void SITargetLowering::emitExpandAtomicAddrSpacePredicate( Builder.CreateBr(ExitBB); } +static void convertScratchAtomicToFlatAtomic(Instruction *I, + unsigned PtrOpIdx) { + Value *PtrOp = I->getOperand(PtrOpIdx); + assert(PtrOp->getType()->getPointerAddressSpace() == + AMDGPUAS::PRIVATE_ADDRESS); + + Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS); + Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast", + I->getIterator()); + I->setOperand(PtrOpIdx, ASCast); +} + void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { AtomicRMWInst::BinOp Op = AI->getOperation(); + if (AI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) + return convertScratchAtomicToFlatAtomic(AI, AI->getPointerOperandIndex()); + if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or || Op == AtomicRMWInst::Xor) { if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand()); @@ -18493,9 +18839,28 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { } void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const { + if (CI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) + return convertScratchAtomicToFlatAtomic(CI, CI->getPointerOperandIndex()); + emitExpandAtomicAddrSpacePredicate(CI); } +void SITargetLowering::emitExpandAtomicLoad(LoadInst *LI) const { + if (LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) + return convertScratchAtomicToFlatAtomic(LI, LI->getPointerOperandIndex()); + + llvm_unreachable( + "Expand Atomic Load only handles SCRATCH -> FLAT conversion"); +} + +void SITargetLowering::emitExpandAtomicStore(StoreInst *SI) const { + if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) + return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex()); + + llvm_unreachable( + "Expand Atomic Store only handles SCRATCH -> FLAT conversion"); +} + LoadInst * SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { IRBuilder<> Builder(AI); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index dedd9ae17077..728c6490bdfd 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -14,8 +14,8 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_SIISELLOWERING_H #define LLVM_LIB_TARGET_AMDGPU_SIISELLOWERING_H -#include "AMDGPUISelLowering.h" #include "AMDGPUArgumentUsageInfo.h" +#include "AMDGPUISelLowering.h" #include "llvm/CodeGen/MachineFunction.h" namespace llvm { @@ -562,6 +562,8 @@ public: void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const; void emitExpandAtomicRMW(AtomicRMWInst *AI) const override; void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override; + void emitExpandAtomicLoad(LoadInst *LI) const override; + void emitExpandAtomicStore(StoreInst *SI) const override; LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override; diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index e3a2efdd3856..b163a274396f 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -152,7 +152,7 @@ static constexpr StringLiteral WaitEventTypeName[] = { // We reserve a fixed number of VGPR slots in the scoring tables for // special tokens like SCMEM_LDS (needed for buffer load to LDS). enum RegisterMapping { - SQ_MAX_PGM_VGPRS = 1024, // Maximum programmable VGPRs across all targets. + SQ_MAX_PGM_VGPRS = 2048, // Maximum programmable VGPRs across all targets. AGPR_OFFSET = 512, // Maximum programmable ArchVGPRs across all targets. SQ_MAX_PGM_SGPRS = 128, // Maximum programmable SGPRs across all targets. // Artificial register slots to track LDS writes into specific LDS locations @@ -831,7 +831,6 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *Context->ST); unsigned RegIdx = TRI->getHWRegIndex(MCReg); - assert(isUInt<8>(RegIdx)); const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg()); unsigned Size = TRI->getRegSizeInBits(*RC); @@ -839,7 +838,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, // AGPRs/VGPRs are tracked every 16 bits, SGPRs by 32 bits if (TRI->isVectorRegister(*MRI, Op.getReg())) { unsigned Reg = RegIdx << 1 | (AMDGPU::isHi16Reg(MCReg, *TRI) ? 1 : 0); - assert(Reg < AGPR_OFFSET); + assert(!Context->ST->hasMAIInsts() || Reg < AGPR_OFFSET); Result.first = Reg; if (TRI->isAGPR(*MRI, Op.getReg())) Result.first += AGPR_OFFSET; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 69708c47f6c9..398c99b3bd12 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -62,8 +62,8 @@ static cl::opt<bool> Fix16BitCopies( cl::ReallyHidden); SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) - : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), - RI(ST), ST(ST) { + : AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), + RI(ST), ST(ST) { SchedModel.init(&ST); } @@ -2493,7 +2493,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); break; } - case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: case AMDGPU::SI_RETURN: { const MachineFunction *MF = MBB.getParent(); const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); @@ -3444,12 +3443,8 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) { case AMDGPU::V_ACCVGPR_READ_B32_e64: case AMDGPU::V_ACCVGPR_MOV_B32: case AMDGPU::AV_MOV_B32_IMM_PSEUDO: - return true; case AMDGPU::AV_MOV_B64_IMM_PSEUDO: - // TODO: We could fold this, but it's a strange case. The immediate value - // can't be directly folded into any real use. We would have to spread new - // immediate legality checks around and only accept subregister extracts for - // profitability. + return true; default: return false; } @@ -3559,13 +3554,12 @@ static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) { bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const { - if (!MRI->hasOneNonDBGUse(Reg)) - return false; - int64_t Imm; if (!getConstValDefinedInReg(DefMI, Reg, Imm)) return false; + const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg); + assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form"); unsigned Opc = UseMI.getOpcode(); @@ -3577,6 +3571,25 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg); + if (HasMultipleUses) { + // TODO: This should fold in more cases with multiple use, but we need to + // more carefully consider what those uses are. + unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg)); + + // Avoid breaking up a 64-bit inline immediate into a subregister extract. + if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64) + return false; + + // Most of the time folding a 32-bit inline constant is free (though this + // might not be true if we can't later fold it into a real user). + // + // FIXME: This isInlineConstant check is imprecise if + // getConstValDefinedInReg handled the tricky non-mov cases. + if (ImmDefSize == 32 && + !isInlineConstant(Imm, AMDGPU::OPERAND_REG_IMM_INT32)) + return false; + } + bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister && RI.getSubRegIdxSize(UseSubReg) == 16; @@ -3664,6 +3677,9 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, return true; } + if (HasMultipleUses) + return false; + if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 || Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || @@ -4572,34 +4588,43 @@ static bool compareMachineOp(const MachineOperand &Op0, } } -bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, - const MachineOperand &MO) const { - const MCInstrDesc &InstDesc = MI.getDesc(); - const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo]; - - assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()); - +bool SIInstrInfo::isLiteralOperandLegal(const MCInstrDesc &InstDesc, + const MCOperandInfo &OpInfo) const { if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) return true; - if (OpInfo.RegClass < 0) + if (!RI.opCanUseLiteralConstant(OpInfo.OperandType)) return false; - if (MO.isImm() && isInlineConstant(MO, OpInfo)) { - if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() && - OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(), - AMDGPU::OpName::src2)) + if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo)) + return true; + + return ST.hasVOP3Literal(); +} + +bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, + int64_t ImmVal) const { + const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo]; + if (isInlineConstant(ImmVal, OpInfo.OperandType)) { + if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() && + OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(), + AMDGPU::OpName::src2)) return false; return RI.opCanUseInlineConstant(OpInfo.OperandType); } - if (!RI.opCanUseLiteralConstant(OpInfo.OperandType)) - return false; + return isLiteralOperandLegal(InstDesc, OpInfo); +} - if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo)) - return true; +bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, + const MachineOperand &MO) const { + if (MO.isImm()) + return isImmOperandLegal(InstDesc, OpNo, MO.getImm()); - return ST.hasVOP3Literal(); + assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) && + "unexpected imm-like operand kind"); + const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo]; + return isLiteralOperandLegal(InstDesc, OpInfo); } bool SIInstrInfo::isLegalAV64PseudoImm(uint64_t Imm) const { @@ -4759,6 +4784,31 @@ MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI, return Inst32; } +bool SIInstrInfo::physRegUsesConstantBus(const MachineOperand &RegOp) const { + // Null is free + Register Reg = RegOp.getReg(); + if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64) + return false; + + // SGPRs use the constant bus + + // FIXME: implicit registers that are not part of the MCInstrDesc's implicit + // physical register operands should also count, except for exec. + if (RegOp.isImplicit()) + return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0; + + // SGPRs use the constant bus + return AMDGPU::SReg_32RegClass.contains(Reg) || + AMDGPU::SReg_64RegClass.contains(Reg); +} + +bool SIInstrInfo::regUsesConstantBus(const MachineOperand &RegOp, + const MachineRegisterInfo &MRI) const { + Register Reg = RegOp.getReg(); + return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg)) + : physRegUsesConstantBus(RegOp); +} + bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const { @@ -4766,23 +4816,9 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, if (!MO.isReg()) return !isInlineConstant(MO, OpInfo); - if (!MO.isUse()) - return false; - - if (MO.getReg().isVirtual()) - return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); - - // Null is free - if (MO.getReg() == AMDGPU::SGPR_NULL || MO.getReg() == AMDGPU::SGPR_NULL64) - return false; - - // SGPRs use the constant bus - if (MO.isImplicit()) { - return MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC || - MO.getReg() == AMDGPU::VCC_LO; - } - return AMDGPU::SReg_32RegClass.contains(MO.getReg()) || - AMDGPU::SReg_64RegClass.contains(MO.getReg()); + Register Reg = MO.getReg(); + return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg)) + : physRegUsesConstantBus(MO); } static Register findImplicitSGPRRead(const MachineInstr &MI) { @@ -4933,7 +4969,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, int RegClass = Desc.operands()[i].RegClass; - switch (Desc.operands()[i].OperandType) { + const MCOperandInfo &OpInfo = Desc.operands()[i]; + switch (OpInfo.OperandType) { case MCOI::OPERAND_REGISTER: if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) { ErrInfo = "Illegal immediate value for operand."; @@ -4941,15 +4978,31 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } break; case AMDGPU::OPERAND_REG_IMM_INT32: + case AMDGPU::OPERAND_REG_IMM_INT64: + case AMDGPU::OPERAND_REG_IMM_INT16: case AMDGPU::OPERAND_REG_IMM_FP32: case AMDGPU::OPERAND_REG_IMM_V2FP32: + case AMDGPU::OPERAND_REG_IMM_BF16: + case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_IMM_FP64: + case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_IMM_V2INT32: + case AMDGPU::OPERAND_REG_IMM_V2BF16: + break; + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: + break; break; + case AMDGPU::OPERAND_REG_INLINE_C_INT16: case AMDGPU::OPERAND_REG_INLINE_C_INT32: - case AMDGPU::OPERAND_REG_INLINE_C_FP32: case AMDGPU::OPERAND_REG_INLINE_C_INT64: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: case AMDGPU::OPERAND_REG_INLINE_C_FP64: - case AMDGPU::OPERAND_REG_INLINE_C_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_BF16: case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_INLINE_AC_INT32: case AMDGPU::OPERAND_REG_INLINE_AC_FP32: case AMDGPU::OPERAND_REG_INLINE_AC_FP64: { @@ -4965,6 +5018,10 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, return false; } break; + case AMDGPU::OPERAND_INPUT_MODS: + case AMDGPU::OPERAND_SDWA_VOPC_DST: + case AMDGPU::OPERAND_KIMM16: + break; case MCOI::OPERAND_IMMEDIATE: case AMDGPU::OPERAND_KIMM32: case AMDGPU::OPERAND_KIMM64: @@ -4976,9 +5033,15 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, ErrInfo = "Expected immediate, but got non-immediate"; return false; } - [[fallthrough]]; + break; + case MCOI::OPERAND_UNKNOWN: + case MCOI::OPERAND_MEMORY: + case MCOI::OPERAND_PCREL: + break; default: - continue; + if (OpInfo.isGenericType()) + continue; + break; } if (!MO.isReg()) @@ -4991,7 +5054,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, // aligned register constraint. // FIXME: We do not verify inline asm operands, but custom inline asm // verification is broken anyway - if (ST.needsAlignedVGPRs()) { + if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO) { const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg); if (RI.hasVectorRegisters(RC) && MO.getSubReg()) { if (const TargetRegisterClass *SubRC = @@ -5912,13 +5975,12 @@ SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const { static const TargetRegisterClass * adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, - const MachineRegisterInfo &MRI, const MCInstrDesc &TID, unsigned RCID, bool IsAllocatable) { - if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) && + if ((IsAllocatable || !ST.hasGFX90AInsts()) && (((TID.mayLoad() || TID.mayStore()) && !(TID.TSFlags & SIInstrFlags::Spill)) || - (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) { + (TID.TSFlags & SIInstrFlags::MIMG))) { switch (RCID) { case AMDGPU::AV_32RegClassID: RCID = AMDGPU::VGPR_32RegClassID; @@ -5953,44 +6015,31 @@ const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID, if (OpNum >= TID.getNumOperands()) return nullptr; auto RegClass = TID.operands()[OpNum].RegClass; - bool IsAllocatable = false; - if (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::FLAT)) { - // vdst and vdata should be both VGPR or AGPR, same for the DS instructions - // with two data operands. Request register class constrained to VGPR only - // of both operands present as Machine Copy Propagation can not check this - // constraint and possibly other passes too. - // - // The check is limited to FLAT and DS because atomics in non-flat encoding - // have their vdst and vdata tied to be the same register. - const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode, - AMDGPU::OpName::vdst); - const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode, - (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0 - : AMDGPU::OpName::vdata); - if (DataIdx != -1) { - IsAllocatable = VDstIdx != -1 || AMDGPU::hasNamedOperand( - TID.Opcode, AMDGPU::OpName::data1); - } + if (TID.getOpcode() == AMDGPU::AV_MOV_B64_IMM_PSEUDO) { + // Special pseudos have no alignment requirement + return RI.getRegClass(RegClass); } - return adjustAllocatableRegClass(ST, RI, MF.getRegInfo(), TID, RegClass, - IsAllocatable); + + return adjustAllocatableRegClass(ST, RI, TID, RegClass, false); } const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, unsigned OpNo) const { - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); const MCInstrDesc &Desc = get(MI.getOpcode()); if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || Desc.operands()[OpNo].RegClass == -1) { Register Reg = MI.getOperand(OpNo).getReg(); - if (Reg.isVirtual()) + if (Reg.isVirtual()) { + const MachineRegisterInfo &MRI = + MI.getParent()->getParent()->getRegInfo(); return MRI.getRegClass(Reg); + } return RI.getPhysRegBaseClass(Reg); } unsigned RCID = Desc.operands()[OpNo].RegClass; - return adjustAllocatableRegClass(ST, RI, MRI, Desc, RCID, true); + return adjustAllocatableRegClass(ST, RI, Desc, RCID, true); } void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { @@ -6224,15 +6273,14 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, continue; const MachineOperand &Op = MI.getOperand(i); if (Op.isReg()) { - RegSubRegPair SGPR(Op.getReg(), Op.getSubReg()); - if (!SGPRsUsed.count(SGPR) && - // FIXME: This can access off the end of the operands() array. - usesConstantBus(MRI, Op, InstDesc.operands().begin()[i])) { - if (--ConstantBusLimit <= 0) - return false; - SGPRsUsed.insert(SGPR); + if (Op.isUse()) { + RegSubRegPair SGPR(Op.getReg(), Op.getSubReg()); + if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) { + if (--ConstantBusLimit <= 0) + return false; + } } - } else if (AMDGPU::isSISrcOperand(InstDesc, i) && + } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) && !isInlineConstant(Op, InstDesc.operands()[i])) { // The same literal may be used multiple times. if (!UsedLiteral) @@ -6526,6 +6574,21 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg())) legalizeOpWithMove(MI, VOP3Idx[2]); + if (isWMMA(MI)) { + // scale_src has a register class restricted to low 256 VGPRs, we may need + // to insert a copy to the restricted VGPR class. + int ScaleSrc0Idx = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::scale_src0); + if (ScaleSrc0Idx != -1) { + int ScaleSrc1Idx = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::scale_src1); + if (!isOperandLegal(MI, ScaleSrc0Idx)) + legalizeOpWithMove(MI, ScaleSrc0Idx); + if (!isOperandLegal(MI, ScaleSrc1Idx)) + legalizeOpWithMove(MI, ScaleSrc1Idx); + } + } + // Fix the register class of packed FP32 instructions on gfx12+. See // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information. if (AMDGPU::isPackedFP32Inst(Opc) && AMDGPU::isGFX12Plus(ST)) { @@ -8036,12 +8099,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, MRI.replaceRegWith(DstReg, NewDstReg); MRI.clearKillFlags(NewDstReg); Inst.getOperand(0).setReg(DstReg); - // Make sure we don't leave around a dead VGPR->SGPR copy. Normally - // these are deleted later, but at -O0 it would leave a suspicious - // looking illegal copy of an undef register. - for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I) - Inst.removeOperand(I); - Inst.setDesc(get(AMDGPU::IMPLICIT_DEF)); + Inst.eraseFromParent(); // Legalize t16 operand since replaceReg is called after addUsersToVALU for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(NewDstReg))) { @@ -9235,6 +9293,9 @@ Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI, MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const { + if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES) + return nullptr; + int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); if (Idx == -1) return nullptr; @@ -9532,6 +9593,7 @@ SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const { { {MONoClobber, "amdgpu-noclobber"}, {MOLastUse, "amdgpu-last-use"}, + {MOCooperative, "amdgpu-cooperative"}, }; return ArrayRef(TargetFlags); @@ -10219,7 +10281,7 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, InstructionUniformity SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const { const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); - unsigned opcode = MI.getOpcode(); + unsigned Opcode = MI.getOpcode(); auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); @@ -10239,7 +10301,7 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const { // If the target supports globally addressable scratch, the mapping from // scratch memory to the flat aperture changes therefore an address space cast // is no longer uniform. - if (opcode == TargetOpcode::G_ADDRSPACE_CAST) + if (Opcode == TargetOpcode::G_ADDRSPACE_CAST) return HandleAddrSpaceCast(MI); if (auto *GI = dyn_cast<GIntrinsic>(&MI)) { @@ -10267,7 +10329,8 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const { // // All other loads are not divergent, because if threads issue loads with the // same arguments, they will always get the same result. - if (opcode == AMDGPU::G_LOAD) { + if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD || + Opcode == AMDGPU::G_SEXTLOAD) { if (MI.memoperands_empty()) return InstructionUniformity::NeverUniform; // conservative assumption @@ -10281,10 +10344,10 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const { return InstructionUniformity::Default; } - if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode) || - opcode == AMDGPU::G_ATOMIC_CMPXCHG || - opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS || - AMDGPU::isGenericAtomic(opcode)) { + if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) || + Opcode == AMDGPU::G_ATOMIC_CMPXCHG || + Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS || + AMDGPU::isGenericAtomic(Opcode)) { return InstructionUniformity::NeverUniform; } return InstructionUniformity::Default; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index fdbd9ce4a66b..f7dde2b90b68 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -48,6 +48,10 @@ static const MachineMemOperand::Flags MONoClobber = static const MachineMemOperand::Flags MOLastUse = MachineMemOperand::MOTargetFlag2; +/// Mark the MMO of cooperative load/store atomics. +static const MachineMemOperand::Flags MOCooperative = + MachineMemOperand::MOTargetFlag3; + /// Utility to store machine instructions worklist. struct SIInstrWorklist { SIInstrWorklist() = default; @@ -533,13 +537,13 @@ public: return get(Opcode).TSFlags & SIInstrFlags::VOP2; } - static bool isVOP3(const MachineInstr &MI) { - return MI.getDesc().TSFlags & SIInstrFlags::VOP3; + static bool isVOP3(const MCInstrDesc &Desc) { + return Desc.TSFlags & SIInstrFlags::VOP3; } - bool isVOP3(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::VOP3; - } + static bool isVOP3(const MachineInstr &MI) { return isVOP3(MI.getDesc()); } + + bool isVOP3(uint16_t Opcode) const { return isVOP3(get(Opcode)); } static bool isSDWA(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::SDWA; @@ -841,13 +845,13 @@ public: return get(Opcode).TSFlags & SIInstrFlags::VINTRP; } - static bool isMAI(const MachineInstr &MI) { - return MI.getDesc().TSFlags & SIInstrFlags::IsMAI; + static bool isMAI(const MCInstrDesc &Desc) { + return Desc.TSFlags & SIInstrFlags::IsMAI; } - bool isMAI(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::IsMAI; - } + static bool isMAI(const MachineInstr &MI) { return isMAI(MI.getDesc()); } + + bool isMAI(uint16_t Opcode) const { return isMAI(get(Opcode)); } static bool isMFMA(const MachineInstr &MI) { return isMAI(MI) && MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 && @@ -983,13 +987,19 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::IsNeverUniform; } - bool isBarrier(unsigned Opcode) const { + // Check to see if opcode is for a barrier start. Pre gfx12 this is just the + // S_BARRIER, but after support for S_BARRIER_SIGNAL* / S_BARRIER_WAIT we want + // to check for the barrier start (S_BARRIER_SIGNAL*) + bool isBarrierStart(unsigned Opcode) const { return Opcode == AMDGPU::S_BARRIER || Opcode == AMDGPU::S_BARRIER_SIGNAL_M0 || Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0 || Opcode == AMDGPU::S_BARRIER_SIGNAL_IMM || - Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM || - Opcode == AMDGPU::S_BARRIER_WAIT || + Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM; + } + + bool isBarrier(unsigned Opcode) const { + return isBarrierStart(Opcode) || Opcode == AMDGPU::S_BARRIER_WAIT || Opcode == AMDGPU::S_BARRIER_INIT_M0 || Opcode == AMDGPU::S_BARRIER_INIT_IMM || Opcode == AMDGPU::S_BARRIER_JOIN_IMM || @@ -1045,6 +1055,8 @@ public: return AMDGPU::S_WAIT_DSCNT; case AMDGPU::S_WAIT_KMCNT_soft: return AMDGPU::S_WAIT_KMCNT; + case AMDGPU::S_WAIT_XCNT_soft: + return AMDGPU::S_WAIT_XCNT; default: return Opcode; } @@ -1174,9 +1186,20 @@ public: return isInlineConstant(*MO.getParent(), MO.getOperandNo()); } - bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, + bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const; + bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, + const MCOperandInfo &OpInfo) const; + + bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, + int64_t ImmVal) const; + + bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, + const MachineOperand &MO) const { + return isImmOperandLegal(MI.getDesc(), OpNo, MO); + } + /// Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO. bool isLegalAV64PseudoImm(uint64_t Imm) const; @@ -1184,6 +1207,10 @@ public: /// This function will return false if you pass it a 32-bit instruction. bool hasVALU32BitEncoding(unsigned Opcode) const; + bool physRegUsesConstantBus(const MachineOperand &Reg) const; + bool regUsesConstantBus(const MachineOperand &Reg, + const MachineRegisterInfo &MRI) const; + /// Returns true if this operand uses the constant bus. bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 0374526e35c4..aa5dae09ca18 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1806,15 +1806,15 @@ class getVALUDstForVT<ValueType VT, bit IsTrue16 = 0, bit IsVOP3Encoding = 0> { VOPDstOperand_t16Lo128), VOPDstOperand<VGPR_32>); RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VOPDstOperand<VReg_1024>, - !eq(VT.Size, 512) : VOPDstOperand<VReg_512>, - !eq(VT.Size, 256) : VOPDstOperand<VReg_256>, - !eq(VT.Size, 192) : VOPDstOperand<VReg_192>, - !eq(VT.Size, 128) : VOPDstOperand<VReg_128>, + !eq(VT.Size, 512) : VOPDstOperand<VReg_512>, + !eq(VT.Size, 256) : VOPDstOperand<VReg_256>, + !eq(VT.Size, 192) : VOPDstOperand<VReg_192>, + !eq(VT.Size, 128) : VOPDstOperand<VReg_128>, !eq(VT.Size, 96) : VOPDstOperand<VReg_96>, - !eq(VT.Size, 64) : VOPDstOperand<VReg_64>, - !eq(VT.Size, 32) : VOPDstOperand<VGPR_32>, - !eq(VT.Size, 16) : op16, - 1 : VOPDstS64orS32); // else VT == i1 + !eq(VT.Size, 64) : VOPDstOperand<VReg_64>, + !eq(VT.Size, 32) : VOPDstOperand<VGPR_32>, + !eq(VT.Size, 16) : op16, + 1 : VOPDstS64orS32); // else VT == i1 } class getVALUDstForVT_fake16<ValueType VT> { @@ -1898,7 +1898,7 @@ class getVregSrcForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 1> { !eq(VT.Size, 64) : RegisterOperand<VReg_64>, !eq(VT.Size, 48) : RegisterOperand<VReg_64>, !eq(VT.Size, 16) : !if(IsTrue16, - !if(IsFake16, VGPRSrc_32_Lo128, VGPRSrc_16_Lo128), + !if(IsFake16, VGPROp_32_Lo128, VGPROp_16_Lo128), RegisterOperand<VGPR_32>), 1 : RegisterOperand<VGPR_32>); } @@ -1950,6 +1950,20 @@ class getVOP3VRegSrcForVT<ValueType VT> { 1 : VRegSrc_32); } +// VGPR only VOP3 src with 8 bit encoding e.g. VOP3DPP src0. +class getVGPRSrcForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 0> { + RegisterOperand ret = + !cond(!eq(VT.Size, 128) : VGPROp_128, + !eq(VT.Size, 96) : VGPROp_96, + !eq(VT.Size, 64) : VGPROp_64, + !eq(VT.Size, 48) : VGPROp_64, + !eq(VT.Size, 16) : !if(IsTrue16, + !if(IsFake16, VGPROp_32, + VGPROp_16), + VGPROp_32), + 1 : VGPROp_32); +} + // Src2 of VOP3 DPP instructions cannot be a literal class getVOP3DPPSrcForVT<ValueType VT, bit IsFake16 = 1> { RegisterOperand ret = @@ -2578,22 +2592,50 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32, getHasSDWA<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret); } -// Return an AGPR+VGPR operand class for the given VGPR register class. -class getLdStRegisterOperand<RegisterClass RC> { - // This type of operands is only used in pseudo instructions helping - // code generation and thus doesn't need encoding and decoding methods. - // It also doesn't need to support AGPRs, because GFX908/A/40 do not - // support True16. - defvar VLdSt_16 = RegisterOperand<VGPR_16>; +class getAlign2RegOp<RegisterOperand RC> { + RegisterOperand ret = + !cond(!eq(RC, VGPROp_16) : VGPROp_16, + !eq(RC, VGPROp_32) : VGPROp_32, + !eq(RC, VGPROp_64) : VGPROp_64_Align2, + !eq(RC, VGPROp_64_Align1) : VGPROp_64_Align2, + !eq(RC, VGPROp_96) : VGPROp_96_Align2, + !eq(RC, VGPROp_96_Align1) : VGPROp_96_Align2, + !eq(RC, VGPROp_128) : VGPROp_128_Align2, + !eq(RC, VGPROp_128_Align1) : VGPROp_128_Align2, + !eq(RC, VGPROp_160) : VGPROp_160_Align2, + !eq(RC, VGPROp_160_Align1) : VGPROp_160_Align2, + !eq(RC, VGPROp_1024) : VGPROp_1024_Align2, + !eq(RC, VGPROp_1024_Align1) : VGPROp_1024_Align2, + !eq(RC, AVLdSt_32) : AVLdSt_32, + !eq(RC, AVLdSt_64) : AVLdSt_64_Align2, + !eq(RC, AVLdSt_96) : AVLdSt_96_Align2, + !eq(RC, AVLdSt_96_Align1) : AVLdSt_96_Align2, + !eq(RC, AVLdSt_128) : AVLdSt_128_Align2, + !eq(RC, AVLdSt_128_Align1) : AVLdSt_128_Align2, + !eq(RC, AVLdSt_160) : AVLdSt_160_Align2, + !eq(RC, AVLdSt_160_Align1) : AVLdSt_160_Align2); +} + +class getEquivalentAGPROperand<RegisterOperand RC> { + defvar Size = RC.RegClass.Size; + RegisterOperand ret = + !cond(!eq(Size, 32) : RegisterOperand<AGPR_32>, + !eq(Size, 64) : RegisterOperand<AReg_64>, + !eq(Size, 96) : RegisterOperand<AReg_96>, + !eq(Size, 128) : RegisterOperand<AReg_128>, + !eq(Size, 160) : RegisterOperand<AReg_160>, + !eq(Size, 1024) : RegisterOperand<AReg_1024>); +} +class getEquivalentVGPROperand<RegisterOperand RC> { + defvar Size = RC.RegClass.Size; RegisterOperand ret = - !cond(!eq(RC.Size, 16) : VLdSt_16, - !eq(RC.Size, 32) : AVLdSt_32, - !eq(RC.Size, 64) : AVLdSt_64, - !eq(RC.Size, 96) : AVLdSt_96, - !eq(RC.Size, 128) : AVLdSt_128, - !eq(RC.Size, 160) : AVLdSt_160, - !eq(RC.Size, 1024) : AVLdSt_1024); + !cond(!eq(Size, 32) : RegisterOperand<VGPR_32>, + !eq(Size, 64) : RegisterOperand<VReg_64>, + !eq(Size, 96) : RegisterOperand<VReg_96>, + !eq(Size, 128) : RegisterOperand<VReg_128>, + !eq(Size, 160) : RegisterOperand<VReg_160>, + !eq(Size, 1024) : RegisterOperand<VReg_1024>); } class getHasVOP3DPP <ValueType DstVT = i32, ValueType Src0VT = i32, @@ -2643,7 +2685,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> { field RegisterOperand Src0DPP = getVregSrcForVT<Src0VT>.ret; field RegisterOperand Src1DPP = getVregSrcForVT<Src1VT>.ret; field RegisterOperand Src2DPP = getVregSrcForVT<Src2VT>.ret; - field RegisterOperand Src0VOP3DPP = VGPRSrc_32; + field RegisterOperand Src0VOP3DPP = getVGPRSrcForVT<Src0VT>.ret; field RegisterOperand Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT>.ret; field RegisterOperand Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT>.ret; field RegisterOperand Src0SDWA = getSDWASrcForVT<Src0VT>.ret; @@ -2859,7 +2901,7 @@ class VOPProfile_True16<VOPProfile P> : VOPProfile<P.ArgVT> { let Src0ModDPP = getSrcModDPP_t16<Src0VT, 0 /*IsFake16*/>.ret; let Src1ModDPP = getSrcModDPP_t16<Src1VT, 0 /*IsFake16*/>.ret; let Src2ModDPP = getSrcModDPP_t16<Src2VT, 0 /*IsFake16*/>.ret; - let Src0VOP3DPP = !if (!eq(Src0VT.Size, 16), VGPRSrc_16, VGPRSrc_32); + let Src0VOP3DPP = !if (!eq(Src0VT.Size, 16), VGPROp_16, VGPROp_32); let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0 /*IsFake16*/>.ret; let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 0 /*IsFake16*/>.ret; let Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT, 0/*IsFake16*/>.ret; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index e8b450122673..1f7951258c21 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -66,7 +66,7 @@ defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m; // Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 let OtherPredicates = [isNotGFX90APlus] in { -let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in { +let Constraints = "$src0 = $vdst" in { defm V_INTERP_P2_F32 : VINTRP_m < 0x00000001, @@ -77,7 +77,7 @@ defm V_INTERP_P2_F32 : VINTRP_m < [(set f32:$vdst, (int_amdgcn_interp_p2 f32:$src0, f32:$vsrc, (i32 timm:$attrchan), (i32 timm:$attr), M0))]>; -} // End DisableEncoding = "$src0", Constraints = "$src0 = $vdst" +} // End Constraints = "$src0 = $vdst" defm V_INTERP_MOV_F32 : VINTRP_m < 0x00000002, @@ -326,28 +326,57 @@ def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)), (V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>; // clang-format off -defvar int_amdgcn_wave_reduce_ = "int_amdgcn_wave_reduce_"; + multiclass - AMDGPUWaveReducePseudoGenerator<string Op, string DataType> { + AMDGPUWaveReducePseudoGenerator<string Op, string DataType, ValueType ty, RegisterClass RetReg, SrcRegOrImm9 Reg> { let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { def !toupper(Op) #"_PSEUDO_" #DataType - : VPseudoInstSI<(outs SGPR_32 : $sdst), - (ins VSrc_b32 : $src, VSrc_b32 : $strategy), - [(set i32 : $sdst, (!cast<AMDGPUWaveReduce>(int_amdgcn_wave_reduce_ #Op) i32 : $src, i32 : $strategy))]> {} + : VPseudoInstSI<(outs RetReg : $sdst), + (ins Reg : $src, VSrc_b32 : $strategy), + [(set ty : $sdst, (!cast<AMDGPUWaveReduce>("int_amdgcn_wave_reduce_" #Op) ty : $src, i32 : $strategy))]> {} } } // clang-format on +class WaveReduceOp<string OpName, string TypeStr, ValueType Ty, + RegisterClass ReturnRegisterClass, SrcRegOrImm9 RC> { + string Name = OpName; + string TypeString = TypeStr; + ValueType VT = Ty; + RegisterClass RetReg = ReturnRegisterClass; + SrcRegOrImm9 Reg = RC; +} + // Input list : [Operation_name, -// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B)] +// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B), +// bit-width +// output register class, +// input register class] defvar Operations = [ - ["umin", "U32"], ["min", "I32"], ["umax", "U32"], ["max", "I32"], - ["add", "I32"], ["sub", "I32"], ["and", "B32"], ["or", "B32"], - ["xor", "B32"] + WaveReduceOp<"umin", "U32", i32, SGPR_32, VSrc_b32>, + WaveReduceOp<"min", "I32", i32, SGPR_32, VSrc_b32>, + WaveReduceOp<"umax", "U32", i32, SGPR_32, VSrc_b32>, + WaveReduceOp<"max", "I32", i32, SGPR_32, VSrc_b32>, + WaveReduceOp<"add", "I32", i32, SGPR_32, VSrc_b32>, + WaveReduceOp<"sub", "I32", i32, SGPR_32, VSrc_b32>, + WaveReduceOp<"and", "B32", i32, SGPR_32, VSrc_b32>, + WaveReduceOp<"or", "B32", i32, SGPR_32, VSrc_b32>, + WaveReduceOp<"xor", "B32", i32, SGPR_32, VSrc_b32>, + + WaveReduceOp<"umin", "U64", i64, SGPR_64, VSrc_b64>, + WaveReduceOp<"min", "I64", i64, SGPR_64, VSrc_b64>, + WaveReduceOp<"umax", "U64", i64, SGPR_64, VSrc_b64>, + WaveReduceOp<"max", "I64", i64, SGPR_64, VSrc_b64>, + WaveReduceOp<"add", "U64", i64, SGPR_64, VSrc_b64>, + WaveReduceOp<"sub", "U64", i64, SGPR_64, VSrc_b64>, + WaveReduceOp<"and", "B64", i64, SGPR_64, VSrc_b64>, + WaveReduceOp<"or", "B64", i64, SGPR_64, VSrc_b64>, + WaveReduceOp<"xor", "B64", i64, SGPR_64, VSrc_b64>, ]; foreach Op = Operations in { - defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op[0], Op[1]>; + defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op.Name, Op.TypeString, + Op.VT, Op.RetReg, Op.Reg>; } let usesCustomInserter = 1, Defs = [VCC] in { @@ -692,6 +721,33 @@ def SI_WHOLE_WAVE_FUNC_RETURN : SPseudoInstSI < def : GCNPat< (AMDGPUwhole_wave_return), (SI_WHOLE_WAVE_FUNC_RETURN (i1 (IMPLICIT_DEF)))>; +// Restores the previous EXEC and otherwise behaves entirely like a SI_TCRETURN. +// This is used for tail calls *from* a whole wave function. Tail calls to +// a whole wave function may use the usual opcodes, depending on the calling +// convention of the caller. +def SI_TCRETURN_GFX_WholeWave : SPseudoInstSI < + (outs), + (ins SReg_1:$orig_exec, Gfx_CCR_SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff)> { + let isCall = 1; + let isTerminator = 1; + let isReturn = 1; + let isBarrier = 1; + let UseNamedOperandTable = 1; + let SchedRW = [WriteBranch]; + let isConvergent = 1; + + // We're going to use custom handling to set the $orig_exec to the correct value. + let usesCustomInserter = 1; +} + +// Generate a SI_TCRETURN_GFX_WholeWave pseudo with a placeholder for its +// argument. It will be filled in by the custom inserter. +def : GCNPat< + (AMDGPUtc_return_gfx_ww i64:$src0, tglobaladdr:$callee, i32:$fpdiff), + (SI_TCRETURN_GFX_WholeWave (i1 (IMPLICIT_DEF)), Gfx_CCR_SGPR_64:$src0, + tglobaladdr:$callee, i32:$fpdiff)>; + + // Return for returning shaders to a shader variant epilog. def SI_RETURN_TO_EPILOG : SPseudoInstSI < (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> { @@ -2174,7 +2230,8 @@ def : GCNPat < } foreach fp16vt = [f16, bf16] in { - +foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in +let True16Predicate = p in { def : GCNPat < (fcopysign fp16vt:$src0, fp16vt:$src1), (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1) @@ -2205,6 +2262,42 @@ def : GCNPat < (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1))) >; +} +let True16Predicate = UseRealTrue16Insts in { +def : GCNPat < + (fcopysign fp16vt:$src0, fp16vt:$src1), + (EXTRACT_SUBREG (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), + (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16), + (REG_SEQUENCE VGPR_32, $src1, lo16, (i16 (IMPLICIT_DEF)), hi16)), lo16) +>; + +def : GCNPat < + (fcopysign f32:$src0, fp16vt:$src1), + (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, + (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src1, hi16)) +>; + +def : GCNPat < + (fcopysign f64:$src0, fp16vt:$src1), + (REG_SEQUENCE VReg_64, + (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, + (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)), + (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src1, hi16)), sub1) +>; + +def : GCNPat < + (fcopysign fp16vt:$src0, f32:$src1), + (EXTRACT_SUBREG (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fff0000)), + (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src0, hi16), $src1), hi16) +>; + +def : GCNPat < + (fcopysign fp16vt:$src0, f64:$src1), + (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), + (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16), + (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1))) +>; +} } // End foreach fp16vt = [f16, bf16] @@ -2480,6 +2573,38 @@ def : AMDGPUPatIgnoreCopies < (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1) >; +// (z & ~x) +def : AMDGPUPatIgnoreCopies < + (DivergentBinFrag<and> i32:$z, (not_oneuse i32:$x)), + (V_BFI_B32_e64 VSrc_b32:$x, (i32 0), VSrc_b32:$z) +>; + +// 64-bit version +def : AMDGPUPatIgnoreCopies < + (DivergentBinFrag<and> i64:$z, (not_oneuse i64:$x)), + (REG_SEQUENCE VReg_64, + (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), (i32 0), + (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0, + (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), (i32 0), + (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1) +>; + +// (y | ~x) +def : AMDGPUPatIgnoreCopies < + (DivergentBinFrag<or> i32:$y, (not_oneuse i32:$x)), + (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, (i32 -1)) +>; + +// 64-bit version +def : AMDGPUPatIgnoreCopies < + (DivergentBinFrag<or> i64:$y, (not_oneuse i64:$x)), + (REG_SEQUENCE VReg_64, + (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), + (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), (i32 -1)), sub0, + (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), + (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)), (i32 -1)), sub1) +>; + // SHA-256 Ch function // z ^ (x & (y ^ z)) def : AMDGPUPatIgnoreCopies < @@ -3096,6 +3221,11 @@ def : GCNPat< (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))), (COPY VSrc_b16:$src) >; + +def : GCNPat < + (i1 (DivergentUnaryFrag<trunc> i16:$a)), + (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1)) +>; } let True16Predicate = UseRealTrue16Insts in { @@ -3106,15 +3236,18 @@ def : GCNPat< def : GCNPat< (i64 (DivergentUnaryFrag<zext> i16:$src)), - (REG_SEQUENCE VReg_64, - (INSERT_SUBREG (i32 (V_MOV_B32_e32 (i32 0))), VGPR_16:$src, lo16), sub0, - (S_MOV_B32 (i32 0)), sub1) + (REG_SEQUENCE VReg_64, $src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16, (V_MOV_B32_e32 (i32 0)), sub1) >; def : GCNPat< (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))), (REG_SEQUENCE VGPR_32, VGPR_16:$src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16) >; + +def : GCNPat < + (i1 (DivergentUnaryFrag<trunc> i16:$a)), + (V_CMP_EQ_U16_t16_e64 (i32 0), (V_AND_B16_t16_e64 (i32 0), (i16 1), (i32 0), $a), (i32 0), (i16 1), (i32 0)) +>; } def : GCNPat < @@ -3143,11 +3276,6 @@ def : GCNPat < (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1)) >; -def : GCNPat < - (i1 (DivergentUnaryFrag<trunc> i16:$a)), - (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1)) ->; - def IMMBitSelConst : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(1ULL << N->getZExtValue(), SDLoc(N), MVT::i32); @@ -3637,13 +3765,24 @@ def : GCNPat < >; foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in -let True16Predicate = p in +let True16Predicate = p in { // Take the lower 16 bits from each VGPR_32 and concat them def : GCNPat < (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), (Ty VGPR_32:$b))), (V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x05040100))) >; +// Take the lower 16 bits from V[0] and the upper 16 bits from V[1] +// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000) +def : GCNPat < + (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), + (Ty !if(!eq(Ty, i16), + (Ty (trunc (srl VGPR_32:$b, (i32 16)))), + (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))), + (V_BFI_B32_e64 (S_MOV_B32 (i32 0x0000ffff)), VGPR_32:$a, VGPR_32:$b) +>; +} + let True16Predicate = UseRealTrue16Insts in { def : GCNPat < (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_16:$a), (Ty VGPR_16:$b))), @@ -3669,18 +3808,6 @@ def : GCNPat < (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff0000)), VGPR_32:$b) >; - -// Take the lower 16 bits from V[0] and the upper 16 bits from V[1] -// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000) -def : GCNPat < - (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), - (Ty !if(!eq(Ty, i16), - (Ty (trunc (srl VGPR_32:$b, (i32 16)))), - (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))), - (V_BFI_B32_e64 (S_MOV_B32 (i32 0x0000ffff)), VGPR_32:$a, VGPR_32:$b) ->; - - // Take the upper 16 bits from V[0] and the lower 16 bits from V[1] // Special case, can use V_ALIGNBIT (always uses encoded literal) let True16Predicate = NotHasTrue16BitInsts in { @@ -3752,7 +3879,8 @@ def : GCNPat < (v2i16 (S_PACK_HL_B32_B16 SReg_32:$src0, SReg_32:$src1)) >; - +foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in +let True16Predicate = p in { def : GCNPat < (v2f16 (scalar_to_vector f16:$src0)), (COPY $src0) @@ -3772,6 +3900,29 @@ def : GCNPat < (v4f16 (scalar_to_vector f16:$src0)), (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0) >; +} + +let True16Predicate = UseRealTrue16Insts in { +def : GCNPat < + (v2f16 (scalar_to_vector f16:$src0)), + (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16) +>; + +def : GCNPat < + (v2i16 (scalar_to_vector i16:$src0)), + (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16) +>; + +def : GCNPat < + (v4i16 (scalar_to_vector i16:$src0)), + (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16, (i32 (IMPLICIT_DEF)), sub1) +>; + +def : GCNPat < + (v4f16 (scalar_to_vector f16:$src0)), + (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16, (i32 (IMPLICIT_DEF)), sub1) +>; +} def : GCNPat < (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask, diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 6f2ea8ad1ff0..69d02e7c2934 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -119,7 +119,7 @@ class SILoadStoreOptimizer { unsigned DMask; InstClassEnum InstClass; unsigned CPol = 0; - bool IsAGPR; + const TargetRegisterClass *DataRC; bool UseST64; int AddrIdx[MaxAddressRegs]; const MachineOperand *AddrReg[MaxAddressRegs]; @@ -203,6 +203,7 @@ class SILoadStoreOptimizer { using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; private: + MachineFunction *MF = nullptr; const GCNSubtarget *STM = nullptr; const SIInstrInfo *TII = nullptr; const SIRegisterInfo *TRI = nullptr; @@ -245,6 +246,8 @@ private: unsigned write2Opcode(unsigned EltSize) const; unsigned write2ST64Opcode(unsigned EltSize) const; + unsigned getWrite2Opcode(const CombineInfo &CI) const; + MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore); @@ -846,7 +849,7 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, if (InstClass == UNKNOWN) return; - IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI)); + DataRC = LSO.getDataRegClass(*MI); switch (InstClass) { case DS_READ: @@ -1313,6 +1316,50 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, // have already been confirmed to be mergeable. if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE) offsetsCanBeCombined(CI, *STM, Paired, true); + + if (CI.InstClass == DS_WRITE) { + // Both data operands must be AGPR or VGPR, so the data registers needs to + // be constrained to one or the other. We expect to only emit the VGPR form + // here for now. + // + // FIXME: There is currently a hack in getRegClass to report that the write2 + // operands are VGPRs. In the future we should have separate agpr + // instruction definitions. + const MachineOperand *Data0 = + TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); + const MachineOperand *Data1 = + TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); + + const MCInstrDesc &Write2Opc = TII->get(getWrite2Opcode(CI)); + int Data0Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(), + AMDGPU::OpName::data0); + int Data1Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(), + AMDGPU::OpName::data1); + + const TargetRegisterClass *DataRC0 = + TII->getRegClass(Write2Opc, Data0Idx, TRI, *MF); + + const TargetRegisterClass *DataRC1 = + TII->getRegClass(Write2Opc, Data1Idx, TRI, *MF); + + if (unsigned SubReg = Data0->getSubReg()) { + DataRC0 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data0->getReg()), + DataRC0, SubReg); + } + + if (unsigned SubReg = Data1->getSubReg()) { + DataRC1 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data1->getReg()), + DataRC1, SubReg); + } + + if (!MRI->constrainRegClass(Data0->getReg(), DataRC0) || + !MRI->constrainRegClass(Data1->getReg(), DataRC1)) + return nullptr; + + // TODO: If one register can be constrained, and not the other, insert a + // copy. + } + return Where; } @@ -1462,6 +1509,10 @@ unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { : AMDGPU::DS_WRITE2ST64_B64_gfx9; } +unsigned SILoadStoreOptimizer::getWrite2Opcode(const CombineInfo &CI) const { + return CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); +} + MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { @@ -1478,8 +1529,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( unsigned NewOffset0 = CI.Offset; unsigned NewOffset1 = Paired.Offset; - unsigned Opc = - CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); + unsigned Opc = getWrite2Opcode(CI); if (NewOffset0 > NewOffset1) { // Canonicalize the merged instruction so the smaller offset comes first. @@ -2032,6 +2082,8 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, } } + // FIXME: This should compute the instruction to use, and then use the result + // of TII->getRegClass. unsigned BitWidth = 32 * (CI.Width + Paired.Width); return TRI->isAGPRClass(getDataRegClass(*CI.I)) ? TRI->getAGPRClassForBitWidth(BitWidth) @@ -2400,7 +2452,6 @@ void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, std::list<std::list<CombineInfo> > &MergeableInsts) const { for (std::list<CombineInfo> &AddrList : MergeableInsts) { if (AddrList.front().InstClass == CI.InstClass && - AddrList.front().IsAGPR == CI.IsAGPR && AddrList.front().hasSameBaseAddress(CI)) { AddrList.emplace_back(CI); return; @@ -2465,16 +2516,6 @@ SILoadStoreOptimizer::collectMergeableInsts( if (!CI.hasMergeableAddress(*MRI)) continue; - if (CI.InstClass == DS_WRITE && CI.IsAGPR) { - // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data - // operands. However we are reporting that ds_write2 shall have - // only VGPR data so that machine copy propagation does not - // create an illegal instruction with a VGPR and AGPR sources. - // Consequenctially if we create such instruction the verifier - // will complain. - continue; - } - LLVM_DEBUG(dbgs() << "Mergeable: " << MI); addInstToMergeableList(CI, MergeableInsts); @@ -2647,6 +2688,7 @@ bool SILoadStoreOptimizerLegacy::runOnMachineFunction(MachineFunction &MF) { } bool SILoadStoreOptimizer::run(MachineFunction &MF) { + this->MF = &MF; STM = &MF.getSubtarget<GCNSubtarget>(); if (!STM->loadStoreOptEnabled()) return false; diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 53f554eccb1f..1637c06936f9 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -63,6 +63,7 @@ enum class SIAtomicScope { SINGLETHREAD, WAVEFRONT, WORKGROUP, + CLUSTER, // Promoted to AGENT on targets without workgroup clusters. AGENT, SYSTEM }; @@ -103,8 +104,10 @@ private: bool IsVolatile = false; bool IsNonTemporal = false; bool IsLastUse = false; + bool IsCooperative = false; SIMemOpInfo( + const GCNSubtarget &ST, AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, SIAtomicScope Scope = SIAtomicScope::SYSTEM, SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, @@ -112,14 +115,15 @@ private: bool IsCrossAddressSpaceOrdering = true, AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent, bool IsVolatile = false, bool IsNonTemporal = false, - bool IsLastUse = false) + bool IsLastUse = false, bool IsCooperative = false) : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope), OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace), IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal), - IsLastUse(IsLastUse) { + IsLastUse(IsLastUse), IsCooperative(IsCooperative) { if (Ordering == AtomicOrdering::NotAtomic) { + assert(!IsCooperative && "Cannot be cooperative & non-atomic!"); assert(Scope == SIAtomicScope::NONE && OrderingAddrSpace == SIAtomicAddrSpace::NONE && !IsCrossAddressSpaceOrdering && @@ -154,6 +158,11 @@ private: SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) { this->Scope = std::min(Scope, SIAtomicScope::AGENT); } + + // On targets that have no concept of a workgroup cluster, use + // AGENT scope as a conservatively correct alternative. + if (this->Scope == SIAtomicScope::CLUSTER && !ST.hasClusters()) + this->Scope = SIAtomicScope::AGENT; } public: @@ -209,6 +218,9 @@ public: /// create this SIMemOpInfo is last use, false otherwise. bool isLastUse() const { return IsLastUse; } + /// \returns True if this is a cooperative load or store atomic. + bool isCooperative() const { return IsCooperative; } + /// \returns True if ordering constraint of the machine instruction used to /// create this SIMemOpInfo is unordered or higher, false otherwise. bool isAtomic() const { @@ -220,6 +232,7 @@ public: class SIMemOpAccess final { private: const AMDGPUMachineModuleInfo *MMI = nullptr; + const GCNSubtarget &ST; /// Reports unsupported message \p Msg for \p MI to LLVM context. void reportUnsupported(const MachineBasicBlock::iterator &MI, @@ -243,7 +256,7 @@ private: public: /// Construct class to support accessing the machine memory operands /// of instructions in the machine function \p MF. - SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI); + SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST); /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise. std::optional<SIMemOpInfo> @@ -325,6 +338,12 @@ public: return false; }; + /// Handle cooperative load/store atomics. + virtual bool handleCooperativeAtomic(MachineInstr &MI) const { + llvm_unreachable( + "cooperative atomics are not available on this architecture"); + } + /// Inserts any necessary instructions at position \p Pos relative /// to instruction \p MI to ensure memory instructions before \p Pos of kind /// \p Op associated with address spaces \p AddrSpace have completed. Used @@ -359,6 +378,12 @@ public: bool IsCrossAddrSpaceOrdering, Position Pos) const = 0; + /// Inserts any necessary instructions before the barrier start instruction + /// \p MI in order to support pairing of barriers and fences. + virtual bool insertBarrierStart(MachineBasicBlock::iterator &MI) const { + return false; + }; + /// Virtual destructor to allow derivations to be deleted. virtual ~SICacheControl() = default; }; @@ -547,6 +572,8 @@ public: SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, Position Pos) const override; + + bool insertBarrierStart(MachineBasicBlock::iterator &MI) const override; }; class SIGfx11CacheControl : public SIGfx10CacheControl { @@ -587,7 +614,11 @@ protected: SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const; public: - SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {} + SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) { + // GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases + // the behavior is the same if assuming GFX12.0 in CU mode. + assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled()); + } bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, @@ -604,6 +635,8 @@ public: bool finalizeStore(MachineInstr &MI, bool Atomic) const override; + virtual bool handleCooperativeAtomic(MachineInstr &MI) const override; + bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, Position Pos) const override; @@ -748,6 +781,8 @@ SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true); if (SSID == MMI->getAgentSSID()) return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true); + if (SSID == MMI->getClusterSSID()) + return std::tuple(SIAtomicScope::CLUSTER, SIAtomicAddrSpace::ATOMIC, true); if (SSID == MMI->getWorkgroupSSID()) return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC, true); @@ -763,6 +798,9 @@ SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, if (SSID == MMI->getAgentOneAddressSpaceSSID()) return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); + if (SSID == MMI->getClusterOneAddressSpaceSSID()) + return std::tuple(SIAtomicScope::CLUSTER, + SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); @@ -790,8 +828,9 @@ SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { return SIAtomicAddrSpace::OTHER; } -SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_) - : MMI(&MMI_) {} +SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_, + const GCNSubtarget &ST) + : MMI(&MMI_), ST(ST) {} std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( const MachineBasicBlock::iterator &MI) const { @@ -804,6 +843,7 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( bool IsNonTemporal = true; bool IsVolatile = false; bool IsLastUse = false; + bool IsCooperative = false; // Validator should check whether or not MMOs cover the entire set of // locations accessed by the memory instruction. @@ -811,6 +851,7 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( IsNonTemporal &= MMO->isNonTemporal(); IsVolatile |= MMO->isVolatile(); IsLastUse |= MMO->getFlags() & MOLastUse; + IsCooperative |= MMO->getFlags() & MOCooperative; InstrAddrSpace |= toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); AtomicOrdering OpOrdering = MMO->getSuccessOrdering(); @@ -850,9 +891,9 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( return std::nullopt; } } - return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, + return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile, - IsNonTemporal, IsLastUse); + IsNonTemporal, IsLastUse, IsCooperative); } std::optional<SIMemOpInfo> @@ -864,7 +905,7 @@ SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const { // Be conservative if there are no memory operands. if (MI->getNumMemOperands() == 0) - return SIMemOpInfo(); + return SIMemOpInfo(ST); return constructFromMIWithMMO(MI); } @@ -878,7 +919,7 @@ SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const { // Be conservative if there are no memory operands. if (MI->getNumMemOperands() == 0) - return SIMemOpInfo(); + return SIMemOpInfo(ST); return constructFromMIWithMMO(MI); } @@ -919,8 +960,9 @@ SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const { if (SynchronizeAS) OrderingAddrSpace = *SynchronizeAS; - return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, - IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic); + return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, + SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering, + AtomicOrdering::NotAtomic); } std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( @@ -932,7 +974,7 @@ std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( // Be conservative if there are no memory operands. if (MI->getNumMemOperands() == 0) - return SIMemOpInfo(); + return SIMemOpInfo(ST); return constructFromMIWithMMO(MI); } @@ -2169,6 +2211,22 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, return Changed; } +bool SIGfx10CacheControl::insertBarrierStart( + MachineBasicBlock::iterator &MI) const { + // We need to wait on vm_vsrc so barriers can pair with fences in GFX10+ CU + // mode. This is because a CU mode release fence does not emit any wait, which + // is fine when only dealing with vmem, but isn't sufficient in the presence + // of barriers which do not go through vmem. + // GFX12.5 does not require this additional wait. + if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) + return false; + + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); + return true; +} + bool SIGfx11CacheControl::enableLoadCacheBypass( const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { @@ -2334,18 +2392,23 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: + case SIAtomicScope::CLUSTER: if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) LOADCnt |= true; if ((Op & SIMemOp::STORE) != SIMemOp::NONE) STORECnt |= true; break; case SIAtomicScope::WORKGROUP: - // In WGP mode the waves of a work-group can be executing on either CU of - // the WGP. Therefore need to wait for operations to complete to ensure - // they are visible to waves in the other CU as the L0 is per CU. - // Otherwise in CU mode and all waves of a work-group are on the same CU - // which shares the same L0. - if (!ST.isCuModeEnabled()) { + // GFX12.0: + // In WGP mode the waves of a work-group can be executing on either CU + // of the WGP. Therefore need to wait for operations to complete to + // ensure they are visible to waves in the other CU as the L0 is per CU. + // Otherwise in CU mode and all waves of a work-group are on the same CU + // which shares the same L0. + // + // GFX12.5: + // TODO DOCS + if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) { if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) LOADCnt |= true; if ((Op & SIMemOp::STORE) != SIMemOp::NONE) @@ -2366,6 +2429,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: + case SIAtomicScope::CLUSTER: case SIAtomicScope::WORKGROUP: // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is // not needed as LDS operations for all waves are executed in a total @@ -2397,7 +2461,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, // // This also applies to fences. Fences cannot pair with an instruction // tracked with bvh/samplecnt as we don't have any atomics that do that. - if (Order != AtomicOrdering::Acquire) { + if (Order != AtomicOrdering::Acquire && ST.hasImageInsts()) { BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0); BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0); } @@ -2448,11 +2512,18 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, case SIAtomicScope::AGENT: ScopeImm = AMDGPU::CPol::SCOPE_DEV; break; + case SIAtomicScope::CLUSTER: + ScopeImm = AMDGPU::CPol::SCOPE_SE; + break; case SIAtomicScope::WORKGROUP: - // In WGP mode the waves of a work-group can be executing on either CU of - // the WGP. Therefore we need to invalidate the L0 which is per CU. - // Otherwise in CU mode all waves of a work-group are on the same CU, and so - // the L0 does not need to be invalidated. + // GFX12.0: + // In WGP mode the waves of a work-group can be executing on either CU of + // the WGP. Therefore we need to invalidate the L0 which is per CU. + // Otherwise in CU mode all waves of a work-group are on the same CU, and + // so the L0 does not need to be invalidated. + // + // GFX12.5 + // TODO DOCS if (ST.isCuModeEnabled()) return false; @@ -2497,7 +2568,8 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, if (Pos == Position::AFTER) ++MI; - // global_wb is only necessary at system scope for gfx120x targets. + // global_wb is only necessary at system scope for GFX12.0, + // they're also necessary at device scope for GFX12.5. // // Emitting it for lower scopes is a slow no-op, so we omit it // for performance. @@ -2507,6 +2579,13 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, .addImm(AMDGPU::CPol::SCOPE_SYS); break; case SIAtomicScope::AGENT: + // TODO DOCS + if (ST.hasGFX1250Insts()) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)) + .addImm(AMDGPU::CPol::SCOPE_DEV); + } + break; + case SIAtomicScope::CLUSTER: case SIAtomicScope::WORKGROUP: // No WB necessary, but we still have to wait. break; @@ -2569,26 +2648,44 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( } bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { - MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol); - if (!CPol) - return false; + assert(MI.mayStore() && "Not a Store inst"); + const bool IsRMW = (MI.mayLoad() && MI.mayStore()); + bool Changed = false; + // GFX12.5 only: xcnt wait is needed before flat and global atomics + // stores/rmw. + if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) { + MachineBasicBlock &MBB = *MI.getParent(); + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0); + Changed = true; + } + + // Remaining fixes do not apply to RMWs. + if (IsRMW) + return Changed; + + MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol); + if (!CPol) // Some vmem operations do not have a scope and are not concerned. + return Changed; const unsigned Scope = CPol->getImm() & CPol::SCOPE; // GFX12.0 only: Extra waits needed before system scope stores. - if (!ST.hasGFX1250Insts()) { - if (!Atomic && Scope == CPol::SCOPE_SYS) - return insertWaitsBeforeSystemScopeStore(MI); - return false; - } + if (!ST.hasGFX1250Insts() && !Atomic && Scope == CPol::SCOPE_SYS) + Changed |= insertWaitsBeforeSystemScopeStore(MI.getIterator()); - // GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address - // space. - // We also require SCOPE_SE minimum if we not have the "cu-stores" feature. - if (Scope == CPol::SCOPE_CU && - (!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(MI))) - return setScope(MI, CPol::SCOPE_SE); + return Changed; +} +bool SIGfx12CacheControl::handleCooperativeAtomic(MachineInstr &MI) const { + if (!ST.hasGFX1250Insts()) + return false; + + // Cooperative atomics need to be SCOPE_DEV or higher. + MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol); + assert(CPol && "No CPol operand?"); + const unsigned Scope = CPol->getImm() & CPol::SCOPE; + if (Scope < CPol::SCOPE_DEV) + return setScope(MI, CPol::SCOPE_DEV); return false; } @@ -2605,6 +2702,9 @@ bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI, case SIAtomicScope::AGENT: Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV); break; + case SIAtomicScope::CLUSTER: + Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE); + break; case SIAtomicScope::WORKGROUP: // In workgroup mode, SCOPE_SE is needed as waves can executes on // different CUs that access different L0s. @@ -2656,6 +2756,11 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, MOI.getOrderingAddrSpace()); } + // Handle cooperative atomics after cache bypass step, as it may override + // the scope of the instruction to a greater scope. + if (MOI.isCooperative()) + Changed |= CC->handleCooperativeAtomic(*MI); + if (Order == AtomicOrdering::SequentiallyConsistent) Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(), SIMemOp::LOAD | SIMemOp::STORE, @@ -2701,6 +2806,11 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, MOI.getOrderingAddrSpace()); } + // Handle cooperative atomics after cache bypass step, as it may override + // the scope of the instruction to a greater scope. + if (MOI.isCooperative()) + Changed |= CC->handleCooperativeAtomic(*MI); + if (MOI.getOrdering() == AtomicOrdering::Release || MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) Changed |= CC->insertRelease(MI, MOI.getScope(), @@ -2778,6 +2888,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, assert(MI->mayLoad() && MI->mayStore()); bool Changed = false; + MachineInstr &RMWMI = *MI; if (MOI.isAtomic()) { const AtomicOrdering Order = MOI.getOrdering(); @@ -2812,6 +2923,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, Position::AFTER); } + Changed |= CC->finalizeStore(RMWMI, /*Atomic=*/true); return Changed; } @@ -2839,8 +2951,9 @@ SIMemoryLegalizerPass::run(MachineFunction &MF, bool SIMemoryLegalizer::run(MachineFunction &MF) { bool Changed = false; - SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>()); - CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>(), ST); + CC = SICacheControl::create(ST); for (auto &MBB : MF) { for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { @@ -2860,6 +2973,11 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) { MI = II->getIterator(); } + if (ST.getInstrInfo()->isBarrierStart(MI->getOpcode())) { + Changed |= CC->insertBarrierStart(MI); + continue; + } + if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) continue; diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index d0cba30a442b..857cb91a977f 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -291,21 +291,7 @@ static MachineOperand *findSingleRegUse(const MachineOperand *Reg, if (!Reg->isReg() || !Reg->isDef()) return nullptr; - MachineOperand *ResMO = nullptr; - for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) { - // If there exist use of subreg of Reg then return nullptr - if (!isSameReg(UseMO, *Reg)) - return nullptr; - - // Check that there is only one instruction that uses Reg - if (!ResMO) { - ResMO = &UseMO; - } else if (ResMO->getParent() != UseMO.getParent()) { - return nullptr; - } - } - - return ResMO; + return MRI->getOneNonDBGUse(Reg->getReg()); } static MachineOperand *findSingleRegDef(const MachineOperand *Reg, @@ -313,17 +299,7 @@ static MachineOperand *findSingleRegDef(const MachineOperand *Reg, if (!Reg->isReg()) return nullptr; - MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg()); - if (!DefInstr) - return nullptr; - - for (auto &DefMO : DefInstr->defs()) { - if (DefMO.isReg() && DefMO.getReg() == Reg->getReg()) - return &DefMO; - } - - // Ignore implicit defs. - return nullptr; + return MRI->getOneDef(Reg->getReg()); } /// Combine an SDWA instruction's existing SDWA selection \p Sel with diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp index efdc55b8e68b..5720b978aada 100644 --- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp +++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp @@ -184,9 +184,11 @@ bool SIPostRABundler::run(MachineFunction &MF) { if (I->getNumExplicitDefs() != 0) Defs.insert(I->defs().begin()->getReg()); ++ClauseLength; - } else if (!I->isMetaInstruction()) { - // Allow meta instructions in between bundle candidates, but do not - // start or end a bundle on one. + } else if (!I->isMetaInstruction() || + I->getOpcode() == AMDGPU::SCHED_BARRIER) { + // SCHED_BARRIER is not bundled to be honored by scheduler later. + // Allow other meta instructions in between bundle candidates, but do + // not start or end a bundle on one. // // TODO: It may be better to move meta instructions like dbg_value // after the bundle. We're relying on the memory legalizer to unbundle diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index ae0f304ea304..22488384759b 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -3273,6 +3273,10 @@ StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const { return AMDGPUInstPrinter::getRegisterName(Reg); } +unsigned SIRegisterInfo::getHWRegIndex(MCRegister Reg) const { + return getEncodingValue(Reg) & AMDGPU::HWEncoding::REG_IDX_MASK; +} + unsigned AMDGPU::getRegBitWidth(const TargetRegisterClass &RC) { return getRegBitWidth(RC.getID()); } @@ -3353,6 +3357,40 @@ SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const { : getAnyVGPRClassForBitWidth(BitWidth); } +const TargetRegisterClass * +SIRegisterInfo::getAlignedLo256VGPRClassForBitWidth(unsigned BitWidth) const { + if (BitWidth <= 32) + return &AMDGPU::VGPR_32_Lo256RegClass; + if (BitWidth <= 64) + return &AMDGPU::VReg_64_Lo256_Align2RegClass; + if (BitWidth <= 96) + return &AMDGPU::VReg_96_Lo256_Align2RegClass; + if (BitWidth <= 128) + return &AMDGPU::VReg_128_Lo256_Align2RegClass; + if (BitWidth <= 160) + return &AMDGPU::VReg_160_Lo256_Align2RegClass; + if (BitWidth <= 192) + return &AMDGPU::VReg_192_Lo256_Align2RegClass; + if (BitWidth <= 224) + return &AMDGPU::VReg_224_Lo256_Align2RegClass; + if (BitWidth <= 256) + return &AMDGPU::VReg_256_Lo256_Align2RegClass; + if (BitWidth <= 288) + return &AMDGPU::VReg_288_Lo256_Align2RegClass; + if (BitWidth <= 320) + return &AMDGPU::VReg_320_Lo256_Align2RegClass; + if (BitWidth <= 352) + return &AMDGPU::VReg_352_Lo256_Align2RegClass; + if (BitWidth <= 384) + return &AMDGPU::VReg_384_Lo256_Align2RegClass; + if (BitWidth <= 512) + return &AMDGPU::VReg_512_Lo256_Align2RegClass; + if (BitWidth <= 1024) + return &AMDGPU::VReg_1024_Lo256_Align2RegClass; + + return nullptr; +} + static const TargetRegisterClass * getAnyAGPRClassForBitWidth(unsigned BitWidth) { if (BitWidth == 64) @@ -3547,7 +3585,17 @@ bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI, const TargetRegisterClass * SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const { unsigned Size = getRegSizeInBits(*SRC); - const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size); + + switch (SRC->getID()) { + default: + break; + case AMDGPU::VS_32_Lo256RegClassID: + case AMDGPU::VS_64_Lo256RegClassID: + return getAllocatableClass(getAlignedLo256VGPRClassForBitWidth(Size)); + } + + const TargetRegisterClass *VRC = + getAllocatableClass(getVGPRClassForBitWidth(Size)); assert(VRC && "Invalid register class size"); return VRC; } @@ -3708,14 +3756,15 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const { - if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 || - Idx == AMDGPU::RegisterPressureSets::AGPR_32) + switch (static_cast<AMDGPU::RegisterPressureSets>(Idx)) { + case AMDGPU::RegisterPressureSets::VGPR_32: + case AMDGPU::RegisterPressureSets::AGPR_32: return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, const_cast<MachineFunction &>(MF)); - - if (Idx == AMDGPU::RegisterPressureSets::SReg_32) + case AMDGPU::RegisterPressureSets::SReg_32: return getRegPressureLimit(&AMDGPU::SGPR_32RegClass, const_cast<MachineFunction &>(MF)); + } llvm_unreachable("Unexpected register pressure set!"); } @@ -3944,6 +3993,8 @@ bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const { return RC.hasSuperClassEq( getVectorSuperClassForBitWidth(getRegSizeInBits(RC))); + assert(&RC != &AMDGPU::VS_64RegClass); + return true; } @@ -3956,6 +4007,9 @@ SIRegisterInfo::getProperlyAlignedRC(const TargetRegisterClass *RC) const { if (Size <= 32) return RC; + if (RC == &AMDGPU::VS_64RegClass) + return &AMDGPU::VS_64_Align2RegClass; + if (isVGPRClass(RC)) return getAlignedVGPRClassForBitWidth(Size); if (isAGPRClass(RC)) @@ -4000,7 +4054,12 @@ SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC, unsigned SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI, const TargetRegisterClass &RC, bool IncludeCalls) const { - for (MCPhysReg Reg : reverse(RC.getRegisters())) + unsigned NumArchVGPRs = ST.has1024AddressableVGPRs() ? 1024 : 256; + ArrayRef<MCPhysReg> Registers = + (RC.getID() == AMDGPU::VGPR_32RegClassID) + ? RC.getRegisters().take_front(NumArchVGPRs) + : RC.getRegisters(); + for (MCPhysReg Reg : reverse(Registers)) if (MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/!IncludeCalls)) return getHWRegIndex(Reg) + 1; return 0; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 5508f07b1b5f..eeefef1116aa 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -200,13 +200,14 @@ public: StringRef getRegAsmName(MCRegister Reg) const override; // Pseudo regs are not allowed - unsigned getHWRegIndex(MCRegister Reg) const { - return getEncodingValue(Reg) & 0xff; - } + unsigned getHWRegIndex(MCRegister Reg) const; LLVM_READONLY const TargetRegisterClass *getVGPRClassForBitWidth(unsigned BitWidth) const; + LLVM_READONLY const TargetRegisterClass * + getAlignedLo256VGPRClassForBitWidth(unsigned BitWidth) const; + LLVM_READONLY const TargetRegisterClass *getAGPRClassForBitWidth(unsigned BitWidth) const; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 0293d4018770..5f5eec49bab0 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -76,17 +76,17 @@ class SIRegisterTuples<list<SubRegIndex> Indices, RegisterClass RC, //===----------------------------------------------------------------------===// // Declarations that describe the SI registers //===----------------------------------------------------------------------===// -class SIReg <string n, bits<8> regIdx = 0, bit isVGPR = 0, +class SIReg <string n, bits<10> regIdx = 0, bit isVGPR = 0, bit isAGPR = 0, bit isHi16 = 0> : Register<n> { let Namespace = "AMDGPU"; // These are generic helper values we use to form actual register // codes. They should not be assumed to match any particular register // encodings on any particular subtargets. - let HWEncoding{7-0} = regIdx; - let HWEncoding{8} = isVGPR; - let HWEncoding{9} = isAGPR; - let HWEncoding{10} = isHi16; + let HWEncoding{9-0} = regIdx; + let HWEncoding{10} = isVGPR; + let HWEncoding{11} = isAGPR; + let HWEncoding{12} = isHi16; int Index = !cast<int>(regIdx); } @@ -110,17 +110,17 @@ class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList> let TSFlags{3} = HasAGPR; let TSFlags{4} = HasSGPR; - // RA will use RegisterClass AllocationPriority amongst other info (e.g. ordering in the basic block) + // RA will use RegisterClass AllocationPriority amongst other info (e.g. ordering in the basic block) // to decide which registers to try to assign first. Usually, this RegisterClass priority is given // very high priority, if not the highest priority, when considering which VirtReg to allocate next. // - // We have 5 bits to assign AllocationPriorities to RegisterClasses. Generally, it is beneficial to - // assign more constrained RegisterClasses first. As a result, we prioritize register classes with - // more 32 bit tuples (e.g. VReg_512) over registers with fewer tuples (e.g. VGPR_32). - // + // We have 5 bits to assign AllocationPriorities to RegisterClasses. Generally, it is beneficial to + // assign more constrained RegisterClasses first. As a result, we prioritize register classes with + // more 32 bit tuples (e.g. VReg_512) over registers with fewer tuples (e.g. VGPR_32). + // // The interesting case is the vector register case on architectures which have ARegs, VRegs, AVRegs. // In this case, we would like to assign ARegs and VRegs before AVRegs, as AVRegs are less constrained - // and can be assigned to both AGPRs and VGPRs. We use the 5th bit to encode this into the + // and can be assigned to both AGPRs and VGPRs. We use the 5th bit to encode this into the // RegisterClass AllocationPriority. BaseClassPriority is used to turn the bit on, and BaseClassScaleFactor // is used for scaling of the bit (i.e. 1 << 4). field int BaseClassPriority = 1; @@ -128,7 +128,7 @@ class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList> } -multiclass SIRegLoHi16 <string n, bits<8> regIdx, bit ArtificialHigh = 1, +multiclass SIRegLoHi16 <string n, bits<10> regIdx, bit ArtificialHigh = 1, bit isVGPR = 0, bit isAGPR = 0, list<int> DwarfEncodings = [-1, -1]> { def _LO16 : SIReg<n#".l", regIdx, isVGPR, isAGPR>; @@ -142,9 +142,10 @@ multiclass SIRegLoHi16 <string n, bits<8> regIdx, bit ArtificialHigh = 1, let Namespace = "AMDGPU"; let SubRegIndices = [lo16, hi16]; let CoveredBySubRegs = !not(ArtificialHigh); - let HWEncoding{7-0} = regIdx; - let HWEncoding{8} = isVGPR; - let HWEncoding{9} = isAGPR; + + let HWEncoding{9-0} = regIdx; + let HWEncoding{10} = isVGPR; + let HWEncoding{11} = isAGPR; int Index = !cast<int>(regIdx); } @@ -225,7 +226,7 @@ def SGPR_NULL64 : // the high 32 bits. The lower 32 bits are always zero (for base) or // -1 (for limit). Since we cannot access the high 32 bits, when we // need them, we need to do a 64 bit load and extract the bits manually. -multiclass ApertureRegister<string name, bits<8> regIdx> { +multiclass ApertureRegister<string name, bits<10> regIdx> { let isConstant = true in { // FIXME: We shouldn't need to define subregisters for these (nor add them to any 16 bit // register classes), but if we don't it seems to confuse the TableGen @@ -313,7 +314,7 @@ foreach Index = 0...15 in { defm TTMP#Index : SIRegLoHi16<"ttmp"#Index, 0>; } -multiclass FLAT_SCR_LOHI_m <string n, bits<8> ci_e, bits<8> vi_e> { +multiclass FLAT_SCR_LOHI_m <string n, bits<10> ci_e, bits<10> vi_e> { defm _ci : SIRegLoHi16<n, ci_e>; defm _vi : SIRegLoHi16<n, vi_e>; defm "" : SIRegLoHi16<n, 0>; @@ -343,11 +344,12 @@ foreach Index = 0...105 in { } // VGPR registers -foreach Index = 0...255 in { +foreach Index = 0...1023 in { defm VGPR#Index : SIRegLoHi16 <"v"#Index, Index, /*ArtificialHigh=*/ 0, /*isVGPR=*/ 1, /*isAGPR=*/ 0, /*DwarfEncodings=*/ - [!add(Index, 2560), !add(Index, 1536)]>; + [!if(!le(Index, 511), !add(Index, 2560), -1), + !if(!le(Index, 511), !add(Index, 1536), !add(Index, !sub(3584, 512)))]>; } // AccVGPR registers @@ -604,15 +606,15 @@ def Reg512Types : RegisterTypes<[v16i32, v16f32, v8i64, v8f64, v32i16, v32f16, v def Reg1024Types : RegisterTypes<[v32i32, v32f32, v16i64, v16f64]>; let HasVGPR = 1 in { -// VOP3 and VINTERP can access 256 lo and 256 hi registers. +// VOP3 and VINTERP can access 1024 lo and 1024 hi registers. def VGPR_16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, - (add (interleave (sequence "VGPR%u_LO16", 0, 255), - (sequence "VGPR%u_HI16", 0, 255)))> { + (add (interleave (sequence "VGPR%u_LO16", 0, 1023), + (sequence "VGPR%u_HI16", 0, 1023)))> { let AllocationPriority = !add(2, !mul(BaseClassPriority, BaseClassScaleFactor)); let Size = 16; let GeneratePressureSet = 0; - // This is the base class for VGPR{128..255}_{LO16,HI16}. + // This is the base class for VGPR{128..1023}_{LO16,HI16}. let BaseClassOrder = 17; } @@ -633,7 +635,7 @@ def VGPR_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, // VGPR 32-bit registers // i16/f16 only on VI+ def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32, - (add (sequence "VGPR%u", 0, 255))> { + (add (sequence "VGPR%u", 0, 1023))> { let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor)); let Size = 32; let Weight = 1; @@ -648,46 +650,55 @@ def VGPR_32_Lo128 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg1 let Size = 32; let Weight = 1; } + +// Identical to VGPR_32 except it only contains the low 256 (Lo256) registers. +def VGPR_32_Lo256 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32, + (add (sequence "VGPR%u", 0, 255))> { + let AllocationPriority = 0; + let GeneratePressureSet = 0; + let Size = 32; + let Weight = 1; +} } // End HasVGPR = 1 // VGPR 64-bit registers -def VGPR_64 : SIRegisterTuples<getSubRegs<2>.ret, VGPR_32, 255, 1, 2, "v">; +def VGPR_64 : SIRegisterTuples<getSubRegs<2>.ret, VGPR_32, 1023, 1, 2, "v">; // VGPR 96-bit registers -def VGPR_96 : SIRegisterTuples<getSubRegs<3>.ret, VGPR_32, 255, 1, 3, "v">; +def VGPR_96 : SIRegisterTuples<getSubRegs<3>.ret, VGPR_32, 1023, 1, 3, "v">; // VGPR 128-bit registers -def VGPR_128 : SIRegisterTuples<getSubRegs<4>.ret, VGPR_32, 255, 1, 4, "v">; +def VGPR_128 : SIRegisterTuples<getSubRegs<4>.ret, VGPR_32, 1023, 1, 4, "v">; // VGPR 160-bit registers -def VGPR_160 : SIRegisterTuples<getSubRegs<5>.ret, VGPR_32, 255, 1, 5, "v">; +def VGPR_160 : SIRegisterTuples<getSubRegs<5>.ret, VGPR_32, 1023, 1, 5, "v">; // VGPR 192-bit registers -def VGPR_192 : SIRegisterTuples<getSubRegs<6>.ret, VGPR_32, 255, 1, 6, "v">; +def VGPR_192 : SIRegisterTuples<getSubRegs<6>.ret, VGPR_32, 1023, 1, 6, "v">; // VGPR 224-bit registers -def VGPR_224 : SIRegisterTuples<getSubRegs<7>.ret, VGPR_32, 255, 1, 7, "v">; +def VGPR_224 : SIRegisterTuples<getSubRegs<7>.ret, VGPR_32, 1023, 1, 7, "v">; // VGPR 256-bit registers -def VGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, VGPR_32, 255, 1, 8, "v">; +def VGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, VGPR_32, 1023, 1, 8, "v">; // VGPR 288-bit registers -def VGPR_288 : SIRegisterTuples<getSubRegs<9>.ret, VGPR_32, 255, 1, 9, "v">; +def VGPR_288 : SIRegisterTuples<getSubRegs<9>.ret, VGPR_32, 1023, 1, 9, "v">; // VGPR 320-bit registers -def VGPR_320 : SIRegisterTuples<getSubRegs<10>.ret, VGPR_32, 255, 1, 10, "v">; +def VGPR_320 : SIRegisterTuples<getSubRegs<10>.ret, VGPR_32, 1023, 1, 10, "v">; // VGPR 352-bit registers -def VGPR_352 : SIRegisterTuples<getSubRegs<11>.ret, VGPR_32, 255, 1, 11, "v">; +def VGPR_352 : SIRegisterTuples<getSubRegs<11>.ret, VGPR_32, 1023, 1, 11, "v">; // VGPR 384-bit registers -def VGPR_384 : SIRegisterTuples<getSubRegs<12>.ret, VGPR_32, 255, 1, 12, "v">; +def VGPR_384 : SIRegisterTuples<getSubRegs<12>.ret, VGPR_32, 1023, 1, 12, "v">; // VGPR 512-bit registers -def VGPR_512 : SIRegisterTuples<getSubRegs<16>.ret, VGPR_32, 255, 1, 16, "v">; +def VGPR_512 : SIRegisterTuples<getSubRegs<16>.ret, VGPR_32, 1023, 1, 16, "v">; // VGPR 1024-bit registers -def VGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, VGPR_32, 255, 1, 32, "v">; +def VGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, VGPR_32, 1023, 1, 32, "v">; let HasAGPR = 1 in { def AGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, @@ -976,14 +987,14 @@ class VRegClassBase<int numRegs, list<ValueType> regTypes, dag regList> : // Requires n v_mov_b32 to copy let CopyCost = numRegs; - // Since we only have 5 bits for the RegisterClass Allocation Priorty, and since we use the - // 5th bit for BaseClassPriority, we need to encode the SizePriority into 4 bits. As a result - // of this encoding, for registers with numRegs 15 or 16, we give SizePriority of 14, and for - // regsters with numRegs 17+ we give SizePriority of 15. In practice, there is only one - // RegClass per Vector Register type in each of these groups (i.e. numRegs = 15,16 : {VReg_512}, - // and numRegs = 17+ : {VReg_1024}). Therefore, we have not lost any info by compressing. + // Since we only have 5 bits for the RegisterClass Allocation Priorty, and since we use the + // 5th bit for BaseClassPriority, we need to encode the SizePriority into 4 bits. As a result + // of this encoding, for registers with numRegs 15 or 16, we give SizePriority of 14, and for + // regsters with numRegs 17+ we give SizePriority of 15. In practice, there is only one + // RegClass per Vector Register type in each of these groups (i.e. numRegs = 15,16 : {VReg_512}, + // and numRegs = 17+ : {VReg_1024}). Therefore, we have not lost any info by compressing. defvar SizePrioriity = !if(!le(numRegs, 14), !sub(numRegs, 1), !if(!le(numRegs, 16), 14, 15)); - + let AllocationPriority = !add(SizePrioriity, !mul(BaseClassPriority, BaseClassScaleFactor)); let Weight = numRegs; } @@ -1003,6 +1014,10 @@ multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> { let BaseClassOrder = !sub(!mul(numRegs, 32), 1); let RegTupleAlignUnits = 2; } + + // Aligned register tuples starting with low 256 vgprs + def _Lo256_Align2 : VRegClassBase<numRegs, regTypes, + (trunc (decimate regList, 2), !div(!sub(258, numRegs), 2))>; } } @@ -1100,6 +1115,14 @@ def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2 let Size = 32; } +def VS_32_Lo256 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32, + (add VGPR_32_Lo256, SReg_32, LDS_DIRECT_CLASS)> { + let isAllocatable = 0; + let HasVGPR = 1; + let HasSGPR = 1; + let Size = 32; +} + def VS_64 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, (add VReg_64, SReg_64)> { let isAllocatable = 0; let HasVGPR = 1; @@ -1107,12 +1130,27 @@ def VS_64 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, (add VReg_64, SReg_6 let Size = 64; } +def VS_64_Align2 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, + (add VReg_64_Align2, SReg_64)> { + let isAllocatable = 0; + let HasVGPR = 1; + let HasSGPR = 1; + let Size = 64; +} + def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_32)> { let HasVGPR = 1; let HasAGPR = 1; let BaseClassPriority = 0; let Size = 32; } + +def VS_64_Lo256 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, (add VReg_64_Lo256_Align2, SReg_64)> { + let isAllocatable = 0; + let HasVGPR = 1; + let HasSGPR = 1; + let Size = 64; +} } // End GeneratePressureSet = 0 // Define a register tuple class, along with one requiring an even @@ -1249,15 +1287,15 @@ class SrcReg9<RegisterClass regClass> : RegisterOperand<regClass> { let DecoderMethod = "decodeSrcReg9<" # regClass.Size # ">"; } -def VRegSrc_32 : SrcReg9<VGPR_32>; -def VRegSrc_64 : SrcReg9<VReg_64>; -def VRegSrc_96 : SrcReg9<VReg_96>; -def VRegSrc_128: SrcReg9<VReg_128>; -def VRegSrc_192: SrcReg9<VReg_192>; -def VRegSrc_256: SrcReg9<VReg_256>; -def VRegSrc_384: SrcReg9<VReg_384>; -def VRegSrc_512: SrcReg9<VReg_512>; -def VRegSrc_1024: SrcReg9<VReg_1024>; +def VRegSrc_32 : SrcReg9<VGPR_32>; +def VRegSrc_64 : SrcReg9<VReg_64>; +def VRegSrc_96 : SrcReg9<VReg_96>; +def VRegSrc_128 : SrcReg9<VReg_128>; +def VRegSrc_192 : SrcReg9<VReg_192>; +def VRegSrc_256 : SrcReg9<VReg_256>; +def VRegSrc_384 : SrcReg9<VReg_384>; +def VRegSrc_512 : SrcReg9<VReg_512>; +def VRegSrc_1024 : SrcReg9<VReg_1024>; def VRegOrLdsSrc_32 : SrcReg9<VRegOrLds_32>; // True 16 Operands @@ -1269,30 +1307,41 @@ def VRegSrc_fake16: SrcReg9<VGPR_32> { let EncoderMethod = "getMachineOpValueT16"; } //===----------------------------------------------------------------------===// -// VGPRSrc_* +// VGPROp_* An 8-bit RegisterOperand wrapper for a VGPR //===----------------------------------------------------------------------===// -// An 8-bit RegisterOperand wrapper for a VGPR -def VGPRSrc_32 : RegisterOperand<VGPR_32> { - let DecoderMethod = "DecodeVGPR_32RegisterClass"; +class VGPROp<RegisterClass regClass> : RegisterOperand<regClass> { + let DecoderMethod = "Decode" # regClass # "RegisterClass"; } -def VGPRSrc_32_Lo128 : RegisterOperand<VGPR_32_Lo128> { - let DecoderMethod = "DecodeVGPR_32RegisterClass"; +class VGPROp_Align2<RegisterClass regClass> : RegisterOperand<!cast<RegisterClass>(regClass#_Align2)> { + let DecoderMethod = "Decode" # regClass # "RegisterClass"; +} +multiclass VGPROp_Aligned<RegisterClass regClass> { + def _Align1 : VGPROp<regClass>; + def _Align2 : VGPROp_Align2<regClass>; } -def VGPRSrc_96 : RegisterOperand<VReg_96> { - let DecoderMethod = "DecodeVReg_96RegisterClass"; +// TODO: These cases should use default target alignment +def VGPROp_16 : VGPROp<VGPR_16> { + let EncoderMethod = "getMachineOpValueT16"; } +def VGPROp_32 : VGPROp<VGPR_32>; -def VGPRSrc_16_Lo128 : RegisterOperand<VGPR_16_Lo128> { +foreach size = ["64", "96", "128", "160", "192", "224", "256", "288", "512", "1024"] in { + def VGPROp_#size : VGPROp<!cast<RegisterClass>("VReg_"#size)>; +} + +foreach size = ["64", "96", "128", "160", "256", "1024"] in { + defm VGPROp_#size : VGPROp_Aligned<!cast<RegisterClass>("VReg_"#size)>; +} + +def VGPROp_16_Lo128 : RegisterOperand<VGPR_16_Lo128> { let DecoderMethod = "DecodeVGPR_16_Lo128RegisterClass"; let EncoderMethod = "getMachineOpValueT16Lo128"; } -// True 16 operands. -def VGPRSrc_16 : RegisterOperand<VGPR_16> { - let DecoderMethod = "DecodeVGPR_16RegisterClass"; - let EncoderMethod = "getMachineOpValueT16"; +def VGPROp_32_Lo128 : RegisterOperand<VGPR_32_Lo128> { + let DecoderMethod = "DecodeVGPR_32RegisterClass"; } //===----------------------------------------------------------------------===// @@ -1321,7 +1370,9 @@ def VCSrc_f64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_FP64">; def VCSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2INT16">; def VCSrc_v2bf16: SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2BF16">; def VCSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2FP16">; +def VCSrc_b32_Lo256 : SrcRegOrImm9 <VS_32_Lo256, "OPERAND_REG_INLINE_C_INT32">; def VCSrc_v2b32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_V2INT32">; +def VCSrc_b64_Lo256 : SrcRegOrImm9 <VS_64_Lo256, "OPERAND_REG_INLINE_C_INT64">; // True 16 Operands def VCSrcT_b16 : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_INT16">; @@ -1372,11 +1423,14 @@ class AVLdStOperand<RegisterClass regClass> : AVOperand<regClass, "decodeAVLdSt">; def AVLdSt_32 : AVLdStOperand<AV_32>; -def AVLdSt_64 : AVLdStOperand<AV_64>; -def AVLdSt_96 : AVLdStOperand<AV_96>; -def AVLdSt_128 : AVLdStOperand<AV_128>; -def AVLdSt_160 : AVLdStOperand<AV_160>; -def AVLdSt_1024 : AVLdStOperand<AV_1024>; + +foreach size = ["64", "96", "128", "160", "256", "1024" ] in { + // TODO: These cases should use target align variant + def AVLdSt_#size : AVLdStOperand<!cast<RegisterClass>("AV_"#size)>; + + def AVLdSt_#size#_Align1 : AVLdStOperand<!cast<RegisterClass>("AV_"#size)>; + def AVLdSt_#size#_Align2 : AVLdStOperand<!cast<RegisterClass>("AV_"#size#_Align2)>; +} //===----------------------------------------------------------------------===// // ACSrc_* Operands with an AGPR or an inline constant @@ -1395,3 +1449,59 @@ def AISrc_512_f32 : SrcRegOrImmA9 <AReg_512, "OPERAND_REG_INLINE_AC_FP32">; def AISrc_512_b32 : SrcRegOrImmA9 <AReg_512, "OPERAND_REG_INLINE_AC_INT32">; def AISrc_1024_f32 : SrcRegOrImmA9 <AReg_1024, "OPERAND_REG_INLINE_AC_FP32">; def AISrc_1024_b32 : SrcRegOrImmA9 <AReg_1024, "OPERAND_REG_INLINE_AC_INT32">; + +//===----------------------------------------------------------------------===// +// Tablegen programming utilities +//===----------------------------------------------------------------------===// + +/// Helper function to extract the register class from an +/// instruction's operand list, which may be a RegisterOperand or a +/// direct RegisterClass reference. +class getRegClassFromOp<DAGOperand Op> { + SIRegisterClass ret = !if( + !isa<RegisterOperand>(Op), + !cast<SIRegisterClass>(!cast<RegisterOperand>(Op).RegClass), + !cast<SIRegisterClass>(Op)); +} + +/// Check if the operand will use an AV_* class. +class OperandIsAV<DAGOperand Op> { + defvar reg_class = getRegClassFromOp<Op>.ret; + bit ret = !and(reg_class.HasAGPR, reg_class.HasVGPR); +} + +/// Check if the operand will use an AGPR class. +class OperandIsAGPR<DAGOperand Op> { + defvar reg_class = getRegClassFromOp<Op>.ret; + bit ret = !and(reg_class.HasAGPR, !not(reg_class.HasVGPR)); +} + +/// Check if the operand will use a VGPR class. +class OperandIsVGPR<DAGOperand Op> { + defvar reg_class = getRegClassFromOp<Op>.ret; + bit ret = !and(reg_class.HasVGPR, !not(reg_class.HasAGPR)); +} + +class VDstOperandIsAV<dag OperandList> { + bit ret = OperandIsAV<!getdagarg<DAGOperand>(OperandList, "vdst")>.ret; +} + +class VDstOperandIsAGPR<dag OperandList> { + bit ret = OperandIsAGPR<!getdagarg<DAGOperand>(OperandList, "vdst")>.ret; +} + +class Data0OperandIsAV<dag OperandList> { + bit ret = OperandIsAV<!getdagarg<DAGOperand>(OperandList, "data0")>.ret; +} + +class Data0OperandIsAGPR<dag OperandList> { + bit ret = OperandIsAGPR<!getdagarg<DAGOperand>(OperandList, "data0")>.ret; +} + +class VDataOperandIsAV<dag OperandList> { + bit ret = OperandIsAV<!getdagarg<DAGOperand>(OperandList, "vdata")>.ret; +} + +class VDataOperandIsAGPR<dag OperandList> { + bit ret = OperandIsAGPR<!getdagarg<DAGOperand>(OperandList, "vdata")>.ret; +} diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index 4bda51d1e959..781c61b073db 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -295,7 +295,6 @@ class SM_Pseudo_Atomic<string opName, let has_soffset = offsets.HasSOffset; let Constraints = !if(isRet, "$sdst = $sdata", ""); - let DisableEncoding = !if(isRet, "$sdata", ""); } multiclass SM_Pseudo_Atomics<RegisterClass baseClass, @@ -678,7 +677,6 @@ class SMEM_Atomic_Real_vi <bits<8> op, SM_Atomic_Pseudo ps> bits<7> sdata; let Constraints = ps.Constraints; - let DisableEncoding = ps.DisableEncoding; let cpol{CPolBit.GLC} = ps.glc; let Inst{12-6} = !if(ps.glc, sdst{6-0}, sdata{6-0}); @@ -1295,7 +1293,6 @@ class SMEM_Atomic_Real_gfx10 <bits<8> op, SM_Atomic_Pseudo ps> bits<7> sdata; let Constraints = ps.Constraints; - let DisableEncoding = ps.DisableEncoding; let cpol{CPolBit.GLC} = ps.glc; diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index a003a46191a8..12a27db241c4 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -583,7 +583,6 @@ class SOP2_Real<SOP_Pseudo ps, string name = ps.Mnemonic> : let mayLoad = ps.mayLoad; let mayStore = ps.mayStore; let Constraints = ps.Constraints; - let DisableEncoding = ps.DisableEncoding; let Uses = ps.Uses; let Defs = ps.Defs; let isConvergent = ps.isConvergent; @@ -934,7 +933,7 @@ let SubtargetPredicate = HasSALUFloatInsts, mayRaiseFPException = 1, >; } // End isReMaterializable = 1 - let Constraints = "$sdst = $src2", DisableEncoding="$src2", + let Constraints = "$sdst = $src2", isCommutable = 1, AddedComplexity = 20 in { def S_FMAC_F32 : SOP2_Pseudo< "s_fmac_f32", (outs SReg_32:$sdst), @@ -949,7 +948,7 @@ let SubtargetPredicate = HasSALUFloatInsts, mayRaiseFPException = 1, "$sdst, $src0, $src1", [(set f16:$sdst, (UniformTernaryFrag<any_fma> SSrc_f16:$src0, SSrc_f16:$src1, SReg_32:$src2))] >; - } // End Constraints = "$sdst = $src2", DisableEncoding="$src2", + } // End Constraints = "$sdst = $src2", // isCommutable = 1, AddedComplexity = 20 } // End SubtargetPredicate = HasSALUFloatInsts, mayRaiseFPException = 1, // Uses = [MODE], SchedRW = [WriteSFPU] @@ -994,7 +993,6 @@ class SOPK_Real<SOPK_Pseudo ps, string name = ps.Mnemonic> : // copy relevant pseudo op flags let SubtargetPredicate = ps.SubtargetPredicate; let AsmMatchConverter = ps.AsmMatchConverter; - let DisableEncoding = ps.DisableEncoding; let Constraints = ps.Constraints; let SchedRW = ps.SchedRW; let mayLoad = ps.mayLoad; @@ -1116,8 +1114,7 @@ def S_CMPK_LT_U32 : SOPK_SCC <"s_cmpk_lt_u32", "s_cmp_lt_u32", 0>; def S_CMPK_LE_U32 : SOPK_SCC <"s_cmpk_le_u32", "s_cmp_le_u32", 0>; } // End isCompare = 1 -let isCommutable = 1, DisableEncoding = "$src0", - Constraints = "$sdst = $src0" in { +let isCommutable = 1, Constraints = "$sdst = $src0" in { let Defs = [SCC] in def S_ADDK_I32 : SOPK_32TIE <"s_addk_i32">; def S_MULK_I32 : SOPK_32TIE <"s_mulk_i32">; @@ -1656,6 +1653,11 @@ let OtherPredicates = [HasImageInsts] in { def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">; } + +let SubtargetPredicate = HasWaitXcnt in { + def S_WAIT_XCNT_soft : SOPP_Pseudo<"", (ins s16imm:$simm16), "$simm16">; +} + // Represents the point at which a wave must wait for all outstanding direct loads to LDS. // Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts. @@ -1847,6 +1849,13 @@ let SubtargetPredicate = HasWaitXcnt, hasSideEffects = 1 in { SOPP_Pseudo<"s_wait_xcnt", (ins s16imm:$simm16), "$simm16">; } // End SubtargetPredicate = hasWaitXcnt, hasSideEffects = 1 +let SubtargetPredicate = Has1024AddressableVGPRs in { + def S_SET_VGPR_MSB : SOPP_Pseudo<"s_set_vgpr_msb" , (ins i16imm:$simm16), "$simm16"> { + let hasSideEffects = 1; + let Defs = [MODE]; + } +} + //===----------------------------------------------------------------------===// // SOP1 Patterns //===----------------------------------------------------------------------===// @@ -2694,6 +2703,7 @@ defm S_WAIT_STORECNT_DSCNT : SOPP_Real_32_gfx12<0x049>; //===----------------------------------------------------------------------===// // SOPP - GFX1250 only. //===----------------------------------------------------------------------===// +defm S_SET_VGPR_MSB : SOPP_Real_32_gfx12<0x006>; defm S_SETPRIO_INC_WG : SOPP_Real_32_gfx12<0x03e>; defm S_WAIT_XCNT : SOPP_Real_32_gfx12<0x045>; defm S_WAIT_ASYNCCNT : SOPP_Real_32_gfx12<0x04a>; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index c740b5e0f09d..14ebbf8e9c92 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -116,6 +116,8 @@ static constexpr CustomOperand MsgOperands[] = { {{"MSG_RTN_GET_TBA"}, ID_RTN_GET_TBA, isGFX11Plus}, {{"MSG_RTN_GET_TBA_TO_PC"}, ID_RTN_GET_TBA_TO_PC, isGFX11Plus}, {{"MSG_RTN_GET_SE_AID_ID"}, ID_RTN_GET_SE_AID_ID, isGFX12Plus}, + {{"MSG_RTN_GET_CLUSTER_BARRIER_STATE"}, ID_RTN_GET_CLUSTER_BARRIER_STATE, + isGFX1250}, }; static constexpr CustomOperand SysMsgOperands[] = { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 18ee9c16b3ff..9f4f42185d9a 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -934,6 +934,10 @@ std::optional<unsigned> InstInfo::getInvalidCompOperandIndex( if (!OpXRegs[CompOprIdx] || !OpYRegs[CompOprIdx]) continue; + if (getVGPREncodingMSBs(OpXRegs[CompOprIdx], MRI) != + getVGPREncodingMSBs(OpYRegs[CompOprIdx], MRI)) + return CompOprIdx; + if (SkipSrc && CompOprIdx >= Component::DST_NUM) continue; @@ -1376,6 +1380,9 @@ unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, ? *EnableWavefrontSize32 : STI->getFeatureBits().test(FeatureWavefrontSize32); + if (STI->getFeatureBits().test(Feature1024AddressableVGPRs)) + return IsWave32 ? 16 : 8; + return IsWave32 ? 8 : 4; } @@ -1396,7 +1403,10 @@ unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI) { return 256; } unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize) { - if (STI->getFeatureBits().test(FeatureGFX90AInsts)) + const auto &Features = STI->getFeatureBits(); + if (Features.test(FeatureGFX1250Insts)) + return Features.test(FeatureWavefrontSize32) ? 1024 : 512; + if (Features.test(FeatureGFX90AInsts)) return 512; // Temporarily check the subtarget feature, until we fully switch to using @@ -2720,13 +2730,6 @@ bool isInlineValue(unsigned Reg) { #undef CASE_GFXPRE11_GFX11PLUS_TO #undef MAP_REG2REG -bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) { - assert(OpNo < Desc.NumOperands); - unsigned OpType = Desc.operands()[OpNo].OperandType; - return OpType >= AMDGPU::OPERAND_SRC_FIRST && - OpType <= AMDGPU::OPERAND_SRC_LAST; -} - bool isKImmOperand(const MCInstrDesc &Desc, unsigned OpNo) { assert(OpNo < Desc.NumOperands); unsigned OpType = Desc.operands()[OpNo].OperandType; @@ -2776,6 +2779,7 @@ unsigned getRegBitWidth(unsigned RCID) { return 16; case AMDGPU::SGPR_32RegClassID: case AMDGPU::VGPR_32RegClassID: + case AMDGPU::VGPR_32_Lo256RegClassID: case AMDGPU::VRegOrLds_32RegClassID: case AMDGPU::AGPR_32RegClassID: case AMDGPU::VS_32RegClassID: @@ -2794,6 +2798,8 @@ unsigned getRegBitWidth(unsigned RCID) { case AMDGPU::AReg_64_Align2RegClassID: case AMDGPU::AV_64RegClassID: case AMDGPU::AV_64_Align2RegClassID: + case AMDGPU::VReg_64_Lo256_Align2RegClassID: + case AMDGPU::VS_64_Lo256RegClassID: return 64; case AMDGPU::SGPR_96RegClassID: case AMDGPU::SReg_96RegClassID: @@ -2803,6 +2809,7 @@ unsigned getRegBitWidth(unsigned RCID) { case AMDGPU::AReg_96_Align2RegClassID: case AMDGPU::AV_96RegClassID: case AMDGPU::AV_96_Align2RegClassID: + case AMDGPU::VReg_96_Lo256_Align2RegClassID: return 96; case AMDGPU::SGPR_128RegClassID: case AMDGPU::SReg_128RegClassID: @@ -2813,6 +2820,7 @@ unsigned getRegBitWidth(unsigned RCID) { case AMDGPU::AV_128RegClassID: case AMDGPU::AV_128_Align2RegClassID: case AMDGPU::SReg_128_XNULLRegClassID: + case AMDGPU::VReg_128_Lo256_Align2RegClassID: return 128; case AMDGPU::SGPR_160RegClassID: case AMDGPU::SReg_160RegClassID: @@ -2822,6 +2830,7 @@ unsigned getRegBitWidth(unsigned RCID) { case AMDGPU::AReg_160_Align2RegClassID: case AMDGPU::AV_160RegClassID: case AMDGPU::AV_160_Align2RegClassID: + case AMDGPU::VReg_160_Lo256_Align2RegClassID: return 160; case AMDGPU::SGPR_192RegClassID: case AMDGPU::SReg_192RegClassID: @@ -2831,6 +2840,7 @@ unsigned getRegBitWidth(unsigned RCID) { case AMDGPU::AReg_192_Align2RegClassID: case AMDGPU::AV_192RegClassID: case AMDGPU::AV_192_Align2RegClassID: + case AMDGPU::VReg_192_Lo256_Align2RegClassID: return 192; case AMDGPU::SGPR_224RegClassID: case AMDGPU::SReg_224RegClassID: @@ -2840,6 +2850,7 @@ unsigned getRegBitWidth(unsigned RCID) { case AMDGPU::AReg_224_Align2RegClassID: case AMDGPU::AV_224RegClassID: case AMDGPU::AV_224_Align2RegClassID: + case AMDGPU::VReg_224_Lo256_Align2RegClassID: return 224; case AMDGPU::SGPR_256RegClassID: case AMDGPU::SReg_256RegClassID: @@ -2850,6 +2861,7 @@ unsigned getRegBitWidth(unsigned RCID) { case AMDGPU::AV_256RegClassID: case AMDGPU::AV_256_Align2RegClassID: case AMDGPU::SReg_256_XNULLRegClassID: + case AMDGPU::VReg_256_Lo256_Align2RegClassID: return 256; case AMDGPU::SGPR_288RegClassID: case AMDGPU::SReg_288RegClassID: @@ -2859,6 +2871,7 @@ unsigned getRegBitWidth(unsigned RCID) { case AMDGPU::AReg_288_Align2RegClassID: case AMDGPU::AV_288RegClassID: case AMDGPU::AV_288_Align2RegClassID: + case AMDGPU::VReg_288_Lo256_Align2RegClassID: return 288; case AMDGPU::SGPR_320RegClassID: case AMDGPU::SReg_320RegClassID: @@ -2868,6 +2881,7 @@ unsigned getRegBitWidth(unsigned RCID) { case AMDGPU::AReg_320_Align2RegClassID: case AMDGPU::AV_320RegClassID: case AMDGPU::AV_320_Align2RegClassID: + case AMDGPU::VReg_320_Lo256_Align2RegClassID: return 320; case AMDGPU::SGPR_352RegClassID: case AMDGPU::SReg_352RegClassID: @@ -2877,6 +2891,7 @@ unsigned getRegBitWidth(unsigned RCID) { case AMDGPU::AReg_352_Align2RegClassID: case AMDGPU::AV_352RegClassID: case AMDGPU::AV_352_Align2RegClassID: + case AMDGPU::VReg_352_Lo256_Align2RegClassID: return 352; case AMDGPU::SGPR_384RegClassID: case AMDGPU::SReg_384RegClassID: @@ -2886,6 +2901,7 @@ unsigned getRegBitWidth(unsigned RCID) { case AMDGPU::AReg_384_Align2RegClassID: case AMDGPU::AV_384RegClassID: case AMDGPU::AV_384_Align2RegClassID: + case AMDGPU::VReg_384_Lo256_Align2RegClassID: return 384; case AMDGPU::SGPR_512RegClassID: case AMDGPU::SReg_512RegClassID: @@ -2895,6 +2911,7 @@ unsigned getRegBitWidth(unsigned RCID) { case AMDGPU::AReg_512_Align2RegClassID: case AMDGPU::AV_512RegClassID: case AMDGPU::AV_512_Align2RegClassID: + case AMDGPU::VReg_512_Lo256_Align2RegClassID: return 512; case AMDGPU::SGPR_1024RegClassID: case AMDGPU::SReg_1024RegClassID: @@ -2904,6 +2921,7 @@ unsigned getRegBitWidth(unsigned RCID) { case AMDGPU::AReg_1024_Align2RegClassID: case AMDGPU::AV_1024RegClassID: case AMDGPU::AV_1024_Align2RegClassID: + case AMDGPU::VReg_1024_Lo256_Align2RegClassID: return 1024; default: llvm_unreachable("Unexpected register class"); @@ -3206,8 +3224,11 @@ bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST, bool isLegalSMRDEncodedSignedOffset(const MCSubtargetInfo &ST, int64_t EncodedOffset, bool IsBuffer) { - if (isGFX12Plus(ST)) + if (isGFX12Plus(ST)) { + if (IsBuffer && EncodedOffset < 0) + return false; return isInt<24>(EncodedOffset); + } return !IsBuffer && hasSMRDSignedImmOffset(ST) && isInt<21>(EncodedOffset); } @@ -3321,6 +3342,112 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format, : getGfx9BufferFormatInfo(Format); } +const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg, + const MCRegisterInfo &MRI) { + const unsigned VGPRClasses[] = { + AMDGPU::VGPR_16RegClassID, AMDGPU::VGPR_32RegClassID, + AMDGPU::VReg_64RegClassID, AMDGPU::VReg_96RegClassID, + AMDGPU::VReg_128RegClassID, AMDGPU::VReg_160RegClassID, + AMDGPU::VReg_192RegClassID, AMDGPU::VReg_224RegClassID, + AMDGPU::VReg_256RegClassID, AMDGPU::VReg_288RegClassID, + AMDGPU::VReg_320RegClassID, AMDGPU::VReg_352RegClassID, + AMDGPU::VReg_384RegClassID, AMDGPU::VReg_512RegClassID, + AMDGPU::VReg_1024RegClassID}; + + for (unsigned RCID : VGPRClasses) { + const MCRegisterClass &RC = MRI.getRegClass(RCID); + if (RC.contains(Reg)) + return &RC; + } + + return nullptr; +} + +unsigned getVGPREncodingMSBs(MCPhysReg Reg, const MCRegisterInfo &MRI) { + unsigned Enc = MRI.getEncodingValue(Reg); + unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK; + return Idx >> 8; +} + +MCPhysReg getVGPRWithMSBs(MCPhysReg Reg, unsigned MSBs, + const MCRegisterInfo &MRI) { + unsigned Enc = MRI.getEncodingValue(Reg); + unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK; + if (Idx >= 0x100) + return AMDGPU::NoRegister; + + const MCRegisterClass *RC = getVGPRPhysRegClass(Reg, MRI); + if (!RC) + return AMDGPU::NoRegister; + return RC->getRegister(Idx | (MSBs << 8)); +} + +std::pair<const AMDGPU::OpName *, const AMDGPU::OpName *> +getVGPRLoweringOperandTables(const MCInstrDesc &Desc) { + static const AMDGPU::OpName VOPOps[4] = { + AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2, + AMDGPU::OpName::vdst}; + static const AMDGPU::OpName VDSOps[4] = { + AMDGPU::OpName::addr, AMDGPU::OpName::data0, AMDGPU::OpName::data1, + AMDGPU::OpName::vdst}; + static const AMDGPU::OpName FLATOps[4] = { + AMDGPU::OpName::vaddr, AMDGPU::OpName::vdata, + AMDGPU::OpName::NUM_OPERAND_NAMES, AMDGPU::OpName::vdst}; + static const AMDGPU::OpName BUFOps[4] = { + AMDGPU::OpName::vaddr, AMDGPU::OpName::NUM_OPERAND_NAMES, + AMDGPU::OpName::NUM_OPERAND_NAMES, AMDGPU::OpName::vdata}; + static const AMDGPU::OpName VIMGOps[4] = { + AMDGPU::OpName::vaddr0, AMDGPU::OpName::vaddr1, AMDGPU::OpName::vaddr2, + AMDGPU::OpName::vdata}; + + // For VOPD instructions MSB of a corresponding Y component operand VGPR + // address is supposed to match X operand, otherwise VOPD shall not be + // combined. + static const AMDGPU::OpName VOPDOpsX[4] = { + AMDGPU::OpName::src0X, AMDGPU::OpName::vsrc1X, AMDGPU::OpName::vsrc2X, + AMDGPU::OpName::vdstX}; + static const AMDGPU::OpName VOPDOpsY[4] = { + AMDGPU::OpName::src0Y, AMDGPU::OpName::vsrc1Y, AMDGPU::OpName::vsrc2Y, + AMDGPU::OpName::vdstY}; + + unsigned TSFlags = Desc.TSFlags; + + if (TSFlags & + (SIInstrFlags::VOP1 | SIInstrFlags::VOP2 | SIInstrFlags::VOP3 | + SIInstrFlags::VOP3P | SIInstrFlags::VOPC | SIInstrFlags::DPP)) { + // LD_SCALE operands ignore MSB. + if (Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE_PAIRED_B32 || + Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE_PAIRED_B32_gfx1250 || + Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE16_PAIRED_B64 || + Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE16_PAIRED_B64_gfx1250) + return {}; + return {VOPOps, nullptr}; + } + + if (TSFlags & SIInstrFlags::DS) + return {VDSOps, nullptr}; + + if (TSFlags & SIInstrFlags::FLAT) + return {FLATOps, nullptr}; + + if (TSFlags & (SIInstrFlags::MUBUF | SIInstrFlags::MTBUF)) + return {BUFOps, nullptr}; + + if (TSFlags & SIInstrFlags::VIMAGE) + return {VIMGOps, nullptr}; + + if (AMDGPU::isVOPD(Desc.getOpcode())) + return {VOPDOpsX, VOPDOpsY}; + + assert(!(TSFlags & SIInstrFlags::MIMG)); + + if (TSFlags & (SIInstrFlags::VSAMPLE | SIInstrFlags::EXP)) + llvm_unreachable("Sample and export VGPR lowering is not implemented and" + " these instructions are not expected on gfx1250"); + + return {}; +} + bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode) { uint64_t TSFlags = MII.get(Opcode).TSFlags; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 70dfb63cbe04..3fcd16f9290b 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1517,6 +1517,7 @@ constexpr bool mayTailCallThisCC(CallingConv::ID CC) { switch (CC) { case CallingConv::C: case CallingConv::AMDGPU_Gfx: + case CallingConv::AMDGPU_Gfx_WholeWave: return true; default: return canGuaranteeTCO(CC); @@ -1590,7 +1591,14 @@ bool isInlineValue(unsigned Reg); /// Is this an AMDGPU specific source operand? These include registers, /// inline constants, literals and mandatory literals (KImm). -bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo); +constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo) { + return OpInfo.OperandType >= AMDGPU::OPERAND_SRC_FIRST && + OpInfo.OperandType <= AMDGPU::OPERAND_SRC_LAST; +} + +inline bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) { + return isSISrcOperand(Desc.operands()[OpNo]); +} /// Is this a KImm operand? bool isKImmOperand(const MCInstrDesc &Desc, unsigned OpNo); @@ -1778,6 +1786,25 @@ bool isIntrinsicSourceOfDivergence(unsigned IntrID); /// \returns true if the intrinsic is uniform bool isIntrinsicAlwaysUniform(unsigned IntrID); +/// \returns a register class for the physical register \p Reg if it is a VGPR +/// or nullptr otherwise. +const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg, + const MCRegisterInfo &MRI); + +/// \returns the MODE bits which have to be set by the S_SET_VGPR_MSB for the +/// physical register \p Reg. +unsigned getVGPREncodingMSBs(MCPhysReg Reg, const MCRegisterInfo &MRI); + +/// If \p Reg is a low VGPR return a corresponding high VGPR with \p MSBs set. +MCPhysReg getVGPRWithMSBs(MCPhysReg Reg, unsigned MSBs, + const MCRegisterInfo &MRI); + +// Returns a table for the opcode with a given \p Desc to map the VGPR MSB +// set by the S_SET_VGPR_MSB to one of 4 sources. In case of VOPD returns 2 +// maps, one for X and one for Y component. +std::pair<const AMDGPU::OpName *, const AMDGPU::OpName *> +getVGPRLoweringOperandTables(const MCInstrDesc &Desc); + /// \returns true if a memory instruction supports scale_offset modifier. bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp index fd6253daa327..a7a0e33da5e4 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp @@ -1061,6 +1061,17 @@ VersionTuple AMDGPUPALMetadata::getPALVersion() { return VersionTuple(getPALVersion(0), getPALVersion(1)); } +// Set the field in a given .hardware_stages entry to a maximum value +void AMDGPUPALMetadata::updateHwStageMaximum(unsigned CC, StringRef field, + unsigned Val) { + msgpack::MapDocNode HwStageFieldMapNode = getHwStage(CC); + auto &Node = HwStageFieldMapNode[field]; + if (Node.isEmpty()) + Node = Val; + else + Node = std::max<unsigned>(Node.getUInt(), Val); +} + // Set the field in a given .hardware_stages entry void AMDGPUPALMetadata::setHwStage(unsigned CC, StringRef field, unsigned Val) { getHwStage(CC)[field] = Val; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h index 4830db5fda50..e50150cc8de9 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h @@ -156,6 +156,7 @@ public: unsigned getPALMinorVersion(); VersionTuple getPALVersion(); + void updateHwStageMaximum(unsigned CC, StringRef field, unsigned Val); void setHwStage(unsigned CC, StringRef field, unsigned Val); void setHwStage(unsigned CC, StringRef field, bool Val); void setHwStage(unsigned CC, StringRef field, msgpack::Type Type, diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 11c72751dde5..f816d7de27ee 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -71,7 +71,6 @@ class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemo let isCodeGenOnly = 0; let Constraints = ps.Constraints; - let DisableEncoding = ps.DisableEncoding; // copy relevant pseudo op flags let SubtargetPredicate = ps.SubtargetPredicate; @@ -80,7 +79,6 @@ class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemo let AsmMatchConverter = ps.AsmMatchConverter; let AsmVariantName = ps.AsmVariantName; let Constraints = ps.Constraints; - let DisableEncoding = ps.DisableEncoding; let TSFlags = ps.TSFlags; let UseNamedOperandTable = ps.UseNamedOperandTable; let Uses = ps.Uses; @@ -584,7 +582,6 @@ def VOP_SWAP_I32 : VOPProfile<[i32, i32, untyped, untyped]> { let SubtargetPredicate = isGFX9Plus in { def V_SWAP_B32 : VOP1_Pseudo<"v_swap_b32", VOP_SWAP_I32, [], 1> { let Constraints = "$vdst = $src1, $vdst1 = $src0"; - let DisableEncoding = "$vdst1,$src1"; let SchedRW = [Write64Bit, Write64Bit]; } @@ -802,7 +799,6 @@ let SubtargetPredicate = isGFX10Plus in { def V_SWAPREL_B32 : VOP1_Pseudo<"v_swaprel_b32", VOP_SWAP_I32, [], 1> { let Constraints = "$vdst = $src1, $vdst1 = $src0"; - let DisableEncoding = "$vdst1,$src1"; let SchedRW = [Write64Bit, Write64Bit]; } } // End Uses = [M0] @@ -831,7 +827,6 @@ def VOP_SWAP_I16 : VOPProfile_True16<VOP_I16_I16> { let SubtargetPredicate = isGFX11Plus in { def V_SWAP_B16 : VOP1_Pseudo<"v_swap_b16", VOP_SWAP_I16, [], /* VOP1Only= */true> { let Constraints = "$vdst = $src1, $vdst1 = $src0"; - let DisableEncoding = "$vdst1, $src1"; let SchedRW = [Write64Bit, Write64Bit]; let True16Predicate = UseRealTrue16Insts; } @@ -849,7 +844,6 @@ let SubtargetPredicate = HasPrngInst in defm V_PRNG_B32 : VOP1Inst <"v_prng_b32", VOP_I32_I32, int_amdgcn_prng_b32>; let Constraints = "$vdst = $vdst_in, $src0_out = $src0", - DisableEncoding="$vdst_in,$src0_out", SchedRW = [Write32Bit, Write32Bit], isConvergent = 1 in { let SubtargetPredicate = HasPermlane16Swap in { diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 9de7d6d009fe..cff66aaedb11 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -105,7 +105,6 @@ class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemo let isCodeGenOnly = 0; let Constraints = ps.Constraints; - let DisableEncoding = ps.DisableEncoding; // copy relevant pseudo op flags let SubtargetPredicate = ps.SubtargetPredicate; @@ -114,7 +113,6 @@ class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemo let AsmMatchConverter = ps.AsmMatchConverter; let AsmVariantName = ps.AsmVariantName; let Constraints = ps.Constraints; - let DisableEncoding = ps.DisableEncoding; let TSFlags = ps.TSFlags; let UseNamedOperandTable = ps.UseNamedOperandTable; let Uses = ps.Uses; @@ -418,12 +416,12 @@ def VOP_MADAK_F16_t16 : VOP_MADAK <f16> { let IsTrue16 = 1; let IsRealTrue16 = 1; let DstRC = getVALUDstForVT<DstVT, 1/*IsTrue16*/, 0/*IsVOP3Encoding*/>.ret; - let Ins32 = (ins VSrcT_f16_Lo128:$src0, VGPRSrc_16_Lo128:$src1, ImmOpType:$imm); + let Ins32 = (ins VSrcT_f16_Lo128:$src0, VGPROp_16_Lo128:$src1, ImmOpType:$imm); } def VOP_MADAK_F16_fake16 : VOP_MADAK <f16> { let IsTrue16 = 1; let DstRC = getVALUDstForVT_fake16<DstVT>.ret; - let Ins32 = (ins VSrcFake16_f16_Lo128:$src0, VGPRSrc_32_Lo128:$src1, ImmOpType:$imm); + let Ins32 = (ins VSrcFake16_f16_Lo128:$src0, VGPROp_32_Lo128:$src1, ImmOpType:$imm); } def VOP_MADAK_F32 : VOP_MADAK <f32>; def VOP_MADAK_F64 : VOP_MADAK <f64>; @@ -454,12 +452,12 @@ def VOP_MADMK_F16_t16 : VOP_MADMK <f16> { let IsTrue16 = 1; let IsRealTrue16 = 1; let DstRC = getVALUDstForVT<DstVT, 1/*IsTrue16*/, 0/*IsVOP3Encoding*/>.ret; - let Ins32 = (ins VSrcT_f16_Lo128:$src0, ImmOpType:$imm, VGPRSrc_16_Lo128:$src1); + let Ins32 = (ins VSrcT_f16_Lo128:$src0, ImmOpType:$imm, VGPROp_16_Lo128:$src1); } def VOP_MADMK_F16_fake16 : VOP_MADMK <f16> { let IsTrue16 = 1; let DstRC = getVALUDstForVT_fake16<DstVT>.ret; - let Ins32 = (ins VSrcFake16_f16_Lo128:$src0, ImmOpType:$imm, VGPRSrc_32_Lo128:$src1); + let Ins32 = (ins VSrcFake16_f16_Lo128:$src0, ImmOpType:$imm, VGPROp_32_Lo128:$src1); } def VOP_MADMK_F32 : VOP_MADMK <f32>; def VOP_MADMK_F64 : VOP_MADMK <f64>; @@ -498,14 +496,14 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v HasClamp, HasModifiers, HasModifiers, HasOMod, Src0ModVOP3DPP, Src1ModVOP3DPP, Src2Mod, HasOpSel>.ret; // We need a dummy src2 tied to dst to track the use of that register for s_delay_alu - let InsVOPDX = (ins Src0RC32:$src0X, Src1RC32:$vsrc1X, VGPRSrc_32:$src2X); - let InsVOPDY = (ins Src0RC32:$src0Y, Src1RC32:$vsrc1Y, VGPRSrc_32:$src2Y); + let InsVOPDX = (ins Src0RC32:$src0X, Src1RC32:$vsrc1X, VGPROp_32:$src2X); + let InsVOPDY = (ins Src0RC32:$src0Y, Src1RC32:$vsrc1Y, VGPROp_32:$src2Y); let InsVOPD3X = (ins Src0ModVOPD3:$src0X_modifiers, Src0VOPD3:$src0X, Src1ModVOPD3:$vsrc1X_modifiers, Src1RC32:$vsrc1X, - VGPRSrc_32:$src2X); + VGPROp_32:$src2X); let InsVOPD3Y = (ins Src0ModVOPD3:$src0Y_modifiers, Src0VOPD3:$src0Y, Src1ModVOPD3:$vsrc1Y_modifiers, Src1RC32:$vsrc1Y, - VGPRSrc_32:$src2Y); + VGPROp_32:$src2Y); let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1ModDPP:$src1_modifiers, Src1DPP:$src1, @@ -567,7 +565,7 @@ def VOP_MAC_F16_t16 : VOP_MAC <f16> { let DstRC64 = getVALUDstForVT<DstVT, 1/*IsTrue*/, 1/*IsVOP3Encoding*/>.ret; let Src0RC64 = getVOP3SrcForVT<Src0VT, 1/*IsTrue16*/>.ret; let Src1RC64 = getVOP3SrcForVT<Src1VT, 1/*IsTrue16*/>.ret; - let Src0VOP3DPP = VGPRSrc_16; + let Src0VOP3DPP = VGPROp_16; let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0/*IsFake16*/>.ret; let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 0/*IsFake16*/>.ret; let Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT, 0/*IsFake16*/>.ret; @@ -599,7 +597,7 @@ def VOP_MAC_F16_fake16 : VOP_MAC <f16> { getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret:$src2, // stub argument dpp8:$dpp8, Dpp8FI:$fi); let DstRC64 = getVALUDstForVT<DstVT>.ret; - let Src0VOP3DPP = VGPRSrc_32; + let Src0VOP3DPP = VGPROp_32; let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 1/*IsFake16*/>.ret; let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 1/*IsFake16*/>.ret; let Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT, 1/*IsFake16*/>.ret; @@ -798,7 +796,7 @@ def VOP2e_I16_I16_I16_I1_true16 : VOP2e_SGPR<[i16, i16, i16, i1]> { Src2RC64, NumSrcArgs, HasClamp, 1/*HasModifiers*/, 0/*HasSrc2Mods*/, HasOMod, Src0Mod, Src1Mod, Src2Mod, 1/*HasOpSel*/>.ret; - let Src0VOP3DPP = VGPRSrc_16; + let Src0VOP3DPP = VGPROp_16; let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0/*IsFake16*/>.ret; let Src0ModVOP3DPP = getSrc0ModVOP3DPP<f16, DstVT, 0/*IsFake16*/>.ret; let Src1ModVOP3DPP = getSrcModVOP3VC<f16, 0/*IsFake16*/>.ret; @@ -810,7 +808,7 @@ def VOP2e_I16_I16_I16_I1_fake16 : VOP2e_SGPR<[i16, i16, i16, i1]> { let Src0Mod = getSrc0Mod<f16, DstVT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret; let Src1Mod = getSrcMod<f16, 1/*IsTrue16*/, 1/*IsFake16*/>.ret; - let Src0VOP3DPP = VGPRSrc_32; + let Src0VOP3DPP = VGPROp_32; let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 1/*IsFake16*/>.ret; let Src0ModVOP3DPP = getSrc0ModVOP3DPP<f16, DstVT, 1/*IsFake16*/>.ret; let Src1ModVOP3DPP = getSrcModVOP3VC<f16, 1/*IsFake16*/>.ret; @@ -889,13 +887,13 @@ defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, xor>; let mayRaiseFPException = 0 in { let OtherPredicates = [HasMadMacF32Insts] in { -let Constraints = "$vdst = $src2", DisableEncoding="$src2", +let Constraints = "$vdst = $src2", isConvertibleToThreeAddress = 1 in { defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>; let SubtargetPredicate = isGFX6GFX7GFX10 in defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_MAC_LEGACY_F32>; -} // End Constraints = "$vdst = $src2", DisableEncoding="$src2", +} // End Constraints = "$vdst = $src2", // isConvertibleToThreeAddress = 1 let isReMaterializable = 1 in @@ -941,9 +939,9 @@ defm V_MUL_U64 : VOP2Inst <"v_mul_u64", VOP_I64_I64_I64, DivergentBinFrag<mul>>; // These are special and do not read the exec mask. let isConvergent = 1, Uses = []<Register> in { def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE, []>; -let IsNeverUniform = 1, Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { +let IsNeverUniform = 1, Constraints = "$vdst = $vdst_in" in { def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, []>; -} // End IsNeverUniform, $vdst = $vdst_in, DisableEncoding $vdst_in +} // End IsNeverUniform, $vdst = $vdst_in } // End isConvergent = 1 foreach vt = Reg32Types.types in { @@ -1175,7 +1173,6 @@ let True16Predicate = UseFakeTrue16Insts in { } // End FPDPRounding = 1, isReMaterializable = 1, FixedSize = 1 let Constraints = "$vdst = $src2", - DisableEncoding="$src2", isConvertibleToThreeAddress = 1, isCommutable = 1 in { let SubtargetPredicate = isGFX10Plus in { @@ -1209,7 +1206,7 @@ let SubtargetPredicate = isGFX8GFX9 in { } // End isReMaterializable = 1 // FIXME: Missing FPDPRounding -let Constraints = "$vdst = $src2", DisableEncoding="$src2", +let Constraints = "$vdst = $src2", isConvertibleToThreeAddress = 1, isCommutable = 1 in { defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>; } @@ -1252,7 +1249,6 @@ def : GCNPat< >; let Constraints = "$vdst = $src2", - DisableEncoding = "$src2", isConvertibleToThreeAddress = 1, isCommutable = 1 in defm V_FMAC_F32 : VOP2Inst_VOPD <"v_fmac_f32", VOP_MAC_F32, 0x0, "v_fmac_f32">; @@ -1261,7 +1257,6 @@ defm V_FMAC_F32 : VOP2Inst_VOPD <"v_fmac_f32", VOP_MAC_F32, 0x0, "v_fmac_f32">; let SubtargetPredicate = HasFmaLegacy32 in { let Constraints = "$vdst = $src2", - DisableEncoding = "$src2", isConvertibleToThreeAddress = 1, isCommutable = 1 in defm V_FMAC_LEGACY_F32 : VOP2Inst <"v_fmac_legacy_f32", VOP_MAC_LEGACY_F32>; @@ -1270,14 +1265,12 @@ defm V_FMAC_LEGACY_F32 : VOP2Inst <"v_fmac_legacy_f32", VOP_MAC_LEGACY_F32>; let SubtargetPredicate = HasFmacF64Inst, Constraints = "$vdst = $src2", - DisableEncoding="$src2", isConvertibleToThreeAddress = 1, isCommutable = 1, SchedRW = [WriteDoubleAdd] in defm V_FMAC_F64 : VOP2Inst <"v_fmac_f64", VOP_MAC_F64>; let Constraints = "$vdst = $src2", - DisableEncoding="$src2", isConvertibleToThreeAddress = 1, isCommutable = 1, IsDOT = 1 in { diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 329d003cf250..19eabb46752b 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -782,7 +782,7 @@ defm V_LSHL_ADD_U64 : VOP3Inst <"v_lshl_add_u64", V_LSHL_ADD_U64_PROF>; let OtherPredicates = [HasFP8ConversionInsts], mayRaiseFPException = 0, SchedRW = [WriteFloatCvt] in { - let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in { + let Constraints = "$vdst = $vdst_in" in { let OtherPredicates = [HasFP8ConversionInsts, NotHasFP8E5M3Insts] in defm V_CVT_PK_FP8_F32 : VOP3Inst_t16_with_profiles<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile<>, VOP3_CVT_PK_F8_F32_Profile_t16<>, @@ -807,7 +807,7 @@ let OtherPredicates = [HasFP8ConversionInsts], mayRaiseFPException = 0, // These instructions have non-standard use of op_sel. In particular they are // using op_sel bits 2 and 3 while only having two sources. Therefore dummy // src2 is used to hold the op_sel value. - let Constraints = "$vdst = $src2", DisableEncoding = "$src2", SubtargetPredicate = isGFX940Plus in { + let Constraints = "$vdst = $src2", SubtargetPredicate = isGFX940Plus in { defm V_CVT_SR_FP8_F32 : VOP3Inst<"v_cvt_sr_fp8_f32", VOP3_CVT_SR_F8_F32_Profile>; defm V_CVT_SR_BF8_F32 : VOP3Inst<"v_cvt_sr_bf8_f32", VOP3_CVT_SR_F8_F32_Profile>; } @@ -1309,7 +1309,7 @@ class VOP3_CVT_SCALEF32_PK_F864_Profile<VOPProfile P> : VOP3_Profile<P> { } let SubtargetPredicate = HasFP8ConversionScaleInsts, mayRaiseFPException = 0 in { - let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { + let Constraints = "$vdst = $vdst_in" in { defm V_CVT_SCALEF32_SR_FP8_BF16 : VOP3Inst<"v_cvt_scalef32_sr_fp8_bf16", VOP3_CVT_SCALE_SR_F8BF8_F16BF16F32_TiedInput_Profile<VOP_I32_BF16_I32_F32>>; defm V_CVT_SCALEF32_SR_FP8_F16 : VOP3Inst<"v_cvt_scalef32_sr_fp8_f16", VOP3_CVT_SCALE_SR_F8BF8_F16BF16F32_TiedInput_Profile<VOP_I32_F16_I32_F32>>; defm V_CVT_SCALEF32_SR_FP8_F32 : VOP3Inst<"v_cvt_scalef32_sr_fp8_f32", VOP3_CVT_SCALE_SR_F8BF8_F16BF16F32_TiedInput_Profile<VOP_I32_F32_I32_F32>>; @@ -1325,7 +1325,7 @@ let SubtargetPredicate = HasFP8ConversionScaleInsts, mayRaiseFPException = 0 in } let SubtargetPredicate = HasBF8ConversionScaleInsts, mayRaiseFPException = 0 in { - let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { + let Constraints = "$vdst = $vdst_in" in { defm V_CVT_SCALEF32_SR_BF8_BF16 : VOP3Inst<"v_cvt_scalef32_sr_bf8_bf16", VOP3_CVT_SCALE_SR_F8BF8_F16BF16F32_TiedInput_Profile<VOP_I32_BF16_I32_F32>>; defm V_CVT_SCALEF32_SR_BF8_F16 : VOP3Inst<"v_cvt_scalef32_sr_bf8_f16", VOP3_CVT_SCALE_SR_F8BF8_F16BF16F32_TiedInput_Profile<VOP_I32_F16_I32_F32>>; defm V_CVT_SCALEF32_SR_BF8_F32 : VOP3Inst<"v_cvt_scalef32_sr_bf8_f32", VOP3_CVT_SCALE_SR_F8BF8_F16BF16F32_TiedInput_Profile<VOP_I32_F32_I32_F32>>; @@ -1342,7 +1342,7 @@ let SubtargetPredicate = HasBF8ConversionScaleInsts, mayRaiseFPException = 0 in let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in { defm V_CVT_SCALEF32_PK_F32_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f32_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f32>>; - let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { + let Constraints = "$vdst = $vdst_in" in { defm V_CVT_SCALEF32_PK_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk_fp4_f32", VOP3_CVT_SCALE_FP4_F32_TiedInput_Profile<VOP_I32_F32_F32_F32>>; let Constraints = "@earlyclobber $vdst" in { defm V_CVT_SCALEF32_SR_PK_FP4_F16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>; @@ -1358,7 +1358,7 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in // These instructions have non-standard use of op_sel. In particular they are // using op_sel bits 2 and 3 while only having two sources. - let Constraints = "$vdst = $src2", DisableEncoding = "$src2" in { + let Constraints = "$vdst = $src2" in { defm V_CVT_SCALEF32_PK_FP4_F16 : VOP3Inst<"v_cvt_scalef32_pk_fp4_f16", VOP3_CVT_SCALE_FP4_F16BF16_TiedInput_Profile<VOP_I32_V2F16_F32_F32>>; defm V_CVT_SCALEF32_PK_FP4_BF16 : VOP3Inst<"v_cvt_scalef32_pk_fp4_bf16", VOP3_CVT_SCALE_FP4_F16BF16_TiedInput_Profile<VOP_I32_V2BF16_F32_F32>>; } @@ -1486,10 +1486,10 @@ let SubtargetPredicate = isGFX10Plus in { } // End isCommutable = 1, isReMaterializable = 1 def : ThreeOp_i32_Pats<xor, xor, V_XOR3_B32_e64>; - let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in", isConvergent = 1 in { + let Constraints = "$vdst = $vdst_in", isConvergent = 1 in { defm V_PERMLANE16_B32 : VOP3Inst<"v_permlane16_b32", VOP3_PERMLANE_Profile>; defm V_PERMLANEX16_B32 : VOP3Inst<"v_permlanex16_b32", VOP3_PERMLANE_Profile>; - } // End $vdst = $vdst_in, DisableEncoding $vdst_in, isConvergent = 1 + } // End $vdst = $vdst_in, isConvergent = 1 foreach vt = Reg32Types.types in { def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64, vt>; @@ -1532,10 +1532,10 @@ let True16Predicate = UseFakeTrue16Insts in { } // End True16Predicate = UseFakeTrue16Insts let SubtargetPredicate = isGFX12Plus in { - let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { + let Constraints = "$vdst = $vdst_in" in { defm V_PERMLANE16_VAR_B32 : VOP3Inst<"v_permlane16_var_b32", VOP3_PERMLANE_VAR_Profile>; defm V_PERMLANEX16_VAR_B32 : VOP3Inst<"v_permlanex16_var_b32", VOP3_PERMLANE_VAR_Profile>; - } // End $vdst = $vdst_in, DisableEncoding $vdst_in + } // End $vdst = $vdst_in def : PermlaneVarPat<int_amdgcn_permlane16_var, V_PERMLANE16_VAR_B32_e64>; def : PermlaneVarPat<int_amdgcn_permlanex16_var, V_PERMLANEX16_VAR_B32_e64>; @@ -1763,7 +1763,7 @@ let SubtargetPredicate = isGFX1250Plus in { // These instructions have non-standard use of op_sel. They are using bits 2 and 3 of opsel // to select a byte in the vdst. Bits 0 and 1 are unused. - let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in { + let Constraints = "$vdst = $vdst_in" in { defm V_CVT_SR_FP8_F16 : VOP3Inst_t16_with_profiles<"v_cvt_sr_fp8_f16", VOP3_CVT_SR_F8_F16_Profile, VOP3_CVT_SR_F8_F16_True16_Profile, VOP3_CVT_SR_F8_F16_Fake16_Profile>; defm V_CVT_SR_BF8_F16 : VOP3Inst_t16_with_profiles<"v_cvt_sr_bf8_f16", VOP3_CVT_SR_F8_F16_Profile, @@ -1850,7 +1850,7 @@ class Cvt_Scale_Sr_F32ToBF16F16_Pat<SDPatternOperator node, VOP3_Pseudo inst, Va >; let SubtargetPredicate = HasF32ToF16BF16ConversionSRInsts in { - let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in { + let Constraints = "$vdst = $vdst_in" in { defm V_CVT_SR_F16_F32 : VOP3Inst<"v_cvt_sr_f16_f32", VOP3_CVT_SR_FP16_TiedInput_Profile<VOP_F16_F32_I32>>; defm V_CVT_SR_BF16_F32 : VOP3Inst<"v_cvt_sr_bf16_f32", VOP3_CVT_SR_FP16_TiedInput_Profile<VOP_BF16_F32_I32>>; } diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index ce280d484da1..6f778a0d262a 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -44,7 +44,7 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR, FP16InputMods:$src1_modifiers, Src1RC:$src1, FP16InputMods:$src2_modifiers, Src2RC:$src2); dag dpp_srcs = - (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0, + (ins FPVRegInputMods:$src0_modifiers, VGPROp_32:$src0, FPVRegInputMods:$src1_modifiers, VRegSrc_32:$src1, FP16InputMods:$src2_modifiers, Src2RC:$src2); @@ -84,7 +84,6 @@ multiclass VOP3PInst<string OpName, VOPProfile P, multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P> { def NAME : VOP3P_Pseudo<OpName, P> { let Constraints = !if(P.UseTiedOutput, "$vdst = $vdst_in", ""); - let DisableEncoding = !if(P.UseTiedOutput, "$vdst_in", ""); } let SubtargetPredicate = isGFX11Plus in { if P.HasExtVOP3DPP then @@ -92,7 +91,6 @@ multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P> { let VOP3P = 1; let PseudoInstr = OpName#"_dpp"; let Constraints = !if(P.UseTiedOutput, "$vdst = $vdst_in", ""); - let DisableEncoding = !if(P.UseTiedOutput, "$vdst_in", ""); } } // end SubtargetPredicate = isGFX11Plus } @@ -1166,7 +1164,7 @@ let SubtargetPredicate = HasFP8Insts, is_gfx940_xdl = 1 in { } // End SubtargetPredicate = HasFP8Insts, is_gfx940_xdl = 1 multiclass SMFMACInst<string OpName, string P, SDPatternOperator node> { - let Constraints = "$vdst = $src2", DisableEncoding = "$src2", + let Constraints = "$vdst = $src2", isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1, is_gfx940_xdl = 1 in { def _e64 : MAIInst<OpName, !cast<VOPProfileSMFMAC>("VOPProfileSMFMAC_" # P), node>; } @@ -1520,8 +1518,8 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, dag MatrixFMT = !if(HasMatrixFMT, (ins MatrixAFMT:$matrix_a_fmt, MatrixBFMT:$matrix_b_fmt), (ins)); dag MatrixScaleSrc = !if(HasMatrixScale, - !if(Scale16, (ins VCSrc_b64:$scale_src0, VCSrc_b64:$scale_src1), - (ins VCSrc_b32:$scale_src0, VCSrc_b32:$scale_src1)), + !if(Scale16, (ins VCSrc_b64_Lo256:$scale_src0, VCSrc_b64_Lo256:$scale_src1), + (ins VCSrc_b32_Lo256:$scale_src0, VCSrc_b32_Lo256:$scale_src1)), (ins)); dag MatrixScale = !if(HasMatrixScale, (ins MatrixAScale:$matrix_a_scale, MatrixBScale:$matrix_b_scale, MatrixAScaleFmt:$matrix_a_scale_fmt, MatrixBScaleFmt:$matrix_b_scale_fmt), @@ -1859,8 +1857,8 @@ defm V_WMMA_SCALE_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale_f32_32x16 defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale16_f32_32x16x128_f4", F32_32X16X128_F4_SCALE16_w32, "_w32">; } // End is_wmma_xdl = 1. -defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32>>; -defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64>>; +defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>; +defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>; } // End SubtargetPredicate = isGFX125xOnly } // End WaveSizePredicate = isWave32 diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index 2c1193509da9..2730ec52294e 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -100,7 +100,7 @@ multiclass VOPC_Profile_t16<list<SchedReadWrite> sched, ValueType vt0, ValueType let Src0ModDPP = getSrcModDPP_t16<Src0VT, 0/*IsFake16*/>.ret; let Src1ModDPP = getSrcModDPP_t16<Src1VT, 0/*IsFake16*/>.ret; let Src2ModDPP = getSrcModDPP_t16<Src2VT, 0/*IsFake16*/>.ret; - let Src0VOP3DPP = VGPRSrc_16; + let Src0VOP3DPP = VGPROp_16; let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0/*IsFake16*/>.ret; let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 0/*IsFake16*/>.ret; @@ -126,7 +126,7 @@ multiclass VOPC_Profile_t16<list<SchedReadWrite> sched, ValueType vt0, ValueType let Src0ModDPP = getSrcModDPP_t16<Src0VT, 1/*IsFake16*/>.ret; let Src1ModDPP = getSrcModDPP_t16<Src1VT, 1/*IsFake16*/>.ret; let Src2ModDPP = getSrcModDPP_t16<Src2VT, 1/*IsFake16*/>.ret; - let Src0VOP3DPP = VGPRSrc_32; + let Src0VOP3DPP = VGPROp_32; let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 1/*IsFake16*/>.ret; let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 1/*IsFake16*/>.ret; @@ -173,7 +173,7 @@ multiclass VOPC_NoSdst_Profile_t16<list<SchedReadWrite> sched, ValueType vt0, Va let Src0ModDPP = getSrcModDPP_t16<Src0VT, 0/*IsFake16*/>.ret; let Src1ModDPP = getSrcModDPP_t16<Src1VT, 0/*IsFake16*/>.ret; let Src2ModDPP = getSrcModDPP_t16<Src2VT, 0/*IsFake16*/>.ret; - let Src0VOP3DPP = VGPRSrc_16; + let Src0VOP3DPP = VGPROp_16; let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0/*IsFake16*/>.ret; let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 0/*IsFake16*/>.ret; @@ -197,7 +197,7 @@ multiclass VOPC_NoSdst_Profile_t16<list<SchedReadWrite> sched, ValueType vt0, Va let Src0ModDPP = getSrcModDPP_t16<Src0VT, 1/*IsFake16*/>.ret; let Src1ModDPP = getSrcModDPP_t16<Src1VT, 1/*IsFake16*/>.ret; let Src2ModDPP = getSrcModDPP_t16<Src2VT, 1/*IsFake16*/>.ret; - let Src0VOP3DPP = VGPRSrc_32; + let Src0VOP3DPP = VGPROp_32; let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 1/*IsFake16*/>.ret; let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 1/*IsFake16*/>.ret; @@ -251,7 +251,6 @@ class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily, string asm_name = ps.Pseudo let isCodeGenOnly = 0; let Constraints = ps.Constraints; - let DisableEncoding = ps.DisableEncoding; // copy relevant pseudo op flags let SubtargetPredicate = ps.SubtargetPredicate; @@ -259,7 +258,6 @@ class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily, string asm_name = ps.Pseudo let OtherPredicates = ps.OtherPredicates; let AsmMatchConverter = ps.AsmMatchConverter; let Constraints = ps.Constraints; - let DisableEncoding = ps.DisableEncoding; let TSFlags = ps.TSFlags; let UseNamedOperandTable = ps.UseNamedOperandTable; let Uses = ps.Uses; @@ -894,7 +892,7 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType src0VT, ValueType // DPP8 forbids modifiers and can inherit from VOPC_Profile let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); - dag InsPartVOP3DPP = (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0, VCSrc_b32:$src1); + dag InsPartVOP3DPP = (ins FPVRegInputMods:$src0_modifiers, VGPROp_32:$src0, VCSrc_b32:$src1); let InsVOP3Base = !con(InsPartVOP3DPP, !if(HasOpSel, (ins op_sel0:$op_sel), (ins))); let AsmVOP3Base = "$sdst, $src0_modifiers, $src1"; @@ -917,7 +915,7 @@ multiclass VOPC_Class_Profile_t16<list<SchedReadWrite> sched> { let Src0ModDPP = getSrcModDPP_t16<Src0VT, 0/*IsFake16*/>.ret; let Src1ModDPP = getSrcModDPP_t16<Src1VT, 0/*IsFake16*/>.ret; let Src2ModDPP = getSrcModDPP_t16<Src2VT, 0/*IsFake16*/>.ret; - let Src0VOP3DPP = VGPRSrc_16; + let Src0VOP3DPP = VGPROp_16; let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0/*IsFake16*/>.ret; let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 0/*IsFake16*/>.ret; @@ -943,7 +941,7 @@ multiclass VOPC_Class_Profile_t16<list<SchedReadWrite> sched> { let Src0ModDPP = getSrcModDPP_t16<Src0VT, 1/*IsFake16*/>.ret; let Src1ModDPP = getSrcModDPP_t16<Src1VT, 1/*IsFake16*/>.ret; let Src2ModDPP = getSrcModDPP_t16<Src2VT, 1/*IsFake16*/>.ret; - let Src0VOP3DPP = VGPRSrc_32; + let Src0VOP3DPP = VGPROp_32; let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 1/*IsFake16*/>.ret; let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 1/*IsFake16*/>.ret; @@ -987,7 +985,7 @@ multiclass VOPC_Class_NoSdst_Profile_t16<list<SchedReadWrite> sched> { let Src0ModDPP = getSrcModDPP_t16<Src0VT, 0/*IsFake16*/>.ret; let Src1ModDPP = getSrcModDPP_t16<Src1VT, 0/*IsFake16*/>.ret; let Src2ModDPP = getSrcModDPP_t16<Src2VT, 0/*IsFake16*/>.ret; - let Src0VOP3DPP = VGPRSrc_16; + let Src0VOP3DPP = VGPROp_16; let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0/*IsFake16*/>.ret; let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 0/*IsFake16*/>.ret; @@ -1011,7 +1009,7 @@ multiclass VOPC_Class_NoSdst_Profile_t16<list<SchedReadWrite> sched> { let Src0ModDPP = getSrcModDPP_t16<Src0VT, 1/*IsFake16*/>.ret; let Src1ModDPP = getSrcModDPP_t16<Src1VT, 1/*IsFake16*/>.ret; let Src2ModDPP = getSrcModDPP_t16<Src2VT, 1/*IsFake16*/>.ret; - let Src0VOP3DPP = VGPRSrc_32; + let Src0VOP3DPP = VGPROp_32; let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 1/*IsFake16*/>.ret; let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 1/*IsFake16*/>.ret; diff --git a/llvm/lib/Target/AMDGPU/VOPDInstructions.td b/llvm/lib/Target/AMDGPU/VOPDInstructions.td index 3e7af12f6b60..f416c0654048 100644 --- a/llvm/lib/Target/AMDGPU/VOPDInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPDInstructions.td @@ -138,10 +138,6 @@ class VOPD_Base<dag outs, dag ins, string asm, VOP_Pseudo VDX, VOP_Pseudo VDY, string ConstraintsY = !if(hasSrc2AccY, "$src2Y = $vdstY", ""); let Constraints = ConstraintsX # !if(!and(hasSrc2AccX, hasSrc2AccY), ", ", "") # ConstraintsY; - string DisableEncodingX = !if(hasSrc2AccX, "$src2X", ""); - string DisableEncodingY = !if(hasSrc2AccY, "$src2Y", ""); - let DisableEncoding = - DisableEncodingX # !if(!and(hasSrc2AccX, hasSrc2AccY), ", ", "") # DisableEncodingY; let Uses = RegListUnion<VDX.Uses, VDY.Uses>.ret; let Defs = RegListUnion<VDX.Defs, VDY.Defs>.ret; @@ -228,7 +224,7 @@ foreach Gen = [GFX11GenD, GFX12GenD, GFX1250GenD] in { defvar isOpXMADK = !or(!eq(x, "V_FMAAK_F32"), !eq(x, "V_FMAMK_F32")); defvar isOpYMADK = !or(!eq(y, "V_FMAAK_F32"), !eq(y, "V_FMAMK_F32")); defvar OpName = "V_DUAL_" # !substr(x,2) # "_X_" # !substr(y,2) # Gen.Suffix; - defvar outs = (outs VGPRSrc_32:$vdstX, VOPDDstYOperand:$vdstY); + defvar outs = (outs VGPROp_32:$vdstX, VOPDDstYOperand:$vdstY); if !or(isOpXMADK, isOpYMADK) then { // If Both X and Y are MADK, the mandatory literal of X additionally must // use an alternate operand format which defers to the 'real' Y literal. diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 3cad5a1c2c37..5550a0c08b91 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -187,7 +187,6 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemoni let AsmMatchConverter = ps.AsmMatchConverter; let AsmVariantName = ps.AsmVariantName; let Constraints = ps.Constraints; - let DisableEncoding = ps.DisableEncoding; let TSFlags = ps.TSFlags; let UseNamedOperandTable = ps.UseNamedOperandTable; let Uses = ps.Uses; @@ -807,7 +806,6 @@ class VOP_SDWA8_Real <VOP_SDWA_Pseudo ps> : let hasSideEffects = ps.hasSideEffects; let Constraints = ps.Constraints; - let DisableEncoding = ps.DisableEncoding; // Copy relevant pseudo op flags let SubtargetPredicate = ps.SubtargetPredicate; @@ -817,7 +815,6 @@ class VOP_SDWA8_Real <VOP_SDWA_Pseudo ps> : let UseNamedOperandTable = ps.UseNamedOperandTable; let DecoderNamespace = ps.DecoderNamespace; let Constraints = ps.Constraints; - let DisableEncoding = ps.DisableEncoding; let TSFlags = ps.TSFlags; let Uses = ps.Uses; let Defs = ps.Defs; @@ -841,7 +838,6 @@ class Base_VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> : let hasSideEffects = ps.hasSideEffects; let Constraints = ps.Constraints; - let DisableEncoding = ps.DisableEncoding; let SubtargetPredicate = ps.SubtargetPredicate; let AssemblerPredicate = HasSDWA9; @@ -854,7 +850,6 @@ class Base_VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> : let AsmMatchConverter = ps.AsmMatchConverter; let UseNamedOperandTable = ps.UseNamedOperandTable; let Constraints = ps.Constraints; - let DisableEncoding = ps.DisableEncoding; let TSFlags = ps.TSFlags; let Uses = ps.Uses; let Defs = ps.Defs; @@ -1037,7 +1032,6 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[], let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP, AMDGPUAsmVariants.Disable); let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", ""); - let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, ""); let DecoderNamespace = "GFX8"; } @@ -1066,7 +1060,6 @@ class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> : let hasSideEffects = ps.hasSideEffects; let Constraints = ps.Constraints; - let DisableEncoding = ps.DisableEncoding; // Copy relevant pseudo op flags let isConvergent = ps.isConvergent; @@ -1079,7 +1072,6 @@ class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> : let UseNamedOperandTable = ps.UseNamedOperandTable; let DecoderNamespace = ps.DecoderNamespace; let Constraints = ps.Constraints; - let DisableEncoding = ps.DisableEncoding; let TSFlags = ps.TSFlags; let Uses = ps.Uses; let Defs = ps.Defs; @@ -1109,7 +1101,6 @@ class VOP_DPP_Base <string OpName, VOPProfile P, let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP, AMDGPUAsmVariants.Disable); let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", ""); - let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, ""); let DecoderNamespace = "GFX8"; } @@ -1228,7 +1219,6 @@ class VOP_DPP8_Base<string OpName, VOPProfile P, dag InsDPP8 = P.InsDPP8, string let AssemblerPredicate = HasDPP8; let AsmVariantName = AMDGPUAsmVariants.DPP; let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", ""); - let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, ""); } class VOP_DPP8<string OpName, VOPProfile P> : diff --git a/llvm/lib/Target/ARC/ARCInstrFormats.td b/llvm/lib/Target/ARC/ARCInstrFormats.td index bd2ed0057617..0560bb1dc966 100644 --- a/llvm/lib/Target/ARC/ARCInstrFormats.td +++ b/llvm/lib/Target/ARC/ARCInstrFormats.td @@ -964,12 +964,10 @@ class F16_OP_U7<bit i, string asmstr> : // Special types for different instruction operands. def ccond : Operand<i32> { - let MIOperandInfo = (ops i32imm); let PrintMethod = "printPredicateOperand"; } def brccond : Operand<i32> { - let MIOperandInfo = (ops i32imm); let PrintMethod = "printBRCCPredicateOperand"; } diff --git a/llvm/lib/Target/ARC/ARCInstrInfo.cpp b/llvm/lib/Target/ARC/ARCInstrInfo.cpp index 8a89bdb546f3..05bcb3596ac4 100644 --- a/llvm/lib/Target/ARC/ARCInstrInfo.cpp +++ b/llvm/lib/Target/ARC/ARCInstrInfo.cpp @@ -44,7 +44,7 @@ enum TSFlagsConstants { void ARCInstrInfo::anchor() {} ARCInstrInfo::ARCInstrInfo(const ARCSubtarget &ST) - : ARCGenInstrInfo(ARC::ADJCALLSTACKDOWN, ARC::ADJCALLSTACKUP), RI(ST) {} + : ARCGenInstrInfo(ST, ARC::ADJCALLSTACKDOWN, ARC::ADJCALLSTACKUP), RI(ST) {} static bool isZeroImm(const MachineOperand &Op) { return Op.isImm() && Op.getImm() == 0; diff --git a/llvm/lib/Target/ARC/ARCInstrInfo.td b/llvm/lib/Target/ARC/ARCInstrInfo.td index f26b49119cab..8ff5f4a39ca7 100644 --- a/llvm/lib/Target/ARC/ARCInstrInfo.td +++ b/llvm/lib/Target/ARC/ARCInstrInfo.td @@ -18,7 +18,7 @@ include "ARCInstrFormats.td" // Operand for printing out a condition code. let PrintMethod = "printCCOperand" in - def CCOp : PredicateOperand<i32, (ops i32imm), (ops)>; + def CCOp : PredicateOperand<i32, (ops), (ops)>; // The "u6" operand of a RRU6-type instruction let PrintMethod = "printU6" in { diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 9e4dbecc16a8..5c35b3327c16 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -107,9 +107,9 @@ static const ARM_MLxEntry ARM_MLxTable[] = { { ARM::VMLSslfq, ARM::VMULslfq, ARM::VSUBfq, false, true }, }; -ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget& STI) - : ARMGenInstrInfo(ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP), - Subtarget(STI) { +ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget &STI) + : ARMGenInstrInfo(STI, ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP), + Subtarget(STI) { for (unsigned i = 0, e = std::size(ARM_MLxTable); i != e; ++i) { if (!MLxEntryMap.insert(std::make_pair(ARM_MLxTable[i].MLxOpc, i)).second) llvm_unreachable("Duplicated entries?"); @@ -6730,7 +6730,7 @@ bool ARMPipelinerLoopInfo::tooMuchRegisterPressure(SwingSchedulerDAG &SSD, Register Reg = S.getReg(); auto CIter = CrossIterationNeeds.find(Reg.id()); if (CIter != CrossIterationNeeds.end()) { - auto Stg2 = SMS.stageScheduled(const_cast<SUnit *>(S.getSUnit())); + auto Stg2 = SMS.stageScheduled(S.getSUnit()); assert(Stg2 <= Stg && "Data dependence upon earlier stage"); if (Stg - Stg2 < MAX_STAGES) CIter->second.set(Stg - Stg2); diff --git a/llvm/lib/Target/ARM/ARMBlockPlacement.cpp b/llvm/lib/Target/ARM/ARMBlockPlacement.cpp index ec907995e3ab..3d8ebfeae81d 100644 --- a/llvm/lib/Target/ARM/ARMBlockPlacement.cpp +++ b/llvm/lib/Target/ARM/ARMBlockPlacement.cpp @@ -218,7 +218,7 @@ bool ARMBlockPlacement::runOnMachineFunction(MachineFunction &MF) { return false; LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Running on " << MF.getName() << "\n"); MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI(); - TII = static_cast<const ARMBaseInstrInfo *>(ST.getInstrInfo()); + TII = ST.getInstrInfo(); BBUtils = std::make_unique<ARMBasicBlockUtils>(MF); MF.RenumberBlocks(); BBUtils->computeAllBlockSizes(); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 12d2d678ff63..d4d3c7009527 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -529,56 +529,56 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, const RTLIB::LibcallImpl Impl; } LibraryCalls[] = { // Single-precision floating-point arithmetic. - { RTLIB::ADD_F32, RTLIB::__addsf3vfp }, - { RTLIB::SUB_F32, RTLIB::__subsf3vfp }, - { RTLIB::MUL_F32, RTLIB::__mulsf3vfp }, - { RTLIB::DIV_F32, RTLIB::__divsf3vfp }, + { RTLIB::ADD_F32, RTLIB::impl___addsf3vfp }, + { RTLIB::SUB_F32, RTLIB::impl___subsf3vfp }, + { RTLIB::MUL_F32, RTLIB::impl___mulsf3vfp }, + { RTLIB::DIV_F32, RTLIB::impl___divsf3vfp }, // Double-precision floating-point arithmetic. - { RTLIB::ADD_F64, RTLIB::__adddf3vfp }, - { RTLIB::SUB_F64, RTLIB::__subdf3vfp }, - { RTLIB::MUL_F64, RTLIB::__muldf3vfp }, - { RTLIB::DIV_F64, RTLIB::__divdf3vfp }, + { RTLIB::ADD_F64, RTLIB::impl___adddf3vfp }, + { RTLIB::SUB_F64, RTLIB::impl___subdf3vfp }, + { RTLIB::MUL_F64, RTLIB::impl___muldf3vfp }, + { RTLIB::DIV_F64, RTLIB::impl___divdf3vfp }, // Single-precision comparisons. - { RTLIB::OEQ_F32, RTLIB::__eqsf2vfp }, - { RTLIB::UNE_F32, RTLIB::__nesf2vfp }, - { RTLIB::OLT_F32, RTLIB::__ltsf2vfp }, - { RTLIB::OLE_F32, RTLIB::__lesf2vfp }, - { RTLIB::OGE_F32, RTLIB::__gesf2vfp }, - { RTLIB::OGT_F32, RTLIB::__gtsf2vfp }, - { RTLIB::UO_F32, RTLIB::__unordsf2vfp }, + { RTLIB::OEQ_F32, RTLIB::impl___eqsf2vfp }, + { RTLIB::UNE_F32, RTLIB::impl___nesf2vfp }, + { RTLIB::OLT_F32, RTLIB::impl___ltsf2vfp }, + { RTLIB::OLE_F32, RTLIB::impl___lesf2vfp }, + { RTLIB::OGE_F32, RTLIB::impl___gesf2vfp }, + { RTLIB::OGT_F32, RTLIB::impl___gtsf2vfp }, + { RTLIB::UO_F32, RTLIB::impl___unordsf2vfp }, // Double-precision comparisons. - { RTLIB::OEQ_F64, RTLIB::__eqdf2vfp }, - { RTLIB::UNE_F64, RTLIB::__nedf2vfp }, - { RTLIB::OLT_F64, RTLIB::__ltdf2vfp }, - { RTLIB::OLE_F64, RTLIB::__ledf2vfp }, - { RTLIB::OGE_F64, RTLIB::__gedf2vfp }, - { RTLIB::OGT_F64, RTLIB::__gtdf2vfp }, - { RTLIB::UO_F64, RTLIB::__unorddf2vfp }, + { RTLIB::OEQ_F64, RTLIB::impl___eqdf2vfp }, + { RTLIB::UNE_F64, RTLIB::impl___nedf2vfp }, + { RTLIB::OLT_F64, RTLIB::impl___ltdf2vfp }, + { RTLIB::OLE_F64, RTLIB::impl___ledf2vfp }, + { RTLIB::OGE_F64, RTLIB::impl___gedf2vfp }, + { RTLIB::OGT_F64, RTLIB::impl___gtdf2vfp }, + { RTLIB::UO_F64, RTLIB::impl___unorddf2vfp }, // Floating-point to integer conversions. // i64 conversions are done via library routines even when generating VFP // instructions, so use the same ones. - { RTLIB::FPTOSINT_F64_I32, RTLIB::__fixdfsivfp }, - { RTLIB::FPTOUINT_F64_I32, RTLIB::__fixunsdfsivfp }, - { RTLIB::FPTOSINT_F32_I32, RTLIB::__fixsfsivfp }, - { RTLIB::FPTOUINT_F32_I32, RTLIB::__fixunssfsivfp }, + { RTLIB::FPTOSINT_F64_I32, RTLIB::impl___fixdfsivfp }, + { RTLIB::FPTOUINT_F64_I32, RTLIB::impl___fixunsdfsivfp }, + { RTLIB::FPTOSINT_F32_I32, RTLIB::impl___fixsfsivfp }, + { RTLIB::FPTOUINT_F32_I32, RTLIB::impl___fixunssfsivfp }, // Conversions between floating types. - { RTLIB::FPROUND_F64_F32, RTLIB::__truncdfsf2vfp }, - { RTLIB::FPEXT_F32_F64, RTLIB::__extendsfdf2vfp }, + { RTLIB::FPROUND_F64_F32, RTLIB::impl___truncdfsf2vfp }, + { RTLIB::FPEXT_F32_F64, RTLIB::impl___extendsfdf2vfp }, // Integer to floating-point conversions. // i64 conversions are done via library routines even when generating VFP // instructions, so use the same ones. // FIXME: There appears to be some naming inconsistency in ARM libgcc: // e.g., __floatunsidf vs. __floatunssidfvfp. - { RTLIB::SINTTOFP_I32_F64, RTLIB::__floatsidfvfp }, - { RTLIB::UINTTOFP_I32_F64, RTLIB::__floatunssidfvfp }, - { RTLIB::SINTTOFP_I32_F32, RTLIB::__floatsisfvfp }, - { RTLIB::UINTTOFP_I32_F32, RTLIB::__floatunssisfvfp }, + { RTLIB::SINTTOFP_I32_F64, RTLIB::impl___floatsidfvfp }, + { RTLIB::UINTTOFP_I32_F64, RTLIB::impl___floatunssidfvfp }, + { RTLIB::SINTTOFP_I32_F32, RTLIB::impl___floatsisfvfp }, + { RTLIB::UINTTOFP_I32_F32, RTLIB::impl___floatunssisfvfp }, }; // clang-format on @@ -3403,7 +3403,7 @@ SDValue ARMTargetLowering::LowerConstantPool(SDValue Op, // position-independent addressing modes. if (Subtarget->genExecuteOnly()) { auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>(); - auto T = const_cast<Type*>(CP->getType()); + auto *T = CP->getType(); auto C = const_cast<Constant*>(CP->getConstVal()); auto M = DAG.getMachineFunction().getFunction().getParent(); auto GV = new GlobalVariable( @@ -5570,7 +5570,7 @@ static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, llvm_unreachable("Unknown VFP cmp argument!"); } -/// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some +/// OptimizeVFPBrcond - With nnan, it's legal to optimize some /// f32 and even f64 comparisons to integer ones. SDValue ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { @@ -5712,9 +5712,12 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, Cmp); } - if (getTargetMachine().Options.UnsafeFPMath && - (CC == ISD::SETEQ || CC == ISD::SETOEQ || - CC == ISD::SETNE || CC == ISD::SETUNE)) { + SDNodeFlags Flags = Op->getFlags(); + if ((getTargetMachine().Options.UnsafeFPMath || Flags.hasNoNaNs()) && + (DAG.getDenormalMode(MVT::f32) == DenormalMode::getIEEE() && + DAG.getDenormalMode(MVT::f64) == DenormalMode::getIEEE()) && + (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETNE || + CC == ISD::SETUNE)) { if (SDValue Result = OptimizeVFPBrcond(Op, DAG)) return Result; } @@ -10539,19 +10542,11 @@ SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const { } // Generate the operation with flags - SDValue OpWithFlags; - if (Opcode == ARMISD::ADDC) { - // Use ADDC: LHS + RHS (where RHS was 0 - X, now X) - OpWithFlags = DAG.getNode(ARMISD::ADDC, dl, - DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS); - } else { - // Use ARMISD::SUBC to generate SUBS instruction (subtract with flags) - OpWithFlags = DAG.getNode(ARMISD::SUBC, dl, - DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS); - } + SDValue OpWithFlags = + DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS); - SDValue OpResult = OpWithFlags.getValue(0); // The operation result - SDValue Flags = OpWithFlags.getValue(1); // The flags + SDValue OpResult = OpWithFlags.getValue(0); + SDValue Flags = OpWithFlags.getValue(1); // Constants for conditional moves SDValue One = DAG.getConstant(1, dl, MVT::i32); @@ -20073,6 +20068,29 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known = KnownOp0.intersectWith(KnownOp1); break; } + case ARMISD::VORRIMM: + case ARMISD::VBICIMM: { + unsigned Encoded = Op.getConstantOperandVal(1); + unsigned DecEltBits = 0; + uint64_t DecodedVal = ARM_AM::decodeVMOVModImm(Encoded, DecEltBits); + + unsigned EltBits = Op.getScalarValueSizeInBits(); + if (EltBits != DecEltBits) { + // Be conservative: only update Known when EltBits == DecEltBits. + // This is believed to always be true for VORRIMM/VBICIMM today, but if + // that changes in the future, doing nothing here is safer than risking + // subtle bugs. + break; + } + + KnownBits KnownLHS = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); + bool IsVORR = Op.getOpcode() == ARMISD::VORRIMM; + APInt Imm(DecEltBits, DecodedVal); + + Known.One = IsVORR ? (KnownLHS.One | Imm) : (KnownLHS.One & ~Imm); + Known.Zero = IsVORR ? (KnownLHS.Zero & ~Imm) : (KnownLHS.Zero | Imm); + break; + } } } @@ -20200,37 +20218,6 @@ bool ARMTargetLowering::SimplifyDemandedBitsForTargetNode( // ARM Inline Assembly Support //===----------------------------------------------------------------------===// -bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { - // Looking for "rev" which is V6+. - if (!Subtarget->hasV6Ops()) - return false; - - InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand()); - StringRef AsmStr = IA->getAsmString(); - SmallVector<StringRef, 4> AsmPieces; - SplitString(AsmStr, AsmPieces, ";\n"); - - switch (AsmPieces.size()) { - default: return false; - case 1: - AsmStr = AsmPieces[0]; - AsmPieces.clear(); - SplitString(AsmStr, AsmPieces, " \t,"); - - // rev $0, $1 - if (AsmPieces.size() == 3 && AsmPieces[0] == "rev" && - AsmPieces[1] == "$0" && AsmPieces[2] == "$1" && - IA->getConstraintString().starts_with("=l,l")) { - IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); - if (Ty && Ty->getBitWidth() == 32) - return IntrinsicLowering::LowerToByteSwap(CI); - } - break; - } - - return false; -} - const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const { // At this point, we have to lower this constraint to something else, so we // lower it to an "r" or "w". However, by doing this we will force the result @@ -21379,12 +21366,25 @@ bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, return false; } +bool ARMTargetLowering::canCreateUndefOrPoisonForTargetNode( + SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, + bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const { + unsigned Opcode = Op.getOpcode(); + switch (Opcode) { + case ARMISD::VORRIMM: + case ARMISD::VBICIMM: + return false; + } + return TargetLowering::canCreateUndefOrPoisonForTargetNode( + Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth); +} + bool ARMTargetLowering::isCheapToSpeculateCttz(Type *Ty) const { - return Subtarget->hasV6T2Ops(); + return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only(); } bool ARMTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const { - return Subtarget->hasV6T2Ops(); + return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only(); } bool ARMTargetLowering::isMaskAndCmp0FoldingBeneficial( @@ -21706,13 +21706,15 @@ bool ARMTargetLowering::lowerInterleavedLoad( bool ARMTargetLowering::lowerInterleavedStore(Instruction *Store, Value *LaneMask, ShuffleVectorInst *SVI, - unsigned Factor) const { + unsigned Factor, + const APInt &GapMask) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); auto *SI = dyn_cast<StoreInst>(Store); if (!SI) return false; - assert(!LaneMask && "Unexpected mask on store"); + assert(!LaneMask && GapMask.popcount() == Factor && + "Unexpected mask on store"); auto *VecTy = cast<FixedVectorType>(SVI->getType()); assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 0185c8ddd492..ccf6d509313b 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -534,8 +534,6 @@ class VectorType; const APInt &DemandedElts, TargetLoweringOpt &TLO) const override; - bool ExpandInlineAsm(CallInst *CI) const override; - ConstraintType getConstraintType(StringRef Constraint) const override; /// Examine constraint string and operand type and determine a weight value. @@ -688,8 +686,8 @@ class VectorType; ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const override; bool lowerInterleavedStore(Instruction *Store, Value *Mask, - ShuffleVectorInst *SVI, - unsigned Factor) const override; + ShuffleVectorInst *SVI, unsigned Factor, + const APInt &GapMask) const override; bool shouldInsertFencesForAtomic(const Instruction *I) const override; TargetLoweringBase::AtomicExpansionKind @@ -709,6 +707,10 @@ class VectorType; bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const override; + bool canCreateUndefOrPoisonForTargetNode( + SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, + bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override; + bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override { // Do not merge to larger than i32. diff --git a/llvm/lib/Target/ARM/ARMInstrFormats.td b/llvm/lib/Target/ARM/ARMInstrFormats.td index 9eb911406914..e50740f7d57c 100644 --- a/llvm/lib/Target/ARM/ARMInstrFormats.td +++ b/llvm/lib/Target/ARM/ARMInstrFormats.td @@ -160,7 +160,7 @@ def CondCodeOperand : AsmOperandClass { let DefaultMethod = "defaultCondCodeOp"; let IsOptional = true; } -def pred : PredicateOperand<OtherVT, (ops i32imm, i32imm), +def pred : PredicateOperand<OtherVT, (ops i32imm, CCR), (ops (i32 14), (i32 zero_reg))> { let PrintMethod = "printPredicateOperand"; let ParserMatchClass = CondCodeOperand; diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index 934ec52c6f1e..bdb16d7d3926 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -164,10 +164,9 @@ def ARMWrapperPIC : SDNode<"ARMISD::WrapperPIC", SDTIntUnaryOp>; def ARMWrapperJT : SDNode<"ARMISD::WrapperJT", SDTIntUnaryOp>; def ARMcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_ARMCallSeqStart, - [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>; + [SDNPHasChain, SDNPOutGlue]>; def ARMcallseq_end : SDNode<"ISD::CALLSEQ_END", SDT_ARMCallSeqEnd, - [SDNPHasChain, SDNPSideEffect, - SDNPOptInGlue, SDNPOutGlue]>; + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; def ARMcopystructbyval : SDNode<"ARMISD::COPY_STRUCT_BYVAL" , SDT_ARMStructByVal, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, @@ -766,7 +765,6 @@ class MVEVectorIndexOperand<int NumLanes> : AsmOperandClass { class MVEVectorIndex<int NumLanes> : Operand<i32> { let PrintMethod = "printVectorIndex"; let ParserMatchClass = MVEVectorIndexOperand<NumLanes>; - let MIOperandInfo = (ops i32imm); } // shift_imm: An integer that encodes a shift amount and the type of shift @@ -1182,7 +1180,6 @@ def PostIdxImm8AsmOperand : AsmOperandClass { let Name = "PostIdxImm8"; } def postidx_imm8 : MemOperand { let PrintMethod = "printPostIdxImm8Operand"; let ParserMatchClass = PostIdxImm8AsmOperand; - let MIOperandInfo = (ops i32imm); } // postidx_imm8s4 := +/- [0,1020] @@ -6448,7 +6445,7 @@ def : ARMInstAlias<"neg${s}${p} $Rd, $Rm", (RSBri GPR:$Rd, GPR:$Rm, 0, pred:$p, cc_out:$s)>; // Pre-v6, 'mov r0, r0' was used as a NOP encoding. -def : InstAlias<"nop${p}", (MOVr R0, R0, pred:$p, zero_reg), 0>, +def : InstAlias<"nop${p}", (MOVr R0, R0, pred:$p, (cc_out zero_reg)), 0>, Requires<[IsARM, NoV6]>; // MUL/UMLAL/SMLAL/UMULL/SMULL are available on all arches, but diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td index 7485ef569445..37f0103363b9 100644 --- a/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -95,28 +95,24 @@ def VectorIndex8 : Operand<i32>, ImmLeaf<i32, [{ }]> { let ParserMatchClass = VectorIndex8Operand; let PrintMethod = "printVectorIndex"; - let MIOperandInfo = (ops i32imm); } def VectorIndex16 : Operand<i32>, ImmLeaf<i32, [{ return ((uint64_t)Imm) < 4; }]> { let ParserMatchClass = VectorIndex16Operand; let PrintMethod = "printVectorIndex"; - let MIOperandInfo = (ops i32imm); } def VectorIndex32 : Operand<i32>, ImmLeaf<i32, [{ return ((uint64_t)Imm) < 2; }]> { let ParserMatchClass = VectorIndex32Operand; let PrintMethod = "printVectorIndex"; - let MIOperandInfo = (ops i32imm); } def VectorIndex64 : Operand<i32>, ImmLeaf<i32, [{ return ((uint64_t)Imm) < 1; }]> { let ParserMatchClass = VectorIndex64Operand; let PrintMethod = "printVectorIndex"; - let MIOperandInfo = (ops i32imm); } // Register list of one D register. diff --git a/llvm/lib/Target/ARM/ARMInstrThumb.td b/llvm/lib/Target/ARM/ARMInstrThumb.td index e38cafdf55c4..0c5ea3e0fa8d 100644 --- a/llvm/lib/Target/ARM/ARMInstrThumb.td +++ b/llvm/lib/Target/ARM/ARMInstrThumb.td @@ -1209,8 +1209,9 @@ def tMOVi8 : T1sI<(outs tGPR:$Rd), (ins imm0_255_expr:$imm8), IIC_iMOVi, } // Because we have an explicit tMOVSr below, we need an alias to handle // the immediate "movs" form here. Blech. -def : tInstAlias <"movs $Rdn, $imm8", - (tMOVi8 tGPR:$Rdn, CPSR, imm0_255_expr:$imm8, 14, zero_reg)>; +def : tInstAlias<"movs $Rdn, $imm8", + (tMOVi8 tGPR:$Rdn, (s_cc_out CPSR), + imm0_255_expr:$imm8, (pred 14, zero_reg))>; // A7-73: MOV(2) - mov setting flag. @@ -1764,7 +1765,8 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { // In Thumb1, "nop" is encoded as a "mov r8, r8". Technically, the bf00 // encoding is available on ARMv6K, but we don't differentiate that finely. -def : InstAlias<"nop", (tMOVr R8, R8, 14, zero_reg), 0>, Requires<[IsThumb, IsThumb1Only]>; +def : InstAlias<"nop", (tMOVr R8, R8, (pred 14, zero_reg)), 0>, + Requires<[IsThumb, IsThumb1Only]>; // "neg" is and alias for "rsb rd, rn, #0" diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td index 8f56fb0938dd..c00d616670b5 100644 --- a/llvm/lib/Target/ARM/ARMInstrThumb2.td +++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -2222,11 +2222,11 @@ def t2MOVr : T2sTwoReg<(outs GPRnopc:$Rd), (ins GPRnopc:$Rm), IIC_iMOVr, let Inst{7-4} = 0b0000; } def : t2InstAlias<"mov${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPRnopc:$Rm, - pred:$p, zero_reg)>; + pred:$p, (cc_out zero_reg))>; def : t2InstAlias<"movs${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPRnopc:$Rm, - pred:$p, CPSR)>; + pred:$p, (cc_out CPSR))>; def : t2InstAlias<"movs${p} $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPRnopc:$Rm, - pred:$p, CPSR)>; + pred:$p, (cc_out CPSR))>; // AddedComplexity to ensure isel tries t2MOVi before t2MOVi16. let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1, @@ -2244,14 +2244,14 @@ def t2MOVi : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), IIC_iMOVi, // cc_out is handled as part of the explicit mnemonic in the parser for 'mov'. // Use aliases to get that to play nice here. def : t2InstAlias<"movs${p}.w $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm, - pred:$p, CPSR)>; + pred:$p, (cc_out CPSR))>; def : t2InstAlias<"movs${p} $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm, - pred:$p, CPSR)>; + pred:$p, (cc_out CPSR))>; def : t2InstAlias<"mov${p}.w $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm, - pred:$p, zero_reg)>; + pred:$p, (cc_out zero_reg))>; def : t2InstAlias<"mov${p} $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm, - pred:$p, zero_reg)>; + pred:$p, (cc_out zero_reg))>; let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins imm0_65535_expr:$imm), IIC_iMOVi, @@ -5122,8 +5122,10 @@ def : InstAlias<"isb${p}.w", (t2ISB 0xf, pred:$p), 0>, Requires<[HasDB]>; // Non-predicable aliases of a predicable DSB: the predicate is (14, zero_reg) where // 14 = AL (always execute) and zero_reg = "instruction doesn't read the CPSR". -def : InstAlias<"ssbb", (t2DSB 0x0, 14, zero_reg), 1>, Requires<[HasDB, IsThumb2]>; -def : InstAlias<"pssbb", (t2DSB 0x4, 14, zero_reg), 1>, Requires<[HasDB, IsThumb2]>; +def : InstAlias<"ssbb", (t2DSB 0x0, (pred 14, zero_reg)), 1>, + Requires<[HasDB, IsThumb2]>; +def : InstAlias<"pssbb", (t2DSB 0x4, (pred 14, zero_reg)), 1>, + Requires<[HasDB, IsThumb2]>; // Armv8-R 'Data Full Barrier' def : InstAlias<"dfb${p}", (t2DSB 0xc, pred:$p), 1>, Requires<[HasDFB]>; @@ -5340,7 +5342,8 @@ def : t2InstAlias<"sxth${p} $Rd, $Rm$rot", // "mov Rd, t2_so_imm_not" can be handled via "mvn" in assembly, just like // for isel. def : t2InstSubst<"mov${p} $Rd, $imm", - (t2MVNi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, zero_reg)>; + (t2MVNi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, + (cc_out zero_reg))>; def : t2InstSubst<"mvn${s}${p} $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, s_cc_out:$s)>; // Same for AND <--> BIC diff --git a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp index fc12f050fa5a..cdff649ecfa5 100644 --- a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp +++ b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp @@ -206,7 +206,7 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) : ST(ST) { getActionDefinitionsBuilder({G_FREM, G_FPOW}).libcallFor({s32, s64}); - if (ST.hasV5TOps()) { + if (ST.hasV5TOps() && !ST.isThumb1Only()) { getActionDefinitionsBuilder(G_CTLZ) .legalFor({s32, s32}) .clampScalar(1, s32, s32) diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 6b2854171c81..9b250e6cac3a 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1916,9 +1916,11 @@ InstructionCost ARMTTIImpl::getExtendedReductionCost( } InstructionCost -ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy, - VectorType *ValTy, +ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, + Type *ResTy, VectorType *ValTy, TTI::TargetCostKind CostKind) const { + if (RedOpcode != Instruction::Add) + return InstructionCost::getInvalid(CostKind); EVT ValVT = TLI->getValueType(DL, ValTy); EVT ResVT = TLI->getValueType(DL, ResTy); @@ -1939,7 +1941,8 @@ ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy, return ST->getMVEVectorCostFactor(CostKind) * LT.first; } - return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, ValTy, CostKind); + return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, ValTy, + CostKind); } InstructionCost diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index cdd8bcb9f741..0810c5532ed9 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -299,7 +299,8 @@ public: VectorType *ValTy, std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const override; InstructionCost - getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *ValTy, + getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, + VectorType *ValTy, TTI::TargetCostKind CostKind) const override; InstructionCost diff --git a/llvm/lib/Target/ARM/CMakeLists.txt b/llvm/lib/Target/ARM/CMakeLists.txt index a39629bd8aeb..fa778cad4af8 100644 --- a/llvm/lib/Target/ARM/CMakeLists.txt +++ b/llvm/lib/Target/ARM/CMakeLists.txt @@ -6,7 +6,8 @@ tablegen(LLVM ARMGenAsmMatcher.inc -gen-asm-matcher) tablegen(LLVM ARMGenAsmWriter.inc -gen-asm-writer) tablegen(LLVM ARMGenCallingConv.inc -gen-callingconv) tablegen(LLVM ARMGenDAGISel.inc -gen-dag-isel) -tablegen(LLVM ARMGenDisassemblerTables.inc -gen-disassembler) +tablegen(LLVM ARMGenDisassemblerTables.inc -gen-disassembler + -ignore-non-decodable-operands) tablegen(LLVM ARMGenFastISel.inc -gen-fast-isel) tablegen(LLVM ARMGenGlobalISel.inc -gen-global-isel) tablegen(LLVM ARMGenInstrInfo.inc -gen-instr-info) diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index 19fa03cdc668..1d19bc89ccf9 100644 --- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -161,676 +161,13 @@ private: // Forward declare these because the autogenerated code will reference them. // Definitions are further down. -static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeCLRMGPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodetGPROddRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodetGPREvenRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus -DecodeGPRwithAPSR_NZCVnospRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeGPRnopcRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeGPRnospRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus -DecodeGPRwithAPSRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeGPRwithZRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus -DecodeGPRwithZRnospRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodetGPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus -DecodeGPRPairnospRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeGPRspRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeHPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeDPR_8RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeSPR_8RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeDPR_VFP2RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeQPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeMQPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeMQQPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeMQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeDPairRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus -DecodeDPairSpacedRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodePredicateOperand(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeCCOutOperand(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeSPRRegListOperand(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeBitfieldMaskOperand(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeCopMemInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus -DecodeAddrMode2IdxInstruction(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeSORegMemOperand(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeAddrMode3Instruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeTSBInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeSORegImmOperand(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeSORegRegOperand(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus -DecodeMemMultipleWritebackInstruction(MCInst &Inst, unsigned Insn, - uint64_t Adddress, - const MCDisassembler *Decoder); -static DecodeStatus DecodeT2MOVTWInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeArmMOVTWInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeSMLAInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeHINTInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeTSTInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeSETPANInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeT2CPSInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeT2HintSpaceInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeAddrModeImm12Operand(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeAddrMode5Operand(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeAddrMode5FP16Operand(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeAddrMode7Operand(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeT2BInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeBranchImmInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeAddrMode6Operand(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVLDST1Instruction(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVLDST2Instruction(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVLDST3Instruction(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVLDST4Instruction(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVLDInstruction(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVSTInstruction(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVLD1DupInstruction(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVLD2DupInstruction(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVLD3DupInstruction(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVMOVModImmInstruction(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeMVEModImmInstruction(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeMVEVADCInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVSHLMaxInstruction(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeShiftRight8Imm(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeShiftRight16Imm(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeShiftRight32Imm(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeShiftRight64Imm(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeTBLInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodePostIdxReg(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeMveAddrModeRQ(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -template <int shift> -static DecodeStatus DecodeMveAddrModeQ(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeInstSyncBarrierOption(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeBankedReg(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeDoubleRegLoad(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeDoubleRegStore(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeLDRPreImm(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeLDRPreReg(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeSTRPreImm(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeSTRPreReg(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVLD1LN(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVLD2LN(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVLD3LN(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVLD4LN(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVST1LN(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVST2LN(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVST3LN(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVST4LN(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVMOVSRR(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVMOVRRS(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeSwap(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVCVTImmOperand(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus -DecodeNEONComplexLane64Instruction(MCInst &Inst, unsigned Val, uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeThumbAddSpecialReg(MCInst &Inst, uint16_t Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeThumbBROperand(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeT2BROperand(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeThumbCmpBROperand(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeThumbAddrModeRR(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeThumbAddrModeIS(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeThumbAddrModePC(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeThumbAddrModeSP(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeT2AddrModeSOReg(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeT2LoadImm8(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeT2LoadImm12(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeT2LoadT(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeT2LoadLabel(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeT2Imm7S4(MCInst &Inst, unsigned Val, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeT2AddrModeImm8s4(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeT2AddrModeImm7s4(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeT2AddrModeImm0_1020s4(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeT2Imm8(MCInst &Inst, unsigned Val, uint64_t Address, - const MCDisassembler *Decoder); -template <int shift> -static DecodeStatus DecodeT2Imm7(MCInst &Inst, unsigned Val, uint64_t Address, - const MCDisassembler *Decoder); static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val, uint64_t Address, const MCDisassembler *Decoder); -template <int shift> -static DecodeStatus DecodeTAddrModeImm7(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -template <int shift, int WriteBack> -static DecodeStatus DecodeT2AddrModeImm7(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeThumbAddSPImm(MCInst &Inst, uint16_t Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeThumbAddSPReg(MCInst &Inst, uint16_t Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeThumbCPS(MCInst &Inst, uint16_t Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeQADDInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeThumbBLXOffset(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeThumbTableBranch(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeThumb2BCCInstruction(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeT2SOImm(MCInst &Inst, unsigned Val, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeThumbBCCTargetOperand(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeThumbBLTargetOperand(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeIT(MCInst &Inst, unsigned Val, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeT2LDRDPreInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeT2STRDPreInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeT2Adr(MCInst &Inst, unsigned Val, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeT2LdStPre(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeT2ShifterImmOperand(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecoderForMRRC2AndMCRR2(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeForVMRSandVMSR(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); - -template <bool isSigned, bool isNeg, bool zeroPermitted, int size> -static DecodeStatus DecodeBFLabelOperand(MCInst &Inst, unsigned val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeBFAfterTargetOperand(MCInst &Inst, unsigned val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodePredNoALOperand(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeLOLoop(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeLongShiftOperand(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVSCCLRM(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVPTMaskOperand(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVpredROperand(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeVpredNOperand(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus -DecodeRestrictedIPredicateOperand(MCInst &Inst, unsigned Val, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus -DecodeRestrictedSPredicateOperand(MCInst &Inst, unsigned Val, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus -DecodeRestrictedUPredicateOperand(MCInst &Inst, unsigned Val, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus -DecodeRestrictedFPPredicateOperand(MCInst &Inst, unsigned Val, uint64_t Address, - const MCDisassembler *Decoder); -template <bool Writeback> -static DecodeStatus DecodeVSTRVLDR_SYSREG(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -template <int shift> -static DecodeStatus DecodeMVE_MEM_1_pre(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -template <int shift> -static DecodeStatus DecodeMVE_MEM_2_pre(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -template <int shift> -static DecodeStatus DecodeMVE_MEM_3_pre(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -template <unsigned MinLog, unsigned MaxLog> -static DecodeStatus DecodePowerTwoOperand(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); -template <unsigned start> -static DecodeStatus -DecodeMVEPairVectorIndexOperand(MCInst &Inst, unsigned Val, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeMVEVMOVQtoDReg(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeMVEVMOVDRegtoQ(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeMVEVCVTt1fp(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); typedef DecodeStatus OperandDecoder(MCInst &Inst, unsigned Val, uint64_t Address, const MCDisassembler *Decoder); -template <bool scalar, OperandDecoder predicate_decoder> -static DecodeStatus DecodeMVEVCMP(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeMveVCTP(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeMVEVPNOT(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus -DecodeMVEOverlappingLongShift(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeT2AddSubSPImm(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeLazyLoadStoreMul(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -#include "ARMGenDisassemblerTables.inc" - -static MCDisassembler *createARMDisassembler(const Target &T, - const MCSubtargetInfo &STI, - MCContext &Ctx) { - return new ARMDisassembler(STI, Ctx, T.createMCInstrInfo()); -} - -// Post-decoding checks -static DecodeStatus checkDecodedInstruction(MCInst &MI, uint64_t &Size, - uint64_t Address, raw_ostream &CS, - uint32_t Insn, - DecodeStatus Result) { - switch (MI.getOpcode()) { - case ARM::HVC: { - // HVC is undefined if condition = 0xf otherwise upredictable - // if condition != 0xe - uint32_t Cond = (Insn >> 28) & 0xF; - if (Cond == 0xF) - return MCDisassembler::Fail; - if (Cond != 0xE) - return MCDisassembler::SoftFail; - return Result; - } - case ARM::t2ADDri: - case ARM::t2ADDri12: - case ARM::t2ADDrr: - case ARM::t2ADDrs: - case ARM::t2SUBri: - case ARM::t2SUBri12: - case ARM::t2SUBrr: - case ARM::t2SUBrs: - if (MI.getOperand(0).getReg() == ARM::SP && - MI.getOperand(1).getReg() != ARM::SP) - return MCDisassembler::SoftFail; - return Result; - default: return Result; - } -} - -uint64_t ARMDisassembler::suggestBytesToSkip(ArrayRef<uint8_t> Bytes, - uint64_t Address) const { - // In Arm state, instructions are always 4 bytes wide, so there's no - // point in skipping any smaller number of bytes if an instruction - // can't be decoded. - if (!STI.hasFeature(ARM::ModeThumb)) - return 4; - - // In a Thumb instruction stream, a halfword is a standalone 2-byte - // instruction if and only if its value is less than 0xE800. - // Otherwise, it's the first halfword of a 4-byte instruction. - // - // So, if we can see the upcoming halfword, we can judge on that - // basis, and maybe skip a whole 4-byte instruction that we don't - // know how to decode, without accidentally trying to interpret its - // second half as something else. - // - // If we don't have the instruction data available, we just have to - // recommend skipping the minimum sensible distance, which is 2 - // bytes. - if (Bytes.size() < 2) - return 2; - - uint16_t Insn16 = llvm::support::endian::read<uint16_t>( - Bytes.data(), InstructionEndianness); - return Insn16 < 0xE800 ? 2 : 4; -} - -DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size, - ArrayRef<uint8_t> Bytes, - uint64_t Address, - raw_ostream &CS) const { - if (STI.hasFeature(ARM::ModeThumb)) - return getThumbInstruction(MI, Size, Bytes, Address, CS); - return getARMInstruction(MI, Size, Bytes, Address, CS); -} - -DecodeStatus ARMDisassembler::getARMInstruction(MCInst &MI, uint64_t &Size, - ArrayRef<uint8_t> Bytes, - uint64_t Address, - raw_ostream &CS) const { - CommentStream = &CS; - - assert(!STI.hasFeature(ARM::ModeThumb) && - "Asked to disassemble an ARM instruction but Subtarget is in Thumb " - "mode!"); - - // We want to read exactly 4 bytes of data. - if (Bytes.size() < 4) { - Size = 0; - return MCDisassembler::Fail; - } - - // Encoded as a 32-bit word in the stream. - uint32_t Insn = llvm::support::endian::read<uint32_t>(Bytes.data(), - InstructionEndianness); - - // Calling the auto-generated decoder function. - DecodeStatus Result = - decodeInstruction(DecoderTableARM32, MI, Insn, Address, this, STI); - if (Result != MCDisassembler::Fail) { - Size = 4; - return checkDecodedInstruction(MI, Size, Address, CS, Insn, Result); - } - - struct DecodeTable { - const uint8_t *P; - bool DecodePred; - }; - - const DecodeTable Tables[] = { - {DecoderTableVFP32, false}, {DecoderTableVFPV832, false}, - {DecoderTableNEONData32, true}, {DecoderTableNEONLoadStore32, true}, - {DecoderTableNEONDup32, true}, {DecoderTablev8NEON32, false}, - {DecoderTablev8Crypto32, false}, - }; - - for (auto Table : Tables) { - Result = decodeInstruction(Table.P, MI, Insn, Address, this, STI); - if (Result != MCDisassembler::Fail) { - Size = 4; - // Add a fake predicate operand, because we share these instruction - // definitions with Thumb2 where these instructions are predicable. - if (Table.DecodePred && !DecodePredicateOperand(MI, 0xE, Address, this)) - return MCDisassembler::Fail; - return Result; - } - } - - Result = - decodeInstruction(DecoderTableCoProc32, MI, Insn, Address, this, STI); - if (Result != MCDisassembler::Fail) { - Size = 4; - return checkDecodedInstruction(MI, Size, Address, CS, Insn, Result); - } - - Size = 4; - return MCDisassembler::Fail; -} /// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the /// immediate Value in the MCInst. The immediate Value has had any PC @@ -868,409 +205,7 @@ static void tryAddingPcLoadReferenceComment(uint64_t Address, int Value, Decoder->tryAddingPcLoadReferenceComment(Value, Address); } -// Thumb1 instructions don't have explicit S bits. Rather, they -// implicitly set CPSR. Since it's not represented in the encoding, the -// auto-generated decoder won't inject the CPSR operand. We need to fix -// that as a post-pass. -void ARMDisassembler::AddThumb1SBit(MCInst &MI, bool InITBlock) const { - const MCInstrDesc &MCID = MCII->get(MI.getOpcode()); - MCInst::iterator I = MI.begin(); - for (unsigned i = 0; i < MCID.NumOperands; ++i, ++I) { - if (I == MI.end()) break; - if (MCID.operands()[i].isOptionalDef() && - MCID.operands()[i].RegClass == ARM::CCRRegClassID) { - if (i > 0 && MCID.operands()[i - 1].isPredicate()) - continue; - MI.insert(I, - MCOperand::createReg(InITBlock ? ARM::NoRegister : ARM::CPSR)); - return; - } - } - - MI.insert(I, MCOperand::createReg(InITBlock ? ARM::NoRegister : ARM::CPSR)); -} - -bool ARMDisassembler::isVectorPredicable(const MCInst &MI) const { - const MCInstrDesc &MCID = MCII->get(MI.getOpcode()); - for (unsigned i = 0; i < MCID.NumOperands; ++i) { - if (ARM::isVpred(MCID.operands()[i].OperandType)) - return true; - } - return false; -} - -// Most Thumb instructions don't have explicit predicates in the -// encoding, but rather get their predicates from IT context. We need -// to fix up the predicate operands using this context information as a -// post-pass. -MCDisassembler::DecodeStatus -ARMDisassembler::AddThumbPredicate(MCInst &MI) const { - MCDisassembler::DecodeStatus S = Success; - - const FeatureBitset &FeatureBits = getSubtargetInfo().getFeatureBits(); - - // A few instructions actually have predicates encoded in them. Don't - // try to overwrite it if we're seeing one of those. - switch (MI.getOpcode()) { - case ARM::tBcc: - case ARM::t2Bcc: - case ARM::tCBZ: - case ARM::tCBNZ: - case ARM::tCPS: - case ARM::t2CPS3p: - case ARM::t2CPS2p: - case ARM::t2CPS1p: - case ARM::t2CSEL: - case ARM::t2CSINC: - case ARM::t2CSINV: - case ARM::t2CSNEG: - case ARM::tMOVSr: - case ARM::tSETEND: - // Some instructions (mostly conditional branches) are not - // allowed in IT blocks. - if (ITBlock.instrInITBlock()) - S = SoftFail; - else - return Success; - break; - case ARM::t2HINT: - if (MI.getOperand(0).getImm() == 0x10 && (FeatureBits[ARM::FeatureRAS]) != 0) - S = SoftFail; - break; - case ARM::tB: - case ARM::t2B: - case ARM::t2TBB: - case ARM::t2TBH: - // Some instructions (mostly unconditional branches) can - // only appears at the end of, or outside of, an IT. - if (ITBlock.instrInITBlock() && !ITBlock.instrLastInITBlock()) - S = SoftFail; - break; - default: - break; - } - - // Warn on non-VPT predicable instruction in a VPT block and a VPT - // predicable instruction in an IT block - if ((!isVectorPredicable(MI) && VPTBlock.instrInVPTBlock()) || - (isVectorPredicable(MI) && ITBlock.instrInITBlock())) - S = SoftFail; - - // If we're in an IT/VPT block, base the predicate on that. Otherwise, - // assume a predicate of AL. - unsigned CC = ARMCC::AL; - unsigned VCC = ARMVCC::None; - if (ITBlock.instrInITBlock()) { - CC = ITBlock.getITCC(); - ITBlock.advanceITState(); - } else if (VPTBlock.instrInVPTBlock()) { - VCC = VPTBlock.getVPTPred(); - VPTBlock.advanceVPTState(); - } - - const MCInstrDesc &MCID = MCII->get(MI.getOpcode()); - - MCInst::iterator CCI = MI.begin(); - for (unsigned i = 0; i < MCID.NumOperands; ++i, ++CCI) { - if (MCID.operands()[i].isPredicate() || CCI == MI.end()) - break; - } - - if (MCID.isPredicable()) { - CCI = MI.insert(CCI, MCOperand::createImm(CC)); - ++CCI; - if (CC == ARMCC::AL) - MI.insert(CCI, MCOperand::createReg(ARM::NoRegister)); - else - MI.insert(CCI, MCOperand::createReg(ARM::CPSR)); - } else if (CC != ARMCC::AL) { - Check(S, SoftFail); - } - - MCInst::iterator VCCI = MI.begin(); - unsigned VCCPos; - for (VCCPos = 0; VCCPos < MCID.NumOperands; ++VCCPos, ++VCCI) { - if (ARM::isVpred(MCID.operands()[VCCPos].OperandType) || VCCI == MI.end()) - break; - } - - if (isVectorPredicable(MI)) { - VCCI = MI.insert(VCCI, MCOperand::createImm(VCC)); - ++VCCI; - if (VCC == ARMVCC::None) - VCCI = MI.insert(VCCI, MCOperand::createReg(0)); - else - VCCI = MI.insert(VCCI, MCOperand::createReg(ARM::P0)); - ++VCCI; - VCCI = MI.insert(VCCI, MCOperand::createReg(0)); - ++VCCI; - if (MCID.operands()[VCCPos].OperandType == ARM::OPERAND_VPRED_R) { - int TiedOp = MCID.getOperandConstraint(VCCPos + 3, MCOI::TIED_TO); - assert(TiedOp >= 0 && - "Inactive register in vpred_r is not tied to an output!"); - // Copy the operand to ensure it's not invalidated when MI grows. - MI.insert(VCCI, MCOperand(MI.getOperand(TiedOp))); - } - } else if (VCC != ARMVCC::None) { - Check(S, SoftFail); - } - - return S; -} - -// Thumb VFP instructions are a special case. Because we share their -// encodings between ARM and Thumb modes, and they are predicable in ARM -// mode, the auto-generated decoder will give them an (incorrect) -// predicate operand. We need to rewrite these operands based on the IT -// context as a post-pass. -void ARMDisassembler::UpdateThumbVFPPredicate( - DecodeStatus &S, MCInst &MI) const { - unsigned CC; - CC = ITBlock.getITCC(); - if (CC == 0xF) - CC = ARMCC::AL; - if (ITBlock.instrInITBlock()) - ITBlock.advanceITState(); - else if (VPTBlock.instrInVPTBlock()) { - CC = VPTBlock.getVPTPred(); - VPTBlock.advanceVPTState(); - } - - const MCInstrDesc &MCID = MCII->get(MI.getOpcode()); - ArrayRef<MCOperandInfo> OpInfo = MCID.operands(); - MCInst::iterator I = MI.begin(); - unsigned short NumOps = MCID.NumOperands; - for (unsigned i = 0; i < NumOps; ++i, ++I) { - if (OpInfo[i].isPredicate() ) { - if (CC != ARMCC::AL && !MCID.isPredicable()) - Check(S, SoftFail); - I->setImm(CC); - ++I; - if (CC == ARMCC::AL) - I->setReg(ARM::NoRegister); - else - I->setReg(ARM::CPSR); - return; - } - } -} - -DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size, - ArrayRef<uint8_t> Bytes, - uint64_t Address, - raw_ostream &CS) const { - CommentStream = &CS; - - assert(STI.hasFeature(ARM::ModeThumb) && - "Asked to disassemble in Thumb mode but Subtarget is in ARM mode!"); - - // We want to read exactly 2 bytes of data. - if (Bytes.size() < 2) { - Size = 0; - return MCDisassembler::Fail; - } - - uint16_t Insn16 = llvm::support::endian::read<uint16_t>( - Bytes.data(), InstructionEndianness); - DecodeStatus Result = - decodeInstruction(DecoderTableThumb16, MI, Insn16, Address, this, STI); - if (Result != MCDisassembler::Fail) { - Size = 2; - Check(Result, AddThumbPredicate(MI)); - return Result; - } - - Result = decodeInstruction(DecoderTableThumbSBit16, MI, Insn16, Address, this, - STI); - if (Result) { - Size = 2; - bool InITBlock = ITBlock.instrInITBlock(); - Check(Result, AddThumbPredicate(MI)); - AddThumb1SBit(MI, InITBlock); - return Result; - } - - Result = - decodeInstruction(DecoderTableThumb216, MI, Insn16, Address, this, STI); - if (Result != MCDisassembler::Fail) { - Size = 2; - - // Nested IT blocks are UNPREDICTABLE. Must be checked before we add - // the Thumb predicate. - if (MI.getOpcode() == ARM::t2IT && ITBlock.instrInITBlock()) - Result = MCDisassembler::SoftFail; - - Check(Result, AddThumbPredicate(MI)); - - // If we find an IT instruction, we need to parse its condition - // code and mask operands so that we can apply them correctly - // to the subsequent instructions. - if (MI.getOpcode() == ARM::t2IT) { - unsigned Firstcond = MI.getOperand(0).getImm(); - unsigned Mask = MI.getOperand(1).getImm(); - ITBlock.setITState(Firstcond, Mask); - - // An IT instruction that would give a 'NV' predicate is unpredictable. - if (Firstcond == ARMCC::AL && !isPowerOf2_32(Mask)) - CS << "unpredictable IT predicate sequence"; - } - - return Result; - } - - // We want to read exactly 4 bytes of data. - if (Bytes.size() < 4) { - Size = 0; - return MCDisassembler::Fail; - } - - uint32_t Insn32 = - (uint32_t(Insn16) << 16) | llvm::support::endian::read<uint16_t>( - Bytes.data() + 2, InstructionEndianness); - - Result = - decodeInstruction(DecoderTableMVE32, MI, Insn32, Address, this, STI); - if (Result != MCDisassembler::Fail) { - Size = 4; - - // Nested VPT blocks are UNPREDICTABLE. Must be checked before we add - // the VPT predicate. - if (isVPTOpcode(MI.getOpcode()) && VPTBlock.instrInVPTBlock()) - Result = MCDisassembler::SoftFail; - - Check(Result, AddThumbPredicate(MI)); - - if (isVPTOpcode(MI.getOpcode())) { - unsigned Mask = MI.getOperand(0).getImm(); - VPTBlock.setVPTState(Mask); - } - - return Result; - } - - Result = - decodeInstruction(DecoderTableThumb32, MI, Insn32, Address, this, STI); - if (Result != MCDisassembler::Fail) { - Size = 4; - bool InITBlock = ITBlock.instrInITBlock(); - Check(Result, AddThumbPredicate(MI)); - AddThumb1SBit(MI, InITBlock); - return Result; - } - - Result = - decodeInstruction(DecoderTableThumb232, MI, Insn32, Address, this, STI); - if (Result != MCDisassembler::Fail) { - Size = 4; - Check(Result, AddThumbPredicate(MI)); - return checkDecodedInstruction(MI, Size, Address, CS, Insn32, Result); - } - - if (fieldFromInstruction(Insn32, 28, 4) == 0xE) { - Result = - decodeInstruction(DecoderTableVFP32, MI, Insn32, Address, this, STI); - if (Result != MCDisassembler::Fail) { - Size = 4; - UpdateThumbVFPPredicate(Result, MI); - return Result; - } - } - - Result = - decodeInstruction(DecoderTableVFPV832, MI, Insn32, Address, this, STI); - if (Result != MCDisassembler::Fail) { - Size = 4; - return Result; - } - - if (fieldFromInstruction(Insn32, 28, 4) == 0xE) { - Result = decodeInstruction(DecoderTableNEONDup32, MI, Insn32, Address, this, - STI); - if (Result != MCDisassembler::Fail) { - Size = 4; - Check(Result, AddThumbPredicate(MI)); - return Result; - } - } - - if (fieldFromInstruction(Insn32, 24, 8) == 0xF9) { - uint32_t NEONLdStInsn = Insn32; - NEONLdStInsn &= 0xF0FFFFFF; - NEONLdStInsn |= 0x04000000; - Result = decodeInstruction(DecoderTableNEONLoadStore32, MI, NEONLdStInsn, - Address, this, STI); - if (Result != MCDisassembler::Fail) { - Size = 4; - Check(Result, AddThumbPredicate(MI)); - return Result; - } - } - - if (fieldFromInstruction(Insn32, 24, 4) == 0xF) { - uint32_t NEONDataInsn = Insn32; - NEONDataInsn &= 0xF0FFFFFF; // Clear bits 27-24 - NEONDataInsn |= (NEONDataInsn & 0x10000000) >> 4; // Move bit 28 to bit 24 - NEONDataInsn |= 0x12000000; // Set bits 28 and 25 - Result = decodeInstruction(DecoderTableNEONData32, MI, NEONDataInsn, - Address, this, STI); - if (Result != MCDisassembler::Fail) { - Size = 4; - Check(Result, AddThumbPredicate(MI)); - return Result; - } - - uint32_t NEONCryptoInsn = Insn32; - NEONCryptoInsn &= 0xF0FFFFFF; // Clear bits 27-24 - NEONCryptoInsn |= (NEONCryptoInsn & 0x10000000) >> 4; // Move bit 28 to bit 24 - NEONCryptoInsn |= 0x12000000; // Set bits 28 and 25 - Result = decodeInstruction(DecoderTablev8Crypto32, MI, NEONCryptoInsn, - Address, this, STI); - if (Result != MCDisassembler::Fail) { - Size = 4; - return Result; - } - - uint32_t NEONv8Insn = Insn32; - NEONv8Insn &= 0xF3FFFFFF; // Clear bits 27-26 - Result = decodeInstruction(DecoderTablev8NEON32, MI, NEONv8Insn, Address, - this, STI); - if (Result != MCDisassembler::Fail) { - Size = 4; - return Result; - } - } - - uint32_t Coproc = fieldFromInstruction(Insn32, 8, 4); - const uint8_t *DecoderTable = ARM::isCDECoproc(Coproc, STI) - ? DecoderTableThumb2CDE32 - : DecoderTableThumb2CoProc32; - Result = - decodeInstruction(DecoderTable, MI, Insn32, Address, this, STI); - if (Result != MCDisassembler::Fail) { - Size = 4; - Check(Result, AddThumbPredicate(MI)); - return Result; - } - - // Advance IT state to prevent next instruction inheriting - // the wrong IT state. - if (ITBlock.instrInITBlock()) - ITBlock.advanceITState(); - Size = 0; - return MCDisassembler::Fail; -} - -extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void -LLVMInitializeARMDisassembler() { - TargetRegistry::RegisterMCDisassembler(getTheARMLETarget(), - createARMDisassembler); - TargetRegistry::RegisterMCDisassembler(getTheARMBETarget(), - createARMDisassembler); - TargetRegistry::RegisterMCDisassembler(getTheThumbLETarget(), - createARMDisassembler); - TargetRegistry::RegisterMCDisassembler(getTheThumbBETarget(), - createARMDisassembler); -} +// Register class decoding functions. static const uint16_t GPRDecoderTable[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3, @@ -1626,6 +561,51 @@ DecodeDPairSpacedRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, return MCDisassembler::Success; } +static DecodeStatus DecodeMQPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const MCDisassembler *Decoder) { + if (RegNo > 7) + return MCDisassembler::Fail; + + unsigned Register = QPRDecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return MCDisassembler::Success; +} + +static const MCPhysReg QQPRDecoderTable[] = { + ARM::Q0_Q1, ARM::Q1_Q2, ARM::Q2_Q3, ARM::Q3_Q4, + ARM::Q4_Q5, ARM::Q5_Q6, ARM::Q6_Q7 +}; + +static DecodeStatus DecodeMQQPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const MCDisassembler *Decoder) { + if (RegNo > 6) + return MCDisassembler::Fail; + + unsigned Register = QQPRDecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return MCDisassembler::Success; +} + +static const MCPhysReg QQQQPRDecoderTable[] = { + ARM::Q0_Q1_Q2_Q3, ARM::Q1_Q2_Q3_Q4, ARM::Q2_Q3_Q4_Q5, + ARM::Q3_Q4_Q5_Q6, ARM::Q4_Q5_Q6_Q7 +}; + +static DecodeStatus DecodeMQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const MCDisassembler *Decoder) { + if (RegNo > 4) + return MCDisassembler::Fail; + + unsigned Register = QQQQPRDecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return MCDisassembler::Success; +} + +// Operand decoding functions. + static DecodeStatus DecodePredicateOperand(MCInst &Inst, unsigned Val, uint64_t Address, const MCDisassembler *Decoder) { @@ -2422,6 +1402,54 @@ static DecodeStatus DecodeRFEInstruction(MCInst &Inst, unsigned Insn, return S; } +static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, + const MCDisassembler *Decoder) { + unsigned imod = fieldFromInstruction(Insn, 18, 2); + unsigned M = fieldFromInstruction(Insn, 17, 1); + unsigned iflags = fieldFromInstruction(Insn, 6, 3); + unsigned mode = fieldFromInstruction(Insn, 0, 5); + + DecodeStatus S = MCDisassembler::Success; + + // This decoder is called from multiple location that do not check + // the full encoding is valid before they do. + if (fieldFromInstruction(Insn, 5, 1) != 0 || + fieldFromInstruction(Insn, 16, 1) != 0 || + fieldFromInstruction(Insn, 20, 8) != 0x10) + return MCDisassembler::Fail; + + // imod == '01' --> UNPREDICTABLE + // NOTE: Even though this is technically UNPREDICTABLE, we choose to + // return failure here. The '01' imod value is unprintable, so there's + // nothing useful we could do even if we returned UNPREDICTABLE. + + if (imod == 1) return MCDisassembler::Fail; + + if (imod && M) { + Inst.setOpcode(ARM::CPS3p); + Inst.addOperand(MCOperand::createImm(imod)); + Inst.addOperand(MCOperand::createImm(iflags)); + Inst.addOperand(MCOperand::createImm(mode)); + } else if (imod && !M) { + Inst.setOpcode(ARM::CPS2p); + Inst.addOperand(MCOperand::createImm(imod)); + Inst.addOperand(MCOperand::createImm(iflags)); + if (mode) S = MCDisassembler::SoftFail; + } else if (!imod && M) { + Inst.setOpcode(ARM::CPS1p); + Inst.addOperand(MCOperand::createImm(mode)); + if (iflags) S = MCDisassembler::SoftFail; + } else { + // imod == '00' && M == '0' --> UNPREDICTABLE + Inst.setOpcode(ARM::CPS1p); + Inst.addOperand(MCOperand::createImm(mode)); + S = MCDisassembler::SoftFail; + } + + return S; +} + static DecodeStatus DecodeQADDInstruction(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { @@ -2562,54 +1590,6 @@ static DecodeStatus DecodeHINTInstruction(MCInst &Inst, unsigned Insn, return S; } -static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - unsigned imod = fieldFromInstruction(Insn, 18, 2); - unsigned M = fieldFromInstruction(Insn, 17, 1); - unsigned iflags = fieldFromInstruction(Insn, 6, 3); - unsigned mode = fieldFromInstruction(Insn, 0, 5); - - DecodeStatus S = MCDisassembler::Success; - - // This decoder is called from multiple location that do not check - // the full encoding is valid before they do. - if (fieldFromInstruction(Insn, 5, 1) != 0 || - fieldFromInstruction(Insn, 16, 1) != 0 || - fieldFromInstruction(Insn, 20, 8) != 0x10) - return MCDisassembler::Fail; - - // imod == '01' --> UNPREDICTABLE - // NOTE: Even though this is technically UNPREDICTABLE, we choose to - // return failure here. The '01' imod value is unprintable, so there's - // nothing useful we could do even if we returned UNPREDICTABLE. - - if (imod == 1) return MCDisassembler::Fail; - - if (imod && M) { - Inst.setOpcode(ARM::CPS3p); - Inst.addOperand(MCOperand::createImm(imod)); - Inst.addOperand(MCOperand::createImm(iflags)); - Inst.addOperand(MCOperand::createImm(mode)); - } else if (imod && !M) { - Inst.setOpcode(ARM::CPS2p); - Inst.addOperand(MCOperand::createImm(imod)); - Inst.addOperand(MCOperand::createImm(iflags)); - if (mode) S = MCDisassembler::SoftFail; - } else if (!imod && M) { - Inst.setOpcode(ARM::CPS1p); - Inst.addOperand(MCOperand::createImm(mode)); - if (iflags) S = MCDisassembler::SoftFail; - } else { - // imod == '00' && M == '0' --> UNPREDICTABLE - Inst.setOpcode(ARM::CPS1p); - Inst.addOperand(MCOperand::createImm(mode)); - S = MCDisassembler::SoftFail; - } - - return S; -} - static DecodeStatus DecodeT2CPSInstruction(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { @@ -2760,28 +1740,6 @@ static DecodeStatus DecodeSMLAInstruction(MCInst &Inst, unsigned Insn, return S; } -static DecodeStatus DecodeTSTInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - DecodeStatus S = MCDisassembler::Success; - - unsigned Pred = fieldFromInstruction(Insn, 28, 4); - unsigned Rn = fieldFromInstruction(Insn, 16, 4); - unsigned Rm = fieldFromInstruction(Insn, 0, 4); - - if (Pred == 0xF) - return DecodeSETPANInstruction(Inst, Insn, Address, Decoder); - - if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) - return MCDisassembler::Fail; - if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) - return MCDisassembler::Fail; - if (!Check(S, DecodePredicateOperand(Inst, Pred, Address, Decoder))) - return MCDisassembler::Fail; - - return S; -} - static DecodeStatus DecodeSETPANInstruction(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { @@ -2811,6 +1769,28 @@ static DecodeStatus DecodeSETPANInstruction(MCInst &Inst, unsigned Insn, return S; } +static DecodeStatus DecodeTSTInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, + const MCDisassembler *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Pred = fieldFromInstruction(Insn, 28, 4); + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned Rm = fieldFromInstruction(Insn, 0, 4); + + if (Pred == 0xF) + return DecodeSETPANInstruction(Inst, Insn, Address, Decoder); + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodePredicateOperand(Inst, Pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + static DecodeStatus DecodeAddrModeImm12Operand(MCInst &Inst, unsigned Val, uint64_t Address, const MCDisassembler *Decoder) { @@ -3232,61 +2212,6 @@ static DecodeStatus DecodeVLDInstruction(MCInst &Inst, unsigned Insn, return S; } -static DecodeStatus DecodeVLDST1Instruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - unsigned type = fieldFromInstruction(Insn, 8, 4); - unsigned align = fieldFromInstruction(Insn, 4, 2); - if (type == 6 && (align & 2)) return MCDisassembler::Fail; - if (type == 7 && (align & 2)) return MCDisassembler::Fail; - if (type == 10 && align == 3) return MCDisassembler::Fail; - - unsigned load = fieldFromInstruction(Insn, 21, 1); - return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder) - : DecodeVSTInstruction(Inst, Insn, Address, Decoder); -} - -static DecodeStatus DecodeVLDST2Instruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - unsigned size = fieldFromInstruction(Insn, 6, 2); - if (size == 3) return MCDisassembler::Fail; - - unsigned type = fieldFromInstruction(Insn, 8, 4); - unsigned align = fieldFromInstruction(Insn, 4, 2); - if (type == 8 && align == 3) return MCDisassembler::Fail; - if (type == 9 && align == 3) return MCDisassembler::Fail; - - unsigned load = fieldFromInstruction(Insn, 21, 1); - return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder) - : DecodeVSTInstruction(Inst, Insn, Address, Decoder); -} - -static DecodeStatus DecodeVLDST3Instruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - unsigned size = fieldFromInstruction(Insn, 6, 2); - if (size == 3) return MCDisassembler::Fail; - - unsigned align = fieldFromInstruction(Insn, 4, 2); - if (align & 2) return MCDisassembler::Fail; - - unsigned load = fieldFromInstruction(Insn, 21, 1); - return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder) - : DecodeVSTInstruction(Inst, Insn, Address, Decoder); -} - -static DecodeStatus DecodeVLDST4Instruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - unsigned size = fieldFromInstruction(Insn, 6, 2); - if (size == 3) return MCDisassembler::Fail; - - unsigned load = fieldFromInstruction(Insn, 21, 1); - return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder) - : DecodeVSTInstruction(Inst, Insn, Address, Decoder); -} - static DecodeStatus DecodeVSTInstruction(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { @@ -3558,6 +2483,61 @@ static DecodeStatus DecodeVSTInstruction(MCInst &Inst, unsigned Insn, return S; } +static DecodeStatus DecodeVLDST1Instruction(MCInst &Inst, unsigned Insn, + uint64_t Address, + const MCDisassembler *Decoder) { + unsigned type = fieldFromInstruction(Insn, 8, 4); + unsigned align = fieldFromInstruction(Insn, 4, 2); + if (type == 6 && (align & 2)) return MCDisassembler::Fail; + if (type == 7 && (align & 2)) return MCDisassembler::Fail; + if (type == 10 && align == 3) return MCDisassembler::Fail; + + unsigned load = fieldFromInstruction(Insn, 21, 1); + return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder) + : DecodeVSTInstruction(Inst, Insn, Address, Decoder); +} + +static DecodeStatus DecodeVLDST2Instruction(MCInst &Inst, unsigned Insn, + uint64_t Address, + const MCDisassembler *Decoder) { + unsigned size = fieldFromInstruction(Insn, 6, 2); + if (size == 3) return MCDisassembler::Fail; + + unsigned type = fieldFromInstruction(Insn, 8, 4); + unsigned align = fieldFromInstruction(Insn, 4, 2); + if (type == 8 && align == 3) return MCDisassembler::Fail; + if (type == 9 && align == 3) return MCDisassembler::Fail; + + unsigned load = fieldFromInstruction(Insn, 21, 1); + return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder) + : DecodeVSTInstruction(Inst, Insn, Address, Decoder); +} + +static DecodeStatus DecodeVLDST3Instruction(MCInst &Inst, unsigned Insn, + uint64_t Address, + const MCDisassembler *Decoder) { + unsigned size = fieldFromInstruction(Insn, 6, 2); + if (size == 3) return MCDisassembler::Fail; + + unsigned align = fieldFromInstruction(Insn, 4, 2); + if (align & 2) return MCDisassembler::Fail; + + unsigned load = fieldFromInstruction(Insn, 21, 1); + return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder) + : DecodeVSTInstruction(Inst, Insn, Address, Decoder); +} + +static DecodeStatus DecodeVLDST4Instruction(MCInst &Inst, unsigned Insn, + uint64_t Address, + const MCDisassembler *Decoder) { + unsigned size = fieldFromInstruction(Insn, 6, 2); + if (size == 3) return MCDisassembler::Fail; + + unsigned load = fieldFromInstruction(Insn, 21, 1); + return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder) + : DecodeVSTInstruction(Inst, Insn, Address, Decoder); +} + static DecodeStatus DecodeVLD1DupInstruction(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { @@ -4063,6 +3043,60 @@ static DecodeStatus DecodeT2AddrModeSOReg(MCInst &Inst, unsigned Val, return S; } +static DecodeStatus DecodeT2LoadLabel(MCInst &Inst, unsigned Insn, + uint64_t Address, + const MCDisassembler *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rt = fieldFromInstruction(Insn, 12, 4); + unsigned U = fieldFromInstruction(Insn, 23, 1); + int imm = fieldFromInstruction(Insn, 0, 12); + + const FeatureBitset &featureBits = + Decoder->getSubtargetInfo().getFeatureBits(); + + bool hasV7Ops = featureBits[ARM::HasV7Ops]; + + if (Rt == 15) { + switch (Inst.getOpcode()) { + case ARM::t2LDRBpci: + case ARM::t2LDRHpci: + Inst.setOpcode(ARM::t2PLDpci); + break; + case ARM::t2LDRSBpci: + Inst.setOpcode(ARM::t2PLIpci); + break; + case ARM::t2LDRSHpci: + return MCDisassembler::Fail; + default: + break; + } + } + + switch(Inst.getOpcode()) { + case ARM::t2PLDpci: + break; + case ARM::t2PLIpci: + if (!hasV7Ops) + return MCDisassembler::Fail; + break; + default: + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + } + + if (!U) { + // Special case for #-0. + if (imm == 0) + imm = INT32_MIN; + else + imm = -imm; + } + Inst.addOperand(MCOperand::createImm(imm)); + + return S; +} + static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { @@ -4232,6 +3266,33 @@ static DecodeStatus DecodeT2LoadImm8(MCInst &Inst, unsigned Insn, return S; } +static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val, + uint64_t Address, + const MCDisassembler *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Val, 13, 4); + unsigned imm = fieldFromInstruction(Val, 0, 12); + + // Thumb stores cannot use PC as dest register. + switch (Inst.getOpcode()) { + case ARM::t2STRi12: + case ARM::t2STRBi12: + case ARM::t2STRHi12: + if (Rn == 15) + return MCDisassembler::Fail; + break; + default: + break; + } + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(imm)); + + return S; +} + static DecodeStatus DecodeT2LoadImm12(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { @@ -4352,60 +3413,6 @@ static DecodeStatus DecodeT2LoadT(MCInst &Inst, unsigned Insn, uint64_t Address, return S; } -static DecodeStatus DecodeT2LoadLabel(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - DecodeStatus S = MCDisassembler::Success; - - unsigned Rt = fieldFromInstruction(Insn, 12, 4); - unsigned U = fieldFromInstruction(Insn, 23, 1); - int imm = fieldFromInstruction(Insn, 0, 12); - - const FeatureBitset &featureBits = - Decoder->getSubtargetInfo().getFeatureBits(); - - bool hasV7Ops = featureBits[ARM::HasV7Ops]; - - if (Rt == 15) { - switch (Inst.getOpcode()) { - case ARM::t2LDRBpci: - case ARM::t2LDRHpci: - Inst.setOpcode(ARM::t2PLDpci); - break; - case ARM::t2LDRSBpci: - Inst.setOpcode(ARM::t2PLIpci); - break; - case ARM::t2LDRSHpci: - return MCDisassembler::Fail; - default: - break; - } - } - - switch(Inst.getOpcode()) { - case ARM::t2PLDpci: - break; - case ARM::t2PLIpci: - if (!hasV7Ops) - return MCDisassembler::Fail; - break; - default: - if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder))) - return MCDisassembler::Fail; - } - - if (!U) { - // Special case for #-0. - if (imm == 0) - imm = INT32_MIN; - else - imm = -imm; - } - Inst.addOperand(MCOperand::createImm(imm)); - - return S; -} - static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val, uint64_t Address, const MCDisassembler *Decoder) { if (Val == 0) @@ -4655,33 +3662,6 @@ static DecodeStatus DecodeT2LdStPre(MCInst &Inst, unsigned Insn, return S; } -static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder) { - DecodeStatus S = MCDisassembler::Success; - - unsigned Rn = fieldFromInstruction(Val, 13, 4); - unsigned imm = fieldFromInstruction(Val, 0, 12); - - // Thumb stores cannot use PC as dest register. - switch (Inst.getOpcode()) { - case ARM::t2STRi12: - case ARM::t2STRBi12: - case ARM::t2STRHi12: - if (Rn == 15) - return MCDisassembler::Fail; - break; - default: - break; - } - - if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) - return MCDisassembler::Fail; - Inst.addOperand(MCOperand::createImm(imm)); - - return S; -} - static DecodeStatus DecodeThumbAddSPImm(MCInst &Inst, uint16_t Insn, uint64_t Address, const MCDisassembler *Decoder) { @@ -4844,6 +3824,16 @@ static DecodeStatus DecodeThumbTableBranch(MCInst &Inst, unsigned Insn, return S; } +static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Val, + uint64_t Address, + const MCDisassembler *Decoder) { + if (Val & ~0xf) + return MCDisassembler::Fail; + + Inst.addOperand(MCOperand::createImm(Val)); + return MCDisassembler::Success; +} + static DecodeStatus DecodeThumb2BCCInstruction(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { @@ -4951,16 +3941,6 @@ static DecodeStatus DecodeThumbBLTargetOperand(MCInst &Inst, unsigned Val, return MCDisassembler::Success; } -static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder) { - if (Val & ~0xf) - return MCDisassembler::Fail; - - Inst.addOperand(MCOperand::createImm(Val)); - return MCDisassembler::Success; -} - static DecodeStatus DecodeInstSyncBarrierOption(MCInst &Inst, unsigned Val, uint64_t Address, const MCDisassembler *Decoder) { @@ -6475,49 +5455,6 @@ static DecodeStatus DecodeVSCCLRM(MCInst &Inst, unsigned Insn, uint64_t Address, return S; } -static DecodeStatus DecodeMQPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder) { - if (RegNo > 7) - return MCDisassembler::Fail; - - unsigned Register = QPRDecoderTable[RegNo]; - Inst.addOperand(MCOperand::createReg(Register)); - return MCDisassembler::Success; -} - -static const MCPhysReg QQPRDecoderTable[] = { - ARM::Q0_Q1, ARM::Q1_Q2, ARM::Q2_Q3, ARM::Q3_Q4, - ARM::Q4_Q5, ARM::Q5_Q6, ARM::Q6_Q7 -}; - -static DecodeStatus DecodeMQQPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder) { - if (RegNo > 6) - return MCDisassembler::Fail; - - unsigned Register = QQPRDecoderTable[RegNo]; - Inst.addOperand(MCOperand::createReg(Register)); - return MCDisassembler::Success; -} - -static const MCPhysReg QQQQPRDecoderTable[] = { - ARM::Q0_Q1_Q2_Q3, ARM::Q1_Q2_Q3_Q4, ARM::Q2_Q3_Q4_Q5, - ARM::Q3_Q4_Q5_Q6, ARM::Q4_Q5_Q6_Q7 -}; - -static DecodeStatus DecodeMQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder) { - if (RegNo > 4) - return MCDisassembler::Fail; - - unsigned Register = QQQQPRDecoderTable[RegNo]; - Inst.addOperand(MCOperand::createReg(Register)); - return MCDisassembler::Success; -} - static DecodeStatus DecodeVPTMaskOperand(MCInst &Inst, unsigned Val, uint64_t Address, const MCDisassembler *Decoder) { @@ -7069,3 +6006,547 @@ static DecodeStatus DecodeLazyLoadStoreMul(MCInst &Inst, unsigned Insn, return S; } + +#include "ARMGenDisassemblerTables.inc" + +// Post-decoding checks +static DecodeStatus checkDecodedInstruction(MCInst &MI, uint64_t &Size, + uint64_t Address, raw_ostream &CS, + uint32_t Insn, + DecodeStatus Result) { + switch (MI.getOpcode()) { + case ARM::HVC: { + // HVC is undefined if condition = 0xf otherwise upredictable + // if condition != 0xe + uint32_t Cond = (Insn >> 28) & 0xF; + if (Cond == 0xF) + return MCDisassembler::Fail; + if (Cond != 0xE) + return MCDisassembler::SoftFail; + return Result; + } + case ARM::t2ADDri: + case ARM::t2ADDri12: + case ARM::t2ADDrr: + case ARM::t2ADDrs: + case ARM::t2SUBri: + case ARM::t2SUBri12: + case ARM::t2SUBrr: + case ARM::t2SUBrs: + if (MI.getOperand(0).getReg() == ARM::SP && + MI.getOperand(1).getReg() != ARM::SP) + return MCDisassembler::SoftFail; + return Result; + default: return Result; + } +} + +uint64_t ARMDisassembler::suggestBytesToSkip(ArrayRef<uint8_t> Bytes, + uint64_t Address) const { + // In Arm state, instructions are always 4 bytes wide, so there's no + // point in skipping any smaller number of bytes if an instruction + // can't be decoded. + if (!STI.hasFeature(ARM::ModeThumb)) + return 4; + + // In a Thumb instruction stream, a halfword is a standalone 2-byte + // instruction if and only if its value is less than 0xE800. + // Otherwise, it's the first halfword of a 4-byte instruction. + // + // So, if we can see the upcoming halfword, we can judge on that + // basis, and maybe skip a whole 4-byte instruction that we don't + // know how to decode, without accidentally trying to interpret its + // second half as something else. + // + // If we don't have the instruction data available, we just have to + // recommend skipping the minimum sensible distance, which is 2 + // bytes. + if (Bytes.size() < 2) + return 2; + + uint16_t Insn16 = llvm::support::endian::read<uint16_t>( + Bytes.data(), InstructionEndianness); + return Insn16 < 0xE800 ? 2 : 4; +} + +DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size, + ArrayRef<uint8_t> Bytes, + uint64_t Address, + raw_ostream &CS) const { + if (STI.hasFeature(ARM::ModeThumb)) + return getThumbInstruction(MI, Size, Bytes, Address, CS); + return getARMInstruction(MI, Size, Bytes, Address, CS); +} + +DecodeStatus ARMDisassembler::getARMInstruction(MCInst &MI, uint64_t &Size, + ArrayRef<uint8_t> Bytes, + uint64_t Address, + raw_ostream &CS) const { + CommentStream = &CS; + + assert(!STI.hasFeature(ARM::ModeThumb) && + "Asked to disassemble an ARM instruction but Subtarget is in Thumb " + "mode!"); + + // We want to read exactly 4 bytes of data. + if (Bytes.size() < 4) { + Size = 0; + return MCDisassembler::Fail; + } + + // Encoded as a 32-bit word in the stream. + uint32_t Insn = llvm::support::endian::read<uint32_t>(Bytes.data(), + InstructionEndianness); + + // Calling the auto-generated decoder function. + DecodeStatus Result = + decodeInstruction(DecoderTableARM32, MI, Insn, Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + return checkDecodedInstruction(MI, Size, Address, CS, Insn, Result); + } + + struct DecodeTable { + const uint8_t *P; + bool DecodePred; + }; + + const DecodeTable Tables[] = { + {DecoderTableVFP32, false}, {DecoderTableVFPV832, false}, + {DecoderTableNEONData32, true}, {DecoderTableNEONLoadStore32, true}, + {DecoderTableNEONDup32, true}, {DecoderTablev8NEON32, false}, + {DecoderTablev8Crypto32, false}, + }; + + for (auto Table : Tables) { + Result = decodeInstruction(Table.P, MI, Insn, Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + // Add a fake predicate operand, because we share these instruction + // definitions with Thumb2 where these instructions are predicable. + if (Table.DecodePred && !DecodePredicateOperand(MI, 0xE, Address, this)) + return MCDisassembler::Fail; + return Result; + } + } + + Result = + decodeInstruction(DecoderTableCoProc32, MI, Insn, Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + return checkDecodedInstruction(MI, Size, Address, CS, Insn, Result); + } + + Size = 4; + return MCDisassembler::Fail; +} + +// Thumb1 instructions don't have explicit S bits. Rather, they +// implicitly set CPSR. Since it's not represented in the encoding, the +// auto-generated decoder won't inject the CPSR operand. We need to fix +// that as a post-pass. +void ARMDisassembler::AddThumb1SBit(MCInst &MI, bool InITBlock) const { + const MCInstrDesc &MCID = MCII->get(MI.getOpcode()); + MCInst::iterator I = MI.begin(); + for (unsigned i = 0; i < MCID.NumOperands; ++i, ++I) { + if (I == MI.end()) break; + if (MCID.operands()[i].isOptionalDef() && + MCID.operands()[i].RegClass == ARM::CCRRegClassID) { + if (i > 0 && MCID.operands()[i - 1].isPredicate()) + continue; + MI.insert(I, + MCOperand::createReg(InITBlock ? ARM::NoRegister : ARM::CPSR)); + return; + } + } + + MI.insert(I, MCOperand::createReg(InITBlock ? ARM::NoRegister : ARM::CPSR)); +} + +bool ARMDisassembler::isVectorPredicable(const MCInst &MI) const { + const MCInstrDesc &MCID = MCII->get(MI.getOpcode()); + for (unsigned i = 0; i < MCID.NumOperands; ++i) { + if (ARM::isVpred(MCID.operands()[i].OperandType)) + return true; + } + return false; +} + +// Most Thumb instructions don't have explicit predicates in the +// encoding, but rather get their predicates from IT context. We need +// to fix up the predicate operands using this context information as a +// post-pass. +MCDisassembler::DecodeStatus +ARMDisassembler::AddThumbPredicate(MCInst &MI) const { + MCDisassembler::DecodeStatus S = Success; + + const FeatureBitset &FeatureBits = getSubtargetInfo().getFeatureBits(); + + // A few instructions actually have predicates encoded in them. Don't + // try to overwrite it if we're seeing one of those. + switch (MI.getOpcode()) { + case ARM::tBcc: + case ARM::t2Bcc: + case ARM::tCBZ: + case ARM::tCBNZ: + case ARM::tCPS: + case ARM::t2CPS3p: + case ARM::t2CPS2p: + case ARM::t2CPS1p: + case ARM::t2CSEL: + case ARM::t2CSINC: + case ARM::t2CSINV: + case ARM::t2CSNEG: + case ARM::tMOVSr: + case ARM::tSETEND: + // Some instructions (mostly conditional branches) are not + // allowed in IT blocks. + if (ITBlock.instrInITBlock()) + S = SoftFail; + else + return Success; + break; + case ARM::t2HINT: + if (MI.getOperand(0).getImm() == 0x10 && (FeatureBits[ARM::FeatureRAS]) != 0) + S = SoftFail; + break; + case ARM::tB: + case ARM::t2B: + case ARM::t2TBB: + case ARM::t2TBH: + // Some instructions (mostly unconditional branches) can + // only appears at the end of, or outside of, an IT. + if (ITBlock.instrInITBlock() && !ITBlock.instrLastInITBlock()) + S = SoftFail; + break; + default: + break; + } + + // Warn on non-VPT predicable instruction in a VPT block and a VPT + // predicable instruction in an IT block + if ((!isVectorPredicable(MI) && VPTBlock.instrInVPTBlock()) || + (isVectorPredicable(MI) && ITBlock.instrInITBlock())) + S = SoftFail; + + // If we're in an IT/VPT block, base the predicate on that. Otherwise, + // assume a predicate of AL. + unsigned CC = ARMCC::AL; + unsigned VCC = ARMVCC::None; + if (ITBlock.instrInITBlock()) { + CC = ITBlock.getITCC(); + ITBlock.advanceITState(); + } else if (VPTBlock.instrInVPTBlock()) { + VCC = VPTBlock.getVPTPred(); + VPTBlock.advanceVPTState(); + } + + const MCInstrDesc &MCID = MCII->get(MI.getOpcode()); + + MCInst::iterator CCI = MI.begin(); + for (unsigned i = 0; i < MCID.NumOperands; ++i, ++CCI) { + if (MCID.operands()[i].isPredicate() || CCI == MI.end()) + break; + } + + if (MCID.isPredicable()) { + CCI = MI.insert(CCI, MCOperand::createImm(CC)); + ++CCI; + if (CC == ARMCC::AL) + MI.insert(CCI, MCOperand::createReg(ARM::NoRegister)); + else + MI.insert(CCI, MCOperand::createReg(ARM::CPSR)); + } else if (CC != ARMCC::AL) { + Check(S, SoftFail); + } + + MCInst::iterator VCCI = MI.begin(); + unsigned VCCPos; + for (VCCPos = 0; VCCPos < MCID.NumOperands; ++VCCPos, ++VCCI) { + if (ARM::isVpred(MCID.operands()[VCCPos].OperandType) || VCCI == MI.end()) + break; + } + + if (isVectorPredicable(MI)) { + VCCI = MI.insert(VCCI, MCOperand::createImm(VCC)); + ++VCCI; + if (VCC == ARMVCC::None) + VCCI = MI.insert(VCCI, MCOperand::createReg(0)); + else + VCCI = MI.insert(VCCI, MCOperand::createReg(ARM::P0)); + ++VCCI; + VCCI = MI.insert(VCCI, MCOperand::createReg(0)); + ++VCCI; + if (MCID.operands()[VCCPos].OperandType == ARM::OPERAND_VPRED_R) { + int TiedOp = MCID.getOperandConstraint(VCCPos + 3, MCOI::TIED_TO); + assert(TiedOp >= 0 && + "Inactive register in vpred_r is not tied to an output!"); + // Copy the operand to ensure it's not invalidated when MI grows. + MI.insert(VCCI, MCOperand(MI.getOperand(TiedOp))); + } + } else if (VCC != ARMVCC::None) { + Check(S, SoftFail); + } + + return S; +} + +// Thumb VFP instructions are a special case. Because we share their +// encodings between ARM and Thumb modes, and they are predicable in ARM +// mode, the auto-generated decoder will give them an (incorrect) +// predicate operand. We need to rewrite these operands based on the IT +// context as a post-pass. +void ARMDisassembler::UpdateThumbVFPPredicate( + DecodeStatus &S, MCInst &MI) const { + unsigned CC; + CC = ITBlock.getITCC(); + if (CC == 0xF) + CC = ARMCC::AL; + if (ITBlock.instrInITBlock()) + ITBlock.advanceITState(); + else if (VPTBlock.instrInVPTBlock()) { + CC = VPTBlock.getVPTPred(); + VPTBlock.advanceVPTState(); + } + + const MCInstrDesc &MCID = MCII->get(MI.getOpcode()); + ArrayRef<MCOperandInfo> OpInfo = MCID.operands(); + MCInst::iterator I = MI.begin(); + unsigned short NumOps = MCID.NumOperands; + for (unsigned i = 0; i < NumOps; ++i, ++I) { + if (OpInfo[i].isPredicate() ) { + if (CC != ARMCC::AL && !MCID.isPredicable()) + Check(S, SoftFail); + I->setImm(CC); + ++I; + if (CC == ARMCC::AL) + I->setReg(ARM::NoRegister); + else + I->setReg(ARM::CPSR); + return; + } + } +} + +DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size, + ArrayRef<uint8_t> Bytes, + uint64_t Address, + raw_ostream &CS) const { + CommentStream = &CS; + + assert(STI.hasFeature(ARM::ModeThumb) && + "Asked to disassemble in Thumb mode but Subtarget is in ARM mode!"); + + // We want to read exactly 2 bytes of data. + if (Bytes.size() < 2) { + Size = 0; + return MCDisassembler::Fail; + } + + uint16_t Insn16 = llvm::support::endian::read<uint16_t>( + Bytes.data(), InstructionEndianness); + DecodeStatus Result = + decodeInstruction(DecoderTableThumb16, MI, Insn16, Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 2; + Check(Result, AddThumbPredicate(MI)); + return Result; + } + + Result = decodeInstruction(DecoderTableThumbSBit16, MI, Insn16, Address, this, + STI); + if (Result) { + Size = 2; + bool InITBlock = ITBlock.instrInITBlock(); + Check(Result, AddThumbPredicate(MI)); + AddThumb1SBit(MI, InITBlock); + return Result; + } + + Result = + decodeInstruction(DecoderTableThumb216, MI, Insn16, Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 2; + + // Nested IT blocks are UNPREDICTABLE. Must be checked before we add + // the Thumb predicate. + if (MI.getOpcode() == ARM::t2IT && ITBlock.instrInITBlock()) + Result = MCDisassembler::SoftFail; + + Check(Result, AddThumbPredicate(MI)); + + // If we find an IT instruction, we need to parse its condition + // code and mask operands so that we can apply them correctly + // to the subsequent instructions. + if (MI.getOpcode() == ARM::t2IT) { + unsigned Firstcond = MI.getOperand(0).getImm(); + unsigned Mask = MI.getOperand(1).getImm(); + ITBlock.setITState(Firstcond, Mask); + + // An IT instruction that would give a 'NV' predicate is unpredictable. + if (Firstcond == ARMCC::AL && !isPowerOf2_32(Mask)) + CS << "unpredictable IT predicate sequence"; + } + + return Result; + } + + // We want to read exactly 4 bytes of data. + if (Bytes.size() < 4) { + Size = 0; + return MCDisassembler::Fail; + } + + uint32_t Insn32 = + (uint32_t(Insn16) << 16) | llvm::support::endian::read<uint16_t>( + Bytes.data() + 2, InstructionEndianness); + + Result = + decodeInstruction(DecoderTableMVE32, MI, Insn32, Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + + // Nested VPT blocks are UNPREDICTABLE. Must be checked before we add + // the VPT predicate. + if (isVPTOpcode(MI.getOpcode()) && VPTBlock.instrInVPTBlock()) + Result = MCDisassembler::SoftFail; + + Check(Result, AddThumbPredicate(MI)); + + if (isVPTOpcode(MI.getOpcode())) { + unsigned Mask = MI.getOperand(0).getImm(); + VPTBlock.setVPTState(Mask); + } + + return Result; + } + + Result = + decodeInstruction(DecoderTableThumb32, MI, Insn32, Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + bool InITBlock = ITBlock.instrInITBlock(); + Check(Result, AddThumbPredicate(MI)); + AddThumb1SBit(MI, InITBlock); + return Result; + } + + Result = + decodeInstruction(DecoderTableThumb232, MI, Insn32, Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + Check(Result, AddThumbPredicate(MI)); + return checkDecodedInstruction(MI, Size, Address, CS, Insn32, Result); + } + + if (fieldFromInstruction(Insn32, 28, 4) == 0xE) { + Result = + decodeInstruction(DecoderTableVFP32, MI, Insn32, Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + UpdateThumbVFPPredicate(Result, MI); + return Result; + } + } + + Result = + decodeInstruction(DecoderTableVFPV832, MI, Insn32, Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + return Result; + } + + if (fieldFromInstruction(Insn32, 28, 4) == 0xE) { + Result = decodeInstruction(DecoderTableNEONDup32, MI, Insn32, Address, this, + STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + Check(Result, AddThumbPredicate(MI)); + return Result; + } + } + + if (fieldFromInstruction(Insn32, 24, 8) == 0xF9) { + uint32_t NEONLdStInsn = Insn32; + NEONLdStInsn &= 0xF0FFFFFF; + NEONLdStInsn |= 0x04000000; + Result = decodeInstruction(DecoderTableNEONLoadStore32, MI, NEONLdStInsn, + Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + Check(Result, AddThumbPredicate(MI)); + return Result; + } + } + + if (fieldFromInstruction(Insn32, 24, 4) == 0xF) { + uint32_t NEONDataInsn = Insn32; + NEONDataInsn &= 0xF0FFFFFF; // Clear bits 27-24 + NEONDataInsn |= (NEONDataInsn & 0x10000000) >> 4; // Move bit 28 to bit 24 + NEONDataInsn |= 0x12000000; // Set bits 28 and 25 + Result = decodeInstruction(DecoderTableNEONData32, MI, NEONDataInsn, + Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + Check(Result, AddThumbPredicate(MI)); + return Result; + } + + uint32_t NEONCryptoInsn = Insn32; + NEONCryptoInsn &= 0xF0FFFFFF; // Clear bits 27-24 + NEONCryptoInsn |= (NEONCryptoInsn & 0x10000000) >> 4; // Move bit 28 to bit 24 + NEONCryptoInsn |= 0x12000000; // Set bits 28 and 25 + Result = decodeInstruction(DecoderTablev8Crypto32, MI, NEONCryptoInsn, + Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + return Result; + } + + uint32_t NEONv8Insn = Insn32; + NEONv8Insn &= 0xF3FFFFFF; // Clear bits 27-26 + Result = decodeInstruction(DecoderTablev8NEON32, MI, NEONv8Insn, Address, + this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + return Result; + } + } + + uint32_t Coproc = fieldFromInstruction(Insn32, 8, 4); + const uint8_t *DecoderTable = ARM::isCDECoproc(Coproc, STI) + ? DecoderTableThumb2CDE32 + : DecoderTableThumb2CoProc32; + Result = + decodeInstruction(DecoderTable, MI, Insn32, Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + Check(Result, AddThumbPredicate(MI)); + return Result; + } + + // Advance IT state to prevent next instruction inheriting + // the wrong IT state. + if (ITBlock.instrInITBlock()) + ITBlock.advanceITState(); + Size = 0; + return MCDisassembler::Fail; +} + +static MCDisassembler *createARMDisassembler(const Target &T, + const MCSubtargetInfo &STI, + MCContext &Ctx) { + return new ARMDisassembler(STI, Ctx, T.createMCInstrInfo()); +} + +extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void +LLVMInitializeARMDisassembler() { + TargetRegistry::RegisterMCDisassembler(getTheARMLETarget(), + createARMDisassembler); + TargetRegistry::RegisterMCDisassembler(getTheARMBETarget(), + createARMDisassembler); + TargetRegistry::RegisterMCDisassembler(getTheThumbLETarget(), + createARMDisassembler); + TargetRegistry::RegisterMCDisassembler(getTheThumbBETarget(), + createARMDisassembler); +} diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp index bb07d79c9374..50f4042102bf 100644 --- a/llvm/lib/Target/ARM/MVETailPredication.cpp +++ b/llvm/lib/Target/ARM/MVETailPredication.cpp @@ -457,5 +457,4 @@ Pass *llvm::createMVETailPredicationPass() { char MVETailPredication::ID = 0; -INITIALIZE_PASS_BEGIN(MVETailPredication, DEBUG_TYPE, DESC, false, false) -INITIALIZE_PASS_END(MVETailPredication, DEBUG_TYPE, DESC, false, false) +INITIALIZE_PASS(MVETailPredication, DEBUG_TYPE, DESC, false, false) diff --git a/llvm/lib/Target/ARM/README.txt b/llvm/lib/Target/ARM/README.txt index def67cfae727..ff84e07fa084 100644 --- a/llvm/lib/Target/ARM/README.txt +++ b/llvm/lib/Target/ARM/README.txt @@ -697,22 +697,6 @@ target-neutral one. //===---------------------------------------------------------------------===// -Optimize unnecessary checks for zero with __builtin_clz/ctz. Those builtins -are specified to be undefined at zero, so portable code must check for zero -and handle it as a special case. That is unnecessary on ARM where those -operations are implemented in a way that is well-defined for zero. For -example: - -int f(int x) { return x ? __builtin_clz(x) : sizeof(int)*8; } - -should just be implemented with a CLZ instruction. Since there are other -targets, e.g., PPC, that share this behavior, it would be best to implement -this in a target-independent way: we should probably fold that (when using -"undefined at zero" semantics) to set the "defined at zero" bit and have -the code generator expand out the right code. - -//===---------------------------------------------------------------------===// - Clean up the test/MC/ARM files to have more robust register choices. R0 should not be used as a register operand in the assembler tests as it's then diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp index 8b254fafc438..e91441b12fe6 100644 --- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -263,11 +263,14 @@ void Thumb2InstrInfo::expandLoadStackGuard( const auto *GV = cast<GlobalValue>((*MI->memoperands_begin())->getValue()); const ARMSubtarget &Subtarget = MF.getSubtarget<ARMSubtarget>(); + bool IsPIC = MF.getTarget().isPositionIndependent(); if (Subtarget.isTargetELF() && !GV->isDSOLocal()) expandLoadStackGuardBase(MI, ARM::t2LDRLIT_ga_pcrel, ARM::t2LDRi12); else if (!Subtarget.useMovt()) - expandLoadStackGuardBase(MI, ARM::tLDRLIT_ga_abs, ARM::t2LDRi12); - else if (MF.getTarget().isPositionIndependent()) + expandLoadStackGuardBase( + MI, IsPIC ? ARM::t2LDRLIT_ga_pcrel : ARM::tLDRLIT_ga_abs, + ARM::t2LDRi12); + else if (IsPIC) expandLoadStackGuardBase(MI, ARM::t2MOV_ga_pcrel, ARM::t2LDRi12); else expandLoadStackGuardBase(MI, ARM::t2MOVi32imm, ARM::t2LDRi12); diff --git a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp index 0fb33cdcb62d..ad8f7d801843 100644 --- a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp +++ b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp @@ -245,7 +245,7 @@ void AVRAsmPrinter::emitXXStructor(const DataLayout &DL, const Constant *CV) { bool AVRAsmPrinter::doFinalization(Module &M) { const TargetLoweringObjectFile &TLOF = getObjFileLowering(); const AVRTargetMachine &TM = (const AVRTargetMachine &)MMI->getTarget(); - const AVRSubtarget *SubTM = (const AVRSubtarget *)TM.getSubtargetImpl(); + const AVRSubtarget *SubTM = TM.getSubtargetImpl(); bool NeedsCopyData = false; bool NeedsClearBSS = false; @@ -294,7 +294,7 @@ bool AVRAsmPrinter::doFinalization(Module &M) { void AVRAsmPrinter::emitStartOfAsmFile(Module &M) { const AVRTargetMachine &TM = (const AVRTargetMachine &)MMI->getTarget(); - const AVRSubtarget *SubTM = (const AVRSubtarget *)TM.getSubtargetImpl(); + const AVRSubtarget *SubTM = TM.getSubtargetImpl(); if (!SubTM) return; diff --git a/llvm/lib/Target/AVR/AVRInstrFormats.td b/llvm/lib/Target/AVR/AVRInstrFormats.td index e1e65b56370c..72ea3bc1f460 100644 --- a/llvm/lib/Target/AVR/AVRInstrFormats.td +++ b/llvm/lib/Target/AVR/AVRInstrFormats.td @@ -79,6 +79,7 @@ class FRdRr<bits<4> opcode, bits<2> f, dag outs, dag ins, string asmstr, //===----------------------------------------------------------------------===// class FZRd<bits<3> t, dag outs, dag ins, string asmstr, list<dag> pattern> : AVRInst16<outs, ins, asmstr, pattern> { + bits<0> z; bits<5> rd; let Inst{15 - 12} = 0b1001; @@ -127,8 +128,6 @@ class FRd<bits<4> opcode, bits<7> f, dag outs, dag ins, string asmstr, let Inst{11 - 9} = f{6 - 4}; let Inst{8 - 4} = rd; let Inst{3 - 0} = f{3 - 0}; - - let DecoderMethod = "decodeFRd"; } //===----------------------------------------------------------------------===// @@ -200,57 +199,64 @@ class FSTLD<bit type, bits<2> mode, dag outs, dag ins, string asmstr, //===---------------------------------------------------------------------===// class FLPMX<bit e, bit p, dag outs, dag ins, string asmstr, list<dag> pattern> : AVRInst16<outs, ins, asmstr, pattern> { + bits<0> z; bits<5> rd; - let Inst{15 - 12} = 0b1001; - - let Inst{11 - 9} = 0b000; - let Inst{8} = rd{4}; - - let Inst{7 - 4} = rd{3 - 0}; - + let Inst{15 - 9} = 0b1001000; + let Inst{8 - 4} = rd; let Inst{3 - 2} = 0b01; let Inst{1} = e; let Inst{0} = p; - - let DecoderMethod = "decodeFLPMX"; } //===----------------------------------------------------------------------===// // MOVWRdRr special encoding: <|0000|0001|dddd|rrrr|> // d = destination = 4 bits // r = source = 4 bits -// (Only accepts even registers) +// (Only accepts register pairs) //===----------------------------------------------------------------------===// class FMOVWRdRr<dag outs, dag ins, string asmstr, list<dag> pattern> : AVRInst16<outs, ins, asmstr, pattern> { - bits<5> rd; - bits<5> rr; + bits<4> rd; + bits<4> rr; let Inst{15 - 8} = 0b00000001; - let Inst{7 - 4} = rd{4 - 1}; - let Inst{3 - 0} = rr{4 - 1}; - - let DecoderMethod = "decodeFMOVWRdRr"; + let Inst{7 - 4} = rd; + let Inst{3 - 0} = rr; } //===----------------------------------------------------------------------===// -// MULSrr special encoding: <|0000|0010|dddd|rrrr|> +// MULS special encoding: <|0000|0010|dddd|rrrr|> // d = multiplicand = 4 bits // r = multiplier = 4 bits // (Only accepts r16-r31) //===----------------------------------------------------------------------===// -class FMUL2RdRr<bit f, dag outs, dag ins, string asmstr, list<dag> pattern> +class FMULSRdRr<dag outs, dag ins, string asmstr, list<dag> pattern> : AVRInst16<outs, ins, asmstr, pattern> { - bits<5> rd; // accept 5 bits but only encode the lower 4 - bits<5> rr; // accept 5 bits but only encode the lower 4 + bits<4> rd; + bits<4> rr; - let Inst{15 - 9} = 0b0000001; - let Inst{8} = f; - let Inst{7 - 4} = rd{3 - 0}; - let Inst{3 - 0} = rr{3 - 0}; + let Inst{15 - 8} = 0b00000010; + let Inst{7 - 4} = rd; + let Inst{3 - 0} = rr; +} - let DecoderMethod = "decodeFMUL2RdRr"; +//===----------------------------------------------------------------------===// +// MULSU special encoding: <|0000|0011|0ddd|0rrr|> +// d = multiplicand = 3 bits +// r = multiplier = 3 bits +// (Only accepts r16-r23) +//===----------------------------------------------------------------------===// +class FMULSURdRr<dag outs, dag ins, string asmstr, list<dag> pattern> + : AVRInst16<outs, ins, asmstr, pattern> { + bits<3> rd; + bits<3> rr; + + let Inst{15 - 8} = 0b00000011; + let Inst{7} = 0; + let Inst{6 - 4} = rd; + let Inst{3} = 0; + let Inst{2 - 0} = rr; } // Special encoding for the FMUL family of instructions. @@ -273,8 +279,6 @@ class FFMULRdRr<bits<2> f, dag outs, dag ins, string asmstr, list<dag> pattern> let Inst{6 - 4} = rd; let Inst{3} = f{0}; let Inst{2 - 0} = rr; - - let DecoderMethod = "decodeFFMULRdRr"; } //===----------------------------------------------------------------------===// @@ -286,16 +290,14 @@ class FFMULRdRr<bits<2> f, dag outs, dag ins, string asmstr, list<dag> pattern> //===----------------------------------------------------------------------===// class FWRdK<bit f, dag outs, dag ins, string asmstr, list<dag> pattern> : AVRInst16<outs, ins, asmstr, pattern> { - bits<5> rd; // accept 5 bits but only encode bits 1 and 2 + bits<2> rd; bits<6> k; let Inst{15 - 9} = 0b1001011; let Inst{8} = f; let Inst{7 - 6} = k{5 - 4}; - let Inst{5 - 4} = rd{2 - 1}; + let Inst{5 - 4} = rd; let Inst{3 - 0} = k{3 - 0}; - - let DecoderMethod = "decodeFWRdK"; } //===----------------------------------------------------------------------===// @@ -313,8 +315,6 @@ class FIORdA<dag outs, dag ins, string asmstr, list<dag> pattern> let Inst{10 - 9} = A{5 - 4}; let Inst{8 - 4} = rd; let Inst{3 - 0} = A{3 - 0}; - - let DecoderMethod = "decodeFIORdA"; } //===----------------------------------------------------------------------===// @@ -332,8 +332,6 @@ class FIOARr<dag outs, dag ins, string asmstr, list<dag> pattern> let Inst{10 - 9} = A{5 - 4}; let Inst{8 - 4} = rr; let Inst{3 - 0} = A{3 - 0}; - - let DecoderMethod = "decodeFIOARr"; } //===----------------------------------------------------------------------===// @@ -348,17 +346,10 @@ class FIOBIT<bits<2> t, dag outs, dag ins, string asmstr, list<dag> pattern> bits<5> addr; bits<3> b; - let Inst{15 - 12} = 0b1001; - - let Inst{11 - 10} = 0b10; + let Inst{15 - 10} = 0b100110; let Inst{9 - 8} = t; - - let Inst{7 - 4} = addr{4 - 1}; - - let Inst{3} = addr{0}; + let Inst{7 - 3} = addr; let Inst{2 - 0} = b{2 - 0}; - - let DecoderMethod = "decodeFIOBIT"; } //===----------------------------------------------------------------------===// @@ -417,8 +408,6 @@ class FBRsk<bit f, bits<3> s, dag outs, dag ins, string asmstr, let Inst{10} = f; let Inst{9 - 3} = k; let Inst{2 - 0} = s; - - let DecoderMethod = "decodeCondBranch"; } //===----------------------------------------------------------------------===// @@ -442,8 +431,6 @@ class FBRk<bit f, dag outs, dag ins, string asmstr, list<dag> pattern> let Inst{15 - 13} = 0b110; let Inst{12} = f; let Inst{11 - 0} = k; - - let DecoderMethod = "decodeFBRk"; } //===----------------------------------------------------------------------===// @@ -537,14 +524,8 @@ class FSK<bit f, dag outs, dag ins, string asmstr, list<dag> pattern> let Inst{11} = 0; let Inst{10} = f; - let Inst{9 - 8} = k{6 - 5}; - - let Inst{7 - 4} = k{4 - 1}; - - let Inst{3} = k{0}; + let Inst{9 - 3} = k; let Inst{2 - 0} = s; - - let DecoderMethod = "decodeCondBranch"; } class ExtensionPseudo<dag outs, dag ins, string asmstr, list<dag> pattern> diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.cpp b/llvm/lib/Target/AVR/AVRInstrInfo.cpp index 601068bf1793..ce9908597dca 100644 --- a/llvm/lib/Target/AVR/AVRInstrInfo.cpp +++ b/llvm/lib/Target/AVR/AVRInstrInfo.cpp @@ -29,8 +29,8 @@ namespace llvm { -AVRInstrInfo::AVRInstrInfo(AVRSubtarget &STI) - : AVRGenInstrInfo(AVR::ADJCALLSTACKDOWN, AVR::ADJCALLSTACKUP), RI(), +AVRInstrInfo::AVRInstrInfo(const AVRSubtarget &STI) + : AVRGenInstrInfo(STI, AVR::ADJCALLSTACKDOWN, AVR::ADJCALLSTACKUP), RI(), STI(STI) {} void AVRInstrInfo::copyPhysReg(MachineBasicBlock &MBB, diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.h b/llvm/lib/Target/AVR/AVRInstrInfo.h index 1c92f173d254..759aea201096 100644 --- a/llvm/lib/Target/AVR/AVRInstrInfo.h +++ b/llvm/lib/Target/AVR/AVRInstrInfo.h @@ -65,7 +65,7 @@ enum TOF { /// Utilities related to the AVR instruction set. class AVRInstrInfo : public AVRGenInstrInfo { public: - explicit AVRInstrInfo(AVRSubtarget &STI); + explicit AVRInstrInfo(const AVRSubtarget &STI); const AVRRegisterInfo &getRegisterInfo() const { return RI; } const MCInstrDesc &getBrCond(AVRCC::CondCodes CC) const; diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.td b/llvm/lib/Target/AVR/AVRInstrInfo.td index 958e1383acef..02fb905f5fb6 100644 --- a/llvm/lib/Target/AVR/AVRInstrInfo.td +++ b/llvm/lib/Target/AVR/AVRInstrInfo.td @@ -204,16 +204,19 @@ def memspi : Operand<iPTR> { def relbrtarget_7 : Operand<OtherVT> { let PrintMethod = "printPCRelImm"; let EncoderMethod = "encodeRelCondBrTarget<AVR::fixup_7_pcrel>"; + let DecoderMethod = "decodeRelCondBrTarget7"; } def brtarget_13 : Operand<OtherVT> { let PrintMethod = "printPCRelImm"; let EncoderMethod = "encodeRelCondBrTarget<AVR::fixup_13_pcrel>"; + let DecoderMethod = "decodeRelCondBrTarget13"; } def rcalltarget_13 : Operand<i16> { let PrintMethod = "printPCRelImm"; let EncoderMethod = "encodeRelCondBrTarget<AVR::fixup_13_pcrel>"; + let DecoderMethod = "decodeRelCondBrTarget13"; } // The target of a 22 or 16-bit call/jmp instruction. @@ -492,13 +495,13 @@ let isCommutable = 1, Defs = [R1, R0, SREG] in { "mul\t$rd, $rr", []>, Requires<[SupportsMultiplication]>; - def MULSRdRr : FMUL2RdRr<0, (outs), (ins LD8:$rd, LD8:$rr), + def MULSRdRr : FMULSRdRr<(outs), (ins LD8:$rd, LD8:$rr), "muls\t$rd, $rr", []>, Requires<[SupportsMultiplication]>; } - def MULSURdRr : FMUL2RdRr<1, (outs), (ins LD8lo:$rd, LD8lo:$rr), - "mulsu\t$rd, $rr", []>, + def MULSURdRr : FMULSURdRr<(outs), (ins LD8lo:$rd, LD8lo:$rr), + "mulsu\t$rd, $rr", []>, Requires<[SupportsMultiplication]>; def FMUL : FFMULRdRr<0b01, (outs), (ins LD8lo:$rd, LD8lo:$rr), @@ -1230,7 +1233,9 @@ let Uses = [R1, R0] in { let Defs = [R31R30] in def SPMZPi : F16<0b1001010111111000, (outs), (ins ZREG:$z), "spm $z+", []>, - Requires<[HasSPMX]>; + Requires<[HasSPMX]> { + bits<0> z; + } } // Read data from IO location operations. diff --git a/llvm/lib/Target/AVR/AVRRegisterInfo.td b/llvm/lib/Target/AVR/AVRRegisterInfo.td index 21b4aedea44c..182f92c684dc 100644 --- a/llvm/lib/Target/AVR/AVRRegisterInfo.td +++ b/llvm/lib/Target/AVR/AVRRegisterInfo.td @@ -68,33 +68,37 @@ def R31 : AVRReg<31, "r31", [], ["zh"]>, DwarfRegNum<[31]>; def SPL : AVRReg<32, "SPL">, DwarfRegNum<[32]>; def SPH : AVRReg<33, "SPH">, DwarfRegNum<[33]>; +// 16 bit GPR pairs. let SubRegIndices = [sub_lo, sub_hi], CoveredBySubRegs = 1 in { - // 16 bit GPR pairs. - def SP : AVRReg<32, "SP", [SPL, SPH]>, DwarfRegNum<[32]>; + // The value 16 for the encoding is arbitrary. SP register is not encoded + // into instructions, they use it implicitly depending on the opcode. + def SP : AVRReg<16, "SP", [SPL, SPH]>, DwarfRegNum<[32]>; // The pointer registers (X,Y,Z) are a special case because they // are printed as a `high:low` pair when a DREG is expected, // but printed using `X`, `Y`, `Z` when a pointer register is expected. + // DREG registers are only used in ADIW, SBIW and MOVW instructions. let RegAltNameIndices = [ptr] in { - def R31R30 : AVRReg<30, "r31:r30", [R30, R31], ["Z"]>, DwarfRegNum<[30]>; - def R29R28 : AVRReg<28, "r29:r28", [R28, R29], ["Y"]>, DwarfRegNum<[28]>; - def R27R26 : AVRReg<26, "r27:r26", [R26, R27], ["X"]>, DwarfRegNum<[26]>; + def R31R30 : AVRReg<15, "r31:r30", [R30, R31], ["Z"]>, DwarfRegNum<[30]>; + def R29R28 : AVRReg<14, "r29:r28", [R28, R29], ["Y"]>, DwarfRegNum<[28]>; + def R27R26 : AVRReg<13, "r27:r26", [R26, R27], ["X"]>, DwarfRegNum<[26]>; } - def R25R24 : AVRReg<24, "r25:r24", [R24, R25]>, DwarfRegNum<[24]>; - def R23R22 : AVRReg<22, "r23:r22", [R22, R23]>, DwarfRegNum<[22]>; - def R21R20 : AVRReg<20, "r21:r20", [R20, R21]>, DwarfRegNum<[20]>; - def R19R18 : AVRReg<18, "r19:r18", [R18, R19]>, DwarfRegNum<[18]>; - def R17R16 : AVRReg<16, "r17:r16", [R16, R17]>, DwarfRegNum<[16]>; - def R15R14 : AVRReg<14, "r15:r14", [R14, R15]>, DwarfRegNum<[14]>; - def R13R12 : AVRReg<12, "r13:r12", [R12, R13]>, DwarfRegNum<[12]>; - def R11R10 : AVRReg<10, "r11:r10", [R10, R11]>, DwarfRegNum<[10]>; - def R9R8 : AVRReg<8, "r9:r8", [R8, R9]>, DwarfRegNum<[8]>; - def R7R6 : AVRReg<6, "r7:r6", [R6, R7]>, DwarfRegNum<[6]>; - def R5R4 : AVRReg<4, "r5:r4", [R4, R5]>, DwarfRegNum<[4]>; - def R3R2 : AVRReg<2, "r3:r2", [R2, R3]>, DwarfRegNum<[2]>; + def R25R24 : AVRReg<12, "r25:r24", [R24, R25]>, DwarfRegNum<[24]>; + def R23R22 : AVRReg<11, "r23:r22", [R22, R23]>, DwarfRegNum<[22]>; + def R21R20 : AVRReg<10, "r21:r20", [R20, R21]>, DwarfRegNum<[20]>; + def R19R18 : AVRReg<9, "r19:r18", [R18, R19]>, DwarfRegNum<[18]>; + def R17R16 : AVRReg<8, "r17:r16", [R16, R17]>, DwarfRegNum<[16]>; + def R15R14 : AVRReg<7, "r15:r14", [R14, R15]>, DwarfRegNum<[14]>; + def R13R12 : AVRReg<6, "r13:r12", [R12, R13]>, DwarfRegNum<[12]>; + def R11R10 : AVRReg<5, "r11:r10", [R10, R11]>, DwarfRegNum<[10]>; + def R9R8 : AVRReg<4, "r9:r8", [R8, R9]>, DwarfRegNum<[8]>; + def R7R6 : AVRReg<3, "r7:r6", [R6, R7]>, DwarfRegNum<[6]>; + def R5R4 : AVRReg<2, "r5:r4", [R4, R5]>, DwarfRegNum<[4]>; + def R3R2 : AVRReg<1, "r3:r2", [R2, R3]>, DwarfRegNum<[2]>; def R1R0 : AVRReg<0, "r1:r0", [R0, R1]>, DwarfRegNum<[0]>; - // Pseudo registers for unaligned i16 + // Pseudo registers for unaligned i16. These are only used in pseudo + // instructions, so encoding values are arbitrary. def R26R25 : AVRReg<25, "r26:r25", [R25, R26]>, DwarfRegNum<[25]>; def R24R23 : AVRReg<23, "r24:r23", [R23, R24]>, DwarfRegNum<[23]>; def R22R21 : AVRReg<21, "r22:r21", [R21, R22]>, DwarfRegNum<[21]>; diff --git a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp index 4e00b192b875..a8650146e988 100644 --- a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp +++ b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp @@ -38,7 +38,6 @@ using namespace llvm; namespace { /// Parses AVR assembly from a stream. class AVRAsmParser : public MCTargetAsmParser { - const MCSubtargetInfo &STI; MCAsmParser &Parser; const MCRegisterInfo *MRI; const std::string GENERATE_STUBS = "gs"; @@ -93,7 +92,7 @@ class AVRAsmParser : public MCTargetAsmParser { public: AVRAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser, const MCInstrInfo &MII, const MCTargetOptions &Options) - : MCTargetAsmParser(Options, STI, MII), STI(STI), Parser(Parser) { + : MCTargetAsmParser(Options, STI, MII), Parser(Parser) { MCAsmParserExtension::Initialize(Parser); MRI = getContext().getRegisterInfo(); @@ -318,7 +317,7 @@ bool AVRAsmParser::missingFeature(llvm::SMLoc const &Loc, bool AVRAsmParser::emit(MCInst &Inst, SMLoc const &Loc, MCStreamer &Out) const { Inst.setLoc(Loc); - Out.emitInstruction(Inst, STI); + Out.emitInstruction(Inst, *STI); return false; } @@ -411,7 +410,7 @@ bool AVRAsmParser::tryParseRegisterOperand(OperandVector &Operands) { // Reject R0~R15 on avrtiny. if (AVR::R0 <= Reg && Reg <= AVR::R15 && - STI.hasFeature(AVR::FeatureTinyEncoding)) + STI->hasFeature(AVR::FeatureTinyEncoding)) return Error(Parser.getTok().getLoc(), "invalid register on avrtiny"); AsmToken const &T = Parser.getTok(); @@ -758,7 +757,7 @@ unsigned AVRAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, // Reject R0~R15 on avrtiny. if (0 <= RegNum && RegNum <= 15 && - STI.hasFeature(AVR::FeatureTinyEncoding)) + STI->hasFeature(AVR::FeatureTinyEncoding)) return Match_InvalidRegisterOnTiny; std::ostringstream RegName; diff --git a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp index 948588cb9a75..3a840a371497 100644 --- a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp +++ b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp @@ -61,7 +61,7 @@ LLVMInitializeAVRDisassembler() { createAVRDisassembler); } -static const uint16_t GPRDecoderTable[] = { +static constexpr MCRegister GPRDecoderTable[] = { AVR::R0, AVR::R1, AVR::R2, AVR::R3, AVR::R4, AVR::R5, AVR::R6, AVR::R7, AVR::R8, AVR::R9, AVR::R10, AVR::R11, AVR::R12, AVR::R13, AVR::R14, AVR::R15, AVR::R16, AVR::R17, AVR::R18, AVR::R19, AVR::R20, @@ -69,6 +69,13 @@ static const uint16_t GPRDecoderTable[] = { AVR::R28, AVR::R29, AVR::R30, AVR::R31, }; +static constexpr MCRegister GPRPairDecoderTable[] = { + AVR::R1R0, AVR::R3R2, AVR::R5R4, AVR::R7R6, + AVR::R9R8, AVR::R11R10, AVR::R13R12, AVR::R15R14, + AVR::R17R16, AVR::R19R18, AVR::R21R20, AVR::R23R22, + AVR::R25R24, AVR::R27R26, AVR::R29R28, AVR::R31R30, +}; + static DecodeStatus DecodeGPR8RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const MCDisassembler *Decoder) { @@ -83,96 +90,41 @@ static DecodeStatus DecodeGPR8RegisterClass(MCInst &Inst, unsigned RegNo, static DecodeStatus DecodeLD8RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const MCDisassembler *Decoder) { - if (RegNo > 15) - return MCDisassembler::Fail; - - unsigned Register = GPRDecoderTable[RegNo + 16]; - Inst.addOperand(MCOperand::createReg(Register)); + assert(isUInt<4>(RegNo)); + // Only r16...r31 are legal. + Inst.addOperand(MCOperand::createReg(GPRDecoderTable[16 + RegNo])); return MCDisassembler::Success; } -static DecodeStatus decodeFIOARr(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus decodeFIORdA(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus decodeFIOBIT(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus decodeCallTarget(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus decodeFRd(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus decodeFLPMX(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus decodeFFMULRdRr(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus decodeFMOVWRdRr(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus decodeFWRdK(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus decodeFMUL2RdRr(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus decodeMemri(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus decodeFBRk(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus decodeCondBranch(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus decodeLoadStore(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -#include "AVRGenDisassemblerTables.inc" +static DecodeStatus DecodeLD8loRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const MCDisassembler *Decoder) { + assert(isUInt<3>(RegNo)); + // Only r16...r23 are legal. + Inst.addOperand(MCOperand::createReg(GPRDecoderTable[16 + RegNo])); + return MCDisassembler::Success; +} -static DecodeStatus decodeFIOARr(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder) { - unsigned addr = 0; - addr |= fieldFromInstruction(Insn, 0, 4); - addr |= fieldFromInstruction(Insn, 9, 2) << 4; - unsigned reg = fieldFromInstruction(Insn, 4, 5); - Inst.addOperand(MCOperand::createImm(addr)); - if (DecodeGPR8RegisterClass(Inst, reg, Address, Decoder) == - MCDisassembler::Fail) - return MCDisassembler::Fail; +static DecodeStatus DecodeDREGSRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const MCDisassembler *Decoder) { + assert(isUInt<4>(RegNo)); + Inst.addOperand(MCOperand::createReg(GPRPairDecoderTable[RegNo])); return MCDisassembler::Success; } -static DecodeStatus decodeFIORdA(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder) { - unsigned addr = 0; - addr |= fieldFromInstruction(Insn, 0, 4); - addr |= fieldFromInstruction(Insn, 9, 2) << 4; - unsigned reg = fieldFromInstruction(Insn, 4, 5); - if (DecodeGPR8RegisterClass(Inst, reg, Address, Decoder) == - MCDisassembler::Fail) - return MCDisassembler::Fail; - Inst.addOperand(MCOperand::createImm(addr)); +static DecodeStatus DecodeIWREGSRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const MCDisassembler *Decoder) { + assert(isUInt<2>(RegNo)); + // Only AVR::R25R24, AVR::R27R26, AVR::R29R28, AVR::R31R30 are legal. + Inst.addOperand(MCOperand::createReg(GPRPairDecoderTable[12 + RegNo])); return MCDisassembler::Success; } -static DecodeStatus decodeFIOBIT(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder) { - unsigned addr = fieldFromInstruction(Insn, 3, 5); - unsigned b = fieldFromInstruction(Insn, 0, 3); - Inst.addOperand(MCOperand::createImm(addr)); - Inst.addOperand(MCOperand::createImm(b)); +static DecodeStatus DecodeZREGRegisterClass(MCInst &Inst, + const MCDisassembler *Decoder) { + Inst.addOperand(MCOperand::createReg(AVR::R31R30)); return MCDisassembler::Success; } @@ -185,78 +137,19 @@ static DecodeStatus decodeCallTarget(MCInst &Inst, unsigned Field, return MCDisassembler::Success; } -static DecodeStatus decodeFRd(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder) { - unsigned d = fieldFromInstruction(Insn, 4, 5); - if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) == - MCDisassembler::Fail) - return MCDisassembler::Fail; - return MCDisassembler::Success; -} - -static DecodeStatus decodeFLPMX(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder) { - if (decodeFRd(Inst, Insn, Address, Decoder) == MCDisassembler::Fail) - return MCDisassembler::Fail; - Inst.addOperand(MCOperand::createReg(AVR::R31R30)); - return MCDisassembler::Success; -} - -static DecodeStatus decodeFFMULRdRr(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - unsigned d = fieldFromInstruction(Insn, 4, 3) + 16; - unsigned r = fieldFromInstruction(Insn, 0, 3) + 16; - if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) == - MCDisassembler::Fail) - return MCDisassembler::Fail; - if (DecodeGPR8RegisterClass(Inst, r, Address, Decoder) == - MCDisassembler::Fail) - return MCDisassembler::Fail; - return MCDisassembler::Success; -} - -static DecodeStatus decodeFMOVWRdRr(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - unsigned r = fieldFromInstruction(Insn, 4, 4) * 2; - unsigned d = fieldFromInstruction(Insn, 0, 4) * 2; - if (DecodeGPR8RegisterClass(Inst, r, Address, Decoder) == - MCDisassembler::Fail) - return MCDisassembler::Fail; - if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) == - MCDisassembler::Fail) - return MCDisassembler::Fail; - return MCDisassembler::Success; -} - -static DecodeStatus decodeFWRdK(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder) { - unsigned d = fieldFromInstruction(Insn, 4, 2) * 2 + 24; // starts at r24:r25 - unsigned k = 0; - k |= fieldFromInstruction(Insn, 0, 4); - k |= fieldFromInstruction(Insn, 6, 2) << 4; - if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) == - MCDisassembler::Fail) - return MCDisassembler::Fail; - if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) == - MCDisassembler::Fail) - return MCDisassembler::Fail; - Inst.addOperand(MCOperand::createImm(k)); +static DecodeStatus decodeRelCondBrTarget7(MCInst &Inst, unsigned Field, + uint64_t Address, + const MCDisassembler *Decoder) { + // The legal range is [-128, 126] (in bytes). + Inst.addOperand(MCOperand::createImm(SignExtend32(Field, 7) * 2)); return MCDisassembler::Success; } -static DecodeStatus decodeFMUL2RdRr(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - unsigned rd = fieldFromInstruction(Insn, 4, 4) + 16; - unsigned rr = fieldFromInstruction(Insn, 0, 4) + 16; - if (DecodeGPR8RegisterClass(Inst, rd, Address, Decoder) == - MCDisassembler::Fail) - return MCDisassembler::Fail; - if (DecodeGPR8RegisterClass(Inst, rr, Address, Decoder) == - MCDisassembler::Fail) - return MCDisassembler::Fail; +static DecodeStatus decodeRelCondBrTarget13(MCInst &Inst, unsigned Field, + uint64_t Address, + const MCDisassembler *Decoder) { + // The legal range is [-4096, 4094] (in bytes). + Inst.addOperand(MCOperand::createImm(SignExtend32(Field, 12) * 2)); return MCDisassembler::Success; } @@ -277,59 +170,6 @@ static DecodeStatus decodeMemri(MCInst &Inst, unsigned Insn, uint64_t Address, return MCDisassembler::Success; } -static DecodeStatus decodeFBRk(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder) { - // Decode the opcode. - switch (Insn & 0xf000) { - case 0xc000: - Inst.setOpcode(AVR::RJMPk); - break; - case 0xd000: - Inst.setOpcode(AVR::RCALLk); - break; - default: // Unknown relative branch instruction. - return MCDisassembler::Fail; - } - // Decode the relative offset. - int16_t Offset = ((int16_t)((Insn & 0xfff) << 4)) >> 3; - Inst.addOperand(MCOperand::createImm(Offset)); - return MCDisassembler::Success; -} - -static DecodeStatus decodeCondBranch(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - // These 8 instructions are not defined as aliases of BRBS/BRBC. - DenseMap<unsigned, unsigned> brInsts = { - {0x000, AVR::BRLOk}, {0x400, AVR::BRSHk}, {0x001, AVR::BREQk}, - {0x401, AVR::BRNEk}, {0x002, AVR::BRMIk}, {0x402, AVR::BRPLk}, - {0x004, AVR::BRLTk}, {0x404, AVR::BRGEk}}; - - // Get the relative offset. - int16_t Offset = ((int16_t)((Insn & 0x3f8) << 6)) >> 8; - - // Search the instruction pattern. - auto NotAlias = [&Insn](const std::pair<unsigned, unsigned> &I) { - return (Insn & 0x407) != I.first; - }; - llvm::partition(brInsts, NotAlias); - auto It = llvm::partition_point(brInsts, NotAlias); - - // Decode the instruction. - if (It != brInsts.end()) { - // This instruction is not an alias of BRBC/BRBS. - Inst.setOpcode(It->second); - Inst.addOperand(MCOperand::createImm(Offset)); - } else { - // Fall back to an ordinary BRBS/BRBC. - Inst.setOpcode(Insn & 0x400 ? AVR::BRBCsk : AVR::BRBSsk); - Inst.addOperand(MCOperand::createImm(Insn & 7)); - Inst.addOperand(MCOperand::createImm(Offset)); - } - - return MCDisassembler::Success; -} - static DecodeStatus decodeLoadStore(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { @@ -435,6 +275,8 @@ static DecodeStatus decodeLoadStore(MCInst &Inst, unsigned Insn, return MCDisassembler::Success; } +#include "AVRGenDisassemblerTables.inc" + static DecodeStatus readInstruction16(ArrayRef<uint8_t> Bytes, uint64_t Address, uint64_t &Size, uint32_t &Insn) { if (Bytes.size() < 2) { diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp index 481219164a0f..5adffeed04bd 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp @@ -101,23 +101,6 @@ const char *AVRInstPrinter::getPrettyRegisterName(MCRegister Reg, void AVRInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) { const MCOperandInfo &MOI = this->MII.get(MI->getOpcode()).operands()[OpNo]; - if (MOI.RegClass == AVR::ZREGRegClassID) { - // Special case for the Z register, which sometimes doesn't have an operand - // in the MCInst. - O << "Z"; - return; - } - - if (OpNo >= MI->size()) { - // Not all operands are correctly disassembled at the moment. This means - // that some machine instructions won't have all the necessary operands - // set. - // To avoid asserting, print <unknown> instead until the necessary support - // has been implemented. - O << "<unknown>"; - return; - } - const MCOperand &Op = MI->getOperand(OpNo); if (Op.isReg()) { diff --git a/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp b/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp index 352017e9b929..dadba52de462 100644 --- a/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp +++ b/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp @@ -193,27 +193,6 @@ void BPFDAGToDAGISel::Select(SDNode *Node) { switch (Opcode) { default: break; - case ISD::INTRINSIC_W_CHAIN: { - unsigned IntNo = Node->getConstantOperandVal(1); - switch (IntNo) { - case Intrinsic::bpf_load_byte: - case Intrinsic::bpf_load_half: - case Intrinsic::bpf_load_word: { - SDLoc DL(Node); - SDValue Chain = Node->getOperand(0); - SDValue N1 = Node->getOperand(1); - SDValue Skb = Node->getOperand(2); - SDValue N3 = Node->getOperand(3); - - SDValue R6Reg = CurDAG->getRegister(BPF::R6, MVT::i64); - Chain = CurDAG->getCopyToReg(Chain, DL, R6Reg, Skb, SDValue()); - Node = CurDAG->UpdateNodeOperands(Node, Chain, N1, R6Reg, N3); - break; - } - } - break; - } - case ISD::FrameIndex: { int FI = cast<FrameIndexSDNode>(Node)->getIndex(); EVT VT = Node->getValueType(0); diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.cpp b/llvm/lib/Target/BPF/BPFInstrInfo.cpp index 70bc163615f6..fb4efcfe8614 100644 --- a/llvm/lib/Target/BPF/BPFInstrInfo.cpp +++ b/llvm/lib/Target/BPF/BPFInstrInfo.cpp @@ -12,6 +12,7 @@ #include "BPFInstrInfo.h" #include "BPF.h" +#include "BPFSubtarget.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -25,8 +26,8 @@ using namespace llvm; -BPFInstrInfo::BPFInstrInfo() - : BPFGenInstrInfo(BPF::ADJCALLSTACKDOWN, BPF::ADJCALLSTACKUP) {} +BPFInstrInfo::BPFInstrInfo(const BPFSubtarget &STI) + : BPFGenInstrInfo(STI, BPF::ADJCALLSTACKDOWN, BPF::ADJCALLSTACKUP) {} void BPFInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.h b/llvm/lib/Target/BPF/BPFInstrInfo.h index d8bbad44e314..2359e43e483f 100644 --- a/llvm/lib/Target/BPF/BPFInstrInfo.h +++ b/llvm/lib/Target/BPF/BPFInstrInfo.h @@ -20,12 +20,13 @@ #include "BPFGenInstrInfo.inc" namespace llvm { +class BPFSubtarget; class BPFInstrInfo : public BPFGenInstrInfo { const BPFRegisterInfo RI; public: - BPFInstrInfo(); + explicit BPFInstrInfo(const BPFSubtarget &STI); const BPFRegisterInfo &getRegisterInfo() const { return RI; } diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td index b21f1a0eee3b..de7dae2c8ca6 100644 --- a/llvm/lib/Target/BPF/BPFInstrInfo.td +++ b/llvm/lib/Target/BPF/BPFInstrInfo.td @@ -1189,10 +1189,9 @@ let Defs = [R0, R1, R2, R3, R4, R5], Uses = [R6], hasSideEffects = 1, hasExtraDefRegAllocReq = 1, hasExtraSrcRegAllocReq = 1, mayLoad = 1 in { class LOAD_ABS<BPFWidthModifer SizeOp, string OpcodeStr, Intrinsic OpNode> : TYPE_LD_ST<BPF_ABS.Value, SizeOp.Value, - (outs), - (ins GPR:$skb, i64imm:$imm), + (outs), (ins i64imm:$imm), "r0 = *("#OpcodeStr#" *)skb[$imm]", - [(set R0, (OpNode GPR:$skb, i64immSExt32:$imm))]> { + [(set R0, (OpNode R6, i64immSExt32:$imm))]> { bits<32> imm; let Inst{31-0} = imm; @@ -1201,10 +1200,9 @@ class LOAD_ABS<BPFWidthModifer SizeOp, string OpcodeStr, Intrinsic OpNode> class LOAD_IND<BPFWidthModifer SizeOp, string OpcodeStr, Intrinsic OpNode> : TYPE_LD_ST<BPF_IND.Value, SizeOp.Value, - (outs), - (ins GPR:$skb, GPR:$val), + (outs), (ins GPR:$val), "r0 = *("#OpcodeStr#" *)skb[$val]", - [(set R0, (OpNode GPR:$skb, GPR:$val))]> { + [(set R0, (OpNode R6, GPR:$val))]> { bits<4> val; let Inst{55-52} = val; diff --git a/llvm/lib/Target/BPF/BPFSubtarget.cpp b/llvm/lib/Target/BPF/BPFSubtarget.cpp index 4167547680b1..a7ecc39fad7b 100644 --- a/llvm/lib/Target/BPF/BPFSubtarget.cpp +++ b/llvm/lib/Target/BPF/BPFSubtarget.cpp @@ -103,7 +103,7 @@ void BPFSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { BPFSubtarget::BPFSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM) : BPFGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), - FrameLowering(initializeSubtargetDependencies(CPU, FS)), + InstrInfo(initializeSubtargetDependencies(CPU, FS)), FrameLowering(*this), TLInfo(TM, *this) { IsLittleEndian = TT.isLittleEndian(); diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp index bed6bc98b167..ba4b48990c64 100644 --- a/llvm/lib/Target/BPF/BTFDebug.cpp +++ b/llvm/lib/Target/BPF/BTFDebug.cpp @@ -235,7 +235,7 @@ void BTFTypeEnum64::completeType(BTFDebug &BDebug) { BTFEnum.NameOff = BDebug.addString(Enum->getName()); uint64_t Value; if (Enum->isUnsigned()) - Value = static_cast<uint64_t>(Enum->getValue().getZExtValue()); + Value = Enum->getValue().getZExtValue(); else Value = static_cast<uint64_t>(Enum->getValue().getSExtValue()); BTFEnum.Val_Lo32 = Value; diff --git a/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp b/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp index b5bb1c08c564..230cf3b0ddbe 100644 --- a/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp +++ b/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp @@ -205,18 +205,6 @@ DecodeStatus BPFDisassembler::getInstruction(MCInst &Instr, uint64_t &Size, Op.setImm(Make_64(Hi, Op.getImm())); break; } - case BPF::LD_ABS_B: - case BPF::LD_ABS_H: - case BPF::LD_ABS_W: - case BPF::LD_IND_B: - case BPF::LD_IND_H: - case BPF::LD_IND_W: { - auto Op = Instr.getOperand(0); - Instr.clear(); - Instr.addOperand(MCOperand::createReg(BPF::R6)); - Instr.addOperand(Op); - break; - } } return Result; diff --git a/llvm/lib/Target/CSKY/CSKYInstrFormats.td b/llvm/lib/Target/CSKY/CSKYInstrFormats.td index 5296d282c689..abf5cac0013d 100644 --- a/llvm/lib/Target/CSKY/CSKYInstrFormats.td +++ b/llvm/lib/Target/CSKY/CSKYInstrFormats.td @@ -168,7 +168,9 @@ class I_16_RET<bits<5> sop, bits<5> pcode, string op, list<dag> pattern> // Instructions(3): cmpnei32, cmphsi32, cmplti32 class I_16_X<bits<5> sop, string op, Operand operand> : CSKY32Inst<AddrModeNone, 0x3a, (outs CARRY:$ca), - (ins GPR:$rx, operand:$imm16), !strconcat(op, "\t$rx, $imm16"), []> { + (ins GPR:$rx, operand:$imm16), + !strconcat(op, "\t$rx, $imm16"), []> { + bits<0> ca; bits<16> imm16; bits<5> rx; let Inst{25 - 21} = sop; @@ -263,8 +265,9 @@ class I_12_PP<bits<5> sop, bits<5> pcode, dag outs, dag ins, string op> class I_5_ZX<bits<6> sop, bits<5> pcode, string op, ImmLeaf ImmType, list<dag> pattern> : CSKY32Inst<AddrModeNone, 0x31, (outs GPR:$rz), - (ins CARRY:$cond, GPR:$false, GPR:$rx, ImmType:$imm5), - !strconcat(op, "\t$rz, $rx, $imm5"), pattern> { + (ins CARRY:$cond, GPR:$false, GPR:$rx, ImmType:$imm5), + !strconcat(op, "\t$rz, $rx, $imm5"), pattern> { + bits<0> cond; bits<5> rz; bits<5> rx; bits<5> imm5; @@ -469,9 +472,10 @@ class I_5_XZ_UZ<bits<6> sop, bits<5> lsb, bits<5> msb, string op, int v> // Instructions(1): btsti32 class I_5_X<bits<6> sop, bits<5> pcode, string op, ImmLeaf ImmType, list<dag> pattern> - : CSKY32Inst<AddrModeNone, 0x31, - (outs CARRY:$ca), (ins GPR:$rx, ImmType:$imm5), - !strconcat(op, "\t$rx, $imm5"), pattern> { + : CSKY32Inst<AddrModeNone, 0x31, (outs CARRY:$ca), + (ins GPR:$rx, ImmType:$imm5), + !strconcat(op, "\t$rx, $imm5"), pattern> { + bits<0> ca; bits<5> imm5; bits<5> rx; let Inst{25 - 21} = imm5; @@ -581,9 +585,9 @@ class R_XXZ<bits<6> sop, bits<5> pcode, dag outs, dag ins, string op, // Format< OP[6] | RY[5] | RX[5] | SOP[6] | PCODE[5] | 00000[5] > // Instructions:(4) cmpne32, cmphs32, cmplt32, tst32 class R_YX<bits<6> sop, bits<5> pcode, string op> - : CSKY32Inst<AddrModeNone, 0x31, (outs CARRY:$ca), - (ins GPR:$rx, GPR:$ry), + : CSKY32Inst<AddrModeNone, 0x31, (outs CARRY:$ca), (ins GPR:$rx, GPR:$ry), !strconcat(op, "\t$rx, $ry"), []> { + bits<0> ca; bits<5> ry; bits<5> rx; let Inst{25 - 21} = ry; @@ -642,8 +646,9 @@ class R_X<bits<6> sop, bits<5> pcode, dag outs, dag ins, string op, list<dag> pa // Format< OP[6] | 00000[5] | 00000[5] | SOP[6] | PCODE[5] | RZ[5] > // Instructions:(2) mvc32, mvcv32 class R_Z_1<bits<6> sop, bits<5> pcode, string op> - : CSKY32Inst<AddrModeNone, 0x31, (outs GPR:$rz), - (ins CARRY:$ca), !strconcat(op, "\t$rz"), []> { + : CSKY32Inst<AddrModeNone, 0x31, (outs GPR:$rz), (ins CARRY:$ca), + !strconcat(op, "\t$rz"), []> { + bits<0> ca; bits<5> rz; let Inst{25 - 21} = 0; let Inst{20 - 16} = 0; @@ -656,7 +661,8 @@ class R_Z_1<bits<6> sop, bits<5> pcode, string op> // Instructions:(2) clrf32, clrt32 class R_Z_2<bits<6> sop, bits<5> pcode, string op> : CSKY32Inst<AddrModeNone, 0x31, (outs GPR:$rz), - (ins CARRY:$ca, GPR:$false), !strconcat(op, "\t$rz"), []> { + (ins CARRY:$ca, GPR:$false), !strconcat(op, "\t$rz"), []> { + bits<0> ca; bits<5> rz; let Inst{25 - 21} = rz; let Inst{20 - 16} = 0; diff --git a/llvm/lib/Target/CSKY/CSKYInstrFormats16Instr.td b/llvm/lib/Target/CSKY/CSKYInstrFormats16Instr.td index ea0761d97545..5cd970d27d47 100644 --- a/llvm/lib/Target/CSKY/CSKYInstrFormats16Instr.td +++ b/llvm/lib/Target/CSKY/CSKYInstrFormats16Instr.td @@ -16,8 +16,9 @@ class J16<bits<5> sop, string opstr, dag ins> } class J16_B<bits<5> sop, string opstr> - : CSKY16Inst<AddrModeNone, (outs), (ins CARRY:$ca, br_symbol_16bit:$offset), - !strconcat(opstr, "\t$offset"), []> { + : CSKY16Inst<AddrModeNone, (outs), (ins CARRY:$ca, br_symbol_16bit:$offset), + !strconcat(opstr, "\t$offset"), []> { + bits<0> ca; bits<10> offset; let Inst{15} = 0; let Inst{14 - 10} = sop; @@ -66,6 +67,8 @@ class R16_XZ_BINOP_NOPat<bits<4> op, bits<2> sop, string opstr> : CSKY16Inst< class R16_XZ_BINOP_C<bits<4> op, bits<2> sop, string opstr> : CSKY16Inst< AddrModeNone, (outs sGPR:$rz, CARRY:$cout), (ins sGPR:$rZ, sGPR:$rx, CARRY:$cin), !strconcat(opstr, "\t$rz, $rx"), []> { + bits<0> cout; + bits<0> cin; bits<4> rz; bits<4> rx; let Inst{15, 14} = 0b01; @@ -101,9 +104,10 @@ class R16_Z_UNOP<bits<4> op, bits<2> sop, string opstr> : CSKY16Inst< let Constraints = "$rz = $rx"; } -class R16_XY_CMP<bits<2> sop, string opstr> : CSKY16Inst< - AddrModeNone, (outs CARRY:$ca), (ins sGPR:$rx, sGPR:$ry), !strconcat(opstr, "\t$rx, $ry"), - []> { +class R16_XY_CMP<bits<2> sop, string opstr> + : CSKY16Inst<AddrModeNone, (outs CARRY:$ca), (ins sGPR:$rx, sGPR:$ry), + !strconcat(opstr, "\t$rx, $ry"), []> { + bits<0> ca; bits<4> ry; bits<4> rx; let Inst{15, 14} = 0b01; @@ -145,9 +149,11 @@ class I16_Z_5<bits<3> sop, dag outs, dag ins,string opstr> let Inst{4 - 0} = imm5; } -class I16_X_CMP<bits<3> sop, string opstr, Operand Immoperand> : CSKY16Inst< - AddrModeNone, (outs CARRY:$ca), (ins mGPR:$rx, Immoperand:$imm5), - !strconcat(opstr, "\t$rx, $imm5"), []> { +class I16_X_CMP<bits<3> sop, string opstr, Operand Immoperand> + : CSKY16Inst<AddrModeNone, (outs CARRY:$ca), + (ins mGPR:$rx, Immoperand:$imm5), + !strconcat(opstr, "\t$rx, $imm5"), []> { + bits<0> ca; bits<3> rx; bits<5> imm5; let Inst{15, 14} = 0b00; @@ -158,9 +164,12 @@ class I16_X_CMP<bits<3> sop, string opstr, Operand Immoperand> : CSKY16Inst< let isCompare = 1; } -class I16_SP_IMM7<bits<3> sop, string opstr> : CSKY16Inst< - AddrModeNone, (outs GPRSP:$sp2), (ins GPRSP:$sp1, uimm7_2:$imm7), - !strconcat(opstr, "\t$sp2, $sp1, $imm7"), []> { +class I16_SP_IMM7<bits<3> sop, string opstr> + : CSKY16Inst<AddrModeNone, (outs GPRSP:$sp2), + (ins GPRSP:$sp1, uimm7_2:$imm7), + !strconcat(opstr, "\t$sp2, $sp1, $imm7"), []> { + bits<0> sp2; + bits<0> sp1; bits<7> imm7; let Inst{15, 14} = 0b00; let Inst{13 - 10} = 0b0101; diff --git a/llvm/lib/Target/CSKY/CSKYInstrFormatsF1.td b/llvm/lib/Target/CSKY/CSKYInstrFormatsF1.td index 446670a4d0a9..a40874b054d8 100644 --- a/llvm/lib/Target/CSKY/CSKYInstrFormatsF1.td +++ b/llvm/lib/Target/CSKY/CSKYInstrFormatsF1.td @@ -91,15 +91,21 @@ multiclass FT_XZ<bits<6> sop, string op, PatFrag opnode> { } let vrz = 0, isCompare = 1 in { -class F_CMPXY<bits<5> datatype, bits<6> sop, string op, string op_su, RegisterOperand regtype> - : F_XYZ_BASE<datatype, sop, (outs CARRY:$ca), (ins regtype:$vrx, regtype:$vry), !strconcat(op#op_su, "\t$vrx, $vry"), - []>; - -let vry = 0 in{ -class F_CMPZX<bits<5> datatype, bits<6> sop, string op, string op_su, RegisterOperand regtype> - : F_XYZ_BASE<datatype, sop, (outs CARRY:$ca), (ins regtype:$vrx), !strconcat(op#op_su, "\t$vrx"), - []>; -} + class F_CMPXY<bits<5> datatype, bits<6> sop, string op, string op_su, + RegisterOperand regtype> + : F_XYZ_BASE<datatype, sop, (outs CARRY:$ca), + (ins regtype:$vrx, regtype:$vry), + !strconcat(op#op_su, "\t$vrx, $vry"), []> { + bits<0> ca; + } + + let vry = 0 in + class F_CMPZX<bits<5> datatype, bits<6> sop, string op, string op_su, + RegisterOperand regtype> + : F_XYZ_BASE<datatype, sop, (outs CARRY:$ca), (ins regtype:$vrx), + !strconcat(op#op_su, "\t$vrx"), []> { + bits<0> ca; + } } class F_XYZ<bits<5> datatype, bits<6> sop, string op, string op_su, PatFrag opnode, RegisterOperand regtype> diff --git a/llvm/lib/Target/CSKY/CSKYInstrFormatsF2.td b/llvm/lib/Target/CSKY/CSKYInstrFormatsF2.td index 641ad623f140..bd7c554565cd 100644 --- a/llvm/lib/Target/CSKY/CSKYInstrFormatsF2.td +++ b/llvm/lib/Target/CSKY/CSKYInstrFormatsF2.td @@ -91,8 +91,9 @@ multiclass F2_XZ_SET_T<bits<6> sop, string op, string suffix = ""> { let vrz = 0, isCompare = 1 in class F2_CXY<bits<5> datatype, RegisterOperand regtype, bits<6> sop, string op> : F2_XYZ<datatype, sop, !strconcat(op, "\t$vrx, $vry"), - (outs CARRY:$ca), (ins regtype:$vrx, regtype:$vry), - []>; + (outs CARRY:$ca), (ins regtype:$vrx, regtype:$vry), []> { + bits<0> ca; +} multiclass F2_CXY_T<bits<6> sop, string op> { def _S : F2_CXY<0b00000, FPR32Op, sop, op#".32">; @@ -103,9 +104,10 @@ multiclass F2_CXY_T<bits<6> sop, string op> { let vrz = 0, vry = 0, isCompare = 1 in class F2_CX<bits<5> datatype, RegisterOperand regtype, bits<6> sop, string op> - : F2_XYZ<datatype, sop, !strconcat(op, "\t$vrx"), - (outs CARRY:$ca), (ins regtype:$vrx), - []>; + : F2_XYZ<datatype, sop, !strconcat(op, "\t$vrx"), (outs CARRY:$ca), + (ins regtype:$vrx), []> { + bits<0> ca; +} multiclass F2_CX_T<bits<6> sop, string op> { def _S : F2_CX<0b00000, FPR32Op, sop, op#".32">; @@ -183,7 +185,10 @@ class F2_LDSTR_D<bits<1> sop, string op, dag outs, dag ins> class F2_CXYZ<bits<5> datatype, RegisterOperand regtype, bits<6> sop, string op> : F2_XYZ<datatype, sop, !strconcat(op, "\t$vrz, $vrx, $vry"), (outs regtype:$vrz), (ins CARRY:$ca, regtype:$vrx, regtype:$vry), - []>; + []> { + bits<0> ca; +} + multiclass F2_CXYZ_T<bits<6> sop, string op> { def _S : F2_CXYZ<0b00000, FPR32Op, sop, op#".32">; let Predicates = [HasFPUv3_DF] in diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp index ccb3f16394d4..619a797be6dc 100644 --- a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp +++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp @@ -24,8 +24,9 @@ using namespace llvm; #define GET_INSTRINFO_CTOR_DTOR #include "CSKYGenInstrInfo.inc" -CSKYInstrInfo::CSKYInstrInfo(CSKYSubtarget &STI) - : CSKYGenInstrInfo(CSKY::ADJCALLSTACKDOWN, CSKY::ADJCALLSTACKUP), STI(STI) { +CSKYInstrInfo::CSKYInstrInfo(const CSKYSubtarget &STI) + : CSKYGenInstrInfo(STI, CSKY::ADJCALLSTACKDOWN, CSKY::ADJCALLSTACKUP), + STI(STI) { v2sf = STI.hasFPUv2SingleFloat(); v2df = STI.hasFPUv2DoubleFloat(); v3sf = STI.hasFPUv3SingleFloat(); diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.h b/llvm/lib/Target/CSKY/CSKYInstrInfo.h index 98f583e8b405..6451c0af14fc 100644 --- a/llvm/lib/Target/CSKY/CSKYInstrInfo.h +++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.h @@ -33,7 +33,7 @@ protected: const CSKYSubtarget &STI; public: - explicit CSKYInstrInfo(CSKYSubtarget &STI); + explicit CSKYInstrInfo(const CSKYSubtarget &STI); Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override; diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.td b/llvm/lib/Target/CSKY/CSKYInstrInfo.td index c6bfc2495ae2..82e271e5b556 100644 --- a/llvm/lib/Target/CSKY/CSKYInstrInfo.td +++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.td @@ -586,14 +586,23 @@ let Predicates = [iHasE2] in { BinOpFrag<(rotl node:$LHS, (and node:$RHS, 0x1f))>, "rotl32">; def BMASKI32 : I_5_Z<0b010100, 0x1, "bmaski32", oimm5, []>; - def LSLC32 : I_5_XZ<0x13, 0x1, "lslc32", - (outs GPR:$rz, CARRY:$cout), (ins GPR:$rx, oimm5:$imm5), []>; - def LSRC32 : I_5_XZ<0x13, 0x2, "lsrc32", - (outs GPR:$rz, CARRY:$cout), (ins GPR:$rx, oimm5:$imm5), []>; - def ASRC32 : I_5_XZ<0x13, 0x4, "asrc32", - (outs GPR:$rz, CARRY:$cout), (ins GPR:$rx, oimm5:$imm5), []>; - def XSR32 : I_5_XZ<0x13, 0x8, "xsr32", - (outs GPR:$rz, CARRY:$cout), (ins GPR:$rx, oimm5:$imm5, CARRY:$cin), []>; + def LSLC32 : I_5_XZ<0x13, 0x1, "lslc32", (outs GPR:$rz, CARRY:$cout), + (ins GPR:$rx, oimm5:$imm5), []> { + bits<0> cout; + } + def LSRC32 : I_5_XZ<0x13, 0x2, "lsrc32", (outs GPR:$rz, CARRY:$cout), + (ins GPR:$rx, oimm5:$imm5), []> { + bits<0> cout; + } + def ASRC32 : I_5_XZ<0x13, 0x4, "asrc32", (outs GPR:$rz, CARRY:$cout), + (ins GPR:$rx, oimm5:$imm5), []> { + bits<0> cout; + } + def XSR32 : I_5_XZ<0x13, 0x8, "xsr32", (outs GPR:$rz, CARRY:$cout), + (ins GPR:$rx, oimm5:$imm5, CARRY:$cin), []> { + bits<0> cout; + bits<0> cin; + } def IXH32 : R_YXZ_SP_F1<0x2, 0x1, BinOpFrag<(add node:$LHS, (shl node:$RHS, (i32 1)))>, "ixh32">; @@ -605,9 +614,15 @@ let Predicates = [iHasE2] in { let isCommutable = 1, isAdd = 1 in def ADDC32 : R_YXZ<0x31, 0x0, 0x2, (outs GPR:$rz, CARRY:$cout), - (ins GPR:$rx, GPR:$ry, CARRY:$cin), "addc32", []>; + (ins GPR:$rx, GPR:$ry, CARRY:$cin), "addc32", []> { + bits<0> cout; + bits<0> cin; + } def SUBC32 : R_YXZ<0x31, 0x0, 0x8, (outs GPR:$rz, CARRY:$cout), - (ins GPR:$rx, GPR:$ry, CARRY:$cin), "subc32", []>; + (ins GPR:$rx, GPR:$ry, CARRY:$cin), "subc32", []> { + bits<0> cout; + bits<0> cin; + } def INCF32 : I_5_ZX<0x3, 0x1, "incf32", uimm5, []>; def INCT32 : I_5_ZX<0x3, 0x2, "inct32", uimm5, []>; @@ -621,12 +636,18 @@ let Predicates = [iHas2E3] in { def DIVU32 : R_YXZ_SP_F1<0x20, 0x1, BinOpFrag<(udiv node:$LHS, node:$RHS)>, "divu32">; - def DECGT32 : I_5_XZ<0x4, 0x1, "decgt32", - (outs GPR:$rz, CARRY:$cout), (ins GPR:$rx, uimm5:$imm5), []>; - def DECLT32 : I_5_XZ<0x4, 0x2, "declt32", - (outs GPR:$rz, CARRY:$cout), (ins GPR:$rx, uimm5:$imm5), []>; - def DECNE32 : I_5_XZ<0x4, 0x4, "decne32", - (outs GPR:$rz, CARRY:$cout), (ins GPR:$rx, uimm5:$imm5), []>; + def DECGT32 : I_5_XZ<0x4, 0x1, "decgt32", (outs GPR:$rz, CARRY:$cout), + (ins GPR:$rx, uimm5:$imm5), []> { + bits<0> cout; + } + def DECLT32 : I_5_XZ<0x4, 0x2, "declt32", (outs GPR:$rz, CARRY:$cout), + (ins GPR:$rx, uimm5:$imm5), []> { + bits<0> cout; + } + def DECNE32 : I_5_XZ<0x4, 0x4, "decne32", (outs GPR:$rz, CARRY:$cout), + (ins GPR:$rx, uimm5:$imm5), []> { + bits<0> cout; + } def SEXT32 : I_5_XZ_U<0x16, (outs GPR:$rz), (ins GPR:$rx, uimm5:$msb, uimm5:$lsb), "sext32", []>; let isCodeGenOnly = 1 in { @@ -744,8 +765,9 @@ let Predicates = [iHas2E3] in { def CMPHS32 : R_YX<0x1, 0x1, "cmphs32">; def CMPLT32 : R_YX<0x1, 0x2, "cmplt32">; - def SETC32 : CSKY32Inst<AddrModeNone, 0x31, - (outs CARRY:$ca), (ins), "setc32", []> { + def SETC32 : CSKY32Inst<AddrModeNone, 0x31, (outs CARRY:$ca), (ins), "setc32", + []> { + bits<0> ca; let Inst{25 - 21} = 0; //rx let Inst{20 - 16} = 0; //ry let Inst{15 - 10} = 0x1; @@ -753,8 +775,9 @@ let Predicates = [iHas2E3] in { let Inst{4 - 0} = 0; let isCompare = 1; } - def CLRC32 : CSKY32Inst<AddrModeNone, 0x31, - (outs CARRY:$ca), (ins), "clrc32", []> { + def CLRC32 : CSKY32Inst<AddrModeNone, 0x31, (outs CARRY:$ca), (ins), "clrc32", + []> { + bits<0> ca; let Inst{25 - 21} = 0; //rx let Inst{20 - 16} = 0; //ry let Inst{15 - 10} = 0x1; @@ -764,8 +787,10 @@ let Predicates = [iHas2E3] in { } def TST32 : R_YX<0x8, 0x4, "tst32">; - def TSTNBZ32 : R_X<0x8, 0x8, - (outs CARRY:$ca), (ins GPR:$rx), "tstnbz32", []>; + def TSTNBZ32 : R_X<0x8, 0x8, (outs CARRY:$ca), (ins GPR:$rx), "tstnbz32", + []> { + bits<0> ca; + } } //===----------------------------------------------------------------------===// @@ -806,9 +831,14 @@ let isBranch = 1, isTerminator = 1 in { [(br bb:$imm16)]>; def BT32 : I_16_L<0x3, (outs), (ins CARRY:$ca, br_symbol:$imm16), - "bt32\t$imm16", [(brcond CARRY:$ca, bb:$imm16)]>, Requires<[iHasE2]>; + "bt32\t$imm16", [(brcond CARRY:$ca, bb:$imm16)]>, + Requires<[iHasE2]> { + bits<0> ca; + } def BF32 : I_16_L<0x2, (outs), (ins CARRY:$ca, br_symbol:$imm16), - "bf32\t$imm16", []>, Requires<[iHasE2]>; + "bf32\t$imm16", []>, Requires<[iHasE2]> { + bits<0> ca; + } } let Predicates = [iHas2E3] in { @@ -1030,7 +1060,10 @@ def SE32 : I_5_XZ_PRIVI<0b010110, 0x1, "se32">; def WSC32 : I_5_XZ_PRIVI<0b001111, 0x1, "wsc32">; def CPOP32 : I_CPOP<(outs), (ins uimm5:$cpid, uimm20:$usdef), "cpop32 <$cpid, ${usdef}>">; -def CPRC32 : I_CP<0b0100, (outs CARRY:$ca), (ins uimm5:$cpid, uimm12:$usdef), "cprc32 <$cpid, ${usdef}>">; +def CPRC32 : I_CP<0b0100, (outs CARRY:$ca), (ins uimm5:$cpid, uimm12:$usdef), + "cprc32 <$cpid, ${usdef}>"> { + bits<0> ca; +} def CPRCR32 : I_CP_Z<0b0010, (outs GPR:$rz), (ins uimm5:$cpid, uimm12:$usdef), "cprcr32 $rz, <$cpid, ${usdef}>">; def CPRGR32 : I_CP_Z<0b0000, (outs GPR:$rz), (ins uimm5:$cpid, uimm12:$usdef), "cprgr32 $rz, <$cpid, ${usdef}>">; def CPWCR32 : I_CP_Z<0b0011, (outs), (ins GPR:$rz, uimm5:$cpid, uimm12:$usdef), "cpwcr32 $rz, <$cpid, ${usdef}>">; diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td b/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td index 3e248019d73f..51645215f32a 100644 --- a/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td +++ b/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td @@ -102,7 +102,9 @@ def : Pat<(add GPR:$rs1, (oimm8_neg:$im)), let isAdd = 1 in def ADDI16ZSP : I16_Z_8<0b011, (ins GPRSP:$sp, uimm8_2:$imm8), - "addi16\t$rz, $sp, $imm8">; + "addi16\t$rz, $sp, $imm8"> { + bits<0> sp; +} let isAdd = 1 in def ADDI16SPSP : I16_SP_IMM7<0b000,"addi16">; @@ -142,10 +144,14 @@ def ST16H : I16_XZ_LDST<AddrMode16H, 0b101, "st16.h", def ST16W : I16_XZ_LDST<AddrMode16W, 0b110, "st16.w", (outs), (ins mGPR:$rz, mGPR:$rx, uimm5_2:$imm)>; -def LD16WSP : I16_ZSP_LDST<AddrMode16W, 0b011, "ld16.w", - (outs mGPR:$rz), (ins GPRSP:$sp, uimm8_2:$addr)>; -def ST16WSP : I16_ZSP_LDST<AddrMode16W, 0b111, "st16.w", - (outs), (ins mGPR:$rz, GPRSP:$sp, uimm8_2:$addr)>; +def LD16WSP : I16_ZSP_LDST<AddrMode16W, 0b011, "ld16.w", (outs mGPR:$rz), + (ins GPRSP:$sp, uimm8_2:$addr)> { + bits<0> sp; +} +def ST16WSP : I16_ZSP_LDST<AddrMode16W, 0b111, "st16.w", (outs), + (ins mGPR:$rz, GPRSP:$sp, uimm8_2:$addr)> { + bits<0> sp; +} //===----------------------------------------------------------------------===// // Compare instructions. @@ -187,8 +193,9 @@ def MOV16 : CSKY16Inst<AddrModeNone, (outs sGPR:$rz), (ins sGPR:$rx), } // MVC16 is not in "cskyv2 instructions reference manul" -def MVCV16 : CSKY16Inst<AddrModeNone, - (outs sGPR:$rz), (ins CARRY:$ca), "mvcv16\t$rz", []> { +def MVCV16 : CSKY16Inst<AddrModeNone, (outs sGPR:$rz), (ins CARRY:$ca), + "mvcv16\t$rz", []> { + bits<0> ca; bits<4> rz; let Inst{15,14} = 0b01; let Inst{13 - 10} = 0b1001; @@ -317,11 +324,14 @@ let Constraints = "$rZ = $rz" in { } let Predicates = [HasBTST16] in - def BTSTI16 : I16_Z_5<0b110, (outs CARRY:$ca), (ins mGPR:$rz, uimm5:$imm5), - "btsti16">; +def BTSTI16 : I16_Z_5<0b110, (outs CARRY:$ca), (ins mGPR:$rz, uimm5:$imm5), + "btsti16"> { + bits<0> ca; +} def TST16 : CSKY16Inst<AddrModeNone, (outs CARRY:$ca), (ins sGPR:$rx, sGPR:$ry), - "tst16\t$rx, $ry", []> { + "tst16\t$rx, $ry", []> { + bits<0> ca; bits<4> ry; bits<4> rx; let Inst{15,14} = 0b01; @@ -334,6 +344,7 @@ def TST16 : CSKY16Inst<AddrModeNone, (outs CARRY:$ca), (ins sGPR:$rx, sGPR:$ry), def TSTNBZ16 : CSKY16Inst<AddrModeNone, (outs CARRY:$ca), (ins sGPR:$rx), "tstnbz16\t$rx", []> { + bits<0> ca; bits<4> rx; let Inst{15,14} = 0b01; let Inst{13 - 10} = 0b1010; diff --git a/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp b/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp index 749127f4ddc8..887e28127953 100644 --- a/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp +++ b/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp @@ -36,8 +36,6 @@ class CSKYDisassembler : public MCDisassembler { std::unique_ptr<MCInstrInfo const> const MCII; mutable StringRef symbolName; - DecodeStatus handleCROperand(MCInst &Instr) const; - public: CSKYDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, MCInstrInfo const *MCII); @@ -198,15 +196,9 @@ static DecodeStatus DecodemGPRRegisterClass(MCInst &Inst, uint64_t RegNo, return MCDisassembler::Success; } -// TODO -LLVM_ATTRIBUTE_UNUSED -static DecodeStatus DecodeGPRSPRegisterClass(MCInst &Inst, uint64_t RegNo, - uint64_t Address, +static DecodeStatus DecodeGPRSPRegisterClass(MCInst &Inst, const MCDisassembler *Decoder) { - if (RegNo != 14) - return MCDisassembler::Fail; - - Inst.addOperand(MCOperand::createReg(GPRDecoderTable[RegNo])); + Inst.addOperand(MCOperand::createReg(CSKY::R14)); return MCDisassembler::Success; } @@ -224,6 +216,12 @@ static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, uint64_t RegNo, return MCDisassembler::Success; } +static DecodeStatus DecodeCARRYRegisterClass(MCInst &Inst, + const MCDisassembler *Decoder) { + Inst.addOperand(MCOperand::createReg(CSKY::C)); + return MCDisassembler::Success; +} + template <unsigned N, unsigned S> static DecodeStatus decodeUImmOperand(MCInst &Inst, uint64_t Imm, int64_t Address, @@ -378,121 +376,6 @@ static DecodeStatus decodeSImmOperand(MCInst &Inst, uint64_t Imm, #include "CSKYGenDisassemblerTables.inc" -DecodeStatus CSKYDisassembler::handleCROperand(MCInst &MI) const { - - // FIXME: To query instruction info from td file or a table inc file - switch (MI.getOpcode()) { - default: - return MCDisassembler::Success; - case CSKY::LD16WSP: - case CSKY::ST16WSP: - case CSKY::ADDI16ZSP: - MI.insert(std::next(MI.begin()), MCOperand::createReg(CSKY::R14)); - return MCDisassembler::Success; - case CSKY::ADDI16SPSP: - case CSKY::SUBI16SPSP: - MI.insert(MI.begin(), MCOperand::createReg(CSKY::R14)); - MI.insert(MI.begin(), MCOperand::createReg(CSKY::R14)); - return MCDisassembler::Success; - case CSKY::FCMPHS_S: - case CSKY::FCMPHS_D: - case CSKY::FCMPLT_S: - case CSKY::FCMPLT_D: - case CSKY::FCMPNE_S: - case CSKY::FCMPNE_D: - case CSKY::FCMPUO_S: - case CSKY::FCMPUO_D: - case CSKY::FCMPZHS_S: - case CSKY::FCMPZHS_D: - case CSKY::FCMPZLS_S: - case CSKY::FCMPZLS_D: - case CSKY::FCMPZNE_S: - case CSKY::FCMPZNE_D: - case CSKY::FCMPZUO_S: - case CSKY::FCMPZUO_D: - case CSKY::f2FCMPHS_S: - case CSKY::f2FCMPHS_D: - case CSKY::f2FCMPLT_S: - case CSKY::f2FCMPLT_D: - case CSKY::f2FCMPNE_S: - case CSKY::f2FCMPNE_D: - case CSKY::f2FCMPUO_S: - case CSKY::f2FCMPUO_D: - case CSKY::f2FCMPHSZ_S: - case CSKY::f2FCMPHSZ_D: - case CSKY::f2FCMPHZ_S: - case CSKY::f2FCMPHZ_D: - case CSKY::f2FCMPLSZ_S: - case CSKY::f2FCMPLSZ_D: - case CSKY::f2FCMPLTZ_S: - case CSKY::f2FCMPLTZ_D: - case CSKY::f2FCMPNEZ_S: - case CSKY::f2FCMPNEZ_D: - case CSKY::f2FCMPUOZ_S: - case CSKY::f2FCMPUOZ_D: - - case CSKY::BT32: - case CSKY::BF32: - case CSKY::BT16: - case CSKY::BF16: - case CSKY::CMPNEI32: - case CSKY::CMPNEI16: - case CSKY::CMPNE32: - case CSKY::CMPNE16: - case CSKY::CMPHSI32: - case CSKY::CMPHSI16: - case CSKY::CMPHS32: - case CSKY::CMPHS16: - case CSKY::CMPLTI32: - case CSKY::CMPLTI16: - case CSKY::CMPLT32: - case CSKY::CMPLT16: - case CSKY::BTSTI32: - case CSKY::BTSTI16: - case CSKY::TSTNBZ32: - case CSKY::TSTNBZ16: - case CSKY::TST32: - case CSKY::TST16: - MI.insert(MI.begin(), MCOperand::createReg(CSKY::C)); - return MCDisassembler::Success; - case CSKY::LSLC32: - case CSKY::LSRC32: - case CSKY::ASRC32: - MI.insert(std::next(MI.begin()), MCOperand::createReg(CSKY::C)); - return MCDisassembler::Success; - case CSKY::MOVF32: - case CSKY::MOVT32: - case CSKY::MVC32: - case CSKY::MVCV32: - case CSKY::MVCV16: - case CSKY::INCT32: - case CSKY::INCF32: - case CSKY::DECT32: - case CSKY::DECF32: - case CSKY::DECGT32: - case CSKY::DECLT32: - case CSKY::DECNE32: - case CSKY::CLRF32: - case CSKY::CLRT32: - case CSKY::f2FSEL_S: - case CSKY::f2FSEL_D: - MI.insert(std::next(MI.begin()), MCOperand::createReg(CSKY::C)); - return MCDisassembler::Success; - case CSKY::ADDC32: - case CSKY::ADDC16: - case CSKY::SUBC32: - case CSKY::SUBC16: - case CSKY::XSR32: - MI.insert(std::next(MI.begin()), MCOperand::createReg(CSKY::C)); - MI.insert(MI.end(), MCOperand::createReg(CSKY::C)); - return MCDisassembler::Success; - case CSKY::INS32: - MI.getOperand(3).setImm(MI.getOperand(3).getImm() + - MI.getOperand(4).getImm()); - return MCDisassembler::Success; - } -} - static bool decodeFPUV3Instruction(MCInst &MI, uint32_t insn, uint64_t Address, const MCDisassembler *DisAsm, const MCSubtargetInfo &STI) { @@ -548,7 +431,10 @@ DecodeStatus CSKYDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Size = 2; } - handleCROperand(MI); + if (MI.getOpcode() == CSKY::INS32) { + MI.getOperand(3).setImm(MI.getOperand(3).getImm() + + MI.getOperand(4).getImm()); + } return Result; } diff --git a/llvm/lib/Target/DirectX/CMakeLists.txt b/llvm/lib/Target/DirectX/CMakeLists.txt index 8100f941c8d9..6c079517e22d 100644 --- a/llvm/lib/Target/DirectX/CMakeLists.txt +++ b/llvm/lib/Target/DirectX/CMakeLists.txt @@ -41,6 +41,7 @@ add_llvm_target(DirectXCodeGen LINK_COMPONENTS Analysis AsmPrinter + BinaryFormat CodeGen CodeGenTypes Core diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index a1ef2578f00a..ca81d30473c0 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -158,12 +158,15 @@ void DXContainerGlobals::addRootSignature(Module &M, if (MMI.ShaderProfile == llvm::Triple::Library) return; - assert(MMI.EntryPropertyVec.size() == 1); - auto &RSA = getAnalysis<RootSignatureAnalysisWrapper>().getRSInfo(); - const Function *EntryFunction = MMI.EntryPropertyVec[0].Entry; - const mcdxbc::RootSignatureDesc *RS = RSA.getDescForFunction(EntryFunction); + const Function *EntryFunction = nullptr; + if (MMI.ShaderProfile != llvm::Triple::RootSignature) { + assert(MMI.EntryPropertyVec.size() == 1); + EntryFunction = MMI.EntryPropertyVec[0].Entry; + } + + const mcdxbc::RootSignatureDesc *RS = RSA.getDescForFunction(EntryFunction); if (!RS) return; @@ -258,7 +261,8 @@ void DXContainerGlobals::addPipelineStateValidationInfo( dxil::ModuleMetadataInfo &MMI = getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata(); assert(MMI.EntryPropertyVec.size() == 1 || - MMI.ShaderProfile == Triple::Library); + MMI.ShaderProfile == Triple::Library || + MMI.ShaderProfile == Triple::RootSignature); PSV.BaseData.ShaderStage = static_cast<uint8_t>(MMI.ShaderProfile - Triple::Pixel); @@ -279,7 +283,8 @@ void DXContainerGlobals::addPipelineStateValidationInfo( break; } - if (MMI.ShaderProfile != Triple::Library) + if (MMI.ShaderProfile != Triple::Library && + MMI.ShaderProfile != Triple::RootSignature) PSV.EntryName = MMI.EntryPropertyVec[0].Entry->getName(); PSV.finalize(MMI.ShaderProfile); diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp index feecfc0880e2..d507d71b99fc 100644 --- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp +++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp @@ -343,9 +343,7 @@ bool DataScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) { GOp->replaceAllUsesWith(NewGEP); - if (auto *CE = dyn_cast<ConstantExpr>(GOp)) - CE->destroyConstant(); - else if (auto *OldGEPI = dyn_cast<GetElementPtrInst>(GOp)) + if (auto *OldGEPI = dyn_cast<GetElementPtrInst>(GOp)) OldGEPI->eraseFromParent(); return true; diff --git a/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp b/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp index 13e3408815bb..aa16e795dc76 100644 --- a/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp +++ b/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp @@ -22,11 +22,13 @@ static bool finalizeLinkage(Module &M) { // Convert private globals and external globals with no usage to internal // linkage. - for (GlobalVariable &GV : M.globals()) + for (GlobalVariable &GV : M.globals()) { + GV.removeDeadConstantUsers(); if (GV.hasPrivateLinkage() || (GV.hasExternalLinkage() && GV.use_empty())) { GV.setLinkage(GlobalValue::InternalLinkage); MadeChange = true; } + } SmallVector<Function *> Funcs; diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp index ee1db54446cb..e2469d8df957 100644 --- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp +++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp @@ -51,6 +51,150 @@ static bool resourceAccessNeeds64BitExpansion(Module *M, Type *OverloadTy, return ScalarTy->isDoubleTy() || ScalarTy->isIntegerTy(64); } +static Value *expand16BitIsInf(CallInst *Orig) { + Module *M = Orig->getModule(); + if (M->getTargetTriple().getDXILVersion() >= VersionTuple(1, 9)) + return nullptr; + + Value *Val = Orig->getOperand(0); + Type *ValTy = Val->getType(); + if (!ValTy->getScalarType()->isHalfTy()) + return nullptr; + + IRBuilder<> Builder(Orig); + Type *IType = Type::getInt16Ty(M->getContext()); + Constant *PosInf = + ValTy->isVectorTy() + ? ConstantVector::getSplat( + ElementCount::getFixed( + cast<FixedVectorType>(ValTy)->getNumElements()), + ConstantInt::get(IType, 0x7c00)) + : ConstantInt::get(IType, 0x7c00); + + Constant *NegInf = + ValTy->isVectorTy() + ? ConstantVector::getSplat( + ElementCount::getFixed( + cast<FixedVectorType>(ValTy)->getNumElements()), + ConstantInt::get(IType, 0xfc00)) + : ConstantInt::get(IType, 0xfc00); + + Value *IVal = Builder.CreateBitCast(Val, PosInf->getType()); + Value *B1 = Builder.CreateICmpEQ(IVal, PosInf); + Value *B2 = Builder.CreateICmpEQ(IVal, NegInf); + Value *B3 = Builder.CreateOr(B1, B2); + return B3; +} + +static Value *expand16BitIsNaN(CallInst *Orig) { + Module *M = Orig->getModule(); + if (M->getTargetTriple().getDXILVersion() >= VersionTuple(1, 9)) + return nullptr; + + Value *Val = Orig->getOperand(0); + Type *ValTy = Val->getType(); + if (!ValTy->getScalarType()->isHalfTy()) + return nullptr; + + IRBuilder<> Builder(Orig); + Type *IType = Type::getInt16Ty(M->getContext()); + + Constant *ExpBitMask = + ValTy->isVectorTy() + ? ConstantVector::getSplat( + ElementCount::getFixed( + cast<FixedVectorType>(ValTy)->getNumElements()), + ConstantInt::get(IType, 0x7c00)) + : ConstantInt::get(IType, 0x7c00); + Constant *SigBitMask = + ValTy->isVectorTy() + ? ConstantVector::getSplat( + ElementCount::getFixed( + cast<FixedVectorType>(ValTy)->getNumElements()), + ConstantInt::get(IType, 0x3ff)) + : ConstantInt::get(IType, 0x3ff); + + Constant *Zero = + ValTy->isVectorTy() + ? ConstantVector::getSplat( + ElementCount::getFixed( + cast<FixedVectorType>(ValTy)->getNumElements()), + ConstantInt::get(IType, 0)) + : ConstantInt::get(IType, 0); + + Value *IVal = Builder.CreateBitCast(Val, ExpBitMask->getType()); + Value *Exp = Builder.CreateAnd(IVal, ExpBitMask); + Value *B1 = Builder.CreateICmpEQ(Exp, ExpBitMask); + + Value *Sig = Builder.CreateAnd(IVal, SigBitMask); + Value *B2 = Builder.CreateICmpNE(Sig, Zero); + Value *B3 = Builder.CreateAnd(B1, B2); + return B3; +} + +static Value *expand16BitIsFinite(CallInst *Orig) { + Module *M = Orig->getModule(); + if (M->getTargetTriple().getDXILVersion() >= VersionTuple(1, 9)) + return nullptr; + + Value *Val = Orig->getOperand(0); + Type *ValTy = Val->getType(); + if (!ValTy->getScalarType()->isHalfTy()) + return nullptr; + + IRBuilder<> Builder(Orig); + Type *IType = Type::getInt16Ty(M->getContext()); + + Constant *ExpBitMask = + ValTy->isVectorTy() + ? ConstantVector::getSplat( + ElementCount::getFixed( + cast<FixedVectorType>(ValTy)->getNumElements()), + ConstantInt::get(IType, 0x7c00)) + : ConstantInt::get(IType, 0x7c00); + + Value *IVal = Builder.CreateBitCast(Val, ExpBitMask->getType()); + Value *Exp = Builder.CreateAnd(IVal, ExpBitMask); + Value *B1 = Builder.CreateICmpNE(Exp, ExpBitMask); + return B1; +} + +static Value *expand16BitIsNormal(CallInst *Orig) { + Module *M = Orig->getModule(); + if (M->getTargetTriple().getDXILVersion() >= VersionTuple(1, 9)) + return nullptr; + + Value *Val = Orig->getOperand(0); + Type *ValTy = Val->getType(); + if (!ValTy->getScalarType()->isHalfTy()) + return nullptr; + + IRBuilder<> Builder(Orig); + Type *IType = Type::getInt16Ty(M->getContext()); + + Constant *ExpBitMask = + ValTy->isVectorTy() + ? ConstantVector::getSplat( + ElementCount::getFixed( + cast<FixedVectorType>(ValTy)->getNumElements()), + ConstantInt::get(IType, 0x7c00)) + : ConstantInt::get(IType, 0x7c00); + Constant *Zero = + ValTy->isVectorTy() + ? ConstantVector::getSplat( + ElementCount::getFixed( + cast<FixedVectorType>(ValTy)->getNumElements()), + ConstantInt::get(IType, 0)) + : ConstantInt::get(IType, 0); + + Value *IVal = Builder.CreateBitCast(Val, ExpBitMask->getType()); + Value *Exp = Builder.CreateAnd(IVal, ExpBitMask); + Value *NotAllZeroes = Builder.CreateICmpNE(Exp, Zero); + Value *NotAllOnes = Builder.CreateICmpNE(Exp, ExpBitMask); + Value *B1 = Builder.CreateAnd(NotAllZeroes, NotAllOnes); + return B1; +} + static bool isIntrinsicExpansion(Function &F) { switch (F.getIntrinsicID()) { case Intrinsic::abs: @@ -68,6 +212,7 @@ static bool isIntrinsicExpansion(Function &F) { case Intrinsic::dx_sclamp: case Intrinsic::dx_nclamp: case Intrinsic::dx_degrees: + case Intrinsic::dx_isinf: case Intrinsic::dx_lerp: case Intrinsic::dx_normalize: case Intrinsic::dx_fdot: @@ -301,13 +446,16 @@ static Value *expandIsFPClass(CallInst *Orig) { auto *TCI = dyn_cast<ConstantInt>(T); // These FPClassTest cases have DXIL opcodes, so they will be handled in - // DXIL Op Lowering instead. + // DXIL Op Lowering instead for all non f16 cases. switch (TCI->getZExtValue()) { case FPClassTest::fcInf: + return expand16BitIsInf(Orig); case FPClassTest::fcNan: + return expand16BitIsNaN(Orig); case FPClassTest::fcNormal: + return expand16BitIsNormal(Orig); case FPClassTest::fcFinite: - return nullptr; + return expand16BitIsFinite(Orig); } IRBuilder<> Builder(Orig); @@ -873,6 +1021,9 @@ static bool expandIntrinsic(Function &F, CallInst *Orig) { case Intrinsic::dx_degrees: Result = expandDegreesIntrinsic(Orig); break; + case Intrinsic::dx_isinf: + Result = expand16BitIsInf(Orig); + break; case Intrinsic::dx_lerp: Result = expandLerpIntrinsic(Orig); break; diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp index bd421771e8ed..577b4624458b 100644 --- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp +++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp @@ -220,7 +220,7 @@ public: removeResourceGlobals(CI); - auto *NameGlobal = dyn_cast<llvm::GlobalVariable>(CI->getArgOperand(5)); + auto *NameGlobal = dyn_cast<llvm::GlobalVariable>(CI->getArgOperand(4)); CI->replaceAllUsesWith(Replacement); CI->eraseFromParent(); @@ -233,6 +233,7 @@ public: IRBuilder<> &IRB = OpBuilder.getIRB(); Type *Int8Ty = IRB.getInt8Ty(); Type *Int32Ty = IRB.getInt32Ty(); + Type *Int1Ty = IRB.getInt1Ty(); return replaceFunction(F, [&](CallInst *CI) -> Error { IRB.SetInsertPoint(CI); @@ -249,10 +250,13 @@ public: IndexOp = IRB.CreateAdd(IndexOp, ConstantInt::get(Int32Ty, Binding.LowerBound)); + // FIXME: The last argument is a NonUniform flag which needs to be set + // based on resource analysis. + // https://github.com/llvm/llvm-project/issues/155701 std::array<Value *, 4> Args{ ConstantInt::get(Int8Ty, llvm::to_underlying(RC)), ConstantInt::get(Int32Ty, Binding.RecordID), IndexOp, - CI->getArgOperand(4)}; + ConstantInt::get(Int1Ty, false)}; Expected<CallInst *> OpCall = OpBuilder.tryCreateOp(OpCode::CreateHandle, Args, CI->getName()); if (Error E = OpCall.takeError()) @@ -267,6 +271,7 @@ public: [[nodiscard]] bool lowerToBindAndAnnotateHandle(Function &F) { IRBuilder<> &IRB = OpBuilder.getIRB(); Type *Int32Ty = IRB.getInt32Ty(); + Type *Int1Ty = IRB.getInt1Ty(); return replaceFunction(F, [&](CallInst *CI) -> Error { IRB.SetInsertPoint(CI); @@ -295,7 +300,11 @@ public: : Binding.LowerBound + Binding.Size - 1; Constant *ResBind = OpBuilder.getResBind(Binding.LowerBound, UpperBound, Binding.Space, RC); - std::array<Value *, 3> BindArgs{ResBind, IndexOp, CI->getArgOperand(4)}; + // FIXME: The last argument is a NonUniform flag which needs to be set + // based on resource analysis. + // https://github.com/llvm/llvm-project/issues/155701 + Constant *NonUniform = ConstantInt::get(Int1Ty, false); + std::array<Value *, 3> BindArgs{ResBind, IndexOp, NonUniform}; Expected<CallInst *> OpBind = OpBuilder.tryCreateOp( OpCode::CreateHandleFromBinding, BindArgs, CI->getName()); if (Error E = OpBind.takeError()) diff --git a/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp b/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp index be2c7d1ddff3..d02f4b9f7ebc 100644 --- a/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp +++ b/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp @@ -25,21 +25,6 @@ using namespace llvm; using namespace llvm::dxil; -static ResourceClass toResourceClass(dxbc::DescriptorRangeType RangeType) { - using namespace dxbc; - switch (RangeType) { - case DescriptorRangeType::SRV: - return ResourceClass::SRV; - case DescriptorRangeType::UAV: - return ResourceClass::UAV; - case DescriptorRangeType::CBV: - return ResourceClass::CBuffer; - case DescriptorRangeType::Sampler: - return ResourceClass::Sampler; - } - llvm_unreachable("Unknown DescriptorRangeType"); -} - static ResourceClass toResourceClass(dxbc::RootParameterType Type) { using namespace dxbc; switch (Type) { @@ -95,7 +80,7 @@ static void reportOverlappingError(Module &M, ResourceInfo R1, } static void reportOverlappingBinding(Module &M, DXILResourceMap &DRM) { - bool ErrorFound = false; + [[maybe_unused]] bool ErrorFound = false; for (const auto &ResList : {DRM.srvs(), DRM.uavs(), DRM.cbuffers(), DRM.samplers()}) { if (ResList.empty()) @@ -118,10 +103,8 @@ static void reportOverlappingBinding(Module &M, DXILResourceMap &DRM) { "true, yet no overlapping binding was found"); } -static void -reportOverlappingRegisters(Module &M, - const llvm::hlsl::BindingInfoBuilder::Binding &R1, - const llvm::hlsl::BindingInfoBuilder::Binding &R2) { +static void reportOverlappingRegisters(Module &M, const llvm::hlsl::Binding &R1, + const llvm::hlsl::Binding &R2) { SmallString<128> Message; raw_svector_ostream OS(Message); @@ -133,6 +116,17 @@ reportOverlappingRegisters(Module &M, M.getContext().diagnose(DiagnosticInfoGeneric(Message)); } +static void +reportRegNotBound(Module &M, ResourceClass Class, + const llvm::dxil::ResourceInfo::ResourceBinding &Unbound) { + SmallString<128> Message; + raw_svector_ostream OS(Message); + OS << getResourceClassName(Class) << " register " << Unbound.LowerBound + << " in space " << Unbound.Space + << " does not have a binding in the Root Signature"; + M.getContext().diagnose(DiagnosticInfoGeneric(Message)); +} + static dxbc::ShaderVisibility tripleToVisibility(llvm::Triple::EnvironmentType ET) { switch (ET) { @@ -157,22 +151,23 @@ tripleToVisibility(llvm::Triple::EnvironmentType ET) { static void validateRootSignature(Module &M, const mcdxbc::RootSignatureDesc &RSD, - dxil::ModuleMetadataInfo &MMI) { + dxil::ModuleMetadataInfo &MMI, + DXILResourceMap &DRM, + DXILResourceTypeMap &DRTM) { hlsl::BindingInfoBuilder Builder; dxbc::ShaderVisibility Visibility = tripleToVisibility(MMI.ShaderProfile); for (const mcdxbc::RootParameterInfo &ParamInfo : RSD.ParametersContainer) { dxbc::ShaderVisibility ParamVisibility = - static_cast<dxbc::ShaderVisibility>(ParamInfo.Header.ShaderVisibility); + dxbc::ShaderVisibility(ParamInfo.Visibility); if (ParamVisibility != dxbc::ShaderVisibility::All && ParamVisibility != Visibility) continue; - dxbc::RootParameterType ParamType = - static_cast<dxbc::RootParameterType>(ParamInfo.Header.ParameterType); + dxbc::RootParameterType ParamType = dxbc::RootParameterType(ParamInfo.Type); switch (ParamType) { case dxbc::RootParameterType::Constants32Bit: { - dxbc::RTS0::v1::RootConstants Const = + mcdxbc::RootConstants Const = RSD.ParametersContainer.getConstant(ParamInfo.Location); Builder.trackBinding(dxil::ResourceClass::CBuffer, Const.RegisterSpace, Const.ShaderRegister, Const.ShaderRegister, @@ -183,12 +178,11 @@ static void validateRootSignature(Module &M, case dxbc::RootParameterType::SRV: case dxbc::RootParameterType::UAV: case dxbc::RootParameterType::CBV: { - dxbc::RTS0::v2::RootDescriptor Desc = + mcdxbc::RootDescriptor Desc = RSD.ParametersContainer.getRootDescriptor(ParamInfo.Location); - Builder.trackBinding(toResourceClass(static_cast<dxbc::RootParameterType>( - ParamInfo.Header.ParameterType)), - Desc.RegisterSpace, Desc.ShaderRegister, - Desc.ShaderRegister, &ParamInfo); + Builder.trackBinding(toResourceClass(ParamInfo.Type), Desc.RegisterSpace, + Desc.ShaderRegister, Desc.ShaderRegister, + &ParamInfo); break; } @@ -196,16 +190,13 @@ static void validateRootSignature(Module &M, const mcdxbc::DescriptorTable &Table = RSD.ParametersContainer.getDescriptorTable(ParamInfo.Location); - for (const dxbc::RTS0::v2::DescriptorRange &Range : Table.Ranges) { + for (const mcdxbc::DescriptorRange &Range : Table.Ranges) { uint32_t UpperBound = Range.NumDescriptors == ~0U ? Range.BaseShaderRegister : Range.BaseShaderRegister + Range.NumDescriptors - 1; - Builder.trackBinding( - toResourceClass( - static_cast<dxbc::DescriptorRangeType>(Range.RangeType)), - Range.RegisterSpace, Range.BaseShaderRegister, UpperBound, - &ParamInfo); + Builder.trackBinding(Range.RangeType, Range.RegisterSpace, + Range.BaseShaderRegister, UpperBound, &ParamInfo); } break; } @@ -218,11 +209,19 @@ static void validateRootSignature(Module &M, Builder.calculateBindingInfo( [&M](const llvm::hlsl::BindingInfoBuilder &Builder, - const llvm::hlsl::BindingInfoBuilder::Binding &ReportedBinding) { - const llvm::hlsl::BindingInfoBuilder::Binding &Overlaping = + const llvm::hlsl::Binding &ReportedBinding) { + const llvm::hlsl::Binding &Overlaping = Builder.findOverlapping(ReportedBinding); reportOverlappingRegisters(M, ReportedBinding, Overlaping); }); + const hlsl::BoundRegs &BoundRegs = Builder.takeBoundRegs(); + for (const ResourceInfo &RI : DRM) { + const ResourceInfo::ResourceBinding &Binding = RI.getBinding(); + ResourceClass RC = DRTM[RI.getHandleTy()].getResourceClass(); + if (!BoundRegs.isBound(RC, Binding.Space, Binding.LowerBound, + Binding.LowerBound + Binding.Size - 1)) + reportRegNotBound(M, RC, Binding); + } } static mcdxbc::RootSignatureDesc * @@ -236,7 +235,8 @@ getRootSignature(RootSignatureBindingInfo &RSBI, static void reportErrors(Module &M, DXILResourceMap &DRM, DXILResourceBindingInfo &DRBI, RootSignatureBindingInfo &RSBI, - dxil::ModuleMetadataInfo &MMI) { + dxil::ModuleMetadataInfo &MMI, + DXILResourceTypeMap &DRTM) { if (DRM.hasInvalidCounterDirection()) reportInvalidDirection(M, DRM); @@ -247,7 +247,7 @@ static void reportErrors(Module &M, DXILResourceMap &DRM, "DXILResourceImplicitBinding pass"); if (mcdxbc::RootSignatureDesc *RSD = getRootSignature(RSBI, MMI)) - validateRootSignature(M, *RSD, MMI); + validateRootSignature(M, *RSD, MMI, DRM, DRTM); } PreservedAnalyses @@ -256,8 +256,9 @@ DXILPostOptimizationValidation::run(Module &M, ModuleAnalysisManager &MAM) { DXILResourceBindingInfo &DRBI = MAM.getResult<DXILResourceBindingAnalysis>(M); RootSignatureBindingInfo &RSBI = MAM.getResult<RootSignatureAnalysis>(M); ModuleMetadataInfo &MMI = MAM.getResult<DXILMetadataAnalysis>(M); + DXILResourceTypeMap &DRTM = MAM.getResult<DXILResourceTypeAnalysis>(M); - reportErrors(M, DRM, DRBI, RSBI, MMI); + reportErrors(M, DRM, DRBI, RSBI, MMI, DRTM); return PreservedAnalyses::all(); } @@ -273,8 +274,10 @@ public: getAnalysis<RootSignatureAnalysisWrapper>().getRSInfo(); dxil::ModuleMetadataInfo &MMI = getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata(); + DXILResourceTypeMap &DRTM = + getAnalysis<DXILResourceTypeWrapperPass>().getResourceTypeMap(); - reportErrors(M, DRM, DRBI, RSBI, MMI); + reportErrors(M, DRM, DRBI, RSBI, MMI, DRTM); return false; } StringRef getPassName() const override { @@ -288,6 +291,7 @@ public: AU.addRequired<DXILResourceBindingWrapperPass>(); AU.addRequired<DXILMetadataAnalysisWrapperPass>(); AU.addRequired<RootSignatureAnalysisWrapper>(); + AU.addRequired<DXILResourceTypeWrapperPass>(); AU.addPreserved<DXILResourceWrapperPass>(); AU.addPreserved<DXILResourceBindingWrapperPass>(); AU.addPreserved<DXILMetadataAnalysisWrapperPass>(); @@ -305,6 +309,7 @@ INITIALIZE_PASS_DEPENDENCY(DXILResourceTypeWrapperPass) INITIALIZE_PASS_DEPENDENCY(DXILResourceWrapperPass) INITIALIZE_PASS_DEPENDENCY(DXILMetadataAnalysisWrapperPass) INITIALIZE_PASS_DEPENDENCY(RootSignatureAnalysisWrapper) +INITIALIZE_PASS_DEPENDENCY(DXILResourceTypeWrapperPass) INITIALIZE_PASS_END(DXILPostOptimizationValidationLegacy, DEBUG_TYPE, "DXIL Post Optimization Validation", false, false) diff --git a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp index c33ec0efd73c..6579d3405cf3 100644 --- a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp +++ b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp @@ -8,14 +8,19 @@ #include "DXILResourceAccess.h" #include "DirectX.h" +#include "llvm/ADT/SetVector.h" #include "llvm/Analysis/DXILResource.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsDirectX.h" +#include "llvm/IR/User.h" #include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/ValueMapper.h" #define DEBUG_TYPE "dxil-resource-access" @@ -198,6 +203,112 @@ static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset, llvm_unreachable("Unhandled case in switch"); } +static SmallVector<Instruction *> collectBlockUseDef(Instruction *Start) { + SmallPtrSet<Instruction *, 32> Visited; + SmallVector<Instruction *, 32> Worklist; + SmallVector<Instruction *> Out; + auto *BB = Start->getParent(); + + // Seed with direct users in this block. + for (User *U : Start->users()) { + if (auto *I = dyn_cast<Instruction>(U)) { + if (I->getParent() == BB) + Worklist.push_back(I); + } + } + + // BFS over transitive users, constrained to the same block. + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); + if (!Visited.insert(I).second) + continue; + Out.push_back(I); + + for (User *U : I->users()) { + if (auto *J = dyn_cast<Instruction>(U)) { + if (J->getParent() == BB) + Worklist.push_back(J); + } + } + for (Use &V : I->operands()) { + if (auto *J = dyn_cast<Instruction>(V)) { + if (J->getParent() == BB && V != Start) + Worklist.push_back(J); + } + } + } + + // Order results in program order. + DenseMap<const Instruction *, unsigned> Ord; + unsigned Idx = 0; + for (Instruction &I : *BB) + Ord[&I] = Idx++; + + llvm::sort(Out, [&](Instruction *A, Instruction *B) { + return Ord.lookup(A) < Ord.lookup(B); + }); + + return Out; +} + +static void phiNodeRemapHelper(PHINode *Phi, BasicBlock *BB, + IRBuilder<> &Builder, + SmallVector<Instruction *> &UsesInBlock) { + + ValueToValueMapTy VMap; + Value *Val = Phi->getIncomingValueForBlock(BB); + VMap[Phi] = Val; + Builder.SetInsertPoint(&BB->back()); + for (Instruction *I : UsesInBlock) { + // don't clone over the Phi just remap them + if (auto *PhiNested = dyn_cast<PHINode>(I)) { + VMap[PhiNested] = PhiNested->getIncomingValueForBlock(BB); + continue; + } + Instruction *Clone = I->clone(); + RemapInstruction(Clone, VMap, + RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); + Builder.Insert(Clone); + VMap[I] = Clone; + } +} + +static void phiNodeReplacement(IntrinsicInst *II, + SmallVectorImpl<Instruction *> &PrevBBDeadInsts, + SetVector<BasicBlock *> &DeadBB) { + SmallVector<Instruction *> CurrBBDeadInsts; + for (User *U : II->users()) { + auto *Phi = dyn_cast<PHINode>(U); + if (!Phi) + continue; + + IRBuilder<> Builder(Phi); + SmallVector<Instruction *> UsesInBlock = collectBlockUseDef(Phi); + bool HasReturnUse = isa<ReturnInst>(UsesInBlock.back()); + + for (unsigned I = 0, E = Phi->getNumIncomingValues(); I < E; I++) { + auto *CurrIncomingBB = Phi->getIncomingBlock(I); + phiNodeRemapHelper(Phi, CurrIncomingBB, Builder, UsesInBlock); + if (HasReturnUse) + PrevBBDeadInsts.push_back(&CurrIncomingBB->back()); + } + + CurrBBDeadInsts.push_back(Phi); + + for (Instruction *I : UsesInBlock) { + CurrBBDeadInsts.push_back(I); + } + if (HasReturnUse) { + BasicBlock *PhiBB = Phi->getParent(); + DeadBB.insert(PhiBB); + } + } + // Traverse the now-dead instructions in RPO and remove them. + for (Instruction *Dead : llvm::reverse(CurrBBDeadInsts)) + Dead->eraseFromParent(); + CurrBBDeadInsts.clear(); +} + static void replaceAccess(IntrinsicInst *II, dxil::ResourceTypeInfo &RTI) { // Process users keeping track of indexing accumulated from GEPs. struct AccessAndOffset { @@ -229,7 +340,6 @@ static void replaceAccess(IntrinsicInst *II, dxil::ResourceTypeInfo &RTI) { } else if (auto *LI = dyn_cast<LoadInst>(Current.Access)) { createLoadIntrinsic(II, LI, Current.Offset, RTI); DeadInsts.push_back(LI); - } else llvm_unreachable("Unhandled instruction - pointer escaped?"); } @@ -242,13 +352,27 @@ static void replaceAccess(IntrinsicInst *II, dxil::ResourceTypeInfo &RTI) { static bool transformResourcePointers(Function &F, DXILResourceTypeMap &DRTM) { SmallVector<std::pair<IntrinsicInst *, dxil::ResourceTypeInfo>> Resources; - for (BasicBlock &BB : F) + SetVector<BasicBlock *> DeadBB; + SmallVector<Instruction *> PrevBBDeadInsts; + for (BasicBlock &BB : make_early_inc_range(F)) { + for (Instruction &I : make_early_inc_range(BB)) + if (auto *II = dyn_cast<IntrinsicInst>(&I)) + if (II->getIntrinsicID() == Intrinsic::dx_resource_getpointer) + phiNodeReplacement(II, PrevBBDeadInsts, DeadBB); + for (Instruction &I : BB) if (auto *II = dyn_cast<IntrinsicInst>(&I)) if (II->getIntrinsicID() == Intrinsic::dx_resource_getpointer) { auto *HandleTy = cast<TargetExtType>(II->getArgOperand(0)->getType()); Resources.emplace_back(II, DRTM[HandleTy]); } + } + for (auto *Dead : PrevBBDeadInsts) + Dead->eraseFromParent(); + PrevBBDeadInsts.clear(); + for (auto *Dead : DeadBB) + Dead->eraseFromParent(); + DeadBB.clear(); for (auto &[II, RI] : Resources) replaceAccess(II, RI); @@ -279,7 +403,6 @@ public: bool runOnFunction(Function &F) override { DXILResourceTypeMap &DRTM = getAnalysis<DXILResourceTypeWrapperPass>().getResourceTypeMap(); - return transformResourcePointers(F, DRTM); } StringRef getPassName() const override { return "DXIL Resource Access"; } diff --git a/llvm/lib/Target/DirectX/DXILResourceImplicitBinding.cpp b/llvm/lib/Target/DirectX/DXILResourceImplicitBinding.cpp index 6e69c5ac1d63..b0d9ad8da10e 100644 --- a/llvm/lib/Target/DirectX/DXILResourceImplicitBinding.cpp +++ b/llvm/lib/Target/DirectX/DXILResourceImplicitBinding.cpp @@ -111,8 +111,7 @@ static bool assignBindings(Module &M, DXILResourceBindingInfo &DRBI, RegSlotOp, /* register slot */ IB.Call->getOperand(2), /* size */ IB.Call->getOperand(3), /* index */ - IB.Call->getOperand(4), /* non-uniform flag */ - IB.Call->getOperand(5)}); /* name */ + IB.Call->getOperand(4)}); /* name */ IB.Call->replaceAllUsesWith(NewCall); IB.Call->eraseFromParent(); Changed = true; diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index a4f5086c2f42..ac3c7dde6b89 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -24,9 +24,11 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" +#include "llvm/MC/DXContainerRootSignature.h" #include "llvm/Pass.h" #include "llvm/Support/Error.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/ScopedPrinter.h" #include "llvm/Support/raw_ostream.h" #include <cstdint> @@ -70,6 +72,13 @@ analyzeModule(Module &M) { if (RootSignatureNode == nullptr) return RSDMap; + bool AllowNullFunctions = false; + if (M.getTargetTriple().getEnvironment() == + Triple::EnvironmentType::RootSignature) { + assert(RootSignatureNode->getNumOperands() == 1); + AllowNullFunctions = true; + } + for (const auto &RSDefNode : RootSignatureNode->operands()) { if (RSDefNode->getNumOperands() != 3) { reportError(Ctx, "Invalid Root Signature metadata - expected function, " @@ -78,24 +87,28 @@ analyzeModule(Module &M) { } // Function was pruned during compilation. - const MDOperand &FunctionPointerMdNode = RSDefNode->getOperand(0); - if (FunctionPointerMdNode == nullptr) { - reportError( - Ctx, "Function associated with Root Signature definition is null."); - continue; - } + Function *F = nullptr; + + if (!AllowNullFunctions) { + const MDOperand &FunctionPointerMdNode = RSDefNode->getOperand(0); + if (FunctionPointerMdNode == nullptr) { + reportError( + Ctx, "Function associated with Root Signature definition is null."); + continue; + } - ValueAsMetadata *VAM = - llvm::dyn_cast<ValueAsMetadata>(FunctionPointerMdNode.get()); - if (VAM == nullptr) { - reportError(Ctx, "First element of root signature is not a Value"); - continue; - } + ValueAsMetadata *VAM = + llvm::dyn_cast<ValueAsMetadata>(FunctionPointerMdNode.get()); + if (VAM == nullptr) { + reportError(Ctx, "First element of root signature is not a Value"); + continue; + } - Function *F = dyn_cast<Function>(VAM->getValue()); - if (F == nullptr) { - reportError(Ctx, "First element of root signature is not a Function"); - continue; + F = dyn_cast<Function>(VAM->getValue()); + if (F == nullptr) { + reportError(Ctx, "First element of root signature is not a Function"); + continue; + } } Metadata *RootElementListOperand = RSDefNode->getOperand(1).get(); @@ -171,41 +184,41 @@ PreservedAnalyses RootSignatureAnalysisPrinter::run(Module &M, << "RootParametersOffset: " << RS.RootParameterOffset << "\n" << "NumParameters: " << RS.ParametersContainer.size() << "\n"; for (size_t I = 0; I < RS.ParametersContainer.size(); I++) { - const auto &[Type, Loc] = - RS.ParametersContainer.getTypeAndLocForParameter(I); - const dxbc::RTS0::v1::RootParameterHeader Header = - RS.ParametersContainer.getHeader(I); - - OS << "- Parameter Type: " << Type << "\n" - << " Shader Visibility: " << Header.ShaderVisibility << "\n"; - - switch (Type) { - case llvm::to_underlying(dxbc::RootParameterType::Constants32Bit): { - const dxbc::RTS0::v1::RootConstants &Constants = - RS.ParametersContainer.getConstant(Loc); + const mcdxbc::RootParameterInfo &Info = RS.ParametersContainer.getInfo(I); + + OS << "- Parameter Type: " + << enumToStringRef(Info.Type, dxbc::getRootParameterTypes()) << "\n" + << " Shader Visibility: " + << enumToStringRef(Info.Visibility, dxbc::getShaderVisibility()) + << "\n"; + switch (Info.Type) { + case dxbc::RootParameterType::Constants32Bit: { + const mcdxbc::RootConstants &Constants = + RS.ParametersContainer.getConstant(Info.Location); OS << " Register Space: " << Constants.RegisterSpace << "\n" << " Shader Register: " << Constants.ShaderRegister << "\n" << " Num 32 Bit Values: " << Constants.Num32BitValues << "\n"; break; } - case llvm::to_underlying(dxbc::RootParameterType::CBV): - case llvm::to_underlying(dxbc::RootParameterType::UAV): - case llvm::to_underlying(dxbc::RootParameterType::SRV): { - const dxbc::RTS0::v2::RootDescriptor &Descriptor = - RS.ParametersContainer.getRootDescriptor(Loc); + case dxbc::RootParameterType::CBV: + case dxbc::RootParameterType::UAV: + case dxbc::RootParameterType::SRV: { + const mcdxbc::RootDescriptor &Descriptor = + RS.ParametersContainer.getRootDescriptor(Info.Location); OS << " Register Space: " << Descriptor.RegisterSpace << "\n" << " Shader Register: " << Descriptor.ShaderRegister << "\n"; if (RS.Version > 1) OS << " Flags: " << Descriptor.Flags << "\n"; break; } - case llvm::to_underlying(dxbc::RootParameterType::DescriptorTable): { + case dxbc::RootParameterType::DescriptorTable: { const mcdxbc::DescriptorTable &Table = - RS.ParametersContainer.getDescriptorTable(Loc); + RS.ParametersContainer.getDescriptorTable(Info.Location); OS << " NumRanges: " << Table.Ranges.size() << "\n"; - for (const dxbc::RTS0::v2::DescriptorRange Range : Table) { - OS << " - Range Type: " << Range.RangeType << "\n" + for (const mcdxbc::DescriptorRange &Range : Table) { + OS << " - Range Type: " + << dxil::getResourceClassName(Range.RangeType) << "\n" << " Register Space: " << Range.RegisterSpace << "\n" << " Base Shader Register: " << Range.BaseShaderRegister << "\n" << " Num Descriptors: " << Range.NumDescriptors << "\n" diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp index 82bcacee7a6d..9eebcc9b1306 100644 --- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp +++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp @@ -127,6 +127,8 @@ static StringRef getShortShaderStage(Triple::EnvironmentType Env) { return "ms"; case Triple::Amplification: return "as"; + case Triple::RootSignature: + return "rootsig"; default: break; } diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp index 1d79c3018439..bc1a3a7995bd 100644 --- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp +++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp @@ -2113,7 +2113,7 @@ void DXILBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal, } break; case Instruction::GetElementPtr: { - Code = bitc::CST_CODE_CE_GEP; + Code = bitc::CST_CODE_CE_GEP_OLD; const auto *GO = cast<GEPOperator>(C); if (GO->isInBounds()) Code = bitc::CST_CODE_CE_INBOUNDS_GEP; diff --git a/llvm/lib/Target/DirectX/DirectXIRPasses/PointerTypeAnalysis.cpp b/llvm/lib/Target/DirectX/DirectXIRPasses/PointerTypeAnalysis.cpp index f99bb4f4eaee..c2e139edc6bd 100644 --- a/llvm/lib/Target/DirectX/DirectXIRPasses/PointerTypeAnalysis.cpp +++ b/llvm/lib/Target/DirectX/DirectXIRPasses/PointerTypeAnalysis.cpp @@ -15,25 +15,39 @@ #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" using namespace llvm; using namespace llvm::dxil; namespace { +Type *classifyFunctionType(const Function &F, PointerTypeMap &Map); + // Classifies the type of the value passed in by walking the value's users to // find a typed instruction to materialize a type from. Type *classifyPointerType(const Value *V, PointerTypeMap &Map) { assert(V->getType()->isPointerTy() && "classifyPointerType called with non-pointer"); + + // A CallInst will trigger this case, and we want to classify its Function + // operand as a Function rather than a generic Value. + if (const Function *F = dyn_cast<Function>(V)) + return classifyFunctionType(*F, Map); + + // There can potentially be dead constants hanging off of the globals we do + // not want to deal with. So we remove them here. + if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) + GV->removeDeadConstantUsers(); + auto It = Map.find(V); if (It != Map.end()) return It->second; Type *PointeeTy = nullptr; - if (auto *Inst = dyn_cast<GetElementPtrInst>(V)) { - if (!Inst->getResultElementType()->isPointerTy()) - PointeeTy = Inst->getResultElementType(); + if (auto *GEP = dyn_cast<GEPOperator>(V)) { + if (!GEP->getResultElementType()->isPointerTy()) + PointeeTy = GEP->getResultElementType(); } else if (auto *Inst = dyn_cast<AllocaInst>(V)) { PointeeTy = Inst->getAllocatedType(); } else if (auto *GV = dyn_cast<GlobalVariable>(V)) { @@ -49,8 +63,8 @@ Type *classifyPointerType(const Value *V, PointerTypeMap &Map) { // When store value is ptr type, cannot get more type info. if (NewPointeeTy->isPointerTy()) continue; - } else if (const auto *Inst = dyn_cast<GetElementPtrInst>(User)) { - NewPointeeTy = Inst->getSourceElementType(); + } else if (const auto *GEP = dyn_cast<GEPOperator>(User)) { + NewPointeeTy = GEP->getSourceElementType(); } if (NewPointeeTy) { // HLSL doesn't support pointers, so it is unlikely to get more than one @@ -204,6 +218,9 @@ PointerTypeMap PointerTypeAnalysis::run(const Module &M) { for (const auto &I : B) { if (I.getType()->isPointerTy()) classifyPointerType(&I, Map); + for (const auto &O : I.operands()) + if (O.get()->getType()->isPointerTy()) + classifyPointerType(O.get(), Map); } } } diff --git a/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp b/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp index 07b68648f16c..bb2efa43d818 100644 --- a/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp +++ b/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp @@ -11,10 +11,14 @@ //===----------------------------------------------------------------------===// #include "DirectXInstrInfo.h" +#include "DirectXSubtarget.h" #define GET_INSTRINFO_CTOR_DTOR #include "DirectXGenInstrInfo.inc" using namespace llvm; +DirectXInstrInfo::DirectXInstrInfo(const DirectXSubtarget &STI) + : DirectXGenInstrInfo(STI) {} + DirectXInstrInfo::~DirectXInstrInfo() {} diff --git a/llvm/lib/Target/DirectX/DirectXInstrInfo.h b/llvm/lib/Target/DirectX/DirectXInstrInfo.h index e2c7036fc74a..57ede28030b2 100644 --- a/llvm/lib/Target/DirectX/DirectXInstrInfo.h +++ b/llvm/lib/Target/DirectX/DirectXInstrInfo.h @@ -20,9 +20,11 @@ #include "DirectXGenInstrInfo.inc" namespace llvm { +class DirectXSubtarget; + struct DirectXInstrInfo : public DirectXGenInstrInfo { const DirectXRegisterInfo RI; - explicit DirectXInstrInfo() : DirectXGenInstrInfo() {} + explicit DirectXInstrInfo(const DirectXSubtarget &STI); const DirectXRegisterInfo &getRegisterInfo() const { return RI; } ~DirectXInstrInfo() override; }; diff --git a/llvm/lib/Target/DirectX/DirectXSubtarget.cpp b/llvm/lib/Target/DirectX/DirectXSubtarget.cpp index 526b7d29fb13..f8519177cc2d 100644 --- a/llvm/lib/Target/DirectX/DirectXSubtarget.cpp +++ b/llvm/lib/Target/DirectX/DirectXSubtarget.cpp @@ -24,6 +24,7 @@ using namespace llvm; DirectXSubtarget::DirectXSubtarget(const Triple &TT, StringRef CPU, StringRef FS, const DirectXTargetMachine &TM) - : DirectXGenSubtargetInfo(TT, CPU, CPU, FS), FL(*this), TL(TM, *this) {} + : DirectXGenSubtargetInfo(TT, CPU, CPU, FS), InstrInfo(*this), FL(*this), + TL(TM, *this) {} void DirectXSubtarget::anchor() {} diff --git a/llvm/lib/Target/DirectX/DirectXSubtarget.h b/llvm/lib/Target/DirectX/DirectXSubtarget.h index b2374caaf3cd..f3d71c4c4e3b 100644 --- a/llvm/lib/Target/DirectX/DirectXSubtarget.h +++ b/llvm/lib/Target/DirectX/DirectXSubtarget.h @@ -28,9 +28,9 @@ namespace llvm { class DirectXTargetMachine; class DirectXSubtarget : public DirectXGenSubtargetInfo { + DirectXInstrInfo InstrInfo; DirectXFrameLowering FL; DirectXTargetLowering TL; - DirectXInstrInfo InstrInfo; virtual void anchor(); // virtual anchor method diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp index de10092cbe3c..0639878c1256 100644 --- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp +++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp @@ -173,6 +173,19 @@ static DecodeStatus s32_0ImmDecoder(MCInst &MI, unsigned tmp, const MCDisassembler *Decoder); static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address, const MCDisassembler *Decoder); + +static DecodeStatus n1ConstDecoder(MCInst &MI, const MCDisassembler *Decoder) { + MCContext &Ctx = Decoder->getContext(); + MI.addOperand(MCOperand::createExpr(MCConstantExpr::create(-1, Ctx))); + return DecodeStatus::Success; +} + +static DecodeStatus sgp10ConstDecoder(MCInst &MI, + const MCDisassembler *Decoder) { + MI.addOperand(MCOperand::createReg(Hexagon::SGP1_0)); + return DecodeStatus::Success; +} + #include "HexagonDepDecoders.inc" #include "HexagonGenDisassemblerTables.inc" @@ -349,21 +362,6 @@ void HexagonDisassembler::remapInstruction(MCInst &Instr) const { } } -static void adjustDuplex(MCInst &MI, MCContext &Context) { - switch (MI.getOpcode()) { - case Hexagon::SA1_setin1: - MI.insert(MI.begin() + 1, - MCOperand::createExpr(MCConstantExpr::create(-1, Context))); - break; - case Hexagon::SA1_dec: - MI.insert(MI.begin() + 2, - MCOperand::createExpr(MCConstantExpr::create(-1, Context))); - break; - default: - break; - } -} - DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB, ArrayRef<uint8_t> Bytes, uint64_t Address, @@ -468,12 +466,10 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB, CurrentExtender = TmpExtender; if (Result != DecodeStatus::Success) return DecodeStatus::Fail; - adjustDuplex(*MILow, getContext()); Result = decodeInstruction( DecodeHigh, *MIHigh, (Instruction >> 16) & 0x1fff, Address, this, STI); if (Result != DecodeStatus::Success) return DecodeStatus::Fail; - adjustDuplex(*MIHigh, getContext()); MCOperand OPLow = MCOperand::createInst(MILow); MCOperand OPHigh = MCOperand::createInst(MIHigh); MI.addOperand(OPLow); @@ -499,41 +495,6 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB, } - switch (MI.getOpcode()) { - case Hexagon::J4_cmpeqn1_f_jumpnv_nt: - case Hexagon::J4_cmpeqn1_f_jumpnv_t: - case Hexagon::J4_cmpeqn1_fp0_jump_nt: - case Hexagon::J4_cmpeqn1_fp0_jump_t: - case Hexagon::J4_cmpeqn1_fp1_jump_nt: - case Hexagon::J4_cmpeqn1_fp1_jump_t: - case Hexagon::J4_cmpeqn1_t_jumpnv_nt: - case Hexagon::J4_cmpeqn1_t_jumpnv_t: - case Hexagon::J4_cmpeqn1_tp0_jump_nt: - case Hexagon::J4_cmpeqn1_tp0_jump_t: - case Hexagon::J4_cmpeqn1_tp1_jump_nt: - case Hexagon::J4_cmpeqn1_tp1_jump_t: - case Hexagon::J4_cmpgtn1_f_jumpnv_nt: - case Hexagon::J4_cmpgtn1_f_jumpnv_t: - case Hexagon::J4_cmpgtn1_fp0_jump_nt: - case Hexagon::J4_cmpgtn1_fp0_jump_t: - case Hexagon::J4_cmpgtn1_fp1_jump_nt: - case Hexagon::J4_cmpgtn1_fp1_jump_t: - case Hexagon::J4_cmpgtn1_t_jumpnv_nt: - case Hexagon::J4_cmpgtn1_t_jumpnv_t: - case Hexagon::J4_cmpgtn1_tp0_jump_nt: - case Hexagon::J4_cmpgtn1_tp0_jump_t: - case Hexagon::J4_cmpgtn1_tp1_jump_nt: - case Hexagon::J4_cmpgtn1_tp1_jump_t: - MI.insert(MI.begin() + 1, - MCOperand::createExpr(MCConstantExpr::create(-1, getContext()))); - break; - case Hexagon::Y4_crswap10: - MI.addOperand(MCOperand::createReg(Hexagon::SGP1_0)); - break; - default: - break; - } - if (HexagonMCInstrInfo::isNewValue(*MCII, MI)) { unsigned OpIndex = HexagonMCInstrInfo::getNewValueOp(*MCII, MI); MCOperand &MCO = MI.getOperand(OpIndex); diff --git a/llvm/lib/Target/Hexagon/Hexagon.td b/llvm/lib/Target/Hexagon/Hexagon.td index 0dbe743d13ed..6d0529fb4277 100644 --- a/llvm/lib/Target/Hexagon/Hexagon.td +++ b/llvm/lib/Target/Hexagon/Hexagon.td @@ -176,8 +176,11 @@ def UseSmallData : Predicate<"HST->useSmallData()">; def UseCabac : Predicate<"HST->useCabac()">, AssemblerPredicate<(any_of FeatureCabac)>; -def Hvx64: HwMode<"+hvx-length64b", [UseHVX64B]>; -def Hvx128: HwMode<"+hvx-length128b", [UseHVX128B]>; +def : HwModePredicateProlog<[{ + const auto *HST = static_cast<const HexagonSubtarget *>(this); +}]>; +def Hvx64: HwMode<[UseHVX64B]>; +def Hvx128: HwMode<[UseHVX128B]>; //===----------------------------------------------------------------------===// // Classes used for relation maps. diff --git a/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td b/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td index 75e87c95f2c4..f48695c6ebc0 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td +++ b/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td @@ -38,11 +38,7 @@ class Enc_041d7b : OpcodeHexagon { let Inst{7-1} = Ii{8-2}; bits <4> Rs16; let Inst{19-16} = Rs16{3-0}; - bits <5> n1; - let Inst{28-28} = n1{4-4}; - let Inst{24-23} = n1{3-2}; - let Inst{13-13} = n1{1-1}; - let Inst{8-8} = n1{0-0}; + bits <0> n1; } class Enc_046afa : OpcodeHexagon { bits <1> Mu2; @@ -244,10 +240,7 @@ class Enc_14640c : OpcodeHexagon { let Inst{7-1} = Ii{8-2}; bits <4> Rs16; let Inst{19-16} = Rs16{3-0}; - bits <5> n1; - let Inst{28-28} = n1{4-4}; - let Inst{24-22} = n1{3-1}; - let Inst{13-13} = n1{0-0}; + bits <0> n1; } class Enc_14d27a : OpcodeHexagon { bits <5> II; @@ -300,11 +293,7 @@ class Enc_178717 : OpcodeHexagon { let Inst{7-1} = Ii{8-2}; bits <4> Rs16; let Inst{19-16} = Rs16{3-0}; - bits <6> n1; - let Inst{28-28} = n1{5-5}; - let Inst{25-23} = n1{4-2}; - let Inst{13-13} = n1{1-1}; - let Inst{8-8} = n1{0-0}; + bits <0> n1; } class Enc_179b35 : OpcodeHexagon { bits <5> Rs32; @@ -384,9 +373,7 @@ class Enc_1de724 : OpcodeHexagon { let Inst{7-1} = Ii{8-2}; bits <4> Rs16; let Inst{19-16} = Rs16{3-0}; - bits <4> n1; - let Inst{28-28} = n1{3-3}; - let Inst{24-22} = n1{2-0}; + bits <0> n1; } class Enc_1ef990 : OpcodeHexagon { bits <2> Pv4; @@ -772,10 +759,7 @@ class Enc_3694bd : OpcodeHexagon { let Inst{7-1} = Ii{8-2}; bits <3> Ns8; let Inst{18-16} = Ns8{2-0}; - bits <5> n1; - let Inst{29-29} = n1{4-4}; - let Inst{26-25} = n1{3-2}; - let Inst{23-22} = n1{1-0}; + bits <0> n1; } class Enc_372c9d : OpcodeHexagon { bits <2> Pv4; @@ -820,10 +804,7 @@ class Enc_3a2484 : OpcodeHexagon { let Inst{7-1} = Ii{8-2}; bits <4> Rs16; let Inst{19-16} = Rs16{3-0}; - bits <4> n1; - let Inst{28-28} = n1{3-3}; - let Inst{24-23} = n1{2-1}; - let Inst{13-13} = n1{0-0}; + bits <0> n1; } class Enc_3a3d62 : OpcodeHexagon { bits <5> Rs32; @@ -883,10 +864,7 @@ class Enc_3e3989 : OpcodeHexagon { let Inst{7-1} = Ii{8-2}; bits <4> Rs16; let Inst{19-16} = Rs16{3-0}; - bits <6> n1; - let Inst{28-28} = n1{5-5}; - let Inst{25-22} = n1{4-1}; - let Inst{8-8} = n1{0-0}; + bits <0> n1; } class Enc_3f97c8 : OpcodeHexagon { bits <6> Ii; @@ -916,9 +894,7 @@ class Enc_405228 : OpcodeHexagon { let Inst{7-1} = Ii{8-2}; bits <4> Rs16; let Inst{19-16} = Rs16{3-0}; - bits <3> n1; - let Inst{28-28} = n1{2-2}; - let Inst{24-23} = n1{1-0}; + bits <0> n1; } class Enc_412ff0 : OpcodeHexagon { bits <5> Rss32; @@ -1046,9 +1022,7 @@ class Enc_4aca3a : OpcodeHexagon { let Inst{7-1} = Ii{8-2}; bits <3> Ns8; let Inst{18-16} = Ns8{2-0}; - bits <3> n1; - let Inst{29-29} = n1{2-2}; - let Inst{26-25} = n1{1-0}; + bits <0> n1; } class Enc_4b39e4 : OpcodeHexagon { bits <3> Ii; @@ -1265,11 +1239,7 @@ class Enc_5a18b3 : OpcodeHexagon { let Inst{7-1} = Ii{8-2}; bits <3> Ns8; let Inst{18-16} = Ns8{2-0}; - bits <5> n1; - let Inst{29-29} = n1{4-4}; - let Inst{26-25} = n1{3-2}; - let Inst{22-22} = n1{1-1}; - let Inst{13-13} = n1{0-0}; + bits <0> n1; } class Enc_5ab2be : OpcodeHexagon { bits <5> Rs32; @@ -1445,11 +1415,7 @@ class Enc_6413b6 : OpcodeHexagon { let Inst{7-1} = Ii{8-2}; bits <3> Ns8; let Inst{18-16} = Ns8{2-0}; - bits <5> n1; - let Inst{29-29} = n1{4-4}; - let Inst{26-25} = n1{3-2}; - let Inst{23-23} = n1{1-1}; - let Inst{13-13} = n1{0-0}; + bits <0> n1; } class Enc_645d54 : OpcodeHexagon { bits <2> Ii; @@ -1490,9 +1456,7 @@ class Enc_668704 : OpcodeHexagon { let Inst{7-1} = Ii{8-2}; bits <4> Rs16; let Inst{19-16} = Rs16{3-0}; - bits <5> n1; - let Inst{28-28} = n1{4-4}; - let Inst{25-22} = n1{3-0}; + bits <0> n1; } class Enc_66bce1 : OpcodeHexagon { bits <11> Ii; @@ -1650,9 +1614,7 @@ class Enc_736575 : OpcodeHexagon { let Inst{7-1} = Ii{8-2}; bits <4> Rs16; let Inst{19-16} = Rs16{3-0}; - bits <4> n1; - let Inst{28-28} = n1{3-3}; - let Inst{25-23} = n1{2-0}; + bits <0> n1; } class Enc_74aef2 : OpcodeHexagon { bits <4> Ii; @@ -1718,8 +1680,7 @@ class Enc_79b8c8 : OpcodeHexagon { class Enc_7a0ea6 : OpcodeHexagon { bits <4> Rd16; let Inst{3-0} = Rd16{3-0}; - bits <1> n1; - let Inst{9-9} = n1{0-0}; + bits <0> n1; } class Enc_7b523d : OpcodeHexagon { bits <5> Vu32; @@ -1805,10 +1766,7 @@ class Enc_800e04 : OpcodeHexagon { let Inst{7-1} = Ii{8-2}; bits <4> Rs16; let Inst{19-16} = Rs16{3-0}; - bits <6> n1; - let Inst{28-28} = n1{5-5}; - let Inst{25-22} = n1{4-1}; - let Inst{13-13} = n1{0-0}; + bits <0> n1; } class Enc_80296d : OpcodeHexagon { bits <5> Rs32; @@ -2067,10 +2025,7 @@ class Enc_8e583a : OpcodeHexagon { let Inst{7-1} = Ii{8-2}; bits <4> Rs16; let Inst{19-16} = Rs16{3-0}; - bits <5> n1; - let Inst{28-28} = n1{4-4}; - let Inst{25-23} = n1{3-1}; - let Inst{13-13} = n1{0-0}; + bits <0> n1; } class Enc_8f7633 : OpcodeHexagon { bits <5> Rs32; @@ -2361,10 +2316,7 @@ class Enc_a42857 : OpcodeHexagon { let Inst{7-1} = Ii{8-2}; bits <4> Rs16; let Inst{19-16} = Rs16{3-0}; - bits <5> n1; - let Inst{28-28} = n1{4-4}; - let Inst{24-22} = n1{3-1}; - let Inst{8-8} = n1{0-0}; + bits <0> n1; } class Enc_a4ef14 : OpcodeHexagon { bits <5> Rd32; @@ -2413,11 +2365,7 @@ class Enc_a6853f : OpcodeHexagon { let Inst{7-1} = Ii{8-2}; bits <3> Ns8; let Inst{18-16} = Ns8{2-0}; - bits <6> n1; - let Inst{29-29} = n1{5-5}; - let Inst{26-25} = n1{4-3}; - let Inst{23-22} = n1{2-1}; - let Inst{13-13} = n1{0-0}; + bits <0> n1; } class Enc_a6ce9c : OpcodeHexagon { bits <6> Ii; @@ -2593,10 +2541,7 @@ class Enc_b1e1fb : OpcodeHexagon { let Inst{7-1} = Ii{8-2}; bits <4> Rs16; let Inst{19-16} = Rs16{3-0}; - bits <5> n1; - let Inst{28-28} = n1{4-4}; - let Inst{25-23} = n1{3-1}; - let Inst{8-8} = n1{0-0}; + bits <0> n1; } class Enc_b388cf : OpcodeHexagon { bits <5> Ii; @@ -2661,10 +2606,7 @@ class Enc_b78edd : OpcodeHexagon { let Inst{7-1} = Ii{8-2}; bits <4> Rs16; let Inst{19-16} = Rs16{3-0}; - bits <4> n1; - let Inst{28-28} = n1{3-3}; - let Inst{24-23} = n1{2-1}; - let Inst{8-8} = n1{0-0}; + bits <0> n1; } class Enc_b7fad3 : OpcodeHexagon { bits <2> Pv4; @@ -2715,11 +2657,7 @@ class Enc_b909d2 : OpcodeHexagon { let Inst{7-1} = Ii{8-2}; bits <4> Rs16; let Inst{19-16} = Rs16{3-0}; - bits <7> n1; - let Inst{28-28} = n1{6-6}; - let Inst{25-22} = n1{5-2}; - let Inst{13-13} = n1{1-1}; - let Inst{8-8} = n1{0-0}; + bits <0> n1; } class Enc_b91167 : OpcodeHexagon { bits <2> Ii; @@ -3335,10 +3273,7 @@ class Enc_e90a15 : OpcodeHexagon { let Inst{7-1} = Ii{8-2}; bits <3> Ns8; let Inst{18-16} = Ns8{2-0}; - bits <4> n1; - let Inst{29-29} = n1{3-3}; - let Inst{26-25} = n1{2-1}; - let Inst{22-22} = n1{0-0}; + bits <0> n1; } class Enc_e957fb : OpcodeHexagon { bits <12> Ii; @@ -3417,8 +3352,7 @@ class Enc_ee5ed0 : OpcodeHexagon { let Inst{7-4} = Rs16{3-0}; bits <4> Rd16; let Inst{3-0} = Rd16{3-0}; - bits <2> n1; - let Inst{9-8} = n1{1-0}; + bits <0> n1; } class Enc_ef601b : OpcodeHexagon { bits <4> Ii; @@ -3531,11 +3465,7 @@ class Enc_f6fe0b : OpcodeHexagon { let Inst{7-1} = Ii{8-2}; bits <4> Rs16; let Inst{19-16} = Rs16{3-0}; - bits <6> n1; - let Inst{28-28} = n1{5-5}; - let Inst{24-22} = n1{4-2}; - let Inst{13-13} = n1{1-1}; - let Inst{8-8} = n1{0-0}; + bits <0> n1; } class Enc_f7430e : OpcodeHexagon { bits <4> Ii; @@ -3574,10 +3504,7 @@ class Enc_f7ea77 : OpcodeHexagon { let Inst{7-1} = Ii{8-2}; bits <3> Ns8; let Inst{18-16} = Ns8{2-0}; - bits <4> n1; - let Inst{29-29} = n1{3-3}; - let Inst{26-25} = n1{2-1}; - let Inst{13-13} = n1{0-0}; + bits <0> n1; } class Enc_f82302 : OpcodeHexagon { bits <11> Ii; @@ -3585,10 +3512,7 @@ class Enc_f82302 : OpcodeHexagon { let Inst{7-1} = Ii{8-2}; bits <3> Ns8; let Inst{18-16} = Ns8{2-0}; - bits <4> n1; - let Inst{29-29} = n1{3-3}; - let Inst{26-25} = n1{2-1}; - let Inst{23-23} = n1{0-0}; + bits <0> n1; } class Enc_f82eaf : OpcodeHexagon { bits <8> Ii; diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp index 64bc5ca134c8..45d194e944fb 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -117,9 +117,10 @@ const int Hexagon_ADDI_OFFSET_MIN = -32768; // Pin the vtable to this file. void HexagonInstrInfo::anchor() {} -HexagonInstrInfo::HexagonInstrInfo(HexagonSubtarget &ST) - : HexagonGenInstrInfo(Hexagon::ADJCALLSTACKDOWN, Hexagon::ADJCALLSTACKUP), - Subtarget(ST) {} +HexagonInstrInfo::HexagonInstrInfo(const HexagonSubtarget &ST) + : HexagonGenInstrInfo(ST, Hexagon::ADJCALLSTACKDOWN, + Hexagon::ADJCALLSTACKUP), + Subtarget(ST) {} namespace llvm { namespace HexagonFUnits { diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h index 086cb1fdd8ac..c17e5277ae2e 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h @@ -45,7 +45,7 @@ class HexagonInstrInfo : public HexagonGenInstrInfo { virtual void anchor(); public: - explicit HexagonInstrInfo(HexagonSubtarget &ST); + explicit HexagonInstrInfo(const HexagonSubtarget &ST); /// TargetInstrInfo overrides. diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp index 72575f2560a3..1057b88530f4 100644 --- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp +++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp @@ -42,6 +42,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/RuntimeLibcalls.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" @@ -104,9 +105,6 @@ static cl::opt<bool> HexagonVolatileMemcpy( static cl::opt<unsigned> SimplifyLimit("hlir-simplify-limit", cl::init(10000), cl::Hidden, cl::desc("Maximum number of simplification steps in HLIR")); -static const char *HexagonVolatileMemcpyName - = "hexagon_memcpy_forward_vp4cp4n2"; - namespace { class HexagonLoopIdiomRecognize { @@ -2246,6 +2244,11 @@ CleanupAndExit: Type *PtrTy = PointerType::get(Ctx, 0); Type *VoidTy = Type::getVoidTy(Ctx); Module *M = Func->getParent(); + + // FIXME: This should check if the call is supported + StringRef HexagonVolatileMemcpyName = + RTLIB::RuntimeLibcallsInfo::getLibcallImplName( + RTLIB::impl_hexagon_memcpy_forward_vp4cp4n2); FunctionCallee Fn = M->getOrInsertFunction( HexagonVolatileMemcpyName, VoidTy, PtrTy, PtrTy, Int32Ty); diff --git a/llvm/lib/Target/Hexagon/HexagonOperands.td b/llvm/lib/Target/Hexagon/HexagonOperands.td index 5134626c65c7..df5d32c13a73 100644 --- a/llvm/lib/Target/Hexagon/HexagonOperands.td +++ b/llvm/lib/Target/Hexagon/HexagonOperands.td @@ -27,9 +27,15 @@ def u9_0ImmPred : PatLeaf<(i32 imm), [{ def u64_0ImmOperand : AsmOperandClass { let Name = "u64_0Imm"; let RenderMethod = "addImmOperands"; } def u64_0Imm : Operand<i64> { let ParserMatchClass = u64_0ImmOperand; } def n1ConstOperand : AsmOperandClass { let Name = "n1Const"; } -def n1Const : Operand<i32> { let ParserMatchClass = n1ConstOperand; } +def n1Const : Operand<i32> { + let ParserMatchClass = n1ConstOperand; + let DecoderMethod = "n1ConstDecoder"; +} def sgp10ConstOperand : AsmOperandClass { let Name = "sgp10Const"; } -def sgp10Const : Operand<i32> { let ParserMatchClass = sgp10ConstOperand; } +def sgp10Const : Operand<i32> { + let ParserMatchClass = sgp10ConstOperand; + let DecoderMethod = "sgp10ConstDecoder"; +} def bblabel : Operand<i32>; def bbl : SDNode<"ISD::BasicBlock", SDTPtrLeaf, [], "BasicBlockSDNode">; diff --git a/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp b/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp index c24700b89634..9cd0636306b1 100644 --- a/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp +++ b/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp @@ -47,34 +47,100 @@ LLVMInitializeLanaiDisassembler() { LanaiDisassembler::LanaiDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) : MCDisassembler(STI, Ctx) {} -// Forward declare because the autogenerated code will reference this. -// Definition is further down. -static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); +// clang-format off +static const unsigned GPRDecoderTable[] = { + Lanai::R0, Lanai::R1, Lanai::PC, Lanai::R3, Lanai::SP, Lanai::FP, + Lanai::R6, Lanai::R7, Lanai::RV, Lanai::R9, Lanai::RR1, Lanai::RR2, + Lanai::R12, Lanai::R13, Lanai::R14, Lanai::RCA, Lanai::R16, Lanai::R17, + Lanai::R18, Lanai::R19, Lanai::R20, Lanai::R21, Lanai::R22, Lanai::R23, + Lanai::R24, Lanai::R25, Lanai::R26, Lanai::R27, Lanai::R28, Lanai::R29, + Lanai::R30, Lanai::R31 +}; +// clang-format on + +DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t /*Address*/, + const MCDisassembler * /*Decoder*/) { + if (RegNo > 31) + return MCDisassembler::Fail; + + unsigned Reg = GPRDecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Reg)); + return MCDisassembler::Success; +} static DecodeStatus decodeRiMemoryValue(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); + const MCDisassembler *Decoder) { + // RI memory values encoded using 23 bits: + // 5 bit register, 16 bit constant + unsigned Register = (Insn >> 18) & 0x1f; + Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register])); + unsigned Offset = (Insn & 0xffff); + Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Offset))); + + return MCDisassembler::Success; +} static DecodeStatus decodeRrMemoryValue(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); + const MCDisassembler *Decoder) { + // RR memory values encoded using 20 bits: + // 5 bit register, 5 bit register, 2 bit PQ, 3 bit ALU operator, 5 bit JJJJJ + unsigned Register = (Insn >> 15) & 0x1f; + Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register])); + Register = (Insn >> 10) & 0x1f; + Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register])); + + return MCDisassembler::Success; +} static DecodeStatus decodeSplsValue(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); + const MCDisassembler *Decoder) { + // RI memory values encoded using 17 bits: + // 5 bit register, 10 bit constant + unsigned Register = (Insn >> 12) & 0x1f; + Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register])); + unsigned Offset = (Insn & 0x3ff); + Inst.addOperand(MCOperand::createImm(SignExtend32<10>(Offset))); -static DecodeStatus decodeBranch(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); + return MCDisassembler::Success; +} -static DecodeStatus decodePredicateOperand(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); +static bool tryAddingSymbolicOperand(int64_t Value, bool IsBranch, + uint64_t Address, uint64_t Offset, + uint64_t Width, MCInst &MI, + const MCDisassembler *Decoder) { + return Decoder->tryAddingSymbolicOperand(MI, Value, Address, IsBranch, Offset, + Width, /*InstSize=*/0); +} + +static DecodeStatus decodeBranch(MCInst &MI, unsigned Insn, uint64_t Address, + const MCDisassembler *Decoder) { + if (!tryAddingSymbolicOperand(Insn + Address, false, Address, 2, 23, MI, + Decoder)) + MI.addOperand(MCOperand::createImm(Insn)); + return MCDisassembler::Success; +} static DecodeStatus decodeShiftImm(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); + const MCDisassembler *Decoder) { + unsigned Offset = (Insn & 0xffff); + Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Offset))); + + return MCDisassembler::Success; +} + +static DecodeStatus decodePredicateOperand(MCInst &Inst, unsigned Val, + uint64_t Address, + const MCDisassembler *Decoder) { + if (Val >= LPCC::UNKNOWN) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(Val)); + return MCDisassembler::Success; +} #include "LanaiGenDisassemblerTables.inc" @@ -157,95 +223,3 @@ LanaiDisassembler::getInstruction(MCInst &Instr, uint64_t &Size, return MCDisassembler::Fail; } - -static const unsigned GPRDecoderTable[] = { - Lanai::R0, Lanai::R1, Lanai::PC, Lanai::R3, Lanai::SP, Lanai::FP, - Lanai::R6, Lanai::R7, Lanai::RV, Lanai::R9, Lanai::RR1, Lanai::RR2, - Lanai::R12, Lanai::R13, Lanai::R14, Lanai::RCA, Lanai::R16, Lanai::R17, - Lanai::R18, Lanai::R19, Lanai::R20, Lanai::R21, Lanai::R22, Lanai::R23, - Lanai::R24, Lanai::R25, Lanai::R26, Lanai::R27, Lanai::R28, Lanai::R29, - Lanai::R30, Lanai::R31}; - -DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t /*Address*/, - const MCDisassembler * /*Decoder*/) { - if (RegNo > 31) - return MCDisassembler::Fail; - - unsigned Reg = GPRDecoderTable[RegNo]; - Inst.addOperand(MCOperand::createReg(Reg)); - return MCDisassembler::Success; -} - -static DecodeStatus decodeRiMemoryValue(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - // RI memory values encoded using 23 bits: - // 5 bit register, 16 bit constant - unsigned Register = (Insn >> 18) & 0x1f; - Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register])); - unsigned Offset = (Insn & 0xffff); - Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Offset))); - - return MCDisassembler::Success; -} - -static DecodeStatus decodeRrMemoryValue(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - // RR memory values encoded using 20 bits: - // 5 bit register, 5 bit register, 2 bit PQ, 3 bit ALU operator, 5 bit JJJJJ - unsigned Register = (Insn >> 15) & 0x1f; - Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register])); - Register = (Insn >> 10) & 0x1f; - Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register])); - - return MCDisassembler::Success; -} - -static DecodeStatus decodeSplsValue(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - // RI memory values encoded using 17 bits: - // 5 bit register, 10 bit constant - unsigned Register = (Insn >> 12) & 0x1f; - Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register])); - unsigned Offset = (Insn & 0x3ff); - Inst.addOperand(MCOperand::createImm(SignExtend32<10>(Offset))); - - return MCDisassembler::Success; -} - -static bool tryAddingSymbolicOperand(int64_t Value, bool IsBranch, - uint64_t Address, uint64_t Offset, - uint64_t Width, MCInst &MI, - const MCDisassembler *Decoder) { - return Decoder->tryAddingSymbolicOperand(MI, Value, Address, IsBranch, Offset, - Width, /*InstSize=*/0); -} - -static DecodeStatus decodeBranch(MCInst &MI, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder) { - if (!tryAddingSymbolicOperand(Insn + Address, false, Address, 2, 23, MI, - Decoder)) - MI.addOperand(MCOperand::createImm(Insn)); - return MCDisassembler::Success; -} - -static DecodeStatus decodeShiftImm(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - unsigned Offset = (Insn & 0xffff); - Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Offset))); - - return MCDisassembler::Success; -} - -static DecodeStatus decodePredicateOperand(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder) { - if (Val >= LPCC::UNKNOWN) - return MCDisassembler::Fail; - Inst.addOperand(MCOperand::createImm(Val)); - return MCDisassembler::Success; -}
\ No newline at end of file diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp index 4ca97da16cde..02ed1001cd0d 100644 --- a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp +++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp @@ -13,6 +13,7 @@ #include "LanaiInstrInfo.h" #include "LanaiAluCode.h" #include "LanaiCondCode.h" +#include "LanaiSubtarget.h" #include "MCTargetDesc/LanaiBaseInfo.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" @@ -25,8 +26,8 @@ using namespace llvm; #define GET_INSTRINFO_CTOR_DTOR #include "LanaiGenInstrInfo.inc" -LanaiInstrInfo::LanaiInstrInfo() - : LanaiGenInstrInfo(Lanai::ADJCALLSTACKDOWN, Lanai::ADJCALLSTACKUP), +LanaiInstrInfo::LanaiInstrInfo(const LanaiSubtarget &STI) + : LanaiGenInstrInfo(STI, Lanai::ADJCALLSTACKDOWN, Lanai::ADJCALLSTACKUP), RegisterInfo() {} void LanaiInstrInfo::copyPhysReg(MachineBasicBlock &MBB, diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.h b/llvm/lib/Target/Lanai/LanaiInstrInfo.h index 07b1e87dc8b2..d98276243dc3 100644 --- a/llvm/lib/Target/Lanai/LanaiInstrInfo.h +++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.h @@ -22,11 +22,13 @@ namespace llvm { +class LanaiSubtarget; + class LanaiInstrInfo : public LanaiGenInstrInfo { const LanaiRegisterInfo RegisterInfo; public: - LanaiInstrInfo(); + LanaiInstrInfo(const LanaiSubtarget &STI); // getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As // such, whenever a client has an instance of instruction info, it should diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.td b/llvm/lib/Target/Lanai/LanaiInstrInfo.td index 1d968fa391c2..e0cd79ca22ff 100644 --- a/llvm/lib/Target/Lanai/LanaiInstrInfo.td +++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.td @@ -212,7 +212,6 @@ def MemImmAsmOperand : AsmOperandClass { let ParserMethod = "parseMemoryOperand"; } def MEMi : Operand<i32> { - let MIOperandInfo = (ops i32lo21:$offset); let ParserMatchClass = MemImmAsmOperand; let PrintMethod = "printMemImmOperand"; } @@ -402,7 +401,7 @@ def : Pat<(LanaiSubbF GPR:$Rs1, i32lo16z:$imm), def : Pat<(LanaiSubbF GPR:$Rs1, i32hi16:$imm), (SUBB_F_I_HI GPR:$Rs1, i32hi16:$imm)>; -def : InstAlias<"mov $src, $dst", (ADD_R GPR:$dst, GPR:$src, R0, 0)>; +def : InstAlias<"mov $src, $dst", (ADD_R GPR:$dst, GPR:$src, R0, (pred 0))>; let isAsCheapAsAMove = 1, Rs1 = R0.Num, isCodeGenOnly = 1, H = 1, F = 0, isReMaterializable = 1 in diff --git a/llvm/lib/Target/Lanai/LanaiSubtarget.cpp b/llvm/lib/Target/Lanai/LanaiSubtarget.cpp index 24aa8553279f..f99e88373edf 100644 --- a/llvm/lib/Target/Lanai/LanaiSubtarget.cpp +++ b/llvm/lib/Target/Lanai/LanaiSubtarget.cpp @@ -40,5 +40,5 @@ LanaiSubtarget::LanaiSubtarget(const Triple &TargetTriple, StringRef Cpu, CodeModel::Model /*CodeModel*/, CodeGenOptLevel /*OptLevel*/) : LanaiGenSubtargetInfo(TargetTriple, Cpu, /*TuneCPU*/ Cpu, FeatureString), - FrameLowering(initializeSubtargetDependencies(Cpu, FeatureString)), - TLInfo(TM, *this) {} + InstrInfo(initializeSubtargetDependencies(Cpu, FeatureString)), + FrameLowering(*this), TLInfo(TM, *this) {} diff --git a/llvm/lib/Target/Lanai/LanaiSubtarget.h b/llvm/lib/Target/Lanai/LanaiSubtarget.h index 0a229063ab7b..233c89e881d5 100644 --- a/llvm/lib/Target/Lanai/LanaiSubtarget.h +++ b/llvm/lib/Target/Lanai/LanaiSubtarget.h @@ -64,8 +64,8 @@ public: } private: - LanaiFrameLowering FrameLowering; LanaiInstrInfo InstrInfo; + LanaiFrameLowering FrameLowering; LanaiTargetLowering TLInfo; LanaiSelectionDAGInfo TSInfo; }; diff --git a/llvm/lib/Target/LoongArch/LoongArch.td b/llvm/lib/Target/LoongArch/LoongArch.td index 39948b31fb9b..6497ff999f6f 100644 --- a/llvm/lib/Target/LoongArch/LoongArch.td +++ b/llvm/lib/Target/LoongArch/LoongArch.td @@ -39,7 +39,7 @@ def IsLA32 "LA32 Basic Integer and Privilege Instruction Set">; defvar LA32 = DefaultMode; -def LA64 : HwMode<"+64bit", [IsLA64]>; +def LA64 : HwMode<[IsLA64]>; // Single Precision floating point def FeatureBasicF diff --git a/llvm/lib/Target/LoongArch/LoongArchCallingConv.td b/llvm/lib/Target/LoongArch/LoongArchCallingConv.td index 9844163163a5..7dcf65ce2b82 100644 --- a/llvm/lib/Target/LoongArch/LoongArchCallingConv.td +++ b/llvm/lib/Target/LoongArch/LoongArchCallingConv.td @@ -21,3 +21,7 @@ def CSR_ILP32D_LP64D // Needed for implementation of LoongArchRegisterInfo::getNoPreservedMask() def CSR_NoRegs : CalleeSavedRegs<(add)>; + +def CSR_MostRegs : CalleeSavedRegs<(add CSR_ILP32S_LP64S, + (sequence "R%u", 4, 11), + (sequence "R%u", 16, 19))>; diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td index 36c3011be2b9..c45975431d83 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td @@ -10,6 +10,9 @@ // //===----------------------------------------------------------------------===// +def NotBoolXor : PatFrags<(ops node:$val), + [(xor node:$val, -1), (xor node:$val, 1)]>; + //===----------------------------------------------------------------------===// // LoongArch specific DAG Nodes. //===----------------------------------------------------------------------===// @@ -22,6 +25,9 @@ def SDT_LoongArchFTINT : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>; def SDT_LoongArchFRECIPE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>; def SDT_LoongArchFRSQRTE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>; +// ISD::BRCOND is custom-lowered to LoongArchISD::BRCOND for floating-point +// comparisons to prevent recursive lowering. +def loongarch_brcond : SDNode<"LoongArchISD::BRCOND", SDTBrcond, [SDNPHasChain]>; def loongarch_movgr2fr_w_la64 : SDNode<"LoongArchISD::MOVGR2FR_W_LA64", SDT_LoongArchMOVGR2FR_W_LA64>; def loongarch_movfr2gr_s_la64 @@ -208,16 +214,18 @@ def : PatFPSetcc<SETUO, FCMP_CUN_S, FPR32>; def : PatFPSetcc<SETLT, FCMP_CLT_S, FPR32>; multiclass PatFPBrcond<CondCode cc, LAInst CmpInst, RegisterClass RegTy> { - def : Pat<(brcond (xor (GRLenVT (setcc RegTy:$fj, RegTy:$fk, cc)), -1), - bb:$imm21), + def : Pat<(loongarch_brcond (NotBoolXor (GRLenVT (setcc RegTy:$fj, RegTy:$fk, cc))), + bb:$imm21), (BCEQZ (CmpInst RegTy:$fj, RegTy:$fk), bb:$imm21)>; - def : Pat<(brcond (GRLenVT (setcc RegTy:$fj, RegTy:$fk, cc)), bb:$imm21), + def : Pat<(loongarch_brcond (GRLenVT (setcc RegTy:$fj, RegTy:$fk, cc)), bb:$imm21), (BCNEZ (CmpInst RegTy:$fj, RegTy:$fk), bb:$imm21)>; } defm : PatFPBrcond<SETOEQ, FCMP_CEQ_S, FPR32>; +defm : PatFPBrcond<SETEQ , FCMP_CEQ_S, FPR32>; defm : PatFPBrcond<SETOLT, FCMP_CLT_S, FPR32>; defm : PatFPBrcond<SETOLE, FCMP_CLE_S, FPR32>; +defm : PatFPBrcond<SETLE, FCMP_CLE_S, FPR32>; defm : PatFPBrcond<SETONE, FCMP_CNE_S, FPR32>; defm : PatFPBrcond<SETO, FCMP_COR_S, FPR32>; defm : PatFPBrcond<SETUEQ, FCMP_CUEQ_S, FPR32>; diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td index 616640152c8d..965ad8a0a35c 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td @@ -184,8 +184,10 @@ def : PatFPSetcc<SETUO, FCMP_CUN_D, FPR64>; def : PatFPSetcc<SETLT, FCMP_CLT_D, FPR64>; defm : PatFPBrcond<SETOEQ, FCMP_CEQ_D, FPR64>; +defm : PatFPBrcond<SETEQ, FCMP_CEQ_D, FPR64>; defm : PatFPBrcond<SETOLT, FCMP_CLT_D, FPR64>; defm : PatFPBrcond<SETOLE, FCMP_CLE_D, FPR64>; +defm : PatFPBrcond<SETLE, FCMP_CLE_D, FPR64>; defm : PatFPBrcond<SETONE, FCMP_CNE_D, FPR64>; defm : PatFPBrcond<SETO, FCMP_COR_D, FPR64>; defm : PatFPBrcond<SETUEQ, FCMP_CUEQ_D, FPR64>; diff --git a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp index 71d0263fe376..07e722b9a659 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp @@ -114,7 +114,7 @@ void LoongArchDAGToDAGISel::Select(SDNode *Node) { unsigned SplatBitSize; bool HasAnyUndefs; unsigned Op; - EVT ViaVecTy; + EVT ResTy = BVN->getValueType(0); bool Is128Vec = BVN->getValueType(0).is128BitVector(); bool Is256Vec = BVN->getValueType(0).is256BitVector(); @@ -129,28 +129,25 @@ void LoongArchDAGToDAGISel::Select(SDNode *Node) { break; case 8: Op = Is256Vec ? LoongArch::PseudoXVREPLI_B : LoongArch::PseudoVREPLI_B; - ViaVecTy = Is256Vec ? MVT::v32i8 : MVT::v16i8; break; case 16: Op = Is256Vec ? LoongArch::PseudoXVREPLI_H : LoongArch::PseudoVREPLI_H; - ViaVecTy = Is256Vec ? MVT::v16i16 : MVT::v8i16; break; case 32: Op = Is256Vec ? LoongArch::PseudoXVREPLI_W : LoongArch::PseudoVREPLI_W; - ViaVecTy = Is256Vec ? MVT::v8i32 : MVT::v4i32; break; case 64: Op = Is256Vec ? LoongArch::PseudoXVREPLI_D : LoongArch::PseudoVREPLI_D; - ViaVecTy = Is256Vec ? MVT::v4i64 : MVT::v2i64; break; } SDNode *Res; // If we have a signed 10 bit integer, we can splat it directly. if (SplatValue.isSignedIntN(10)) { - SDValue Imm = CurDAG->getTargetConstant(SplatValue, DL, - ViaVecTy.getVectorElementType()); - Res = CurDAG->getMachineNode(Op, DL, ViaVecTy, Imm); + EVT EleType = ResTy.getVectorElementType(); + APInt Val = SplatValue.sextOrTrunc(EleType.getSizeInBits()); + SDValue Imm = CurDAG->getTargetConstant(Val, DL, EleType); + Res = CurDAG->getMachineNode(Op, DL, ResTy, Imm); ReplaceNode(Node, Res); return; } diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 5b2d185594f4..634914d3b3fd 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -127,6 +127,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::BR_JT, MVT::Other, Expand); setOperationAction(ISD::BR_CC, GRLenVT, Expand); + setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::SELECT_CC, GRLenVT, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, GRLenVT, Expand); @@ -340,6 +341,14 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, {MVT::v16i8, MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v8i16, MVT::v4i16, MVT::v2i16, MVT::v4i32, MVT::v2i32, MVT::v2i64}) { setOperationAction(ISD::TRUNCATE, VT, Custom); + setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); + setOperationAction(ISD::VECREDUCE_AND, VT, Custom); + setOperationAction(ISD::VECREDUCE_OR, VT, Custom); + setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); } } @@ -377,6 +386,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::ABDS, VT, Legal); setOperationAction(ISD::ABDU, VT, Legal); + setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); } for (MVT VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32}) setOperationAction(ISD::BITREVERSE, VT, Custom); @@ -413,6 +423,11 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::BITCAST); } + // Set DAG combine for 'LASX' feature. + + if (Subtarget.hasExtLASX()) + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + // Compute derived properties from the register classes. computeRegisterProperties(Subtarget.getRegisterInfo()); @@ -514,6 +529,8 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op, return lowerPREFETCH(Op, DAG); case ISD::SELECT: return lowerSELECT(Op, DAG); + case ISD::BRCOND: + return lowerBRCOND(Op, DAG); case ISD::FP_TO_FP16: return lowerFP_TO_FP16(Op, DAG); case ISD::FP16_TO_FP: @@ -522,10 +539,109 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op, return lowerFP_TO_BF16(Op, DAG); case ISD::BF16_TO_FP: return lowerBF16_TO_FP(Op, DAG); + case ISD::VECREDUCE_ADD: + return lowerVECREDUCE_ADD(Op, DAG); + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_UMIN: + return lowerVECREDUCE(Op, DAG); } return SDValue(); } +// Lower vecreduce_add using vhaddw instructions. +// For Example: +// call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a) +// can be lowered to: +// VHADDW_D_W vr0, vr0, vr0 +// VHADDW_Q_D vr0, vr0, vr0 +// VPICKVE2GR_D a0, vr0, 0 +// ADDI_W a0, a0, 0 +SDValue LoongArchTargetLowering::lowerVECREDUCE_ADD(SDValue Op, + SelectionDAG &DAG) const { + + SDLoc DL(Op); + MVT OpVT = Op.getSimpleValueType(); + SDValue Val = Op.getOperand(0); + + unsigned NumEles = Val.getSimpleValueType().getVectorNumElements(); + unsigned EleBits = Val.getSimpleValueType().getScalarSizeInBits(); + + unsigned LegalVecSize = 128; + bool isLASX256Vector = + Subtarget.hasExtLASX() && Val.getValueSizeInBits() == 256; + + // Ensure operand type legal or enable it legal. + while (!isTypeLegal(Val.getSimpleValueType())) { + Val = DAG.WidenVector(Val, DL); + } + + // NumEles is designed for iterations count, v4i32 for LSX + // and v8i32 for LASX should have the same count. + if (isLASX256Vector) { + NumEles /= 2; + LegalVecSize = 256; + } + + for (unsigned i = 1; i < NumEles; i *= 2, EleBits *= 2) { + MVT IntTy = MVT::getIntegerVT(EleBits); + MVT VecTy = MVT::getVectorVT(IntTy, LegalVecSize / EleBits); + Val = DAG.getNode(LoongArchISD::VHADDW, DL, VecTy, Val, Val); + } + + if (isLASX256Vector) { + SDValue Tmp = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, Val, + DAG.getConstant(2, DL, MVT::i64)); + Val = DAG.getNode(ISD::ADD, DL, MVT::v4i64, Tmp, Val); + } + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Val, + DAG.getConstant(0, DL, Subtarget.getGRLenVT())); +} + +// Lower vecreduce_and/or/xor/[s/u]max/[s/u]min. +// For Example: +// call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a) +// can be lowered to: +// VBSRL_V vr1, vr0, 8 +// VMAX_W vr0, vr1, vr0 +// VBSRL_V vr1, vr0, 4 +// VMAX_W vr0, vr1, vr0 +// VPICKVE2GR_W a0, vr0, 0 +// For 256 bit vector, it is illegal and will be spilt into +// two 128 bit vector by default then processed by this. +SDValue LoongArchTargetLowering::lowerVECREDUCE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + + MVT OpVT = Op.getSimpleValueType(); + SDValue Val = Op.getOperand(0); + + unsigned NumEles = Val.getSimpleValueType().getVectorNumElements(); + unsigned EleBits = Val.getSimpleValueType().getScalarSizeInBits(); + + // Ensure operand type legal or enable it legal. + while (!isTypeLegal(Val.getSimpleValueType())) { + Val = DAG.WidenVector(Val, DL); + } + + unsigned Opcode = ISD::getVecReduceBaseOpcode(Op.getOpcode()); + MVT VecTy = Val.getSimpleValueType(); + + for (int i = NumEles; i > 1; i /= 2) { + SDValue ShiftAmt = DAG.getConstant(i * EleBits / 16, DL, MVT::i64); + SDValue Tmp = DAG.getNode(LoongArchISD::VBSRL, DL, VecTy, Val, ShiftAmt); + Val = DAG.getNode(Opcode, DL, VecTy, Tmp, Val); + } + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Val, + DAG.getConstant(0, DL, Subtarget.getGRLenVT())); +} + SDValue LoongArchTargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const { unsigned IsData = Op.getConstantOperandVal(4); @@ -859,6 +975,35 @@ SDValue LoongArchTargetLowering::lowerSELECT(SDValue Op, return DAG.getNode(LoongArchISD::SELECT_CC, DL, VT, Ops); } +SDValue LoongArchTargetLowering::lowerBRCOND(SDValue Op, + SelectionDAG &DAG) const { + SDValue CondV = Op.getOperand(1); + SDLoc DL(Op); + MVT GRLenVT = Subtarget.getGRLenVT(); + + if (CondV.getOpcode() == ISD::SETCC) { + if (CondV.getOperand(0).getValueType() == GRLenVT) { + SDValue LHS = CondV.getOperand(0); + SDValue RHS = CondV.getOperand(1); + ISD::CondCode CCVal = cast<CondCodeSDNode>(CondV.getOperand(2))->get(); + + translateSetCCForBranch(DL, LHS, RHS, CCVal, DAG); + + SDValue TargetCC = DAG.getCondCode(CCVal); + return DAG.getNode(LoongArchISD::BR_CC, DL, Op.getValueType(), + Op.getOperand(0), LHS, RHS, TargetCC, + Op.getOperand(2)); + } else if (CondV.getOperand(0).getValueType().isFloatingPoint()) { + return DAG.getNode(LoongArchISD::BRCOND, DL, Op.getValueType(), + Op.getOperand(0), CondV, Op.getOperand(2)); + } + } + + return DAG.getNode(LoongArchISD::BR_CC, DL, Op.getValueType(), + Op.getOperand(0), CondV, DAG.getConstant(0, DL, GRLenVT), + DAG.getCondCode(ISD::SETNE), Op.getOperand(2)); +} + SDValue LoongArchTargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { @@ -1031,6 +1176,7 @@ static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, static SDValue lowerVECTOR_SHUFFLEAsShift(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, + const LoongArchSubtarget &Subtarget, const APInt &Zeroable) { int Size = Mask.size(); assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); @@ -1057,7 +1203,7 @@ static SDValue lowerVECTOR_SHUFFLEAsShift(const SDLoc &DL, ArrayRef<int> Mask, "Illegal integer vector type"); V = DAG.getBitcast(ShiftVT, V); V = DAG.getNode(Opcode, DL, ShiftVT, V, - DAG.getConstant(ShiftAmt, DL, MVT::i64)); + DAG.getConstant(ShiftAmt, DL, Subtarget.getGRLenVT())); return DAG.getBitcast(VT, V); } @@ -1226,10 +1372,10 @@ static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, /// (VBSRL_V $v1, $v1, 8) /// (VBSLL_V $v0, $v0, 8) /// (VOR_V $v0, $V0, $v1) -static SDValue lowerVECTOR_SHUFFLEAsByteRotate(const SDLoc &DL, - ArrayRef<int> Mask, MVT VT, - SDValue V1, SDValue V2, - SelectionDAG &DAG) { +static SDValue +lowerVECTOR_SHUFFLEAsByteRotate(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, + SDValue V1, SDValue V2, SelectionDAG &DAG, + const LoongArchSubtarget &Subtarget) { SDValue Lo = V1, Hi = V2; int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask); @@ -1242,11 +1388,12 @@ static SDValue lowerVECTOR_SHUFFLEAsByteRotate(const SDLoc &DL, int LoByteShift = 16 - ByteRotation; int HiByteShift = ByteRotation; + MVT GRLenVT = Subtarget.getGRLenVT(); SDValue LoShift = DAG.getNode(LoongArchISD::VBSLL, DL, ByteVT, Lo, - DAG.getConstant(LoByteShift, DL, MVT::i64)); + DAG.getConstant(LoByteShift, DL, GRLenVT)); SDValue HiShift = DAG.getNode(LoongArchISD::VBSRL, DL, ByteVT, Hi, - DAG.getConstant(HiByteShift, DL, MVT::i64)); + DAG.getConstant(HiByteShift, DL, GRLenVT)); return DAG.getBitcast(VT, DAG.getNode(ISD::OR, DL, ByteVT, LoShift, HiShift)); } @@ -1351,9 +1498,10 @@ static SDValue lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(const SDLoc &DL, /// /// When undef's appear in the mask they are treated as if they were whatever /// value is necessary in order to fit the above form. -static SDValue lowerVECTOR_SHUFFLE_VREPLVEI(const SDLoc &DL, ArrayRef<int> Mask, - MVT VT, SDValue V1, SDValue V2, - SelectionDAG &DAG) { +static SDValue +lowerVECTOR_SHUFFLE_VREPLVEI(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, + SDValue V1, SDValue V2, SelectionDAG &DAG, + const LoongArchSubtarget &Subtarget) { int SplatIndex = -1; for (const auto &M : Mask) { if (M != -1) { @@ -1369,7 +1517,7 @@ static SDValue lowerVECTOR_SHUFFLE_VREPLVEI(const SDLoc &DL, ArrayRef<int> Mask, if (fitsRegularPattern<int>(Mask.begin(), 1, Mask.end(), SplatIndex, 0)) { APInt Imm(64, SplatIndex); return DAG.getNode(LoongArchISD::VREPLVEI, DL, VT, V1, - DAG.getConstant(Imm, DL, MVT::i64)); + DAG.getConstant(Imm, DL, Subtarget.getGRLenVT())); } return SDValue(); @@ -1393,9 +1541,10 @@ static SDValue lowerVECTOR_SHUFFLE_VREPLVEI(const SDLoc &DL, ArrayRef<int> Mask, /// (VSHUF4I_H $v0, $v1, 27) /// where the 27 comes from: /// 3 + (2 << 2) + (1 << 4) + (0 << 6) -static SDValue lowerVECTOR_SHUFFLE_VSHUF4I(const SDLoc &DL, ArrayRef<int> Mask, - MVT VT, SDValue V1, SDValue V2, - SelectionDAG &DAG) { +static SDValue +lowerVECTOR_SHUFFLE_VSHUF4I(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, + SDValue V1, SDValue V2, SelectionDAG &DAG, + const LoongArchSubtarget &Subtarget) { unsigned SubVecSize = 4; if (VT == MVT::v2f64 || VT == MVT::v2i64) @@ -1437,13 +1586,15 @@ static SDValue lowerVECTOR_SHUFFLE_VSHUF4I(const SDLoc &DL, ArrayRef<int> Mask, Imm |= M & 0x3; } + MVT GRLenVT = Subtarget.getGRLenVT(); + // Return vshuf4i.d if (VT == MVT::v2f64 || VT == MVT::v2i64) return DAG.getNode(LoongArchISD::VSHUF4I, DL, VT, V1, V2, - DAG.getConstant(Imm, DL, MVT::i64)); + DAG.getConstant(Imm, DL, GRLenVT)); return DAG.getNode(LoongArchISD::VSHUF4I, DL, VT, V1, - DAG.getConstant(Imm, DL, MVT::i64)); + DAG.getConstant(Imm, DL, GRLenVT)); } /// Lower VECTOR_SHUFFLE into VPACKEV (if possible). @@ -1723,7 +1874,8 @@ static SDValue lowerVECTOR_SHUFFLE_VSHUF(const SDLoc &DL, ArrayRef<int> Mask, /// This routine breaks down the specific type of 128-bit shuffle and /// dispatches to the lowering routines accordingly. static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, - SDValue V1, SDValue V2, SelectionDAG &DAG) { + SDValue V1, SDValue V2, SelectionDAG &DAG, + const LoongArchSubtarget &Subtarget) { assert((VT.SimpleTy == MVT::v16i8 || VT.SimpleTy == MVT::v8i16 || VT.SimpleTy == MVT::v4i32 || VT.SimpleTy == MVT::v2i64 || VT.SimpleTy == MVT::v4f32 || VT.SimpleTy == MVT::v2f64) && @@ -1741,9 +1893,11 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, SDValue Result; // TODO: Add more comparison patterns. if (V2.isUndef()) { - if ((Result = lowerVECTOR_SHUFFLE_VREPLVEI(DL, Mask, VT, V1, V2, DAG))) + if ((Result = lowerVECTOR_SHUFFLE_VREPLVEI(DL, Mask, VT, V1, V2, DAG, + Subtarget))) return Result; - if ((Result = lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG))) + if ((Result = + lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG, Subtarget))) return Result; // TODO: This comment may be enabled in the future to better match the @@ -1766,15 +1920,17 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, if ((Result = lowerVECTOR_SHUFFLE_VPICKOD(DL, Mask, VT, V1, V2, DAG))) return Result; if ((VT.SimpleTy == MVT::v2i64 || VT.SimpleTy == MVT::v2f64) && - (Result = lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG))) + (Result = + lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG, Subtarget))) return Result; if ((Result = lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(DL, Mask, VT, V1, V2, DAG, Zeroable))) return Result; - if ((Result = - lowerVECTOR_SHUFFLEAsShift(DL, Mask, VT, V1, V2, DAG, Zeroable))) + if ((Result = lowerVECTOR_SHUFFLEAsShift(DL, Mask, VT, V1, V2, DAG, Subtarget, + Zeroable))) return Result; - if ((Result = lowerVECTOR_SHUFFLEAsByteRotate(DL, Mask, VT, V1, V2, DAG))) + if ((Result = lowerVECTOR_SHUFFLEAsByteRotate(DL, Mask, VT, V1, V2, DAG, + Subtarget))) return Result; if (SDValue NewShuffle = widenShuffleMask(DL, Mask, VT, V1, V2, DAG)) return NewShuffle; @@ -1791,10 +1947,10 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, /// /// When undef's appear in the mask they are treated as if they were whatever /// value is necessary in order to fit the above form. -static SDValue lowerVECTOR_SHUFFLE_XVREPLVEI(const SDLoc &DL, - ArrayRef<int> Mask, MVT VT, - SDValue V1, SDValue V2, - SelectionDAG &DAG) { +static SDValue +lowerVECTOR_SHUFFLE_XVREPLVEI(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, + SDValue V1, SDValue V2, SelectionDAG &DAG, + const LoongArchSubtarget &Subtarget) { int SplatIndex = -1; for (const auto &M : Mask) { if (M != -1) { @@ -1816,21 +1972,64 @@ static SDValue lowerVECTOR_SHUFFLE_XVREPLVEI(const SDLoc &DL, 0)) { APInt Imm(64, SplatIndex); return DAG.getNode(LoongArchISD::VREPLVEI, DL, VT, V1, - DAG.getConstant(Imm, DL, MVT::i64)); + DAG.getConstant(Imm, DL, Subtarget.getGRLenVT())); } return SDValue(); } /// Lower VECTOR_SHUFFLE into XVSHUF4I (if possible). -static SDValue lowerVECTOR_SHUFFLE_XVSHUF4I(const SDLoc &DL, ArrayRef<int> Mask, - MVT VT, SDValue V1, SDValue V2, - SelectionDAG &DAG) { +static SDValue +lowerVECTOR_SHUFFLE_XVSHUF4I(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, + SDValue V1, SDValue V2, SelectionDAG &DAG, + const LoongArchSubtarget &Subtarget) { // When the size is less than or equal to 4, lower cost instructions may be // used. if (Mask.size() <= 4) return SDValue(); - return lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG); + return lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG, Subtarget); +} + +/// Lower VECTOR_SHUFFLE into XVPERM (if possible). +static SDValue lowerVECTOR_SHUFFLE_XVPERM(const SDLoc &DL, ArrayRef<int> Mask, + MVT VT, SDValue V1, SDValue V2, + SelectionDAG &DAG) { + // LoongArch LASX only have XVPERM_W. + if (Mask.size() != 8 || (VT != MVT::v8i32 && VT != MVT::v8f32)) + return SDValue(); + + unsigned NumElts = VT.getVectorNumElements(); + unsigned HalfSize = NumElts / 2; + bool FrontLo = true, FrontHi = true; + bool BackLo = true, BackHi = true; + + auto inRange = [](int val, int low, int high) { + return (val == -1) || (val >= low && val < high); + }; + + for (unsigned i = 0; i < HalfSize; ++i) { + int Fronti = Mask[i]; + int Backi = Mask[i + HalfSize]; + + FrontLo &= inRange(Fronti, 0, HalfSize); + FrontHi &= inRange(Fronti, HalfSize, NumElts); + BackLo &= inRange(Backi, 0, HalfSize); + BackHi &= inRange(Backi, HalfSize, NumElts); + } + + // If both the lower and upper 128-bit parts access only one half of the + // vector (either lower or upper), avoid using xvperm.w. The latency of + // xvperm.w(3) is higher than using xvshuf(1) and xvori(1). + if ((FrontLo || FrontHi) && (BackLo || BackHi)) + return SDValue(); + + SmallVector<SDValue, 8> Masks; + for (unsigned i = 0; i < NumElts; ++i) + Masks.push_back(Mask[i] == -1 ? DAG.getUNDEF(MVT::i64) + : DAG.getConstant(Mask[i], DL, MVT::i64)); + SDValue MaskVec = DAG.getBuildVector(MVT::v8i32, DL, Masks); + + return DAG.getNode(LoongArchISD::XVPERM, DL, VT, V1, MaskVec); } /// Lower VECTOR_SHUFFLE into XVPACKEV (if possible). @@ -2060,15 +2259,15 @@ static SDValue lowerVECTOR_SHUFFLE_XVSHUF(const SDLoc &DL, ArrayRef<int> Mask, /// cases need to be converted to it for processing. /// /// This function may modify V1, V2 and Mask -static void canonicalizeShuffleVectorByLane(const SDLoc &DL, - MutableArrayRef<int> Mask, MVT VT, - SDValue &V1, SDValue &V2, - SelectionDAG &DAG) { +static void canonicalizeShuffleVectorByLane( + const SDLoc &DL, MutableArrayRef<int> Mask, MVT VT, SDValue &V1, + SDValue &V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget) { enum HalfMaskType { HighLaneTy, LowLaneTy, None }; int MaskSize = Mask.size(); int HalfSize = Mask.size() / 2; + MVT GRLenVT = Subtarget.getGRLenVT(); HalfMaskType preMask = None, postMask = None; @@ -2106,13 +2305,13 @@ static void canonicalizeShuffleVectorByLane(const SDLoc &DL, if (preMask == LowLaneTy && postMask == HighLaneTy) { V1 = DAG.getBitcast(MVT::v4i64, V1); V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1, - DAG.getConstant(0b01001110, DL, MVT::i64)); + DAG.getConstant(0b01001110, DL, GRLenVT)); V1 = DAG.getBitcast(VT, V1); if (!V2.isUndef()) { V2 = DAG.getBitcast(MVT::v4i64, V2); V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2, - DAG.getConstant(0b01001110, DL, MVT::i64)); + DAG.getConstant(0b01001110, DL, GRLenVT)); V2 = DAG.getBitcast(VT, V2); } @@ -2125,13 +2324,13 @@ static void canonicalizeShuffleVectorByLane(const SDLoc &DL, } else if (preMask == LowLaneTy && postMask == LowLaneTy) { V1 = DAG.getBitcast(MVT::v4i64, V1); V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1, - DAG.getConstant(0b11101110, DL, MVT::i64)); + DAG.getConstant(0b11101110, DL, GRLenVT)); V1 = DAG.getBitcast(VT, V1); if (!V2.isUndef()) { V2 = DAG.getBitcast(MVT::v4i64, V2); V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2, - DAG.getConstant(0b11101110, DL, MVT::i64)); + DAG.getConstant(0b11101110, DL, GRLenVT)); V2 = DAG.getBitcast(VT, V2); } @@ -2141,13 +2340,13 @@ static void canonicalizeShuffleVectorByLane(const SDLoc &DL, } else if (preMask == HighLaneTy && postMask == HighLaneTy) { V1 = DAG.getBitcast(MVT::v4i64, V1); V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1, - DAG.getConstant(0b01000100, DL, MVT::i64)); + DAG.getConstant(0b01000100, DL, GRLenVT)); V1 = DAG.getBitcast(VT, V1); if (!V2.isUndef()) { V2 = DAG.getBitcast(MVT::v4i64, V2); V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2, - DAG.getConstant(0b01000100, DL, MVT::i64)); + DAG.getConstant(0b01000100, DL, GRLenVT)); V2 = DAG.getBitcast(VT, V2); } @@ -2209,7 +2408,8 @@ static SDValue lowerVECTOR_SHUFFLEAsLanePermuteAndShuffle(const SDLoc &DL, /// This routine breaks down the specific type of 256-bit shuffle and /// dispatches to the lowering routines accordingly. static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, - SDValue V1, SDValue V2, SelectionDAG &DAG) { + SDValue V1, SDValue V2, SelectionDAG &DAG, + const LoongArchSubtarget &Subtarget) { assert((VT.SimpleTy == MVT::v32i8 || VT.SimpleTy == MVT::v16i16 || VT.SimpleTy == MVT::v8i32 || VT.SimpleTy == MVT::v4i64 || VT.SimpleTy == MVT::v8f32 || VT.SimpleTy == MVT::v4f64) && @@ -2223,7 +2423,7 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, // canonicalize non cross-lane shuffle vector SmallVector<int> NewMask(Mask); - canonicalizeShuffleVectorByLane(DL, NewMask, VT, V1, V2, DAG); + canonicalizeShuffleVectorByLane(DL, NewMask, VT, V1, V2, DAG, Subtarget); APInt KnownUndef, KnownZero; computeZeroableShuffleElements(NewMask, V1, V2, KnownUndef, KnownZero); @@ -2232,9 +2432,13 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, SDValue Result; // TODO: Add more comparison patterns. if (V2.isUndef()) { - if ((Result = lowerVECTOR_SHUFFLE_XVREPLVEI(DL, NewMask, VT, V1, V2, DAG))) + if ((Result = lowerVECTOR_SHUFFLE_XVREPLVEI(DL, NewMask, VT, V1, V2, DAG, + Subtarget))) + return Result; + if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, NewMask, VT, V1, V2, DAG, + Subtarget))) return Result; - if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, NewMask, VT, V1, V2, DAG))) + if ((Result = lowerVECTOR_SHUFFLE_XVPERM(DL, NewMask, VT, V1, V2, DAG))) return Result; if ((Result = lowerVECTOR_SHUFFLEAsLanePermuteAndShuffle(DL, NewMask, VT, V1, V2, DAG))) @@ -2259,10 +2463,11 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, return Result; if ((Result = lowerVECTOR_SHUFFLE_XVPICKOD(DL, NewMask, VT, V1, V2, DAG))) return Result; - if ((Result = - lowerVECTOR_SHUFFLEAsShift(DL, NewMask, VT, V1, V2, DAG, Zeroable))) + if ((Result = lowerVECTOR_SHUFFLEAsShift(DL, NewMask, VT, V1, V2, DAG, + Subtarget, Zeroable))) return Result; - if ((Result = lowerVECTOR_SHUFFLEAsByteRotate(DL, NewMask, VT, V1, V2, DAG))) + if ((Result = lowerVECTOR_SHUFFLEAsByteRotate(DL, NewMask, VT, V1, V2, DAG, + Subtarget))) return Result; if (SDValue NewShuffle = widenShuffleMask(DL, NewMask, VT, V1, V2, DAG)) return NewShuffle; @@ -2314,10 +2519,10 @@ SDValue LoongArchTargetLowering::lowerVECTOR_SHUFFLE(SDValue Op, // For each vector width, delegate to a specialized lowering routine. if (VT.is128BitVector()) - return lower128BitShuffle(DL, OrigMask, VT, V1, V2, DAG); + return lower128BitShuffle(DL, OrigMask, VT, V1, V2, DAG, Subtarget); if (VT.is256BitVector()) - return lower256BitShuffle(DL, OrigMask, VT, V1, V2, DAG); + return lower256BitShuffle(DL, OrigMask, VT, V1, V2, DAG, Subtarget); return SDValue(); } @@ -2414,11 +2619,14 @@ static SDValue lowerBUILD_VECTORAsBroadCastLoad(BuildVectorSDNode *BVOp, } // make sure that this load is valid and only has one user. - if (!IdentitySrc || !BVOp->isOnlyUserOf(IdentitySrc.getNode())) + if (!IsIdeneity || !IdentitySrc || !BVOp->isOnlyUserOf(IdentitySrc.getNode())) return SDValue(); - if (IsIdeneity) { - auto *LN = cast<LoadSDNode>(IdentitySrc); + auto *LN = cast<LoadSDNode>(IdentitySrc); + auto ExtType = LN->getExtensionType(); + + if ((ExtType == ISD::EXTLOAD || ExtType == ISD::NON_EXTLOAD) && + VT.getScalarSizeInBits() == LN->getMemoryVT().getScalarSizeInBits()) { SDVTList Tys = LN->isIndexed() ? DAG.getVTList(VT, LN->getBasePtr().getValueType(), MVT::Other) @@ -2461,6 +2669,16 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op, SplatBitSize != 64) return SDValue(); + if (SplatBitSize == 64 && !Subtarget.is64Bit()) { + // We can only handle 64-bit elements that are within + // the signed 32-bit range on 32-bit targets. + if (!SplatValue.isSignedIntN(32)) + return SDValue(); + if ((Is128Vec && ResTy == MVT::v4i32) || + (Is256Vec && ResTy == MVT::v8i32)) + return Op; + } + EVT ViaVecTy; switch (SplatBitSize) { @@ -2609,14 +2827,58 @@ SDValue LoongArchTargetLowering::lowerCONCAT_VECTORS(SDValue Op, SDValue LoongArchTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { - EVT VecTy = Op->getOperand(0)->getValueType(0); + MVT EltVT = Op.getSimpleValueType(); + SDValue Vec = Op->getOperand(0); + EVT VecTy = Vec->getValueType(0); SDValue Idx = Op->getOperand(1); - unsigned NumElts = VecTy.getVectorNumElements(); + SDLoc DL(Op); + MVT GRLenVT = Subtarget.getGRLenVT(); + + assert(VecTy.is256BitVector() && "Unexpected EXTRACT_VECTOR_ELT vector type"); - if (isa<ConstantSDNode>(Idx) && Idx->getAsZExtVal() < NumElts) + if (isa<ConstantSDNode>(Idx)) return Op; - return SDValue(); + switch (VecTy.getSimpleVT().SimpleTy) { + default: + llvm_unreachable("Unexpected type"); + case MVT::v32i8: + case MVT::v16i16: + case MVT::v4i64: + case MVT::v4f64: { + // Extract the high half subvector and place it to the low half of a new + // vector. It doesn't matter what the high half of the new vector is. + EVT HalfTy = VecTy.getHalfNumVectorElementsVT(*DAG.getContext()); + SDValue VecHi = + DAG.getExtractSubvector(DL, HalfTy, Vec, HalfTy.getVectorNumElements()); + SDValue TmpVec = + DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecTy, DAG.getUNDEF(VecTy), + VecHi, DAG.getConstant(0, DL, GRLenVT)); + + // Shuffle the origin Vec and the TmpVec using MaskVec, the lowest element + // of MaskVec is Idx, the rest do not matter. ResVec[0] will hold the + // desired element. + SDValue IdxCp = + DAG.getNode(LoongArchISD::MOVGR2FR_W_LA64, DL, MVT::f32, Idx); + SDValue IdxVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f32, IdxCp); + SDValue MaskVec = + DAG.getBitcast((VecTy == MVT::v4f64) ? MVT::v4i64 : VecTy, IdxVec); + SDValue ResVec = + DAG.getNode(LoongArchISD::VSHUF, DL, VecTy, MaskVec, TmpVec, Vec); + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ResVec, + DAG.getConstant(0, DL, GRLenVT)); + } + case MVT::v8i32: + case MVT::v8f32: { + SDValue SplatIdx = DAG.getSplatBuildVector(MVT::v8i32, DL, Idx); + SDValue SplatValue = + DAG.getNode(LoongArchISD::XVPERM, DL, VecTy, Vec, SplatIdx); + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SplatValue, + DAG.getConstant(0, DL, GRLenVT)); + } + } } SDValue @@ -4740,13 +5002,29 @@ static SDValue performBITCASTCombine(SDNode *N, SelectionDAG &DAG, UseLASX = true; break; }; - if (UseLASX && !(Subtarget.has32S() && Subtarget.hasExtLASX())) - return SDValue(); Src = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL) : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src); - Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ; - SDValue V = DAG.getNode(Opc, DL, MVT::i64, Src); + SDValue V; + if (!Subtarget.has32S() || !Subtarget.hasExtLASX()) { + if (Src.getSimpleValueType() == MVT::v32i8) { + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(Src, DL); + Lo = DAG.getNode(LoongArchISD::VMSKLTZ, DL, MVT::i64, Lo); + Hi = DAG.getNode(LoongArchISD::VMSKLTZ, DL, MVT::i64, Hi); + Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi, + DAG.getConstant(16, DL, MVT::i8)); + V = DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi); + } else if (UseLASX) { + return SDValue(); + } + } + + if (!V) { + Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ; + V = DAG.getNode(Opc, DL, MVT::i64, Src); + } + EVT T = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements()); V = DAG.getZExtOrTrunc(V, DL, T); return DAG.getBitcast(VT, V); @@ -5154,6 +5432,145 @@ static SDValue performBITREV_WCombine(SDNode *N, SelectionDAG &DAG, Src.getOperand(0)); } +// Perform common combines for BR_CC and SELECT_CC conditions. +static bool combine_CC(SDValue &LHS, SDValue &RHS, SDValue &CC, const SDLoc &DL, + SelectionDAG &DAG, const LoongArchSubtarget &Subtarget) { + ISD::CondCode CCVal = cast<CondCodeSDNode>(CC)->get(); + + // As far as arithmetic right shift always saves the sign, + // shift can be omitted. + // Fold setlt (sra X, N), 0 -> setlt X, 0 and + // setge (sra X, N), 0 -> setge X, 0 + if (isNullConstant(RHS) && (CCVal == ISD::SETGE || CCVal == ISD::SETLT) && + LHS.getOpcode() == ISD::SRA) { + LHS = LHS.getOperand(0); + return true; + } + + if (!ISD::isIntEqualitySetCC(CCVal)) + return false; + + // Fold ((setlt X, Y), 0, ne) -> (X, Y, lt) + // Sometimes the setcc is introduced after br_cc/select_cc has been formed. + if (LHS.getOpcode() == ISD::SETCC && isNullConstant(RHS) && + LHS.getOperand(0).getValueType() == Subtarget.getGRLenVT()) { + // If we're looking for eq 0 instead of ne 0, we need to invert the + // condition. + bool Invert = CCVal == ISD::SETEQ; + CCVal = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); + if (Invert) + CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType()); + + RHS = LHS.getOperand(1); + LHS = LHS.getOperand(0); + translateSetCCForBranch(DL, LHS, RHS, CCVal, DAG); + + CC = DAG.getCondCode(CCVal); + return true; + } + + // Fold ((srl (and X, 1<<C), C), 0, eq/ne) -> ((shl X, GRLen-1-C), 0, ge/lt) + if (isNullConstant(RHS) && LHS.getOpcode() == ISD::SRL && LHS.hasOneUse() && + LHS.getOperand(1).getOpcode() == ISD::Constant) { + SDValue LHS0 = LHS.getOperand(0); + if (LHS0.getOpcode() == ISD::AND && + LHS0.getOperand(1).getOpcode() == ISD::Constant) { + uint64_t Mask = LHS0.getConstantOperandVal(1); + uint64_t ShAmt = LHS.getConstantOperandVal(1); + if (isPowerOf2_64(Mask) && Log2_64(Mask) == ShAmt) { + CCVal = CCVal == ISD::SETEQ ? ISD::SETGE : ISD::SETLT; + CC = DAG.getCondCode(CCVal); + + ShAmt = LHS.getValueSizeInBits() - 1 - ShAmt; + LHS = LHS0.getOperand(0); + if (ShAmt != 0) + LHS = + DAG.getNode(ISD::SHL, DL, LHS.getValueType(), LHS0.getOperand(0), + DAG.getConstant(ShAmt, DL, LHS.getValueType())); + return true; + } + } + } + + // (X, 1, setne) -> (X, 0, seteq) if we can prove X is 0/1. + // This can occur when legalizing some floating point comparisons. + APInt Mask = APInt::getBitsSetFrom(LHS.getValueSizeInBits(), 1); + if (isOneConstant(RHS) && DAG.MaskedValueIsZero(LHS, Mask)) { + CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType()); + CC = DAG.getCondCode(CCVal); + RHS = DAG.getConstant(0, DL, LHS.getValueType()); + return true; + } + + return false; +} + +static SDValue performBR_CCCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const LoongArchSubtarget &Subtarget) { + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + SDValue CC = N->getOperand(3); + SDLoc DL(N); + + if (combine_CC(LHS, RHS, CC, DL, DAG, Subtarget)) + return DAG.getNode(LoongArchISD::BR_CC, DL, N->getValueType(0), + N->getOperand(0), LHS, RHS, CC, N->getOperand(4)); + + return SDValue(); +} + +static SDValue performSELECT_CCCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const LoongArchSubtarget &Subtarget) { + // Transform + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue CC = N->getOperand(2); + ISD::CondCode CCVal = cast<CondCodeSDNode>(CC)->get(); + SDValue TrueV = N->getOperand(3); + SDValue FalseV = N->getOperand(4); + SDLoc DL(N); + EVT VT = N->getValueType(0); + + // If the True and False values are the same, we don't need a select_cc. + if (TrueV == FalseV) + return TrueV; + + // (select (x < 0), y, z) -> x >> (GRLEN - 1) & (y - z) + z + // (select (x >= 0), y, z) -> x >> (GRLEN - 1) & (z - y) + y + if (isa<ConstantSDNode>(TrueV) && isa<ConstantSDNode>(FalseV) && + isNullConstant(RHS) && + (CCVal == ISD::CondCode::SETLT || CCVal == ISD::CondCode::SETGE)) { + if (CCVal == ISD::CondCode::SETGE) + std::swap(TrueV, FalseV); + + int64_t TrueSImm = cast<ConstantSDNode>(TrueV)->getSExtValue(); + int64_t FalseSImm = cast<ConstantSDNode>(FalseV)->getSExtValue(); + // Only handle simm12, if it is not in this range, it can be considered as + // register. + if (isInt<12>(TrueSImm) && isInt<12>(FalseSImm) && + isInt<12>(TrueSImm - FalseSImm)) { + SDValue SRA = + DAG.getNode(ISD::SRA, DL, VT, LHS, + DAG.getConstant(Subtarget.getGRLen() - 1, DL, VT)); + SDValue AND = + DAG.getNode(ISD::AND, DL, VT, SRA, + DAG.getSignedConstant(TrueSImm - FalseSImm, DL, VT)); + return DAG.getNode(ISD::ADD, DL, VT, AND, FalseV); + } + + if (CCVal == ISD::CondCode::SETGE) + std::swap(TrueV, FalseV); + } + + if (combine_CC(LHS, RHS, CC, DL, DAG, Subtarget)) + return DAG.getNode(LoongArchISD::SELECT_CC, DL, N->getValueType(0), + {LHS, RHS, CC, TrueV, FalseV}); + + return SDValue(); +} + template <unsigned N> static SDValue legalizeIntrinsicImmArg(SDNode *Node, unsigned ImmOp, SelectionDAG &DAG, @@ -5828,6 +6245,42 @@ performSPLIT_PAIR_F64Combine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue +performEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const LoongArchSubtarget &Subtarget) { + if (!DCI.isBeforeLegalize()) + return SDValue(); + + MVT EltVT = N->getSimpleValueType(0); + SDValue Vec = N->getOperand(0); + EVT VecTy = Vec->getValueType(0); + SDValue Idx = N->getOperand(1); + unsigned IdxOp = Idx.getOpcode(); + SDLoc DL(N); + + if (!VecTy.is256BitVector() || isa<ConstantSDNode>(Idx)) + return SDValue(); + + // Combine: + // t2 = truncate t1 + // t3 = {zero/sign/any}_extend t2 + // t4 = extract_vector_elt t0, t3 + // to: + // t4 = extract_vector_elt t0, t1 + if (IdxOp == ISD::ZERO_EXTEND || IdxOp == ISD::SIGN_EXTEND || + IdxOp == ISD::ANY_EXTEND) { + SDValue IdxOrig = Idx.getOperand(0); + if (!(IdxOrig.getOpcode() == ISD::TRUNCATE)) + return SDValue(); + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vec, + IdxOrig.getOperand(0)); + } + + return SDValue(); +} + SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -5846,6 +6299,10 @@ SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N, return performBITCASTCombine(N, DAG, DCI, Subtarget); case LoongArchISD::BITREV_W: return performBITREV_WCombine(N, DAG, DCI, Subtarget); + case LoongArchISD::BR_CC: + return performBR_CCCombine(N, DAG, DCI, Subtarget); + case LoongArchISD::SELECT_CC: + return performSELECT_CCCombine(N, DAG, DCI, Subtarget); case ISD::INTRINSIC_WO_CHAIN: return performINTRINSIC_WO_CHAINCombine(N, DAG, DCI, Subtarget); case LoongArchISD::MOVGR2FR_W_LA64: @@ -5857,6 +6314,8 @@ SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N, return performVMSKLTZCombine(N, DAG, DCI, Subtarget); case LoongArchISD::SPLIT_PAIR_F64: return performSPLIT_PAIR_F64Combine(N, DAG, DCI, Subtarget); + case ISD::EXTRACT_VECTOR_ELT: + return performEXTRACT_VECTOR_ELTCombine(N, DAG, DCI, Subtarget); } return SDValue(); } @@ -6575,6 +7034,8 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(TAIL_MEDIUM) NODE_NAME_CASE(TAIL_LARGE) NODE_NAME_CASE(SELECT_CC) + NODE_NAME_CASE(BR_CC) + NODE_NAME_CASE(BRCOND) NODE_NAME_CASE(SLL_W) NODE_NAME_CASE(SRA_W) NODE_NAME_CASE(SRL_W) @@ -6637,6 +7098,7 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(VREPLVEI) NODE_NAME_CASE(VREPLGR2VR) NODE_NAME_CASE(XVPERMI) + NODE_NAME_CASE(XVPERM) NODE_NAME_CASE(VPICK_SEXT_ELT) NODE_NAME_CASE(VPICK_ZEXT_ELT) NODE_NAME_CASE(VREPLVE) @@ -6659,6 +7121,7 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(XVMSKGEZ) NODE_NAME_CASE(XVMSKEQZ) NODE_NAME_CASE(XVMSKNEZ) + NODE_NAME_CASE(VHADDW) } #undef NODE_NAME_CASE return nullptr; @@ -7132,6 +7595,7 @@ SDValue LoongArchTargetLowering::LowerFormalArguments( llvm_unreachable("Unsupported calling convention"); case CallingConv::C: case CallingConv::Fast: + case CallingConv::PreserveMost: break; case CallingConv::GHC: if (!MF.getSubtarget().hasFeature(LoongArch::FeatureBasicF) || @@ -7893,7 +8357,7 @@ LoongArchTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { if (Size < 32 && (AI->getOperation() == AtomicRMWInst::And || AI->getOperation() == AtomicRMWInst::Or || AI->getOperation() == AtomicRMWInst::Xor)) - return AtomicExpansionKind::Expand; + return AtomicExpansionKind::CustomExpand; if (AI->getOperation() == AtomicRMWInst::Nand || Size < 32) return AtomicExpansionKind::CmpXChg; } diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index f79ba7450cc3..9d14934a9d36 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -37,6 +37,10 @@ enum NodeType : unsigned { // Select SELECT_CC, + // Branch + BR_CC, + BRCOND, + // 32-bit shifts, directly matching the semantics of the named LoongArch // instructions. SLL_W, @@ -141,6 +145,7 @@ enum NodeType : unsigned { VREPLVEI, VREPLGR2VR, XVPERMI, + XVPERM, // Extended vector element extraction VPICK_SEXT_ELT, @@ -177,6 +182,9 @@ enum NodeType : unsigned { XVMSKEQZ, XVMSKNEZ, + // Vector Horizontal Addition with Widening‌ + VHADDW + // Intrinsic operations end ============================================= }; } // end namespace LoongArchISD @@ -382,10 +390,13 @@ private: SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const; SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerBRCOND(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const; SDValue lowerBF16_TO_FP(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerVECREDUCE_ADD(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override; diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp index 26d36f1c5058..c89212dae72d 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp @@ -25,8 +25,8 @@ using namespace llvm; #define GET_INSTRINFO_CTOR_DTOR #include "LoongArchGenInstrInfo.inc" -LoongArchInstrInfo::LoongArchInstrInfo(LoongArchSubtarget &STI) - : LoongArchGenInstrInfo(LoongArch::ADJCALLSTACKDOWN, +LoongArchInstrInfo::LoongArchInstrInfo(const LoongArchSubtarget &STI) + : LoongArchGenInstrInfo(STI, LoongArch::ADJCALLSTACKDOWN, LoongArch::ADJCALLSTACKUP), STI(STI) {} diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h index 63b7112b8b40..f25958a32bec 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h @@ -25,7 +25,7 @@ class LoongArchSubtarget; class LoongArchInstrInfo : public LoongArchGenInstrInfo { public: - explicit LoongArchInstrInfo(LoongArchSubtarget &STI); + explicit LoongArchInstrInfo(const LoongArchSubtarget &STI); MCInst getNop() const override; diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td index 2b94e65cac0e..20ccc622f58d 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td @@ -31,6 +31,10 @@ def SDT_LoongArchSelectCC : SDTypeProfile<1, 5, [SDTCisSameAs<1, 2>, SDTCisSameAs<0, 4>, SDTCisSameAs<4, 5>]>; +def SDT_LoongArchBrCC : SDTypeProfile<0, 4, [SDTCisSameAs<0, 1>, + SDTCisVT<2, OtherVT>, + SDTCisVT<3, OtherVT>]>; + def SDT_LoongArchBStrIns: SDTypeProfile<1, 4, [ SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<3>, SDTCisSameAs<3, 4> @@ -94,6 +98,8 @@ def loongarch_tail_large : SDNode<"LoongArchISD::TAIL_LARGE", SDT_LoongArchCall, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>; def loongarch_selectcc : SDNode<"LoongArchISD::SELECT_CC", SDT_LoongArchSelectCC>; +def loongarch_brcc : SDNode<"LoongArchISD::BR_CC", SDT_LoongArchBrCC, + [SDNPHasChain]>; def loongarch_sll_w : SDNode<"LoongArchISD::SLL_W", SDT_LoongArchIntBinOpW>; def loongarch_sra_w : SDNode<"LoongArchISD::SRA_W", SDT_LoongArchIntBinOpW>; def loongarch_srl_w : SDNode<"LoongArchISD::SRL_W", SDT_LoongArchIntBinOpW>; @@ -1537,47 +1543,29 @@ def : Pat<(select GPR:$cond, GPR:$t, GPR:$f), /// Branches and jumps -class BccPat<PatFrag CondOp, LAInst Inst> - : Pat<(brcond (GRLenVT (CondOp GPR:$rj, GPR:$rd)), bb:$imm16), - (Inst GPR:$rj, GPR:$rd, bb:$imm16)>; - -def : BccPat<seteq, BEQ>; -def : BccPat<setne, BNE>; -def : BccPat<setlt, BLT>; -def : BccPat<setge, BGE>; -def : BccPat<setult, BLTU>; -def : BccPat<setuge, BGEU>; - -class BccSwapPat<PatFrag CondOp, LAInst InstBcc> - : Pat<(brcond (GRLenVT (CondOp GPR:$rd, GPR:$rj)), bb:$imm16), - (InstBcc GPR:$rj, GPR:$rd, bb:$imm16)>; - -// Condition codes that don't have matching LoongArch branch instructions, but -// are trivially supported by swapping the two input operands. -def : BccSwapPat<setgt, BLT>; -def : BccSwapPat<setle, BGE>; -def : BccSwapPat<setugt, BLTU>; -def : BccSwapPat<setule, BGEU>; - let Predicates = [Has32S] in { -// An extra pattern is needed for a brcond without a setcc (i.e. where the -// condition was calculated elsewhere). -def : Pat<(brcond GPR:$rj, bb:$imm21), (BNEZ GPR:$rj, bb:$imm21)>; - -def : Pat<(brcond (GRLenVT (seteq GPR:$rj, 0)), bb:$imm21), - (BEQZ GPR:$rj, bb:$imm21)>; -def : Pat<(brcond (GRLenVT (setne GPR:$rj, 0)), bb:$imm21), - (BNEZ GPR:$rj, bb:$imm21)>; +class BccZeroPat<CondCode Cond, LAInst Inst> + : Pat<(loongarch_brcc (GRLenVT GPR:$rj), 0, Cond, bb:$imm21), + (Inst GPR:$rj, bb:$imm21)>; + +def : BccZeroPat<SETEQ, BEQZ>; +def : BccZeroPat<SETNE, BNEZ>; } // Predicates = [Has32S] -// An extra pattern is needed for a brcond without a setcc (i.e. where the -// condition was calculated elsewhere). -def : Pat<(brcond GPR:$rj, bb:$imm16), (BNE GPR:$rj, R0, bb:$imm16)>; +multiclass BccPat<CondCode Cond, LAInst Inst> { + def : Pat<(loongarch_brcc (GRLenVT GPR:$rj), GPR:$rd, Cond, bb:$imm16), + (Inst GPR:$rj, GPR:$rd, bb:$imm16)>; + // Explicitly select 0 to R0. The register coalescer doesn't always do it. + def : Pat<(loongarch_brcc (GRLenVT GPR:$rj), 0, Cond, bb:$imm16), + (Inst GPR:$rj, (GRLenVT R0), bb:$imm16)>; +} -def : Pat<(brcond (GRLenVT (seteq GPR:$rj, 0)), bb:$imm16), - (BEQ GPR:$rj, R0, bb:$imm16)>; -def : Pat<(brcond (GRLenVT (setne GPR:$rj, 0)), bb:$imm16), - (BNE GPR:$rj, R0, bb:$imm16)>; +defm : BccPat<SETEQ, BEQ>; +defm : BccPat<SETNE, BNE>; +defm : BccPat<SETLT, BLT>; +defm : BccPat<SETGE, BGE>; +defm : BccPat<SETULT, BLTU>; +defm : BccPat<SETUGE, BGEU>; let isBarrier = 1, isBranch = 1, isTerminator = 1 in def PseudoBR : Pseudo<(outs), (ins simm26_b:$imm26), [(br bb:$imm26)]>, diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index 0696b11d62ac..a79c01cbe577 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -10,8 +10,12 @@ // //===----------------------------------------------------------------------===// +def SDT_LoongArchXVPERM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, + SDTCisVec<2>, SDTCisInt<2>]>; + // Target nodes. def loongarch_xvpermi: SDNode<"LoongArchISD::XVPERMI", SDT_LoongArchV1RUimm>; +def loongarch_xvperm: SDNode<"LoongArchISD::XVPERM", SDT_LoongArchXVPERM>; def loongarch_xvmskltz: SDNode<"LoongArchISD::XVMSKLTZ", SDT_LoongArchVMSKCOND>; def loongarch_xvmskgez: SDNode<"LoongArchISD::XVMSKGEZ", SDT_LoongArchVMSKCOND>; def loongarch_xvmskeqz: SDNode<"LoongArchISD::XVMSKEQZ", SDT_LoongArchVMSKCOND>; @@ -1186,6 +1190,17 @@ multiclass PatXrXrXr<SDPatternOperator OpNode, string Inst> { (!cast<LAInst>(Inst#"_D") LASX256:$xd, LASX256:$xj, LASX256:$xk)>; } +multiclass PatXrXrW<SDPatternOperator OpNode, string Inst> { + def : Pat<(OpNode(v32i8 LASX256:$vj), (v32i8 LASX256:$vk)), + (!cast<LAInst>(Inst#"_H_B") LASX256:$vj, LASX256:$vk)>; + def : Pat<(OpNode(v16i16 LASX256:$vj), (v16i16 LASX256:$vk)), + (!cast<LAInst>(Inst#"_W_H") LASX256:$vj, LASX256:$vk)>; + def : Pat<(OpNode(v8i32 LASX256:$vj), (v8i32 LASX256:$vk)), + (!cast<LAInst>(Inst#"_D_W") LASX256:$vj, LASX256:$vk)>; + def : Pat<(OpNode(v4i64 LASX256:$vj), (v4i64 LASX256:$vk)), + (!cast<LAInst>(Inst#"_Q_D") LASX256:$vj, LASX256:$vk)>; +} + multiclass PatShiftXrXr<SDPatternOperator OpNode, string Inst> { def : Pat<(OpNode (v32i8 LASX256:$xj), (and vsplati8_imm_eq_7, (v32i8 LASX256:$xk))), @@ -1513,6 +1528,9 @@ def : Pat<(bswap (v8i32 LASX256:$xj)), (XVSHUF4I_B LASX256:$xj, 0b00011011)>; def : Pat<(bswap (v4i64 LASX256:$xj)), (XVSHUF4I_W (XVSHUF4I_B LASX256:$xj, 0b00011011), 0b10110001)>; +// XVHADDW_{H_B/W_H/D_W/Q_D} +defm : PatXrXrW<loongarch_vhaddw, "XVHADDW">; + // XVFADD_{S/D} defm : PatXrXrF<fadd, "XVFADD">; @@ -1852,6 +1870,12 @@ def : Pat<(loongarch_xvpermi v4i64:$xj, immZExt8: $ui8), def : Pat<(loongarch_xvpermi v4f64:$xj, immZExt8: $ui8), (XVPERMI_D v4f64:$xj, immZExt8: $ui8)>; +// XVPERM_W +def : Pat<(loongarch_xvperm v8i32:$xj, v8i32:$xk), + (XVPERM_W v8i32:$xj, v8i32:$xk)>; +def : Pat<(loongarch_xvperm v8f32:$xj, v8i32:$xk), + (XVPERM_W v8f32:$xj, v8i32:$xk)>; + // XVREPLVE0_{W/D} def : Pat<(lasxsplatf32 FPR32:$fj), (XVREPLVE0_W (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32))>; diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td index 3c9defb0366f..eb7120ffb41a 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td @@ -22,7 +22,7 @@ def SDT_LoongArchVShuf : SDTypeProfile<1, 3, [SDTCisVec<0>, def SDT_LoongArchV2R : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>]>; def SDT_LoongArchV1RUimm: SDTypeProfile<1, 2, [SDTCisVec<0>, - SDTCisSameAs<0,1>, SDTCisVT<2, i64>]>; + SDTCisSameAs<0,1>, SDTCisVT<2, GRLenVT>]>; def SDT_LoongArchV2RUimm : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, @@ -71,6 +71,8 @@ def loongarch_vsrli : SDNode<"LoongArchISD::VSRLI", SDT_LoongArchV1RUimm>; def loongarch_vbsll : SDNode<"LoongArchISD::VBSLL", SDT_LoongArchV1RUimm>; def loongarch_vbsrl : SDNode<"LoongArchISD::VBSRL", SDT_LoongArchV1RUimm>; +def loongarch_vhaddw : SDNode<"LoongArchISD::VHADDW", SDT_LoongArchV2R>; + def loongarch_vldrepl : SDNode<"LoongArchISD::VLDREPL", SDT_LoongArchVLDREPL, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; @@ -1364,6 +1366,17 @@ multiclass PatVrVrVr<SDPatternOperator OpNode, string Inst> { (!cast<LAInst>(Inst#"_D") LSX128:$vd, LSX128:$vj, LSX128:$vk)>; } +multiclass PatVrVrW<SDPatternOperator OpNode, string Inst> { + def : Pat<(OpNode(v16i8 LSX128:$vj), (v16i8 LSX128:$vk)), + (!cast<LAInst>(Inst#"_H_B") LSX128:$vj, LSX128:$vk)>; + def : Pat<(OpNode(v8i16 LSX128:$vj), (v8i16 LSX128:$vk)), + (!cast<LAInst>(Inst#"_W_H") LSX128:$vj, LSX128:$vk)>; + def : Pat<(OpNode(v4i32 LSX128:$vj), (v4i32 LSX128:$vk)), + (!cast<LAInst>(Inst#"_D_W") LSX128:$vj, LSX128:$vk)>; + def : Pat<(OpNode(v2i64 LSX128:$vj), (v2i64 LSX128:$vk)), + (!cast<LAInst>(Inst#"_Q_D") LSX128:$vj, LSX128:$vk)>; +} + multiclass PatShiftVrVr<SDPatternOperator OpNode, string Inst> { def : Pat<(OpNode (v16i8 LSX128:$vj), (and vsplati8_imm_eq_7, (v16i8 LSX128:$vk))), @@ -1709,6 +1722,9 @@ def : Pat<(bswap (v4i32 LSX128:$vj)), (VSHUF4I_B LSX128:$vj, 0b00011011)>; def : Pat<(bswap (v2i64 LSX128:$vj)), (VSHUF4I_W (VSHUF4I_B LSX128:$vj, 0b00011011), 0b10110001)>; +// VHADDW_{H_B/W_H/D_W/Q_D} +defm : PatVrVrW<loongarch_vhaddw, "VHADDW">; + // VFADD_{S/D} defm : PatVrVrF<fadd, "VFADD">; diff --git a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp index 47fce37ce59f..9c5f8edfaf66 100644 --- a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp @@ -41,6 +41,8 @@ LoongArchRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { if (MF->getFunction().getCallingConv() == CallingConv::GHC) return CSR_NoRegs_SaveList; + if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost) + return CSR_MostRegs_SaveList; switch (Subtarget.getTargetABI()) { default: llvm_unreachable("Unrecognized ABI"); @@ -63,6 +65,8 @@ LoongArchRegisterInfo::getCallPreservedMask(const MachineFunction &MF, if (CC == CallingConv::GHC) return CSR_NoRegs_RegMask; + if (CC == CallingConv::PreserveMost) + return CSR_MostRegs_RegMask; switch (Subtarget.getTargetABI()) { default: llvm_unreachable("Unrecognized ABI"); diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp index ede5477f04bd..f548a8dd0532 100644 --- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp @@ -95,4 +95,20 @@ unsigned LoongArchTTIImpl::getPrefetchDistance() const { return 200; } bool LoongArchTTIImpl::enableWritePrefetching() const { return true; } +bool LoongArchTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const { + switch (II->getIntrinsicID()) { + default: + return true; + case Intrinsic::vector_reduce_add: + case Intrinsic::vector_reduce_and: + case Intrinsic::vector_reduce_or: + case Intrinsic::vector_reduce_smax: + case Intrinsic::vector_reduce_smin: + case Intrinsic::vector_reduce_umax: + case Intrinsic::vector_reduce_umin: + case Intrinsic::vector_reduce_xor: + return false; + } +} + // TODO: Implement more hooks to provide TTI machinery for LoongArch. diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h index d43d2cb0eb12..e3f16c780499 100644 --- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h +++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h @@ -53,6 +53,8 @@ public: unsigned getPrefetchDistance() const override; bool enableWritePrefetching() const override; + bool shouldExpandReduction(const IntrinsicInst *II) const override; + // TODO: Implement more hooks to provide TTI machinery for LoongArch. }; diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp index 35277ce094a7..e5bd1c91edec 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp @@ -26,6 +26,7 @@ #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/Compiler.h" +#include <bitset> #define GET_INSTRINFO_MC_DESC #define ENABLE_INSTR_PREDICATE_VERIFIER @@ -95,10 +96,81 @@ createLoongArchAsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS, namespace { class LoongArchMCInstrAnalysis : public MCInstrAnalysis { + int64_t GPRState[31] = {}; + std::bitset<31> GPRValidMask; + + static bool isGPR(MCRegister Reg) { + return Reg >= LoongArch::R0 && Reg <= LoongArch::R31; + } + + static unsigned getRegIndex(MCRegister Reg) { + assert(isGPR(Reg) && Reg != LoongArch::R0 && "Invalid GPR reg"); + return Reg - LoongArch::R1; + } + + void setGPRState(MCRegister Reg, std::optional<int64_t> Value) { + if (Reg == LoongArch::R0) + return; + + auto Index = getRegIndex(Reg); + + if (Value) { + GPRState[Index] = *Value; + GPRValidMask.set(Index); + } else { + GPRValidMask.reset(Index); + } + } + + std::optional<int64_t> getGPRState(MCRegister Reg) const { + if (Reg == LoongArch::R0) + return 0; + + auto Index = getRegIndex(Reg); + + if (GPRValidMask.test(Index)) + return GPRState[Index]; + return std::nullopt; + } + public: explicit LoongArchMCInstrAnalysis(const MCInstrInfo *Info) : MCInstrAnalysis(Info) {} + void resetState() override { GPRValidMask.reset(); } + + void updateState(const MCInst &Inst, uint64_t Addr) override { + // Terminators mark the end of a basic block which means the sequentially + // next instruction will be the first of another basic block and the current + // state will typically not be valid anymore. For calls, we assume all + // registers may be clobbered by the callee (TODO: should we take the + // calling convention into account?). + if (isTerminator(Inst) || isCall(Inst)) { + resetState(); + return; + } + + switch (Inst.getOpcode()) { + default: { + // Clear the state of all defined registers for instructions that we don't + // explicitly support. + auto NumDefs = Info->get(Inst.getOpcode()).getNumDefs(); + for (unsigned I = 0; I < NumDefs; ++I) { + auto DefReg = Inst.getOperand(I).getReg(); + if (isGPR(DefReg)) + setGPRState(DefReg, std::nullopt); + } + break; + } + case LoongArch::PCADDU18I: + setGPRState( + Inst.getOperand(0).getReg(), + Addr + SignExtend64<38>( + static_cast<uint64_t>(Inst.getOperand(1).getImm()) << 18)); + break; + } + } + bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size, uint64_t &Target) const override { unsigned NumOps = Inst.getNumOperands(); @@ -108,6 +180,14 @@ public: return true; } + if (Inst.getOpcode() == LoongArch::JIRL) { + if (auto TargetRegState = getGPRState(Inst.getOperand(1).getReg())) { + Target = *TargetRegState + Inst.getOperand(2).getImm(); + return true; + } + return false; + } + return false; } diff --git a/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp b/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp index d3ad65390143..4992f1abe5a0 100644 --- a/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp +++ b/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp @@ -107,6 +107,18 @@ static DecodeStatus DecodeFPCSCRegisterClass(MCInst &Inst, uint64_t RegNo, } #define DecodeFPICRegisterClass DecodeFPCSCRegisterClass +static DecodeStatus DecodeCCRCRegisterClass(MCInst &Inst, + const MCDisassembler *Decoder) { + Inst.addOperand(MCOperand::createReg(M68k::CCR)); + return DecodeStatus::Success; +} + +static DecodeStatus DecodeSRCRegisterClass(MCInst &Inst, + const MCDisassembler *Decoder) { + Inst.addOperand(MCOperand::createReg(M68k::SR)); + return DecodeStatus::Success; +} + static DecodeStatus DecodeImm32(MCInst &Inst, uint64_t Imm, uint64_t Address, const void *Decoder) { Inst.addOperand(MCOperand::createImm(M68k::swapWord<uint32_t>(Imm))); diff --git a/llvm/lib/Target/M68k/M68kInstrAtomics.td b/llvm/lib/Target/M68k/M68kInstrAtomics.td index 867afbefe68f..b2b64ca85322 100644 --- a/llvm/lib/Target/M68k/M68kInstrAtomics.td +++ b/llvm/lib/Target/M68k/M68kInstrAtomics.td @@ -67,7 +67,8 @@ class MxCASARIDOp<bits<2> size_encoding, MxType type> "cas."#type.Prefix#" $dc, $du, $mem"> { let Inst = (ascend (descend 0b00001, size_encoding, 0b011, MxEncAddrMode_p<"mem">.EA), - (descend 0b0000000, (operand "$du", 3), 0b000, (operand "$dc", 3)) + (descend 0b0000000, (operand "$du", 3), 0b000, (operand "$dc", 3)), + MxEncAddrMode_p<"mem">.Supplement ); let Constraints = "$out = $dc"; let mayLoad = 1; @@ -84,7 +85,8 @@ class MxCASARIIOp<bits<2> size_encoding, MxType type> "cas."#type.Prefix#" $dc, $du, $mem"> { let Inst = (ascend (descend 0b00001, size_encoding, 0b011, MxEncAddrMode_f<"mem">.EA), - (descend 0b0000000, (operand "$du", 3), 0b000, (operand "$dc", 3)) + (descend 0b0000000, (operand "$du", 3), 0b000, (operand "$dc", 3)), + MxEncAddrMode_f<"mem">.Supplement ); let Constraints = "$out = $dc"; let mayLoad = 1; @@ -100,8 +102,9 @@ class MxCASALOp<bits<2> size_encoding, MxType type> (ins type.ROp:$dc, type.ROp:$du, !cast<MxMemOp>("MxAL"#type.Size):$mem), "cas."#type.Prefix#" $dc, $du, $mem"> { let Inst = (ascend - (descend 0b00001, size_encoding, 0b011, MxEncAddrMode_abs<"mem">.EA), - (descend 0b0000000, (operand "$du", 3), 0b000, (operand "$dc", 3)) + (descend 0b00001, size_encoding, 0b011, MxEncAddrMode_abs<"mem", true>.EA), + (descend 0b0000000, (operand "$du", 3), 0b000, (operand "$dc", 3)), + MxEncAddrMode_abs<"mem", true>.Supplement ); let Constraints = "$out = $dc"; let mayLoad = 1; diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.cpp b/llvm/lib/Target/M68k/M68kInstrInfo.cpp index 21e9319aaf0b..c6be190bd124 100644 --- a/llvm/lib/Target/M68k/M68kInstrInfo.cpp +++ b/llvm/lib/Target/M68k/M68kInstrInfo.cpp @@ -43,7 +43,7 @@ using namespace llvm; void M68kInstrInfo::anchor() {} M68kInstrInfo::M68kInstrInfo(const M68kSubtarget &STI) - : M68kGenInstrInfo(M68k::ADJCALLSTACKDOWN, M68k::ADJCALLSTACKUP, 0, + : M68kGenInstrInfo(STI, M68k::ADJCALLSTACKDOWN, M68k::ADJCALLSTACKUP, 0, M68k::RET), Subtarget(STI), RI(STI) {} diff --git a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp index 38d22eda5f17..a31c8ec1b2bb 100644 --- a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp +++ b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp @@ -36,7 +36,6 @@ namespace { /// Parses MSP430 assembly from a stream. class MSP430AsmParser : public MCTargetAsmParser { - const MCSubtargetInfo &STI; MCAsmParser &Parser; const MCRegisterInfo *MRI; @@ -79,7 +78,7 @@ class MSP430AsmParser : public MCTargetAsmParser { public: MSP430AsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser, const MCInstrInfo &MII, const MCTargetOptions &Options) - : MCTargetAsmParser(Options, STI, MII), STI(STI), Parser(Parser) { + : MCTargetAsmParser(Options, STI, MII), Parser(Parser) { MCAsmParserExtension::Initialize(Parser); MRI = getContext().getRegisterInfo(); @@ -264,7 +263,7 @@ bool MSP430AsmParser::matchAndEmitInstruction(SMLoc Loc, unsigned &Opcode, switch (MatchResult) { case Match_Success: Inst.setLoc(Loc); - Out.emitInstruction(Inst, STI); + Out.emitInstruction(Inst, *STI); return false; case Match_MnemonicFail: return Error(Loc, "invalid instruction mnemonic"); diff --git a/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp b/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp index c8094a8eeb36..e6666e8cafdf 100644 --- a/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp +++ b/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp @@ -103,15 +103,6 @@ static DecodeStatus DecodeGR16RegisterClass(MCInst &MI, uint64_t RegNo, } static DecodeStatus DecodeCGImm(MCInst &MI, uint64_t Bits, uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeMemOperand(MCInst &MI, uint64_t Bits, - uint64_t Address, - const MCDisassembler *Decoder); - -#include "MSP430GenDisassemblerTables.inc" - -static DecodeStatus DecodeCGImm(MCInst &MI, uint64_t Bits, uint64_t Address, const MCDisassembler *Decoder) { int64_t Imm; switch (Bits) { @@ -142,6 +133,8 @@ static DecodeStatus DecodeMemOperand(MCInst &MI, uint64_t Bits, return MCDisassembler::Success; } +#include "MSP430GenDisassemblerTables.inc" + enum AddrMode { amInvalid = 0, amRegister, diff --git a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp index 6da5e66be4ad..5653099431b1 100644 --- a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp +++ b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp @@ -154,9 +154,9 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM, const RTLIB::LibcallImpl Impl; } LibraryCalls[] = { // Integer Multiply - EABI Table 9 - {RTLIB::MUL_I16, RTLIB::__mspabi_mpyi_hw}, - {RTLIB::MUL_I32, RTLIB::__mspabi_mpyl_hw}, - {RTLIB::MUL_I64, RTLIB::__mspabi_mpyll_hw}, + {RTLIB::MUL_I16, RTLIB::impl___mspabi_mpyi_hw}, + {RTLIB::MUL_I32, RTLIB::impl___mspabi_mpyl_hw}, + {RTLIB::MUL_I64, RTLIB::impl___mspabi_mpyll_hw}, // TODO The __mspabi_mpysl*_hw functions ARE implemented in libgcc // TODO The __mspabi_mpyul*_hw functions ARE implemented in libgcc }; @@ -169,9 +169,9 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM, const RTLIB::LibcallImpl Impl; } LibraryCalls[] = { // Integer Multiply - EABI Table 9 - {RTLIB::MUL_I16, RTLIB::__mspabi_mpyi_hw}, - {RTLIB::MUL_I32, RTLIB::__mspabi_mpyl_hw32}, - {RTLIB::MUL_I64, RTLIB::__mspabi_mpyll_hw32}, + {RTLIB::MUL_I16, RTLIB::impl___mspabi_mpyi_hw}, + {RTLIB::MUL_I32, RTLIB::impl___mspabi_mpyl_hw32}, + {RTLIB::MUL_I64, RTLIB::impl___mspabi_mpyll_hw32}, // TODO The __mspabi_mpysl*_hw32 functions ARE implemented in libgcc // TODO The __mspabi_mpyul*_hw32 functions ARE implemented in libgcc }; @@ -184,9 +184,9 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM, const RTLIB::LibcallImpl Impl; } LibraryCalls[] = { // Integer Multiply - EABI Table 9 - {RTLIB::MUL_I16, RTLIB::__mspabi_mpyi_f5hw}, - {RTLIB::MUL_I32, RTLIB::__mspabi_mpyl_f5hw}, - {RTLIB::MUL_I64, RTLIB::__mspabi_mpyll_f5hw}, + {RTLIB::MUL_I16, RTLIB::impl___mspabi_mpyi_f5hw}, + {RTLIB::MUL_I32, RTLIB::impl___mspabi_mpyl_f5hw}, + {RTLIB::MUL_I64, RTLIB::impl___mspabi_mpyll_f5hw}, // TODO The __mspabi_mpysl*_f5hw functions ARE implemented in libgcc // TODO The __mspabi_mpyul*_f5hw functions ARE implemented in libgcc }; @@ -199,9 +199,9 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM, const RTLIB::LibcallImpl Impl; } LibraryCalls[] = { // Integer Multiply - EABI Table 9 - {RTLIB::MUL_I16, RTLIB::__mspabi_mpyi}, - {RTLIB::MUL_I32, RTLIB::__mspabi_mpyl}, - {RTLIB::MUL_I64, RTLIB::__mspabi_mpyll}, + {RTLIB::MUL_I16, RTLIB::impl___mspabi_mpyi}, + {RTLIB::MUL_I32, RTLIB::impl___mspabi_mpyl}, + {RTLIB::MUL_I64, RTLIB::impl___mspabi_mpyll}, // The __mspabi_mpysl* functions are NOT implemented in libgcc // The __mspabi_mpyul* functions are NOT implemented in libgcc }; diff --git a/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp b/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp index 8bc6387e6a7e..65b4820752c9 100644 --- a/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp +++ b/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp @@ -12,6 +12,7 @@ #include "MSP430InstrInfo.h" #include "MSP430.h" +#include "MSP430Subtarget.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/Support/ErrorHandling.h" @@ -24,9 +25,9 @@ using namespace llvm; // Pin the vtable to this file. void MSP430InstrInfo::anchor() {} -MSP430InstrInfo::MSP430InstrInfo(MSP430Subtarget &STI) - : MSP430GenInstrInfo(MSP430::ADJCALLSTACKDOWN, MSP430::ADJCALLSTACKUP), - RI() {} +MSP430InstrInfo::MSP430InstrInfo(const MSP430Subtarget &STI) + : MSP430GenInstrInfo(STI, MSP430::ADJCALLSTACKDOWN, MSP430::ADJCALLSTACKUP), + RI() {} void MSP430InstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, diff --git a/llvm/lib/Target/MSP430/MSP430InstrInfo.h b/llvm/lib/Target/MSP430/MSP430InstrInfo.h index 58be64336f26..316c136890bf 100644 --- a/llvm/lib/Target/MSP430/MSP430InstrInfo.h +++ b/llvm/lib/Target/MSP430/MSP430InstrInfo.h @@ -27,7 +27,7 @@ class MSP430InstrInfo : public MSP430GenInstrInfo { const MSP430RegisterInfo RI; virtual void anchor(); public: - explicit MSP430InstrInfo(MSP430Subtarget &STI); + explicit MSP430InstrInfo(const MSP430Subtarget &STI); /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As /// such, whenever a client has an instance of instruction info, it should diff --git a/llvm/lib/Target/Mips/CMakeLists.txt b/llvm/lib/Target/Mips/CMakeLists.txt index 21d1765107ae..4a2277e9a80d 100644 --- a/llvm/lib/Target/Mips/CMakeLists.txt +++ b/llvm/lib/Target/Mips/CMakeLists.txt @@ -6,7 +6,8 @@ tablegen(LLVM MipsGenAsmMatcher.inc -gen-asm-matcher) tablegen(LLVM MipsGenAsmWriter.inc -gen-asm-writer) tablegen(LLVM MipsGenCallingConv.inc -gen-callingconv) tablegen(LLVM MipsGenDAGISel.inc -gen-dag-isel) -tablegen(LLVM MipsGenDisassemblerTables.inc -gen-disassembler) +tablegen(LLVM MipsGenDisassemblerTables.inc -gen-disassembler + -ignore-non-decodable-operands) tablegen(LLVM MipsGenFastISel.inc -gen-fast-isel) tablegen(LLVM MipsGenGlobalISel.inc -gen-global-isel) tablegen(LLVM MipsGenPostLegalizeGICombiner.inc -gen-global-isel-combiner diff --git a/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp index 0c98c4da2ede..fa6cc0e3f018 100644 --- a/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp +++ b/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp @@ -78,451 +78,216 @@ public: } // end anonymous namespace -// Forward declare these because the autogenerated code will reference them. -// Definitions are further down. -static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeCPU16RegsRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeGPRMM16RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus -DecodeGPRMM16ZeroRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus -DecodeGPRMM16MovePRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodePtrRegisterClass(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeDSPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeFGR64RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeCCRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); +static MCDisassembler *createMipsDisassembler(const Target &T, + const MCSubtargetInfo &STI, + MCContext &Ctx) { + return new MipsDisassembler(STI, Ctx, true); +} -static DecodeStatus DecodeFCCRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); +static MCDisassembler *createMipselDisassembler(const Target &T, + const MCSubtargetInfo &STI, + MCContext &Ctx) { + return new MipsDisassembler(STI, Ctx, false); +} -static DecodeStatus DecodeFGRCCRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); +extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void +LLVMInitializeMipsDisassembler() { + // Register the disassembler. + TargetRegistry::RegisterMCDisassembler(getTheMipsTarget(), + createMipsDisassembler); + TargetRegistry::RegisterMCDisassembler(getTheMipselTarget(), + createMipselDisassembler); + TargetRegistry::RegisterMCDisassembler(getTheMips64Target(), + createMipsDisassembler); + TargetRegistry::RegisterMCDisassembler(getTheMips64elTarget(), + createMipselDisassembler); +} -static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst, unsigned Insn, +static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo) { + const MCRegisterInfo *RegInfo = D->getContext().getRegisterInfo(); + return RegInfo->getRegClass(RC).getRegister(RegNo); +} +static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const MCDisassembler *Decoder); + const MCDisassembler *Decoder) { + // Currently only hardware register 29 is supported. + if (RegNo != 29) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createReg(Mips::HWR29)); + return MCDisassembler::Success; +} static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const MCDisassembler *Decoder); + const MCDisassembler *Decoder) { + if (RegNo > 30 || RegNo % 2) + return MCDisassembler::Fail; + + unsigned Reg = getReg(Decoder, Mips::AFGR64RegClassID, RegNo / 2); + Inst.addOperand(MCOperand::createReg(Reg)); + return MCDisassembler::Success; +} static DecodeStatus DecodeACC64DSPRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const MCDisassembler *Decoder); + const MCDisassembler *Decoder) { + if (RegNo >= 4) + return MCDisassembler::Fail; + + unsigned Reg = getReg(Decoder, Mips::ACC64DSPRegClassID, RegNo); + Inst.addOperand(MCOperand::createReg(Reg)); + return MCDisassembler::Success; +} static DecodeStatus DecodeHI32DSPRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const MCDisassembler *Decoder); + const MCDisassembler *Decoder) { + if (RegNo >= 4) + return MCDisassembler::Fail; + + unsigned Reg = getReg(Decoder, Mips::HI32DSPRegClassID, RegNo); + Inst.addOperand(MCOperand::createReg(Reg)); + return MCDisassembler::Success; +} static DecodeStatus DecodeLO32DSPRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const MCDisassembler *Decoder); + const MCDisassembler *Decoder) { + if (RegNo >= 4) + return MCDisassembler::Fail; + + unsigned Reg = getReg(Decoder, Mips::LO32DSPRegClassID, RegNo); + Inst.addOperand(MCOperand::createReg(Reg)); + return MCDisassembler::Success; +} static DecodeStatus DecodeMSA128BRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const MCDisassembler *Decoder); + const MCDisassembler *Decoder) { + if (RegNo > 31) + return MCDisassembler::Fail; + + unsigned Reg = getReg(Decoder, Mips::MSA128BRegClassID, RegNo); + Inst.addOperand(MCOperand::createReg(Reg)); + return MCDisassembler::Success; +} static DecodeStatus DecodeMSA128HRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const MCDisassembler *Decoder); + const MCDisassembler *Decoder) { + if (RegNo > 31) + return MCDisassembler::Fail; + + unsigned Reg = getReg(Decoder, Mips::MSA128HRegClassID, RegNo); + Inst.addOperand(MCOperand::createReg(Reg)); + return MCDisassembler::Success; +} static DecodeStatus DecodeMSA128WRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const MCDisassembler *Decoder); + const MCDisassembler *Decoder) { + if (RegNo > 31) + return MCDisassembler::Fail; + + unsigned Reg = getReg(Decoder, Mips::MSA128WRegClassID, RegNo); + Inst.addOperand(MCOperand::createReg(Reg)); + return MCDisassembler::Success; +} static DecodeStatus DecodeMSA128DRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const MCDisassembler *Decoder); + const MCDisassembler *Decoder) { + if (RegNo > 31) + return MCDisassembler::Fail; + + unsigned Reg = getReg(Decoder, Mips::MSA128DRegClassID, RegNo); + Inst.addOperand(MCOperand::createReg(Reg)); + return MCDisassembler::Success; +} static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const MCDisassembler *Decoder); + const MCDisassembler *Decoder) { + if (RegNo > 7) + return MCDisassembler::Fail; + + unsigned Reg = getReg(Decoder, Mips::MSACtrlRegClassID, RegNo); + Inst.addOperand(MCOperand::createReg(Reg)); + return MCDisassembler::Success; +} static DecodeStatus DecodeCOP0RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const MCDisassembler *Decoder); + const MCDisassembler *Decoder) { + if (RegNo > 31) + return MCDisassembler::Fail; + + unsigned Reg = getReg(Decoder, Mips::COP0RegClassID, RegNo); + Inst.addOperand(MCOperand::createReg(Reg)); + return MCDisassembler::Success; +} static DecodeStatus DecodeCOP2RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeBranchTarget(MCInst &Inst, unsigned Offset, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeBranchTarget1SImm16(MCInst &Inst, unsigned Offset, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeJumpTarget(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeBranchTarget21(MCInst &Inst, unsigned Offset, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeBranchTarget21MM(MCInst &Inst, unsigned Offset, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeBranchTarget26(MCInst &Inst, unsigned Offset, - uint64_t Address, - const MCDisassembler *Decoder); - -// DecodeBranchTarget7MM - Decode microMIPS branch offset, which is -// shifted left by 1 bit. -static DecodeStatus DecodeBranchTarget7MM(MCInst &Inst, unsigned Offset, - uint64_t Address, - const MCDisassembler *Decoder); - -// DecodeBranchTarget10MM - Decode microMIPS branch offset, which is -// shifted left by 1 bit. -static DecodeStatus DecodeBranchTarget10MM(MCInst &Inst, unsigned Offset, - uint64_t Address, - const MCDisassembler *Decoder); - -// DecodeBranchTargetMM - Decode microMIPS branch offset, which is -// shifted left by 1 bit. -static DecodeStatus DecodeBranchTargetMM(MCInst &Inst, unsigned Offset, - uint64_t Address, - const MCDisassembler *Decoder); - -// DecodeBranchTarget26MM - Decode microMIPS branch offset, which is -// shifted left by 1 bit. -static DecodeStatus DecodeBranchTarget26MM(MCInst &Inst, unsigned Offset, - uint64_t Address, - const MCDisassembler *Decoder); - -// DecodeJumpTargetMM - Decode microMIPS jump target, which is -// shifted left by 1 bit. -static DecodeStatus DecodeJumpTargetMM(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -// DecodeJumpTargetXMM - Decode microMIPS jump and link exchange target, -// which is shifted left by 2 bit. -static DecodeStatus DecodeJumpTargetXMM(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeMem(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeMemEVA(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeLoadByte15(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeCacheOp(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeCacheeOp_CacheOpR6(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeCacheOpMM(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodePrefeOpMM(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeSyncI(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeSyncI_MM(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeSynciR6(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeMemMMImm4(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeMemMMSPImm5Lsl2(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeMemMMGPImm7Lsl2(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeMemMMReglistImm4Lsl2(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeMemMMImm9(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeMemMMImm12(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeMemMMImm16(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeFMem(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeFMemMMR2(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeFMem2(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeFMem3(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeFMemCop2R6(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeFMemCop2MMR6(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeAddiur2Simm7(MCInst &Inst, unsigned Value, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeLi16Imm(MCInst &Inst, unsigned Value, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodePOOL16BEncodedField(MCInst &Inst, unsigned Value, - uint64_t Address, - const MCDisassembler *Decoder); + const MCDisassembler *Decoder) { + if (RegNo > 31) + return MCDisassembler::Fail; -template <unsigned Bits, int Offset, int Scale> -static DecodeStatus DecodeUImmWithOffsetAndScale(MCInst &Inst, unsigned Value, - uint64_t Address, - const MCDisassembler *Decoder); + unsigned Reg = getReg(Decoder, Mips::COP2RegClassID, RegNo); + Inst.addOperand(MCOperand::createReg(Reg)); + return MCDisassembler::Success; +} -template <unsigned Bits, int Offset> -static DecodeStatus DecodeUImmWithOffset(MCInst &Inst, unsigned Value, +static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { - return DecodeUImmWithOffsetAndScale<Bits, Offset, 1>(Inst, Value, Address, - Decoder); -} - -template <unsigned Bits, int Offset = 0, int ScaleBy = 1> -static DecodeStatus DecodeSImmWithOffsetAndScale(MCInst &Inst, unsigned Value, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeInsSize(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeSimm18Lsl3(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeSimm9SP(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeANDI16Imm(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeSimm23Lsl2(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -/// INSVE_[BHWD] have an implicit operand that the generated decoder doesn't -/// handle. -template <typename InsnType> -static DecodeStatus DecodeINSVE_DF(MCInst &MI, InsnType insn, uint64_t Address, - const MCDisassembler *Decoder); - -template <typename InsnType> -static DecodeStatus DecodeDAHIDATIMMR6(MCInst &MI, InsnType insn, - uint64_t Address, - const MCDisassembler *Decoder); - -template <typename InsnType> -static DecodeStatus DecodeDAHIDATI(MCInst &MI, InsnType insn, uint64_t Address, - const MCDisassembler *Decoder); - -template <typename InsnType> -static DecodeStatus DecodeAddiGroupBranch(MCInst &MI, InsnType insn, - uint64_t Address, - const MCDisassembler *Decoder); - -template <typename InsnType> -static DecodeStatus DecodePOP35GroupBranchMMR6(MCInst &MI, InsnType insn, - uint64_t Address, - const MCDisassembler *Decoder); - -template <typename InsnType> -static DecodeStatus DecodeDaddiGroupBranch(MCInst &MI, InsnType insn, - uint64_t Address, - const MCDisassembler *Decoder); - -template <typename InsnType> -static DecodeStatus DecodePOP37GroupBranchMMR6(MCInst &MI, InsnType insn, - uint64_t Address, - const MCDisassembler *Decoder); - -template <typename InsnType> -static DecodeStatus DecodePOP65GroupBranchMMR6(MCInst &MI, InsnType insn, - uint64_t Address, - const MCDisassembler *Decoder); - -template <typename InsnType> -static DecodeStatus DecodePOP75GroupBranchMMR6(MCInst &MI, InsnType insn, - uint64_t Address, - const MCDisassembler *Decoder); - -template <typename InsnType> -static DecodeStatus DecodeBlezlGroupBranch(MCInst &MI, InsnType insn, - uint64_t Address, - const MCDisassembler *Decoder); - -template <typename InsnType> -static DecodeStatus DecodeBgtzlGroupBranch(MCInst &MI, InsnType insn, - uint64_t Address, - const MCDisassembler *Decoder); - -template <typename InsnType> -static DecodeStatus DecodeBgtzGroupBranch(MCInst &MI, InsnType insn, - uint64_t Address, - const MCDisassembler *Decoder); + unsigned Regs[] = {Mips::S0, Mips::S1, Mips::S2, Mips::S3, Mips::S4, + Mips::S5, Mips::S6, Mips::S7, Mips::FP}; + unsigned RegNum; -template <typename InsnType> -static DecodeStatus DecodeBlezGroupBranch(MCInst &MI, InsnType insn, - uint64_t Address, - const MCDisassembler *Decoder); + unsigned RegLst = fieldFromInstruction(Insn, 21, 5); -template <typename InsnType> -static DecodeStatus DecodeBgtzGroupBranchMMR6(MCInst &MI, InsnType insn, - uint64_t Address, - const MCDisassembler *Decoder); + // Empty register lists are not allowed. + if (RegLst == 0) + return MCDisassembler::Fail; -template <typename InsnType> -static DecodeStatus DecodeBlezGroupBranchMMR6(MCInst &MI, InsnType insn, - uint64_t Address, - const MCDisassembler *Decoder); + RegNum = RegLst & 0xf; -template <typename InsnType> -static DecodeStatus DecodeDINS(MCInst &MI, InsnType Insn, uint64_t Address, - const MCDisassembler *Decoder); + // RegLst values 10-15, and 26-31 are reserved. + if (RegNum > 9) + return MCDisassembler::Fail; -template <typename InsnType> -static DecodeStatus DecodeDEXT(MCInst &MI, InsnType Insn, uint64_t Address, - const MCDisassembler *Decoder); + for (unsigned i = 0; i < RegNum; i++) + Inst.addOperand(MCOperand::createReg(Regs[i])); -template <typename InsnType> -static DecodeStatus DecodeCRC(MCInst &MI, InsnType Insn, uint64_t Address, - const MCDisassembler *Decoder); + if (RegLst & 0x10) + Inst.addOperand(MCOperand::createReg(Mips::RA)); -static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); + return MCDisassembler::Success; +} static DecodeStatus DecodeRegListOperand16(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned RegPair, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeMovePOperands(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeFIXMEInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static MCDisassembler *createMipsDisassembler( - const Target &T, - const MCSubtargetInfo &STI, - MCContext &Ctx) { - return new MipsDisassembler(STI, Ctx, true); -} - -static MCDisassembler *createMipselDisassembler( - const Target &T, - const MCSubtargetInfo &STI, - MCContext &Ctx) { - return new MipsDisassembler(STI, Ctx, false); -} + const MCDisassembler *Decoder) { + unsigned Regs[] = {Mips::S0, Mips::S1, Mips::S2, Mips::S3}; + unsigned RegLst; + switch (Inst.getOpcode()) { + default: + RegLst = fieldFromInstruction(Insn, 4, 2); + break; + case Mips::LWM16_MMR6: + case Mips::SWM16_MMR6: + RegLst = fieldFromInstruction(Insn, 8, 2); + break; + } + unsigned RegNum = RegLst & 0x3; -extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void -LLVMInitializeMipsDisassembler() { - // Register the disassembler. - TargetRegistry::RegisterMCDisassembler(getTheMipsTarget(), - createMipsDisassembler); - TargetRegistry::RegisterMCDisassembler(getTheMipselTarget(), - createMipselDisassembler); - TargetRegistry::RegisterMCDisassembler(getTheMips64Target(), - createMipsDisassembler); - TargetRegistry::RegisterMCDisassembler(getTheMips64elTarget(), - createMipselDisassembler); -} + for (unsigned i = 0; i <= RegNum; i++) + Inst.addOperand(MCOperand::createReg(Regs[i])); -#include "MipsGenDisassemblerTables.inc" + Inst.addOperand(MCOperand::createReg(Mips::RA)); -static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo) { - const MCRegisterInfo *RegInfo = D->getContext().getRegisterInfo(); - return *(RegInfo->getRegClass(RC).begin() + RegNo); + return MCDisassembler::Success; } template <typename InsnType> @@ -1095,247 +860,15 @@ static DecodeStatus DecodeCRC(MCInst &MI, InsnType Insn, uint64_t Address, const MCDisassembler *Decoder) { InsnType Rs = fieldFromInstruction(Insn, 21, 5); InsnType Rt = fieldFromInstruction(Insn, 16, 5); - MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID, - Rt))); - MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID, - Rs))); - MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID, - Rt))); - return MCDisassembler::Success; -} - -/// Read two bytes from the ArrayRef and return 16 bit halfword sorted -/// according to the given endianness. -static DecodeStatus readInstruction16(ArrayRef<uint8_t> Bytes, uint64_t Address, - uint64_t &Size, uint32_t &Insn, - bool IsBigEndian) { - // We want to read exactly 2 Bytes of data. - if (Bytes.size() < 2) { - Size = 0; - return MCDisassembler::Fail; - } - - if (IsBigEndian) { - Insn = (Bytes[0] << 8) | Bytes[1]; - } else { - Insn = (Bytes[1] << 8) | Bytes[0]; - } - - return MCDisassembler::Success; -} - -/// Read four bytes from the ArrayRef and return 32 bit word sorted -/// according to the given endianness. -static DecodeStatus readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t Address, - uint64_t &Size, uint32_t &Insn, - bool IsBigEndian, bool IsMicroMips) { - // We want to read exactly 4 Bytes of data. - if (Bytes.size() < 4) { - Size = 0; - return MCDisassembler::Fail; - } - - // High 16 bits of a 32-bit microMIPS instruction (where the opcode is) - // always precede the low 16 bits in the instruction stream (that is, they - // are placed at lower addresses in the instruction stream). - // - // microMIPS byte ordering: - // Big-endian: 0 | 1 | 2 | 3 - // Little-endian: 1 | 0 | 3 | 2 - - if (IsBigEndian) { - // Encoded as a big-endian 32-bit word in the stream. - Insn = - (Bytes[3] << 0) | (Bytes[2] << 8) | (Bytes[1] << 16) | (Bytes[0] << 24); - } else { - if (IsMicroMips) { - Insn = (Bytes[2] << 0) | (Bytes[3] << 8) | (Bytes[0] << 16) | - (Bytes[1] << 24); - } else { - Insn = (Bytes[0] << 0) | (Bytes[1] << 8) | (Bytes[2] << 16) | - (Bytes[3] << 24); - } - } - + MI.addOperand( + MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID, Rt))); + MI.addOperand( + MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID, Rs))); + MI.addOperand( + MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID, Rt))); return MCDisassembler::Success; } -DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size, - ArrayRef<uint8_t> Bytes, - uint64_t Address, - raw_ostream &CStream) const { - uint32_t Insn; - DecodeStatus Result; - Size = 0; - - if (IsMicroMips) { - Result = readInstruction16(Bytes, Address, Size, Insn, IsBigEndian); - if (Result == MCDisassembler::Fail) - return MCDisassembler::Fail; - - if (hasMips32r6()) { - LLVM_DEBUG( - dbgs() << "Trying MicroMipsR616 table (16-bit instructions):\n"); - // Calling the auto-generated decoder function for microMIPS32R6 - // 16-bit instructions. - Result = decodeInstruction(DecoderTableMicroMipsR616, Instr, Insn, - Address, this, STI); - if (Result != MCDisassembler::Fail) { - Size = 2; - return Result; - } - } - - LLVM_DEBUG(dbgs() << "Trying MicroMips16 table (16-bit instructions):\n"); - // Calling the auto-generated decoder function for microMIPS 16-bit - // instructions. - Result = decodeInstruction(DecoderTableMicroMips16, Instr, Insn, Address, - this, STI); - if (Result != MCDisassembler::Fail) { - Size = 2; - return Result; - } - - Result = readInstruction32(Bytes, Address, Size, Insn, IsBigEndian, true); - if (Result == MCDisassembler::Fail) - return MCDisassembler::Fail; - - if (hasMips32r6()) { - LLVM_DEBUG( - dbgs() << "Trying MicroMips32r632 table (32-bit instructions):\n"); - // Calling the auto-generated decoder function. - Result = decodeInstruction(DecoderTableMicroMipsR632, Instr, Insn, - Address, this, STI); - if (Result != MCDisassembler::Fail) { - Size = 4; - return Result; - } - } - - LLVM_DEBUG(dbgs() << "Trying MicroMips32 table (32-bit instructions):\n"); - // Calling the auto-generated decoder function. - Result = decodeInstruction(DecoderTableMicroMips32, Instr, Insn, Address, - this, STI); - if (Result != MCDisassembler::Fail) { - Size = 4; - return Result; - } - - if (isFP64()) { - LLVM_DEBUG(dbgs() << "Trying MicroMipsFP64 table (32-bit opcodes):\n"); - Result = decodeInstruction(DecoderTableMicroMipsFP6432, Instr, Insn, - Address, this, STI); - if (Result != MCDisassembler::Fail) { - Size = 4; - return Result; - } - } - - // This is an invalid instruction. Claim that the Size is 2 bytes. Since - // microMIPS instructions have a minimum alignment of 2, the next 2 bytes - // could form a valid instruction. The two bytes we rejected as an - // instruction could have actually beeen an inline constant pool that is - // unconditionally branched over. - Size = 2; - return MCDisassembler::Fail; - } - - // Attempt to read the instruction so that we can attempt to decode it. If - // the buffer is not 4 bytes long, let the higher level logic figure out - // what to do with a size of zero and MCDisassembler::Fail. - Result = readInstruction32(Bytes, Address, Size, Insn, IsBigEndian, false); - if (Result == MCDisassembler::Fail) - return MCDisassembler::Fail; - - // The only instruction size for standard encoded MIPS. - Size = 4; - - if (hasCOP3()) { - LLVM_DEBUG(dbgs() << "Trying COP3_ table (32-bit opcodes):\n"); - Result = - decodeInstruction(DecoderTableCOP3_32, Instr, Insn, Address, this, STI); - if (Result != MCDisassembler::Fail) - return Result; - } - - if (hasMips32r6() && isGP64()) { - LLVM_DEBUG( - dbgs() << "Trying Mips32r6_64r6 (GPR64) table (32-bit opcodes):\n"); - Result = decodeInstruction(DecoderTableMips32r6_64r6_GP6432, Instr, Insn, - Address, this, STI); - if (Result != MCDisassembler::Fail) - return Result; - } - - if (hasMips32r6() && isPTR64()) { - LLVM_DEBUG( - dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n"); - Result = decodeInstruction(DecoderTableMips32r6_64r6_PTR6432, Instr, Insn, - Address, this, STI); - if (Result != MCDisassembler::Fail) - return Result; - } - - if (hasMips32r6()) { - LLVM_DEBUG(dbgs() << "Trying Mips32r6_64r6 table (32-bit opcodes):\n"); - Result = decodeInstruction(DecoderTableMips32r6_64r632, Instr, Insn, - Address, this, STI); - if (Result != MCDisassembler::Fail) - return Result; - } - - if (hasMips2() && isPTR64()) { - LLVM_DEBUG( - dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n"); - Result = decodeInstruction(DecoderTableMips32_64_PTR6432, Instr, Insn, - Address, this, STI); - if (Result != MCDisassembler::Fail) - return Result; - } - - if (hasCnMips()) { - LLVM_DEBUG(dbgs() << "Trying CnMips table (32-bit opcodes):\n"); - Result = decodeInstruction(DecoderTableCnMips32, Instr, Insn, - Address, this, STI); - if (Result != MCDisassembler::Fail) - return Result; - } - - if (hasCnMipsP()) { - LLVM_DEBUG(dbgs() << "Trying CnMipsP table (32-bit opcodes):\n"); - Result = decodeInstruction(DecoderTableCnMipsP32, Instr, Insn, - Address, this, STI); - if (Result != MCDisassembler::Fail) - return Result; - } - - if (isGP64()) { - LLVM_DEBUG(dbgs() << "Trying Mips64 (GPR64) table (32-bit opcodes):\n"); - Result = decodeInstruction(DecoderTableMips6432, Instr, Insn, - Address, this, STI); - if (Result != MCDisassembler::Fail) - return Result; - } - - if (isFP64()) { - LLVM_DEBUG( - dbgs() << "Trying MipsFP64 (64 bit FPU) table (32-bit opcodes):\n"); - Result = decodeInstruction(DecoderTableMipsFP6432, Instr, Insn, - Address, this, STI); - if (Result != MCDisassembler::Fail) - return Result; - } - - LLVM_DEBUG(dbgs() << "Trying Mips table (32-bit opcodes):\n"); - // Calling the auto-generated decoder function. - Result = - decodeInstruction(DecoderTableMips32, Instr, Insn, Address, this, STI); - if (Result != MCDisassembler::Fail) - return Result; - - return MCDisassembler::Fail; -} - static DecodeStatus DecodeCPU16RegsRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const MCDisassembler *Decoder) { @@ -1971,137 +1504,6 @@ static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst, unsigned Insn, return MCDisassembler::Success; } -static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder) { - // Currently only hardware register 29 is supported. - if (RegNo != 29) - return MCDisassembler::Fail; - Inst.addOperand(MCOperand::createReg(Mips::HWR29)); - return MCDisassembler::Success; -} - -static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder) { - if (RegNo > 30 || RegNo %2) - return MCDisassembler::Fail; - - unsigned Reg = getReg(Decoder, Mips::AFGR64RegClassID, RegNo /2); - Inst.addOperand(MCOperand::createReg(Reg)); - return MCDisassembler::Success; -} - -static DecodeStatus DecodeACC64DSPRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder) { - if (RegNo >= 4) - return MCDisassembler::Fail; - - unsigned Reg = getReg(Decoder, Mips::ACC64DSPRegClassID, RegNo); - Inst.addOperand(MCOperand::createReg(Reg)); - return MCDisassembler::Success; -} - -static DecodeStatus DecodeHI32DSPRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder) { - if (RegNo >= 4) - return MCDisassembler::Fail; - - unsigned Reg = getReg(Decoder, Mips::HI32DSPRegClassID, RegNo); - Inst.addOperand(MCOperand::createReg(Reg)); - return MCDisassembler::Success; -} - -static DecodeStatus DecodeLO32DSPRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder) { - if (RegNo >= 4) - return MCDisassembler::Fail; - - unsigned Reg = getReg(Decoder, Mips::LO32DSPRegClassID, RegNo); - Inst.addOperand(MCOperand::createReg(Reg)); - return MCDisassembler::Success; -} - -static DecodeStatus DecodeMSA128BRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder) { - if (RegNo > 31) - return MCDisassembler::Fail; - - unsigned Reg = getReg(Decoder, Mips::MSA128BRegClassID, RegNo); - Inst.addOperand(MCOperand::createReg(Reg)); - return MCDisassembler::Success; -} - -static DecodeStatus DecodeMSA128HRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder) { - if (RegNo > 31) - return MCDisassembler::Fail; - - unsigned Reg = getReg(Decoder, Mips::MSA128HRegClassID, RegNo); - Inst.addOperand(MCOperand::createReg(Reg)); - return MCDisassembler::Success; -} - -static DecodeStatus DecodeMSA128WRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder) { - if (RegNo > 31) - return MCDisassembler::Fail; - - unsigned Reg = getReg(Decoder, Mips::MSA128WRegClassID, RegNo); - Inst.addOperand(MCOperand::createReg(Reg)); - return MCDisassembler::Success; -} - -static DecodeStatus DecodeMSA128DRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder) { - if (RegNo > 31) - return MCDisassembler::Fail; - - unsigned Reg = getReg(Decoder, Mips::MSA128DRegClassID, RegNo); - Inst.addOperand(MCOperand::createReg(Reg)); - return MCDisassembler::Success; -} - -static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder) { - if (RegNo > 7) - return MCDisassembler::Fail; - - unsigned Reg = getReg(Decoder, Mips::MSACtrlRegClassID, RegNo); - Inst.addOperand(MCOperand::createReg(Reg)); - return MCDisassembler::Success; -} - -static DecodeStatus DecodeCOP0RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder) { - if (RegNo > 31) - return MCDisassembler::Fail; - - unsigned Reg = getReg(Decoder, Mips::COP0RegClassID, RegNo); - Inst.addOperand(MCOperand::createReg(Reg)); - return MCDisassembler::Success; -} - -static DecodeStatus DecodeCOP2RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder) { - if (RegNo > 31) - return MCDisassembler::Fail; - - unsigned Reg = getReg(Decoder, Mips::COP2RegClassID, RegNo); - Inst.addOperand(MCOperand::createReg(Reg)); - return MCDisassembler::Success; -} - static DecodeStatus DecodeBranchTarget(MCInst &Inst, unsigned Offset, uint64_t Address, const MCDisassembler *Decoder) { @@ -2241,7 +1643,7 @@ DecodeUImmWithOffsetAndScale(MCInst &Inst, unsigned Value, uint64_t Address, return MCDisassembler::Success; } -template <unsigned Bits, int Offset, int ScaleBy> +template <unsigned Bits, int Offset = 0, int ScaleBy = 1> static DecodeStatus DecodeSImmWithOffsetAndScale(MCInst &Inst, unsigned Value, uint64_t Address, const MCDisassembler *Decoder) { @@ -2250,6 +1652,14 @@ DecodeSImmWithOffsetAndScale(MCInst &Inst, unsigned Value, uint64_t Address, return MCDisassembler::Success; } +template <unsigned Bits, int Offset> +static DecodeStatus DecodeUImmWithOffset(MCInst &Inst, unsigned Value, + uint64_t Address, + const MCDisassembler *Decoder) { + return DecodeUImmWithOffsetAndScale<Bits, Offset, 1>(Inst, Value, Address, + Decoder); +} + static DecodeStatus DecodeInsSize(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { // First we need to grab the pos(lsb) from MCInst. @@ -2294,90 +1704,12 @@ static DecodeStatus DecodeANDI16Imm(MCInst &Inst, unsigned Insn, const MCDisassembler *Decoder) { // Insn must be >= 0, since it is unsigned that condition is always true. assert(Insn < 16); - int32_t DecodedValues[] = {128, 1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, - 255, 32768, 65535}; + int32_t DecodedValues[] = {128, 1, 2, 3, 4, 7, 8, 15, + 16, 31, 32, 63, 64, 255, 32768, 65535}; Inst.addOperand(MCOperand::createImm(DecodedValues[Insn])); return MCDisassembler::Success; } -static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - unsigned Regs[] = {Mips::S0, Mips::S1, Mips::S2, Mips::S3, Mips::S4, Mips::S5, - Mips::S6, Mips::S7, Mips::FP}; - unsigned RegNum; - - unsigned RegLst = fieldFromInstruction(Insn, 21, 5); - - // Empty register lists are not allowed. - if (RegLst == 0) - return MCDisassembler::Fail; - - RegNum = RegLst & 0xf; - - // RegLst values 10-15, and 26-31 are reserved. - if (RegNum > 9) - return MCDisassembler::Fail; - - for (unsigned i = 0; i < RegNum; i++) - Inst.addOperand(MCOperand::createReg(Regs[i])); - - if (RegLst & 0x10) - Inst.addOperand(MCOperand::createReg(Mips::RA)); - - return MCDisassembler::Success; -} - -static DecodeStatus DecodeRegListOperand16(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - unsigned Regs[] = {Mips::S0, Mips::S1, Mips::S2, Mips::S3}; - unsigned RegLst; - switch(Inst.getOpcode()) { - default: - RegLst = fieldFromInstruction(Insn, 4, 2); - break; - case Mips::LWM16_MMR6: - case Mips::SWM16_MMR6: - RegLst = fieldFromInstruction(Insn, 8, 2); - break; - } - unsigned RegNum = RegLst & 0x3; - - for (unsigned i = 0; i <= RegNum; i++) - Inst.addOperand(MCOperand::createReg(Regs[i])); - - Inst.addOperand(MCOperand::createReg(Mips::RA)); - - return MCDisassembler::Success; -} - -static DecodeStatus DecodeMovePOperands(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - unsigned RegPair = fieldFromInstruction(Insn, 7, 3); - if (DecodeMovePRegPair(Inst, RegPair, Address, Decoder) == - MCDisassembler::Fail) - return MCDisassembler::Fail; - - unsigned RegRs; - if (static_cast<const MipsDisassembler*>(Decoder)->hasMips32r6()) - RegRs = fieldFromInstruction(Insn, 0, 2) | - (fieldFromInstruction(Insn, 3, 1) << 2); - else - RegRs = fieldFromInstruction(Insn, 1, 3); - if (DecodeGPRMM16MovePRegisterClass(Inst, RegRs, Address, Decoder) == - MCDisassembler::Fail) - return MCDisassembler::Fail; - - unsigned RegRt = fieldFromInstruction(Insn, 4, 3); - if (DecodeGPRMM16MovePRegisterClass(Inst, RegRt, Address, Decoder) == - MCDisassembler::Fail) - return MCDisassembler::Fail; - - return MCDisassembler::Success; -} - static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned RegPair, uint64_t Address, const MCDisassembler *Decoder) { @@ -2421,6 +1753,32 @@ static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned RegPair, return MCDisassembler::Success; } +static DecodeStatus DecodeMovePOperands(MCInst &Inst, unsigned Insn, + uint64_t Address, + const MCDisassembler *Decoder) { + unsigned RegPair = fieldFromInstruction(Insn, 7, 3); + if (DecodeMovePRegPair(Inst, RegPair, Address, Decoder) == + MCDisassembler::Fail) + return MCDisassembler::Fail; + + unsigned RegRs; + if (static_cast<const MipsDisassembler *>(Decoder)->hasMips32r6()) + RegRs = fieldFromInstruction(Insn, 0, 2) | + (fieldFromInstruction(Insn, 3, 1) << 2); + else + RegRs = fieldFromInstruction(Insn, 1, 3); + if (DecodeGPRMM16MovePRegisterClass(Inst, RegRs, Address, Decoder) == + MCDisassembler::Fail) + return MCDisassembler::Fail; + + unsigned RegRt = fieldFromInstruction(Insn, 4, 3); + if (DecodeGPRMM16MovePRegisterClass(Inst, RegRt, Address, Decoder) == + MCDisassembler::Fail) + return MCDisassembler::Fail; + + return MCDisassembler::Success; +} + static DecodeStatus DecodeSimm23Lsl2(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { @@ -2528,3 +1886,237 @@ static DecodeStatus DecodeFIXMEInstruction(MCInst &Inst, unsigned Insn, const MCDisassembler *Decoder) { return MCDisassembler::Fail; } + +#include "MipsGenDisassemblerTables.inc" + +/// Read two bytes from the ArrayRef and return 16 bit halfword sorted +/// according to the given endianness. +static DecodeStatus readInstruction16(ArrayRef<uint8_t> Bytes, uint64_t Address, + uint64_t &Size, uint32_t &Insn, + bool IsBigEndian) { + // We want to read exactly 2 Bytes of data. + if (Bytes.size() < 2) { + Size = 0; + return MCDisassembler::Fail; + } + + if (IsBigEndian) { + Insn = (Bytes[0] << 8) | Bytes[1]; + } else { + Insn = (Bytes[1] << 8) | Bytes[0]; + } + + return MCDisassembler::Success; +} + +/// Read four bytes from the ArrayRef and return 32 bit word sorted +/// according to the given endianness. +static DecodeStatus readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t Address, + uint64_t &Size, uint32_t &Insn, + bool IsBigEndian, bool IsMicroMips) { + // We want to read exactly 4 Bytes of data. + if (Bytes.size() < 4) { + Size = 0; + return MCDisassembler::Fail; + } + + // High 16 bits of a 32-bit microMIPS instruction (where the opcode is) + // always precede the low 16 bits in the instruction stream (that is, they + // are placed at lower addresses in the instruction stream). + // + // microMIPS byte ordering: + // Big-endian: 0 | 1 | 2 | 3 + // Little-endian: 1 | 0 | 3 | 2 + + if (IsBigEndian) { + // Encoded as a big-endian 32-bit word in the stream. + Insn = + (Bytes[3] << 0) | (Bytes[2] << 8) | (Bytes[1] << 16) | (Bytes[0] << 24); + } else { + if (IsMicroMips) { + Insn = (Bytes[2] << 0) | (Bytes[3] << 8) | (Bytes[0] << 16) | + (Bytes[1] << 24); + } else { + Insn = (Bytes[0] << 0) | (Bytes[1] << 8) | (Bytes[2] << 16) | + (Bytes[3] << 24); + } + } + + return MCDisassembler::Success; +} + +DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size, + ArrayRef<uint8_t> Bytes, + uint64_t Address, + raw_ostream &CStream) const { + uint32_t Insn; + DecodeStatus Result; + Size = 0; + + if (IsMicroMips) { + Result = readInstruction16(Bytes, Address, Size, Insn, IsBigEndian); + if (Result == MCDisassembler::Fail) + return MCDisassembler::Fail; + + if (hasMips32r6()) { + LLVM_DEBUG( + dbgs() << "Trying MicroMipsR616 table (16-bit instructions):\n"); + // Calling the auto-generated decoder function for microMIPS32R6 + // 16-bit instructions. + Result = decodeInstruction(DecoderTableMicroMipsR616, Instr, Insn, + Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 2; + return Result; + } + } + + LLVM_DEBUG(dbgs() << "Trying MicroMips16 table (16-bit instructions):\n"); + // Calling the auto-generated decoder function for microMIPS 16-bit + // instructions. + Result = decodeInstruction(DecoderTableMicroMips16, Instr, Insn, Address, + this, STI); + if (Result != MCDisassembler::Fail) { + Size = 2; + return Result; + } + + Result = readInstruction32(Bytes, Address, Size, Insn, IsBigEndian, true); + if (Result == MCDisassembler::Fail) + return MCDisassembler::Fail; + + if (hasMips32r6()) { + LLVM_DEBUG( + dbgs() << "Trying MicroMips32r632 table (32-bit instructions):\n"); + // Calling the auto-generated decoder function. + Result = decodeInstruction(DecoderTableMicroMipsR632, Instr, Insn, + Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + return Result; + } + } + + LLVM_DEBUG(dbgs() << "Trying MicroMips32 table (32-bit instructions):\n"); + // Calling the auto-generated decoder function. + Result = decodeInstruction(DecoderTableMicroMips32, Instr, Insn, Address, + this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + return Result; + } + + if (isFP64()) { + LLVM_DEBUG(dbgs() << "Trying MicroMipsFP64 table (32-bit opcodes):\n"); + Result = decodeInstruction(DecoderTableMicroMipsFP6432, Instr, Insn, + Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + return Result; + } + } + + // This is an invalid instruction. Claim that the Size is 2 bytes. Since + // microMIPS instructions have a minimum alignment of 2, the next 2 bytes + // could form a valid instruction. The two bytes we rejected as an + // instruction could have actually beeen an inline constant pool that is + // unconditionally branched over. + Size = 2; + return MCDisassembler::Fail; + } + + // Attempt to read the instruction so that we can attempt to decode it. If + // the buffer is not 4 bytes long, let the higher level logic figure out + // what to do with a size of zero and MCDisassembler::Fail. + Result = readInstruction32(Bytes, Address, Size, Insn, IsBigEndian, false); + if (Result == MCDisassembler::Fail) + return MCDisassembler::Fail; + + // The only instruction size for standard encoded MIPS. + Size = 4; + + if (hasCOP3()) { + LLVM_DEBUG(dbgs() << "Trying COP3_ table (32-bit opcodes):\n"); + Result = + decodeInstruction(DecoderTableCOP3_32, Instr, Insn, Address, this, STI); + if (Result != MCDisassembler::Fail) + return Result; + } + + if (hasMips32r6() && isGP64()) { + LLVM_DEBUG( + dbgs() << "Trying Mips32r6_64r6 (GPR64) table (32-bit opcodes):\n"); + Result = decodeInstruction(DecoderTableMips32r6_64r6_GP6432, Instr, Insn, + Address, this, STI); + if (Result != MCDisassembler::Fail) + return Result; + } + + if (hasMips32r6() && isPTR64()) { + LLVM_DEBUG( + dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n"); + Result = decodeInstruction(DecoderTableMips32r6_64r6_PTR6432, Instr, Insn, + Address, this, STI); + if (Result != MCDisassembler::Fail) + return Result; + } + + if (hasMips32r6()) { + LLVM_DEBUG(dbgs() << "Trying Mips32r6_64r6 table (32-bit opcodes):\n"); + Result = decodeInstruction(DecoderTableMips32r6_64r632, Instr, Insn, + Address, this, STI); + if (Result != MCDisassembler::Fail) + return Result; + } + + if (hasMips2() && isPTR64()) { + LLVM_DEBUG( + dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n"); + Result = decodeInstruction(DecoderTableMips32_64_PTR6432, Instr, Insn, + Address, this, STI); + if (Result != MCDisassembler::Fail) + return Result; + } + + if (hasCnMips()) { + LLVM_DEBUG(dbgs() << "Trying CnMips table (32-bit opcodes):\n"); + Result = decodeInstruction(DecoderTableCnMips32, Instr, Insn, Address, this, + STI); + if (Result != MCDisassembler::Fail) + return Result; + } + + if (hasCnMipsP()) { + LLVM_DEBUG(dbgs() << "Trying CnMipsP table (32-bit opcodes):\n"); + Result = decodeInstruction(DecoderTableCnMipsP32, Instr, Insn, Address, + this, STI); + if (Result != MCDisassembler::Fail) + return Result; + } + + if (isGP64()) { + LLVM_DEBUG(dbgs() << "Trying Mips64 (GPR64) table (32-bit opcodes):\n"); + Result = decodeInstruction(DecoderTableMips6432, Instr, Insn, Address, this, + STI); + if (Result != MCDisassembler::Fail) + return Result; + } + + if (isFP64()) { + LLVM_DEBUG( + dbgs() << "Trying MipsFP64 (64 bit FPU) table (32-bit opcodes):\n"); + Result = decodeInstruction(DecoderTableMipsFP6432, Instr, Insn, Address, + this, STI); + if (Result != MCDisassembler::Fail) + return Result; + } + + LLVM_DEBUG(dbgs() << "Trying Mips table (32-bit opcodes):\n"); + // Calling the auto-generated decoder function. + Result = + decodeInstruction(DecoderTableMips32, Instr, Insn, Address, this, STI); + if (Result != MCDisassembler::Fail) + return Result; + + return MCDisassembler::Fail; +} diff --git a/llvm/lib/Target/Mips/Mips16ISelLowering.cpp b/llvm/lib/Target/Mips/Mips16ISelLowering.cpp index 330cb4e0e206..7bd96b571bc6 100644 --- a/llvm/lib/Target/Mips/Mips16ISelLowering.cpp +++ b/llvm/lib/Target/Mips/Mips16ISelLowering.cpp @@ -56,48 +56,52 @@ struct Mips16IntrinsicHelperType{ // Libcalls for which no helper is generated. Sorted by name for binary search. static const Mips16Libcall HardFloatLibCalls[] = { - {RTLIB::ADD_F64, RTLIB::__mips16_adddf3, "__mips16_adddf3"}, - {RTLIB::ADD_F32, RTLIB::__mips16_addsf3, "__mips16_addsf3"}, - {RTLIB::DIV_F64, RTLIB::__mips16_divdf3, "__mips16_divdf3"}, - {RTLIB::DIV_F32, RTLIB::__mips16_divsf3, "__mips16_divsf3"}, - {RTLIB::OEQ_F64, RTLIB::__mips16_eqdf2, "__mips16_eqdf2"}, - {RTLIB::OEQ_F32, RTLIB::__mips16_eqsf2, "__mips16_eqsf2"}, - {RTLIB::FPEXT_F32_F64, RTLIB::__mips16_extendsfdf2, "__mips16_extendsfdf2"}, - {RTLIB::FPTOSINT_F64_I32, RTLIB::__mips16_fix_truncdfsi, + {RTLIB::ADD_F64, RTLIB::impl___mips16_adddf3, "__mips16_adddf3"}, + {RTLIB::ADD_F32, RTLIB::impl___mips16_addsf3, "__mips16_addsf3"}, + {RTLIB::DIV_F64, RTLIB::impl___mips16_divdf3, "__mips16_divdf3"}, + {RTLIB::DIV_F32, RTLIB::impl___mips16_divsf3, "__mips16_divsf3"}, + {RTLIB::OEQ_F64, RTLIB::impl___mips16_eqdf2, "__mips16_eqdf2"}, + {RTLIB::OEQ_F32, RTLIB::impl___mips16_eqsf2, "__mips16_eqsf2"}, + {RTLIB::FPEXT_F32_F64, RTLIB::impl___mips16_extendsfdf2, + "__mips16_extendsfdf2"}, + {RTLIB::FPTOSINT_F64_I32, RTLIB::impl___mips16_fix_truncdfsi, "__mips16_fix_truncdfsi"}, - {RTLIB::FPTOSINT_F32_I32, RTLIB::__mips16_fix_truncsfsi, + {RTLIB::FPTOSINT_F32_I32, RTLIB::impl___mips16_fix_truncsfsi, "__mips16_fix_truncsfsi"}, - {RTLIB::SINTTOFP_I32_F64, RTLIB::__mips16_floatsidf, "__mips16_floatsidf"}, - {RTLIB::SINTTOFP_I32_F32, RTLIB::__mips16_floatsisf, "__mips16_floatsisf"}, - {RTLIB::UINTTOFP_I32_F64, RTLIB::__mips16_floatunsidf, + {RTLIB::SINTTOFP_I32_F64, RTLIB::impl___mips16_floatsidf, + "__mips16_floatsidf"}, + {RTLIB::SINTTOFP_I32_F32, RTLIB::impl___mips16_floatsisf, + "__mips16_floatsisf"}, + {RTLIB::UINTTOFP_I32_F64, RTLIB::impl___mips16_floatunsidf, "__mips16_floatunsidf"}, - {RTLIB::UINTTOFP_I32_F32, RTLIB::__mips16_floatunsisf, + {RTLIB::UINTTOFP_I32_F32, RTLIB::impl___mips16_floatunsisf, "__mips16_floatunsisf"}, - {RTLIB::OGE_F64, RTLIB::__mips16_gedf2, "__mips16_gedf2"}, - {RTLIB::OGE_F32, RTLIB::__mips16_gesf2, "__mips16_gesf2"}, - {RTLIB::OGT_F64, RTLIB::__mips16_gtdf2, "__mips16_gtdf2"}, - {RTLIB::OGT_F32, RTLIB::__mips16_gtsf2, "__mips16_gtsf2"}, - {RTLIB::OLE_F64, RTLIB::__mips16_ledf2, "__mips16_ledf2"}, - {RTLIB::OLE_F32, RTLIB::__mips16_lesf2, "__mips16_lesf2"}, - {RTLIB::OLT_F64, RTLIB::__mips16_ltdf2, "__mips16_ltdf2"}, - {RTLIB::OLT_F32, RTLIB::__mips16_ltsf2, "__mips16_ltsf2"}, - {RTLIB::MUL_F64, RTLIB::__mips16_muldf3, "__mips16_muldf3"}, - {RTLIB::MUL_F32, RTLIB::__mips16_mulsf3, "__mips16_mulsf3"}, - {RTLIB::UNE_F64, RTLIB::__mips16_nedf2, "__mips16_nedf2"}, - {RTLIB::UNE_F32, RTLIB::__mips16_nesf2, "__mips16_nesf2"}, - {RTLIB::UNKNOWN_LIBCALL, RTLIB::__mips16_ret_dc, + {RTLIB::OGE_F64, RTLIB::impl___mips16_gedf2, "__mips16_gedf2"}, + {RTLIB::OGE_F32, RTLIB::impl___mips16_gesf2, "__mips16_gesf2"}, + {RTLIB::OGT_F64, RTLIB::impl___mips16_gtdf2, "__mips16_gtdf2"}, + {RTLIB::OGT_F32, RTLIB::impl___mips16_gtsf2, "__mips16_gtsf2"}, + {RTLIB::OLE_F64, RTLIB::impl___mips16_ledf2, "__mips16_ledf2"}, + {RTLIB::OLE_F32, RTLIB::impl___mips16_lesf2, "__mips16_lesf2"}, + {RTLIB::OLT_F64, RTLIB::impl___mips16_ltdf2, "__mips16_ltdf2"}, + {RTLIB::OLT_F32, RTLIB::impl___mips16_ltsf2, "__mips16_ltsf2"}, + {RTLIB::MUL_F64, RTLIB::impl___mips16_muldf3, "__mips16_muldf3"}, + {RTLIB::MUL_F32, RTLIB::impl___mips16_mulsf3, "__mips16_mulsf3"}, + {RTLIB::UNE_F64, RTLIB::impl___mips16_nedf2, "__mips16_nedf2"}, + {RTLIB::UNE_F32, RTLIB::impl___mips16_nesf2, "__mips16_nesf2"}, + {RTLIB::UNKNOWN_LIBCALL, RTLIB::impl___mips16_ret_dc, "__mips16_ret_dc"}, // No associated libcall. - {RTLIB::UNKNOWN_LIBCALL, RTLIB::__mips16_ret_df, + {RTLIB::UNKNOWN_LIBCALL, RTLIB::impl___mips16_ret_df, "__mips16_ret_df"}, // No associated libcall. - {RTLIB::UNKNOWN_LIBCALL, RTLIB::__mips16_ret_sc, + {RTLIB::UNKNOWN_LIBCALL, RTLIB::impl___mips16_ret_sc, "__mips16_ret_sc"}, // No associated libcall. - {RTLIB::UNKNOWN_LIBCALL, RTLIB::__mips16_ret_sf, + {RTLIB::UNKNOWN_LIBCALL, RTLIB::impl___mips16_ret_sf, "__mips16_ret_sf"}, // No associated libcall. - {RTLIB::SUB_F64, RTLIB::__mips16_subdf3, "__mips16_subdf3"}, - {RTLIB::SUB_F32, RTLIB::__mips16_subsf3, "__mips16_subsf3"}, - {RTLIB::FPROUND_F64_F32, RTLIB::__mips16_truncdfsf2, "__mips16_truncdfsf2"}, - {RTLIB::UO_F64, RTLIB::__mips16_unorddf2, "__mips16_unorddf2"}, - {RTLIB::UO_F32, RTLIB::__mips16_unordsf2, "__mips16_unordsf2"}}; + {RTLIB::SUB_F64, RTLIB::impl___mips16_subdf3, "__mips16_subdf3"}, + {RTLIB::SUB_F32, RTLIB::impl___mips16_subsf3, "__mips16_subsf3"}, + {RTLIB::FPROUND_F64_F32, RTLIB::impl___mips16_truncdfsf2, + "__mips16_truncdfsf2"}, + {RTLIB::UO_F64, RTLIB::impl___mips16_unorddf2, "__mips16_unorddf2"}, + {RTLIB::UO_F32, RTLIB::impl___mips16_unordsf2, "__mips16_unordsf2"}}; static const Mips16IntrinsicHelperType Mips16IntrinsicHelper[] = { {"__fixunsdfsi", "__mips16_call_stub_2" }, diff --git a/llvm/lib/Target/Mips/Mips16InstrInfo.td b/llvm/lib/Target/Mips/Mips16InstrInfo.td index fb2a83dc90ea..ab473c133b8e 100644 --- a/llvm/lib/Target/Mips/Mips16InstrInfo.td +++ b/llvm/lib/Target/Mips/Mips16InstrInfo.td @@ -374,8 +374,8 @@ class FRR16_JALRC_RA_only_ins<bits<1> nd_, bits<1> l_, class FRR16_JALRC_ins<bits<1> nd, bits<1> l, bits<1> ra, string asmstr, InstrItinClass itin>: - FRR16_JALRC<nd, l, ra, (outs), (ins CPU16Regs:$rs), - !strconcat(asmstr, "\t$rs"), [], itin> ; + FRR16_JALRC<nd, l, ra, (outs), (ins CPU16Regs:$rx), + !strconcat(asmstr, "\t$rx"), [], itin> ; class FRR_SF16_ins <bits<5> _funct, bits<3> _subfunc, @@ -776,7 +776,6 @@ def JrcRa16: FRR16_JALRC_RA_only_ins<1, 1, "jrc", IIM16Alu> { } def JrcRx16: FRR16_JALRC_ins<1, 1, 0, "jrc", IIM16Alu> { - let rx = 0b000; let isBranch = 1; let isIndirectBranch = 1; let isTerminator=1; diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp index ae91c97e2a80..9d8b9f86daf7 100644 --- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp @@ -967,8 +967,7 @@ void MipsAsmPrinter::EmitFPCallStub( // freed) and since we're at the global level we can use the default // constructed subtarget. std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo( - TM.getTargetTriple().str(), TM.getTargetCPU(), - TM.getTargetFeatureString())); + TM.getTargetTriple(), TM.getTargetCPU(), TM.getTargetFeatureString())); // // .global xxxx diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.cpp b/llvm/lib/Target/Mips/MipsInstrInfo.cpp index 8a59532ba578..bffdffa4af6a 100644 --- a/llvm/lib/Target/Mips/MipsInstrInfo.cpp +++ b/llvm/lib/Target/Mips/MipsInstrInfo.cpp @@ -40,7 +40,7 @@ using namespace llvm; void MipsInstrInfo::anchor() {} MipsInstrInfo::MipsInstrInfo(const MipsSubtarget &STI, unsigned UncondBr) - : MipsGenInstrInfo(Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP), + : MipsGenInstrInfo(STI, Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP), Subtarget(STI), UncondBrOpc(UncondBr) {} const MipsInstrInfo *MipsInstrInfo::create(MipsSubtarget &STI) { diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index ee1ca4538554..f9bdc0993533 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -290,7 +290,8 @@ void NVPTXInstPrinter::printAtomicCode(const MCInst *MI, int OpNum, O << ".acq_rel"; return; case NVPTX::Ordering::SequentiallyConsistent: - O << ".seq_cst"; + report_fatal_error( + "NVPTX AtomicCode Printer does not support \"seq_cst\" ordering."); return; case NVPTX::Ordering::Volatile: O << ".volatile"; diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td index 8a445f82e700..31c117a8c0fe 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.td +++ b/llvm/lib/Target/NVPTX/NVPTX.td @@ -80,9 +80,9 @@ class FeaturePTX<int version>: // + Compare within the family by comparing FullSMVersion, given both belongs to // the same family. // + Detect 'a' variants by checking FullSMVersion & 1. -foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53, - 60, 61, 62, 70, 72, 75, 80, 86, 87, - 89, 90, 100, 101, 103, 120, 121] in { +foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53, 60, + 61, 62, 70, 72, 75, 80, 86, 87, 88, 89, + 90, 100, 101, 103, 110, 120, 121] in { // Base SM version (e.g. FullSMVersion for sm_100 is 1000) def SM#sm : FeatureSM<""#sm, !mul(sm, 10)>; @@ -127,6 +127,7 @@ def : Proc<"sm_75", [SM75, PTX63]>; def : Proc<"sm_80", [SM80, PTX70]>; def : Proc<"sm_86", [SM86, PTX71]>; def : Proc<"sm_87", [SM87, PTX74]>; +def : Proc<"sm_88", [SM88, PTX90]>; def : Proc<"sm_89", [SM89, PTX78]>; def : Proc<"sm_90", [SM90, PTX78]>; def : Proc<"sm_90a", [SM90a, PTX80]>; @@ -139,6 +140,9 @@ def : Proc<"sm_101f", [SM101f, PTX88]>; def : Proc<"sm_103", [SM103, PTX88]>; def : Proc<"sm_103a", [SM103a, PTX88]>; def : Proc<"sm_103f", [SM103f, PTX88]>; +def : Proc<"sm_110", [SM110, PTX90]>; +def : Proc<"sm_110a", [SM110a, PTX90]>; +def : Proc<"sm_110f", [SM110f, PTX90]>; def : Proc<"sm_120", [SM120, PTX87]>; def : Proc<"sm_120a", [SM120a, PTX87]>; def : Proc<"sm_120f", [SM120f, PTX88]>; diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index 7391c2d488b5..14ca867023e2 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -432,7 +432,7 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F, // .maxclusterrank directive requires SM_90 or higher, make sure that we // filter it out for lower SM versions, as it causes a hard ptxas crash. const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM); - const auto *STI = static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl()); + const NVPTXSubtarget *STI = &NTM.getSubtarget<NVPTXSubtarget>(F); if (STI->getSmVersion() >= 90) { const auto ClusterDim = getClusterDim(F); @@ -669,7 +669,7 @@ void NVPTXAsmPrinter::emitStartOfAsmFile(Module &M) { // rest of NVPTX isn't friendly to change subtargets per function and // so the default TargetMachine will have all of the options. const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM); - const auto* STI = static_cast<const NVPTXSubtarget*>(NTM.getSubtargetImpl()); + const NVPTXSubtarget *STI = NTM.getSubtargetImpl(); SmallString<128> Str1; raw_svector_ostream OS1(Str1); @@ -680,8 +680,7 @@ void NVPTXAsmPrinter::emitStartOfAsmFile(Module &M) { bool NVPTXAsmPrinter::doInitialization(Module &M) { const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM); - const NVPTXSubtarget &STI = - *static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl()); + const NVPTXSubtarget &STI = *NTM.getSubtargetImpl(); if (M.alias_size() && (STI.getPTXVersion() < 63 || STI.getSmVersion() < 30)) report_fatal_error(".alias requires PTX version >= 6.3 and sm_30"); @@ -716,8 +715,7 @@ void NVPTXAsmPrinter::emitGlobals(const Module &M) { assert(GVVisiting.size() == 0 && "Did not fully process a global variable"); const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM); - const NVPTXSubtarget &STI = - *static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl()); + const NVPTXSubtarget &STI = *NTM.getSubtargetImpl(); // Print out module-level global variables in proper order for (const GlobalVariable *GV : Globals) @@ -1178,8 +1176,7 @@ void NVPTXAsmPrinter::emitDemotedVars(const Function *F, raw_ostream &O) { ArrayRef<const GlobalVariable *> GVars = It->second; const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM); - const NVPTXSubtarget &STI = - *static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl()); + const NVPTXSubtarget &STI = *NTM.getSubtargetImpl(); for (const GlobalVariable *GV : GVars) { O << "\t// demoted variable\n\t"; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 3300ed9a5a81..c70f48af33cf 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -170,6 +170,10 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) { } break; } + case NVPTXISD::ATOMIC_CMP_SWAP_B128: + case NVPTXISD::ATOMIC_SWAP_B128: + selectAtomicSwap128(N); + return; case ISD::FADD: case ISD::FMUL: case ISD::FSUB: @@ -1097,11 +1101,6 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { if (PlainLoad && PlainLoad->isIndexed()) return false; - const EVT LoadedEVT = LD->getMemoryVT(); - if (!LoadedEVT.isSimple()) - return false; - const MVT LoadedVT = LoadedEVT.getSimpleVT(); - // Address Space Setting const auto CodeAddrSpace = getAddrSpace(LD); if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace)) @@ -1111,7 +1110,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { SDValue Chain = N->getOperand(0); const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD); - const unsigned FromTypeWidth = LoadedVT.getSizeInBits(); + const unsigned FromTypeWidth = LD->getMemoryVT().getSizeInBits(); // Vector Setting const unsigned FromType = @@ -1165,9 +1164,6 @@ static unsigned getStoreVectorNumElts(SDNode *N) { bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { MemSDNode *LD = cast<MemSDNode>(N); - const EVT MemEVT = LD->getMemoryVT(); - if (!MemEVT.isSimple()) - return false; // Address Space Setting const auto CodeAddrSpace = getAddrSpace(LD); @@ -1237,10 +1233,6 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } bool NVPTXDAGToDAGISel::tryLDG(MemSDNode *LD) { - const EVT LoadedEVT = LD->getMemoryVT(); - if (!LoadedEVT.isSimple()) - return false; - SDLoc DL(LD); unsigned ExtensionType; @@ -1357,10 +1349,6 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { if (PlainStore && PlainStore->isIndexed()) return false; - const EVT StoreVT = ST->getMemoryVT(); - if (!StoreVT.isSimple()) - return false; - // Address Space Setting const auto CodeAddrSpace = getAddrSpace(ST); @@ -1369,7 +1357,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST); // Vector Setting - const unsigned ToTypeWidth = StoreVT.getSimpleVT().getSizeInBits(); + const unsigned ToTypeWidth = ST->getMemoryVT().getSizeInBits(); // Create the machine instruction DAG SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal(); @@ -1406,8 +1394,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { MemSDNode *ST = cast<MemSDNode>(N); - const EVT StoreVT = ST->getMemoryVT(); - assert(StoreVT.isSimple() && "Store value is not simple"); + const unsigned TotalWidth = ST->getMemoryVT().getSizeInBits(); // Address Space Setting const auto CodeAddrSpace = getAddrSpace(ST); @@ -1420,10 +1407,6 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { SDValue Chain = ST->getChain(); const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST); - // Type Setting: toType + toTypeWidth - // - for integer type, always use 'u' - const unsigned TotalWidth = StoreVT.getSimpleVT().getSizeInBits(); - const unsigned NumElts = getStoreVectorNumElts(ST); SmallVector<SDValue, 16> Ops; @@ -2337,3 +2320,30 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) { } } } + +void NVPTXDAGToDAGISel::selectAtomicSwap128(SDNode *N) { + MemSDNode *AN = cast<MemSDNode>(N); + SDLoc dl(N); + + const SDValue Chain = N->getOperand(0); + const auto [Base, Offset] = selectADDR(N->getOperand(1), CurDAG); + SmallVector<SDValue, 5> Ops{Base, Offset}; + Ops.append(N->op_begin() + 2, N->op_end()); + Ops.append({ + getI32Imm(getMemOrder(AN), dl), + getI32Imm(getAtomicScope(AN), dl), + getI32Imm(getAddrSpace(AN), dl), + Chain, + }); + + assert(N->getOpcode() == NVPTXISD::ATOMIC_CMP_SWAP_B128 || + N->getOpcode() == NVPTXISD::ATOMIC_SWAP_B128); + unsigned Opcode = N->getOpcode() == NVPTXISD::ATOMIC_SWAP_B128 + ? NVPTX::ATOM_EXCH_B128 + : NVPTX::ATOM_CAS_B128; + + auto *ATOM = CurDAG->getMachineNode(Opcode, dl, N->getVTList(), Ops); + CurDAG->setNodeMemRefs(ATOM, AN->getMemOperand()); + + ReplaceNode(N, ATOM); +} diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index e2ad55bc1796..8dcd5362c451 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -90,6 +90,7 @@ private: bool IsIm2Col = false); void SelectTcgen05Ld(SDNode *N, bool hasOffset = false); void SelectTcgen05St(SDNode *N, bool hasOffset = false); + void selectAtomicSwap128(SDNode *N); inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) { return CurDAG->getTargetConstant(Imm, DL, MVT::i32); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index bb4bb1195f78..d3fb657851fe 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -198,6 +198,12 @@ static bool IsPTXVectorType(MVT VT) { static std::optional<std::pair<unsigned int, MVT>> getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI, unsigned AddressSpace) { + const bool CanLowerTo256Bit = STI.has256BitVectorLoadStore(AddressSpace); + + if (CanLowerTo256Bit && VectorEVT.isScalarInteger() && + VectorEVT.getSizeInBits() == 256) + return {{4, MVT::i64}}; + if (!VectorEVT.isSimple()) return std::nullopt; const MVT VectorVT = VectorEVT.getSimpleVT(); @@ -214,8 +220,6 @@ getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI, // The size of the PTX virtual register that holds a packed type. unsigned PackRegSize; - bool CanLowerTo256Bit = STI.has256BitVectorLoadStore(AddressSpace); - // We only handle "native" vector sizes for now, e.g. <4 x double> is not // legal. We can (and should) split that into 2 stores of <2 x double> here // but I'm leaving that as a TODO for now. @@ -539,6 +543,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, case ISD::FMINNUM_IEEE: case ISD::FMAXIMUM: case ISD::FMINIMUM: + case ISD::FMAXIMUMNUM: + case ISD::FMINIMUMNUM: IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70; break; case ISD::FEXP2: @@ -702,57 +708,66 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // intrinsics. setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); - // Turn FP extload into load/fpextend - setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand); - // Turn FP truncstore into trunc + store. - // FIXME: vector types should also be expanded - setTruncStoreAction(MVT::f32, MVT::f16, Expand); - setTruncStoreAction(MVT::f64, MVT::f16, Expand); - setTruncStoreAction(MVT::f32, MVT::bf16, Expand); - setTruncStoreAction(MVT::f64, MVT::bf16, Expand); - setTruncStoreAction(MVT::f64, MVT::f32, Expand); - setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand); - setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand); + // FP extload/truncstore is not legal in PTX. We need to expand all these. + for (auto FloatVTs : + {MVT::fp_valuetypes(), MVT::fp_fixedlen_vector_valuetypes()}) { + for (MVT ValVT : FloatVTs) { + for (MVT MemVT : FloatVTs) { + setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Expand); + setTruncStoreAction(ValVT, MemVT, Expand); + } + } + } - // PTX does not support load / store predicate registers - setOperationAction(ISD::LOAD, MVT::i1, Custom); - setOperationAction(ISD::STORE, MVT::i1, Custom); + // To improve CodeGen we'll legalize any-extend loads to zext loads. This is + // how they'll be lowered in ISel anyway, and by doing this a little earlier + // we allow for more DAG combine opportunities. + for (auto IntVTs : + {MVT::integer_valuetypes(), MVT::integer_fixedlen_vector_valuetypes()}) + for (MVT ValVT : IntVTs) + for (MVT MemVT : IntVTs) + if (isTypeLegal(ValVT)) + setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Custom); + // PTX does not support load / store predicate registers + setOperationAction({ISD::LOAD, ISD::STORE}, MVT::i1, Custom); for (MVT VT : MVT::integer_valuetypes()) { - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, VT, MVT::i1, + Promote); setTruncStoreAction(VT, MVT::i1, Expand); } + // Disable generations of extload/truncstore for v2i16/v2i8. The generic + // expansion for these nodes when they are unaligned is incorrect if the + // type is a vector. + // + // TODO: Fix the generic expansion for these nodes found in + // TargetLowering::expandUnalignedLoad/Store. + setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16, + MVT::v2i8, Expand); + setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand); + + // Register custom handling for illegal type loads/stores. We'll try to custom + // lower almost all illegal types and logic in the lowering will discard cases + // we can't handle. + setOperationAction({ISD::LOAD, ISD::STORE}, {MVT::i128, MVT::f128}, Custom); + for (MVT VT : MVT::fixedlen_vector_valuetypes()) + if (!isTypeLegal(VT) && VT.getStoreSizeInBits() <= 256) + setOperationAction({ISD::STORE, ISD::LOAD}, VT, Custom); + + // Custom legalization for LDU intrinsics. + // TODO: The logic to lower these is not very robust and we should rewrite it. + // Perhaps LDU should not be represented as an intrinsic at all. + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); + for (MVT VT : MVT::fixedlen_vector_valuetypes()) + if (IsPTXVectorType(VT)) + setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom); + setCondCodeAction({ISD::SETNE, ISD::SETEQ, ISD::SETUGE, ISD::SETULE, ISD::SETUGT, ISD::SETULT, ISD::SETGT, ISD::SETLT, ISD::SETGE, ISD::SETLE}, MVT::i1, Expand); - // expand extload of vector of integers. - setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16, - MVT::v2i8, Expand); - setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand); - // This is legal in NVPTX setOperationAction(ISD::ConstantFP, MVT::f64, Legal); setOperationAction(ISD::ConstantFP, MVT::f32, Legal); @@ -767,24 +782,12 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // DEBUGTRAP can be lowered to PTX brkpt setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); - // Register custom handling for vector loads/stores - for (MVT VT : MVT::fixedlen_vector_valuetypes()) - if (IsPTXVectorType(VT)) - setOperationAction({ISD::LOAD, ISD::STORE, ISD::INTRINSIC_W_CHAIN}, VT, - Custom); - - setOperationAction({ISD::LOAD, ISD::STORE, ISD::INTRINSIC_W_CHAIN}, - {MVT::i128, MVT::f128}, Custom); - // Support varargs. setOperationAction(ISD::VASTART, MVT::Other, Custom); setOperationAction(ISD::VAARG, MVT::Other, Custom); setOperationAction(ISD::VACOPY, MVT::Other, Expand); setOperationAction(ISD::VAEND, MVT::Other, Expand); - // Custom handling for i8 intrinsics - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); - setOperationAction({ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}, {MVT::i16, MVT::i32, MVT::i64}, Legal); @@ -988,7 +991,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, if (getOperationAction(ISD::FABS, MVT::bf16) == Promote) AddPromotedToType(ISD::FABS, MVT::bf16, MVT::f32); - for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) { + for (const auto &Op : + {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM}) { setOperationAction(Op, MVT::f32, Legal); setOperationAction(Op, MVT::f64, Legal); setFP16OperationAction(Op, MVT::f16, Legal, Promote); @@ -1039,7 +1043,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, {MVT::i32, MVT::i64}, Expand); - // No FPOW or FREM in PTX. + + // atom.b128 is legal in PTX but since we don't represent i128 as a legal + // type, we need to custom lower it. + setOperationAction({ISD::ATOMIC_CMP_SWAP, ISD::ATOMIC_SWAP}, MVT::i128, + Custom); // Now deduce the information based on the above mentioned // actions @@ -1047,7 +1055,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // PTX support for 16-bit CAS is emulated. Only use 32+ setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits()); - setMaxAtomicSizeInBitsSupported(64); + setMaxAtomicSizeInBitsSupported(STI.hasAtomSwap128() ? 128 : 64); setMaxDivRemBitWidthSupported(64); // Custom lowering for tcgen05.ld vector operands @@ -1080,6 +1088,8 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { case NVPTXISD::FIRST_NUMBER: break; + MAKE_CASE(NVPTXISD::ATOMIC_CMP_SWAP_B128) + MAKE_CASE(NVPTXISD::ATOMIC_SWAP_B128) MAKE_CASE(NVPTXISD::RET_GLUE) MAKE_CASE(NVPTXISD::DeclareArrayParam) MAKE_CASE(NVPTXISD::DeclareScalarParam) @@ -3088,29 +3098,112 @@ SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MachinePointerInfo(SV)); } -static void replaceLoadVector(SDNode *N, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &Results, - const NVPTXSubtarget &STI); +/// replaceLoadVector - Convert vector loads into multi-output scalar loads. +static std::optional<std::pair<SDValue, SDValue>> +replaceLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI) { + LoadSDNode *LD = cast<LoadSDNode>(N); + const EVT ResVT = LD->getValueType(0); + const EVT MemVT = LD->getMemoryVT(); -SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { - if (Op.getValueType() == MVT::i1) - return LowerLOADi1(Op, DAG); + // If we're doing sign/zero extension as part of the load, avoid lowering to + // a LoadV node. TODO: consider relaxing this restriction. + if (ResVT != MemVT) + return std::nullopt; - EVT VT = Op.getValueType(); + const auto NumEltsAndEltVT = + getVectorLoweringShape(ResVT, STI, LD->getAddressSpace()); + if (!NumEltsAndEltVT) + return std::nullopt; + const auto [NumElts, EltVT] = NumEltsAndEltVT.value(); + + Align Alignment = LD->getAlign(); + const auto &TD = DAG.getDataLayout(); + Align PrefAlign = TD.getPrefTypeAlign(MemVT.getTypeForEVT(*DAG.getContext())); + if (Alignment < PrefAlign) { + // This load is not sufficiently aligned, so bail out and let this vector + // load be scalarized. Note that we may still be able to emit smaller + // vector loads. For example, if we are loading a <4 x float> with an + // alignment of 8, this check will fail but the legalizer will try again + // with 2 x <2 x float>, which will succeed with an alignment of 8. + return std::nullopt; + } + + // Since LoadV2 is a target node, we cannot rely on DAG type legalization. + // Therefore, we must ensure the type is legal. For i1 and i8, we set the + // loaded type to i16 and propagate the "real" type as the memory type. + const MVT LoadEltVT = (EltVT.getSizeInBits() < 16) ? MVT::i16 : EltVT; + + unsigned Opcode; + switch (NumElts) { + default: + return std::nullopt; + case 2: + Opcode = NVPTXISD::LoadV2; + break; + case 4: + Opcode = NVPTXISD::LoadV4; + break; + case 8: + Opcode = NVPTXISD::LoadV8; + break; + } + auto ListVTs = SmallVector<EVT, 9>(NumElts, LoadEltVT); + ListVTs.push_back(MVT::Other); + SDVTList LdResVTs = DAG.getVTList(ListVTs); - if (NVPTX::isPackedVectorTy(VT)) { - // v2f32/v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to - // handle unaligned loads and have to handle it here. - LoadSDNode *Load = cast<LoadSDNode>(Op); - EVT MemVT = Load->getMemoryVT(); - if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), - MemVT, *Load->getMemOperand())) { - SDValue Ops[2]; - std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); - return DAG.getMergeValues(Ops, SDLoc(Op)); + SDLoc DL(LD); + + // Copy regular operands + SmallVector<SDValue, 8> OtherOps(LD->ops()); + + // The select routine does not have access to the LoadSDNode instance, so + // pass along the extension information + OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL)); + + SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, MemVT, + LD->getMemOperand()); + + SmallVector<SDValue> ScalarRes; + if (EltVT.isVector()) { + assert(EVT(EltVT.getVectorElementType()) == ResVT.getVectorElementType()); + assert(NumElts * EltVT.getVectorNumElements() == + ResVT.getVectorNumElements()); + // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back + // into individual elements. + for (const unsigned I : llvm::seq(NumElts)) { + SDValue SubVector = NewLD.getValue(I); + DAG.ExtractVectorElements(SubVector, ScalarRes); + } + } else { + for (const unsigned I : llvm::seq(NumElts)) { + SDValue Res = NewLD.getValue(I); + if (LoadEltVT != EltVT) + Res = DAG.getNode(ISD::TRUNCATE, DL, EltVT, Res); + ScalarRes.push_back(Res); } } + SDValue LoadChain = NewLD.getValue(NumElts); + + const MVT BuildVecVT = + MVT::getVectorVT(EltVT.getScalarType(), ScalarRes.size()); + SDValue BuildVec = DAG.getBuildVector(BuildVecVT, DL, ScalarRes); + SDValue LoadValue = DAG.getBitcast(ResVT, BuildVec); + + return {{LoadValue, LoadChain}}; +} + +static void replaceLoadVector(SDNode *N, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &Results, + const NVPTXSubtarget &STI) { + if (auto Res = replaceLoadVector(N, DAG, STI)) + Results.append({Res->first, Res->second}); +} + +static SDValue lowerLoadVector(SDNode *N, SelectionDAG &DAG, + const NVPTXSubtarget &STI) { + if (auto Res = replaceLoadVector(N, DAG, STI)) + return DAG.getMergeValues({Res->first, Res->second}, SDLoc(N)); return SDValue(); } @@ -3118,13 +3211,10 @@ SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { // => // v1 = ld i8* addr (-> i16) // v = trunc i16 to i1 -SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { - SDNode *Node = Op.getNode(); - LoadSDNode *LD = cast<LoadSDNode>(Node); - SDLoc dl(Node); +static SDValue lowerLOADi1(LoadSDNode *LD, SelectionDAG &DAG) { + SDLoc dl(LD); assert(LD->getExtensionType() == ISD::NON_EXTLOAD); - assert(Node->getValueType(0) == MVT::i1 && - "Custom lowering for i1 load only"); + assert(LD->getValueType(0) == MVT::i1 && "Custom lowering for i1 load only"); SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), MVT::i8, LD->getAlign(), @@ -3133,35 +3223,31 @@ SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { // The legalizer (the caller) is expecting two values from the legalized // load, so we build a MergeValues node for it. See ExpandUnalignedLoad() // in LegalizeDAG.cpp which also uses MergeValues. - SDValue Ops[] = { result, LD->getChain() }; - return DAG.getMergeValues(Ops, dl); + return DAG.getMergeValues({result, LD->getChain()}, dl); } -SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { - StoreSDNode *Store = cast<StoreSDNode>(Op); - EVT VT = Store->getMemoryVT(); - - if (VT == MVT::i1) - return LowerSTOREi1(Op, DAG); +SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + LoadSDNode *LD = cast<LoadSDNode>(Op); - // v2f32/v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to - // handle unaligned stores and have to handle it here. - if (NVPTX::isPackedVectorTy(VT) && - !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), - VT, *Store->getMemOperand())) - return expandUnalignedStore(Store, DAG); + if (Op.getValueType() == MVT::i1) + return lowerLOADi1(LD, DAG); - // v2f16/v2bf16/v2i16 don't need special handling. - if (NVPTX::isPackedVectorTy(VT) && VT.is32BitVector()) - return SDValue(); + // To improve CodeGen we'll legalize any-extend loads to zext loads. This is + // how they'll be lowered in ISel anyway, and by doing this a little earlier + // we allow for more DAG combine opportunities. + if (LD->getExtensionType() == ISD::EXTLOAD) { + assert(LD->getValueType(0).isInteger() && LD->getMemoryVT().isInteger() && + "Unexpected fpext-load"); + return DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Op), Op.getValueType(), + LD->getChain(), LD->getBasePtr(), LD->getMemoryVT(), + LD->getMemOperand()); + } - // Lower store of any other vector type, including v2f32 as we want to break - // it apart since this is not a widely-supported type. - return LowerSTOREVector(Op, DAG); + llvm_unreachable("Unexpected custom lowering for load"); } -SDValue -NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { +static SDValue lowerSTOREVector(SDValue Op, SelectionDAG &DAG, + const NVPTXSubtarget &STI) { MemSDNode *N = cast<MemSDNode>(Op.getNode()); SDValue Val = N->getOperand(1); SDLoc DL(N); @@ -3253,6 +3339,18 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { return NewSt; } +SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + StoreSDNode *Store = cast<StoreSDNode>(Op); + EVT VT = Store->getMemoryVT(); + + if (VT == MVT::i1) + return LowerSTOREi1(Op, DAG); + + // Lower store of any other vector type, including v2f32 as we want to break + // it apart since this is not a widely-supported type. + return lowerSTOREVector(Op, DAG, STI); +} + // st i1 v, addr // => // v1 = zxt v to i16 @@ -4010,14 +4108,8 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( case Intrinsic::nvvm_ldu_global_i: case Intrinsic::nvvm_ldu_global_f: case Intrinsic::nvvm_ldu_global_p: { - auto &DL = I.getDataLayout(); Info.opc = ISD::INTRINSIC_W_CHAIN; - if (Intrinsic == Intrinsic::nvvm_ldu_global_i) - Info.memVT = getValueType(DL, I.getType()); - else if(Intrinsic == Intrinsic::nvvm_ldu_global_p) - Info.memVT = getPointerTy(DL); - else - Info.memVT = getValueType(DL, I.getType()); + Info.memVT = getValueType(I.getDataLayout(), I.getType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; @@ -5152,11 +5244,34 @@ static SDValue combinePackingMovIntoStore(SDNode *N, ST->getMemoryVT(), ST->getMemOperand()); } -static SDValue PerformStoreCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { +static SDValue combineSTORE(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + const NVPTXSubtarget &STI) { + + if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::STORE) { + // Here is our chance to custom lower a store with a non-simple type. + // Unfortunately, we can't do this in the legalizer because there is no + // way to setOperationAction for an non-simple type. + StoreSDNode *ST = cast<StoreSDNode>(N); + if (!ST->getValue().getValueType().isSimple()) + return lowerSTOREVector(SDValue(ST, 0), DCI.DAG, STI); + } + return combinePackingMovIntoStore(N, DCI, 1, 2); } +static SDValue combineLOAD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + const NVPTXSubtarget &STI) { + if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::LOAD) { + // Here is our chance to custom lower a load with a non-simple type. + // Unfortunately, we can't do this in the legalizer because there is no + // way to setOperationAction for an non-simple type. + if (!N->getValueType(0).isSimple()) + return lowerLoadVector(N, DCI.DAG, STI); + } + + return combineUnpackingMovIntoLoad(N, DCI); +} + /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. /// static SDValue PerformADDCombine(SDNode *N, @@ -5884,7 +5999,7 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, case ISD::LOAD: case NVPTXISD::LoadV2: case NVPTXISD::LoadV4: - return combineUnpackingMovIntoLoad(N, DCI); + return combineLOAD(N, DCI, STI); case ISD::MUL: return PerformMULCombine(N, DCI, OptLevel); case NVPTXISD::PRMT: @@ -5901,7 +6016,7 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, case ISD::STORE: case NVPTXISD::StoreV2: case NVPTXISD::StoreV4: - return PerformStoreCombine(N, DCI); + return combineSTORE(N, DCI, STI); case ISD::VSELECT: return PerformVSELECTCombine(N, DCI); } @@ -5930,103 +6045,6 @@ static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG, DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i8, {Vec0, Vec1})); } -/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. -static void replaceLoadVector(SDNode *N, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &Results, - const NVPTXSubtarget &STI) { - LoadSDNode *LD = cast<LoadSDNode>(N); - const EVT ResVT = LD->getValueType(0); - const EVT MemVT = LD->getMemoryVT(); - - // If we're doing sign/zero extension as part of the load, avoid lowering to - // a LoadV node. TODO: consider relaxing this restriction. - if (ResVT != MemVT) - return; - - const auto NumEltsAndEltVT = - getVectorLoweringShape(ResVT, STI, LD->getAddressSpace()); - if (!NumEltsAndEltVT) - return; - const auto [NumElts, EltVT] = NumEltsAndEltVT.value(); - - Align Alignment = LD->getAlign(); - const auto &TD = DAG.getDataLayout(); - Align PrefAlign = TD.getPrefTypeAlign(MemVT.getTypeForEVT(*DAG.getContext())); - if (Alignment < PrefAlign) { - // This load is not sufficiently aligned, so bail out and let this vector - // load be scalarized. Note that we may still be able to emit smaller - // vector loads. For example, if we are loading a <4 x float> with an - // alignment of 8, this check will fail but the legalizer will try again - // with 2 x <2 x float>, which will succeed with an alignment of 8. - return; - } - - // Since LoadV2 is a target node, we cannot rely on DAG type legalization. - // Therefore, we must ensure the type is legal. For i1 and i8, we set the - // loaded type to i16 and propagate the "real" type as the memory type. - const MVT LoadEltVT = (EltVT.getSizeInBits() < 16) ? MVT::i16 : EltVT; - - unsigned Opcode; - switch (NumElts) { - default: - return; - case 2: - Opcode = NVPTXISD::LoadV2; - break; - case 4: - Opcode = NVPTXISD::LoadV4; - break; - case 8: - Opcode = NVPTXISD::LoadV8; - break; - } - auto ListVTs = SmallVector<EVT, 9>(NumElts, LoadEltVT); - ListVTs.push_back(MVT::Other); - SDVTList LdResVTs = DAG.getVTList(ListVTs); - - SDLoc DL(LD); - - // Copy regular operands - SmallVector<SDValue, 8> OtherOps(LD->ops()); - - // The select routine does not have access to the LoadSDNode instance, so - // pass along the extension information - OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL)); - - SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, - LD->getMemoryVT(), - LD->getMemOperand()); - - SmallVector<SDValue> ScalarRes; - if (EltVT.isVector()) { - assert(EVT(EltVT.getVectorElementType()) == ResVT.getVectorElementType()); - assert(NumElts * EltVT.getVectorNumElements() == - ResVT.getVectorNumElements()); - // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back - // into individual elements. - for (const unsigned I : llvm::seq(NumElts)) { - SDValue SubVector = NewLD.getValue(I); - DAG.ExtractVectorElements(SubVector, ScalarRes); - } - } else { - for (const unsigned I : llvm::seq(NumElts)) { - SDValue Res = NewLD.getValue(I); - if (LoadEltVT != EltVT) - Res = DAG.getNode(ISD::TRUNCATE, DL, EltVT, Res); - ScalarRes.push_back(Res); - } - } - - SDValue LoadChain = NewLD.getValue(NumElts); - - const MVT BuildVecVT = - MVT::getVectorVT(EltVT.getScalarType(), ScalarRes.size()); - SDValue BuildVec = DAG.getBuildVector(BuildVecVT, DL, ScalarRes); - SDValue LoadValue = DAG.getBitcast(ResVT, BuildVec); - - Results.append({LoadValue, LoadChain}); -} - // Lower vector return type of tcgen05.ld intrinsics static void ReplaceTcgen05Ld(SDNode *N, SelectionDAG &DAG, SmallVectorImpl<SDValue> &Results, @@ -6262,6 +6280,49 @@ static void replaceProxyReg(SDNode *N, SelectionDAG &DAG, Results.push_back(Res); } +static void replaceAtomicSwap128(SDNode *N, SelectionDAG &DAG, + const NVPTXSubtarget &STI, + SmallVectorImpl<SDValue> &Results) { + assert(N->getValueType(0) == MVT::i128 && + "Custom lowering for atomic128 only supports i128"); + + AtomicSDNode *AN = cast<AtomicSDNode>(N); + SDLoc dl(N); + + if (!STI.hasAtomSwap128()) { + DAG.getContext()->diagnose(DiagnosticInfoUnsupported( + DAG.getMachineFunction().getFunction(), + "Support for b128 atomics introduced in PTX ISA version 8.3 and " + "requires target sm_90.", + dl.getDebugLoc())); + + Results.push_back(DAG.getUNDEF(MVT::i128)); + Results.push_back(AN->getOperand(0)); // Chain + return; + } + + SmallVector<SDValue, 6> Ops; + Ops.push_back(AN->getOperand(0)); // Chain + Ops.push_back(AN->getOperand(1)); // Ptr + for (const auto &Op : AN->ops().drop_front(2)) { + // Low part + Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op, + DAG.getIntPtrConstant(0, dl))); + // High part + Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op, + DAG.getIntPtrConstant(1, dl))); + } + unsigned Opcode = N->getOpcode() == ISD::ATOMIC_SWAP + ? NVPTXISD::ATOMIC_SWAP_B128 + : NVPTXISD::ATOMIC_CMP_SWAP_B128; + SDVTList Tys = DAG.getVTList(MVT::i64, MVT::i64, MVT::Other); + SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, MVT::i128, + AN->getMemOperand()); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i128, + {Result.getValue(0), Result.getValue(1)})); + Results.push_back(Result.getValue(2)); +} + void NVPTXTargetLowering::ReplaceNodeResults( SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { switch (N->getOpcode()) { @@ -6282,6 +6343,10 @@ void NVPTXTargetLowering::ReplaceNodeResults( case NVPTXISD::ProxyReg: replaceProxyReg(N, DAG, *this, Results); return; + case ISD::ATOMIC_CMP_SWAP: + case ISD::ATOMIC_SWAP: + replaceAtomicSwap128(N, DAG, STI, Results); + return; } } @@ -6306,16 +6371,19 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { } assert(Ty->isIntegerTy() && "Ty should be integer at this point"); - auto ITy = cast<llvm::IntegerType>(Ty); + const unsigned BitWidth = cast<IntegerType>(Ty)->getBitWidth(); switch (AI->getOperation()) { default: return AtomicExpansionKind::CmpXChg; + case AtomicRMWInst::BinOp::Xchg: + if (BitWidth == 128) + return AtomicExpansionKind::None; + LLVM_FALLTHROUGH; case AtomicRMWInst::BinOp::And: case AtomicRMWInst::BinOp::Or: case AtomicRMWInst::BinOp::Xor: - case AtomicRMWInst::BinOp::Xchg: - switch (ITy->getBitWidth()) { + switch (BitWidth) { case 8: case 16: return AtomicExpansionKind::CmpXChg; @@ -6325,6 +6393,8 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { if (STI.hasAtomBitwise64()) return AtomicExpansionKind::None; return AtomicExpansionKind::CmpXChg; + case 128: + return AtomicExpansionKind::CmpXChg; default: llvm_unreachable("unsupported width encountered"); } @@ -6334,7 +6404,7 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { case AtomicRMWInst::BinOp::Min: case AtomicRMWInst::BinOp::UMax: case AtomicRMWInst::BinOp::UMin: - switch (ITy->getBitWidth()) { + switch (BitWidth) { case 8: case 16: return AtomicExpansionKind::CmpXChg; @@ -6344,17 +6414,20 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { if (STI.hasAtomMinMax64()) return AtomicExpansionKind::None; return AtomicExpansionKind::CmpXChg; + case 128: + return AtomicExpansionKind::CmpXChg; default: llvm_unreachable("unsupported width encountered"); } case AtomicRMWInst::BinOp::UIncWrap: case AtomicRMWInst::BinOp::UDecWrap: - switch (ITy->getBitWidth()) { + switch (BitWidth) { case 32: return AtomicExpansionKind::None; case 8: case 16: case 64: + case 128: return AtomicExpansionKind::CmpXChg; default: llvm_unreachable("unsupported width encountered"); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 27f099e22097..03b3edc902e5 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -81,7 +81,17 @@ enum NodeType : unsigned { CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Z, FIRST_MEMORY_OPCODE, - LoadV2 = FIRST_MEMORY_OPCODE, + + /// These nodes are used to lower atomic instructions with i128 type. They are + /// similar to the generic nodes, but the input and output values are split + /// into two 64-bit values. + /// ValLo, ValHi, OUTCHAIN = ATOMIC_CMP_SWAP_B128(INCHAIN, ptr, cmpLo, cmpHi, + /// swapLo, swapHi) + /// ValLo, ValHi, OUTCHAIN = ATOMIC_SWAP_B128(INCHAIN, ptr, amtLo, amtHi) + ATOMIC_CMP_SWAP_B128 = FIRST_MEMORY_OPCODE, + ATOMIC_SWAP_B128, + + LoadV2, LoadV4, LoadV8, LDUV2, // LDU.v2 @@ -309,11 +319,8 @@ private: SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerLOADi1(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp index 34fe467c9456..6840c7ae8faf 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp @@ -12,6 +12,7 @@ #include "NVPTXInstrInfo.h" #include "NVPTX.h" +#include "NVPTXSubtarget.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -24,7 +25,8 @@ using namespace llvm; // Pin the vtable to this file. void NVPTXInstrInfo::anchor() {} -NVPTXInstrInfo::NVPTXInstrInfo() : RegInfo() {} +NVPTXInstrInfo::NVPTXInstrInfo(const NVPTXSubtarget &STI) + : NVPTXGenInstrInfo(STI), RegInfo() {} void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, @@ -190,4 +192,4 @@ unsigned NVPTXInstrInfo::insertBranch(MachineBasicBlock &MBB, BuildMI(&MBB, DL, get(NVPTX::CBranch)).add(Cond[0]).addMBB(TBB); BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(FBB); return 2; -}
\ No newline at end of file +} diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h index 4e9dc9d3b468..23889531431e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h @@ -21,12 +21,13 @@ #include "NVPTXGenInstrInfo.inc" namespace llvm { +class NVPTXSubtarget; class NVPTXInstrInfo : public NVPTXGenInstrInfo { const NVPTXRegisterInfo RegInfo; virtual void anchor(); public: - explicit NVPTXInstrInfo(); + explicit NVPTXInstrInfo(const NVPTXSubtarget &STI); const NVPTXRegisterInfo &getRegisterInfo() const { return RegInfo; } diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 7b135098bd4c..4e38e026e6bd 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -104,6 +104,7 @@ def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">; def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">; def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">; def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">; +def hasAtomSwap128 : Predicate<"Subtarget->hasAtomSwap128()">; def hasClusters : Predicate<"Subtarget->hasClusters()">; def hasPTXASUnreachableBug : Predicate<"Subtarget->hasPTXASUnreachableBug()">; def noPTXASUnreachableBug : Predicate<"!Subtarget->hasPTXASUnreachableBug()">; @@ -294,7 +295,7 @@ multiclass ADD_SUB_INT_CARRY<string op_str, SDNode op_node, bit commutative> { // // Also defines ftz (flush subnormal inputs and results to sign-preserving // zero) variants for fp32 functions. -multiclass FMINIMUMMAXIMUM<string OpcStr, bit NaN, SDNode OpNode> { +multiclass FMINIMUMMAXIMUM<string OpcStr, bit NaN, SDPatternOperator OpNode> { defvar nan_str = !if(NaN, ".NaN", ""); if !not(NaN) then { def _f64_rr : @@ -898,10 +899,8 @@ let Predicates = [hasOptEnabled] in { defm MAD_LO_S32 : MADInst<"lo.s32", mul, I32RT, I32RT>; defm MAD_LO_S64 : MADInst<"lo.s64", mul, I64RT, I64RT>; - defm MAD_WIDE_U16 : MADInst<"wide.u16", umul_wide, I32RT, I16RT>; - defm MAD_WIDE_S16 : MADInst<"wide.s16", smul_wide, I32RT, I16RT>; - defm MAD_WIDE_U32 : MADInst<"wide.u32", umul_wide, I64RT, I32RT>; - defm MAD_WIDE_S32 : MADInst<"wide.s32", smul_wide, I64RT, I32RT>; + // Generating mad.wide causes a regression: + // https://github.com/llvm/llvm-project/pull/150477#issuecomment-3191367837 } //----------------------------------- @@ -912,8 +911,15 @@ defm FADD : F3_fma_component<"add", fadd>; defm FSUB : F3_fma_component<"sub", fsub>; defm FMUL : F3_fma_component<"mul", fmul>; -defm MIN : FMINIMUMMAXIMUM<"min", /* NaN */ false, fminnum>; -defm MAX : FMINIMUMMAXIMUM<"max", /* NaN */ false, fmaxnum>; +def fminnum_or_fminimumnum : PatFrags<(ops node:$a, node:$b), + [(fminnum node:$a, node:$b), + (fminimumnum node:$a, node:$b)]>; +def fmaxnum_or_fmaximumnum : PatFrags<(ops node:$a, node:$b), + [(fmaxnum node:$a, node:$b), + (fmaximumnum node:$a, node:$b)]>; + +defm MIN : FMINIMUMMAXIMUM<"min", /* NaN */ false, fminnum_or_fminimumnum>; +defm MAX : FMINIMUMMAXIMUM<"max", /* NaN */ false, fmaxnum_or_fmaximumnum>; defm MIN_NAN : FMINIMUMMAXIMUM<"min", /* NaN */ true, fminimum>; defm MAX_NAN : FMINIMUMMAXIMUM<"max", /* NaN */ true, fmaximum>; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 4ab30a5b5f5e..c544911bdf1e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1990,19 +1990,23 @@ multiclass F_ATOMIC_3<RegTyInfo t, string op_str, SDPatternOperator op, SDNode a let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { def _rr : BasicFlagsNVPTXInst<(outs t.RC:$dst), - (ins ADDR:$addr, t.RC:$b, t.RC:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), + (ins ADDR:$addr, t.RC:$b, t.RC:$c), + (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), asm_str>; def _ir : BasicFlagsNVPTXInst<(outs t.RC:$dst), - (ins ADDR:$addr, t.Imm:$b, t.RC:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), + (ins ADDR:$addr, t.Imm:$b, t.RC:$c), + (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), asm_str>; def _ri : BasicFlagsNVPTXInst<(outs t.RC:$dst), - (ins ADDR:$addr, t.RC:$b, t.Imm:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), + (ins ADDR:$addr, t.RC:$b, t.Imm:$c), + (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), asm_str>; def _ii : BasicFlagsNVPTXInst<(outs t.RC:$dst), - (ins ADDR:$addr, t.Imm:$b, t.Imm:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), + (ins ADDR:$addr, t.Imm:$b, t.Imm:$c), + (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), asm_str>; } @@ -2200,6 +2204,37 @@ defm INT_PTX_SATOM_MIN : ATOM2_minmax_impl<"min">; defm INT_PTX_SATOM_OR : ATOM2_bitwise_impl<"or">; defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">; +// atom.*.b128 + +let mayLoad = true, mayStore = true, hasSideEffects = true, + Predicates = [hasAtomSwap128] in { + def ATOM_CAS_B128 : + NVPTXInst< + (outs B64:$dst0, B64:$dst1), + (ins ADDR:$addr, B64:$cmp0, B64:$cmp1, B64:$swap0, B64:$swap1, + AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), + "{{\n\t" + ".reg .b128 cmp, swap, dst;\n\t" + "mov.b128 cmp, {$cmp0, $cmp1};\n\t" + "mov.b128 swap, {$swap0, $swap1};\n\t" + "atom${sem:sem}${scope:scope}${addsp:addsp}.cas.b128 dst, [$addr], cmp, swap;\n\t" + "mov.b128 {$dst0, $dst1}, dst;\n\t" + "}}">; + + def ATOM_EXCH_B128 : + NVPTXInst< + (outs B64:$dst0, B64:$dst1), + (ins ADDR:$addr, B64:$amt0, B64:$amt1, + AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), + "{{\n\t" + ".reg .b128 amt, dst;\n\t" + "mov.b128 amt, {$amt0, $amt1};\n\t" + "atom${sem:sem}${scope:scope}${addsp:addsp}.exch.b128 dst, [$addr], amt;\n\t" + "mov.b128 {$dst0, $dst1}, dst;\n\t" + "}}">; +} + + //----------------------------------- // Support for ldu on sm_20 or later //----------------------------------- @@ -4358,10 +4393,12 @@ let hasSideEffects = 1 in { def SREG_CLOCK : PTX_READ_SREG_R32<"clock", int_nvvm_read_ptx_sreg_clock>; def SREG_CLOCK64 : PTX_READ_SREG_R64<"clock64", int_nvvm_read_ptx_sreg_clock64>; def SREG_GLOBALTIMER : PTX_READ_SREG_R64<"globaltimer", int_nvvm_read_ptx_sreg_globaltimer>; + def SREG_GLOBALTIMER_LO : PTX_READ_SREG_R32<"globaltimer_lo", int_nvvm_read_ptx_sreg_globaltimer_lo>; } def: Pat <(i64 (readcyclecounter)), (SREG_CLOCK64)>; def: Pat <(i64 (readsteadycounter)), (SREG_GLOBALTIMER)>; +def: Pat <(i32 (readsteadycounter)), (SREG_GLOBALTIMER_LO)>; def INT_PTX_SREG_PM0 : PTX_READ_SREG_R32<"pm0", int_nvvm_read_ptx_sreg_pm0>; def INT_PTX_SREG_PM1 : PTX_READ_SREG_R32<"pm1", int_nvvm_read_ptx_sreg_pm1>; diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp index a84ceaba991c..c5489670bd24 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp @@ -62,7 +62,7 @@ NVPTXSubtarget::NVPTXSubtarget(const Triple &TT, const std::string &CPU, const NVPTXTargetMachine &TM) : NVPTXGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), PTXVersion(0), FullSmVersion(200), SmVersion(getSmVersion()), - TLInfo(TM, initializeSubtargetDependencies(CPU, FS)) { + InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this) { TSInfo = std::make_unique<NVPTXSelectionDAGInfo>(); } diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index acf025b70ce3..0a77a633cb25 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -82,6 +82,7 @@ public: bool hasAtomBitwise64() const { return SmVersion >= 32; } bool hasAtomMinMax64() const { return SmVersion >= 32; } bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; } + bool hasAtomSwap128() const { return SmVersion >= 90 && PTXVersion >= 83; } bool hasClusters() const { return SmVersion >= 90 && PTXVersion >= 78; } bool hasLDG() const { return SmVersion >= 32; } bool hasHWROT32() const { return SmVersion >= 32; } @@ -105,6 +106,7 @@ public: // Tcgen05 instructions in Blackwell family bool hasTcgen05Instructions() const { bool HasTcgen05 = false; + unsigned MinPTXVersion = 86; switch (FullSmVersion) { default: break; @@ -112,9 +114,13 @@ public: case 1013: // sm_101a HasTcgen05 = true; break; + case 1033: // sm_103a + HasTcgen05 = true; + MinPTXVersion = 88; + break; } - return HasTcgen05 && PTXVersion >= 86; + return HasTcgen05 && PTXVersion >= MinPTXVersion; } // f32x2 instructions in Blackwell family bool hasF32x2Instructions() const; diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 0603994606d7..833f014a4c87 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -126,12 +126,12 @@ static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) { // (addrspace:3). if (!is64Bit) Ret += "-p:32:32-p6:32:32-p7:32:32"; - else if (UseShortPointers) { + else if (UseShortPointers) Ret += "-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32"; - } else + else Ret += "-p6:32:32"; - Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64"; + Ret += "-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"; return Ret; } diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp index 274b04fdd30b..8e97b422218f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp @@ -55,15 +55,6 @@ void clearAnnotationCache(const Module *Mod) { AC.Cache.erase(Mod); } -static void readIntVecFromMDNode(const MDNode *MetadataNode, - std::vector<unsigned> &Vec) { - for (unsigned i = 0, e = MetadataNode->getNumOperands(); i != e; ++i) { - ConstantInt *Val = - mdconst::extract<ConstantInt>(MetadataNode->getOperand(i)); - Vec.push_back(Val->getZExtValue()); - } -} - static void cacheAnnotationFromMD(const MDNode *MetadataNode, key_val_pair_t &retval) { auto &AC = getAnnotationCache(); @@ -83,19 +74,8 @@ static void cacheAnnotationFromMD(const MDNode *MetadataNode, if (ConstantInt *Val = mdconst::dyn_extract<ConstantInt>( MetadataNode->getOperand(i + 1))) { retval[Key].push_back(Val->getZExtValue()); - } else if (MDNode *VecMd = - dyn_cast<MDNode>(MetadataNode->getOperand(i + 1))) { - // note: only "grid_constant" annotations support vector MDNodes. - // assert: there can only exist one unique key value pair of - // the form (string key, MDNode node). Operands of such a node - // shall always be unsigned ints. - auto [It, Inserted] = retval.try_emplace(Key); - if (Inserted) { - readIntVecFromMDNode(VecMd, It->second); - continue; - } } else { - llvm_unreachable("Value operand not a constant int or an mdnode"); + llvm_unreachable("Value operand not a constant int"); } } } @@ -179,16 +159,13 @@ static bool globalHasNVVMAnnotation(const Value &V, const std::string &Prop) { } static bool argHasNVVMAnnotation(const Value &Val, - const std::string &Annotation, - const bool StartArgIndexAtOne = false) { + const std::string &Annotation) { if (const Argument *Arg = dyn_cast<Argument>(&Val)) { const Function *Func = Arg->getParent(); std::vector<unsigned> Annot; if (findAllNVVMAnnotation(Func, Annotation, Annot)) { - const unsigned BaseOffset = StartArgIndexAtOne ? 1 : 0; - if (is_contained(Annot, BaseOffset + Arg->getArgNo())) { + if (is_contained(Annot, Arg->getArgNo())) return true; - } } } return false; @@ -250,8 +227,7 @@ bool isParamGridConstant(const Argument &Arg) { } // "grid_constant" counts argument indices starting from 1 - if (argHasNVVMAnnotation(Arg, "grid_constant", - /*StartArgIndexAtOne*/ true)) + if (Arg.hasAttribute("nvvm.grid_constant")) return true; return false; diff --git a/llvm/lib/Target/PowerPC/CMakeLists.txt b/llvm/lib/Target/PowerPC/CMakeLists.txt index 1e39f01fd7aa..2182039e0eef 100644 --- a/llvm/lib/Target/PowerPC/CMakeLists.txt +++ b/llvm/lib/Target/PowerPC/CMakeLists.txt @@ -49,7 +49,7 @@ add_llvm_target(PowerPCCodeGen PPCTargetTransformInfo.cpp PPCTOCRegDeps.cpp PPCTLSDynamicCall.cpp - PPCVSXCopy.cpp + PPCVSXWACCCopy.cpp PPCReduceCRLogicals.cpp PPCVSXFMAMutate.cpp PPCVSXSwapRemoval.cpp diff --git a/llvm/lib/Target/PowerPC/PPC.h b/llvm/lib/Target/PowerPC/PPC.h index 124dac458431..a7cd5cde16b4 100644 --- a/llvm/lib/Target/PowerPC/PPC.h +++ b/llvm/lib/Target/PowerPC/PPC.h @@ -39,7 +39,7 @@ class ModulePass; FunctionPass *createPPCLoopInstrFormPrepPass(PPCTargetMachine &TM); FunctionPass *createPPCTOCRegDepsPass(); FunctionPass *createPPCEarlyReturnPass(); - FunctionPass *createPPCVSXCopyPass(); + FunctionPass *createPPCVSXWACCCopyPass(); FunctionPass *createPPCVSXFMAMutatePass(); FunctionPass *createPPCVSXSwapRemovalPass(); FunctionPass *createPPCReduceCRLogicalsPass(); @@ -64,7 +64,7 @@ class ModulePass; void initializePPCLoopInstrFormPrepPass(PassRegistry&); void initializePPCTOCRegDepsPass(PassRegistry&); void initializePPCEarlyReturnPass(PassRegistry&); - void initializePPCVSXCopyPass(PassRegistry&); + void initializePPCVSXWACCCopyPass(PassRegistry &); void initializePPCVSXFMAMutatePass(PassRegistry&); void initializePPCVSXSwapRemovalPass(PassRegistry&); void initializePPCReduceCRLogicalsPass(PassRegistry&); diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 2ab2c147be0e..023fd147535e 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -920,10 +920,6 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { case TargetOpcode::PATCHABLE_FUNCTION_ENTER: { assert(!Subtarget->isAIXABI() && "AIX does not support patchable function entry!"); - // PATCHABLE_FUNCTION_ENTER on little endian is for XRAY support which is - // handled in PPCLinuxAsmPrinter. - if (MAI->isLittleEndian()) - return; const Function &F = MF->getFunction(); unsigned Num = 0; (void)F.getFnAttribute("patchable-function-entry") @@ -1789,7 +1785,13 @@ void PPCLinuxAsmPrinter::emitInstruction(const MachineInstr *MI) { // Update compiler-rt/lib/xray/xray_powerpc64.cc accordingly when number // of instructions change. // XRAY is only supported on PPC Linux little endian. - if (!MAI->isLittleEndian()) + const Function &F = MF->getFunction(); + unsigned Num = 0; + (void)F.getFnAttribute("patchable-function-entry") + .getValueAsString() + .getAsInteger(10, Num); + + if (!MAI->isLittleEndian() || Num) break; MCSymbol *BeginOfSled = OutContext.createTempSymbol(); MCSymbol *EndOfSled = OutContext.createTempSymbol(); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 7022e9e9dae9..fa104e4f69d7 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1693,6 +1693,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::XXPERM: return "PPCISD::XXPERM"; case PPCISD::VECSHL: return "PPCISD::VECSHL"; + case PPCISD::VSRQ: + return "PPCISD::VSRQ"; case PPCISD::CMPB: return "PPCISD::CMPB"; case PPCISD::Hi: return "PPCISD::Hi"; case PPCISD::Lo: return "PPCISD::Lo"; @@ -2696,7 +2698,7 @@ bool llvm::isIntS34Immediate(SDNode *N, int64_t &Imm) { if (!isa<ConstantSDNode>(N)) return false; - Imm = (int64_t)cast<ConstantSDNode>(N)->getSExtValue(); + Imm = cast<ConstantSDNode>(N)->getSExtValue(); return isInt<34>(Imm); } bool llvm::isIntS34Immediate(SDValue Op, int64_t &Imm) { @@ -11274,6 +11276,24 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getMergeValues(RetOps, dl); } + case Intrinsic::ppc_mma_build_dmr: { + SmallVector<SDValue, 8> Pairs; + SmallVector<SDValue, 8> Chains; + for (int i = 1; i < 9; i += 2) { + SDValue Hi = Op.getOperand(i); + SDValue Lo = Op.getOperand(i + 1); + if (Hi->getOpcode() == ISD::LOAD) + Chains.push_back(Hi.getValue(1)); + if (Lo->getOpcode() == ISD::LOAD) + Chains.push_back(Lo.getValue(1)); + Pairs.push_back( + DAG.getNode(PPCISD::PAIR_BUILD, dl, MVT::v256i1, {Hi, Lo})); + } + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); + SDValue Value = DMFInsert1024(Pairs, SDLoc(Op), DAG); + return DAG.getMergeValues({Value, TF}, dl); + } + case Intrinsic::ppc_mma_dmxxextfdmr512: { assert(Subtarget.isISAFuture() && "dmxxextfdmr512 requires ISA Future"); auto *Idx = dyn_cast<ConstantSDNode>(Op.getOperand(2)); @@ -11610,6 +11630,10 @@ SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op, Op.getOperand(0)), 0); } + case Intrinsic::ppc_mma_disassemble_dmr: { + return DAG.getStore(DAG.getEntryNode(), DL, Op.getOperand(ArgStart + 2), + Op.getOperand(ArgStart + 1), MachinePointerInfo()); + } default: break; } @@ -12099,6 +12123,24 @@ SDValue PPCTargetLowering::LowerDMFVectorLoad(SDValue Op, return DAG.getMergeValues({DmrPValue, TF}, dl); } +SDValue PPCTargetLowering::DMFInsert1024(const SmallVectorImpl<SDValue> &Pairs, + const SDLoc &dl, + SelectionDAG &DAG) const { + SDValue Lo(DAG.getMachineNode(PPC::DMXXINSTDMR512, dl, MVT::v512i1, Pairs[0], + Pairs[1]), + 0); + SDValue LoSub = DAG.getTargetConstant(PPC::sub_wacc_lo, dl, MVT::i32); + SDValue Hi(DAG.getMachineNode(PPC::DMXXINSTDMR512_HI, dl, MVT::v512i1, + Pairs[2], Pairs[3]), + 0); + SDValue HiSub = DAG.getTargetConstant(PPC::sub_wacc_hi, dl, MVT::i32); + SDValue RC = DAG.getTargetConstant(PPC::DMRRCRegClassID, dl, MVT::i32); + + return SDValue(DAG.getMachineNode(PPC::REG_SEQUENCE, dl, MVT::v1024i1, + {RC, Lo, LoSub, Hi, HiSub}), + 0); +} + SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 559d58309692..669430550f4e 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -498,6 +498,9 @@ namespace llvm { /// SETBCR - The ISA 3.1 (P10) SETBCR instruction. SETBCR, + /// VSRQ - The ISA 3.1 (P10) Vector Shift right quadword instruction + VSRQ, + // NOTE: The nodes below may require PC-Rel specific patterns if the // address could be PC-Relative. When adding new nodes below, consider // whether or not the address can be PC-Relative and add the corresponding @@ -1345,6 +1348,8 @@ namespace llvm { SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDMFVectorLoad(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDMFVectorStore(SDValue Op, SelectionDAG &DAG) const; + SDValue DMFInsert1024(const SmallVectorImpl<SDValue> &Pairs, + const SDLoc &dl, SelectionDAG &DAG) const; SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td index fd2084398c85..269d30318bca 100644 --- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -1095,8 +1095,7 @@ let hasSideEffects = 0 in { defm RLDIMI : MDForm_1r<30, 3, (outs g8rc:$RA), (ins g8rc:$RAi, g8rc:$RS, u6imm:$SH, u6imm:$MBE), "rldimi", "$RA, $RS, $SH, $MBE", IIC_IntRotateDI, - []>, isPPC64, RegConstraint<"$RAi = $RA">, - NoEncode<"$RAi">; + []>, isPPC64, RegConstraint<"$RAi = $RA">; // Rotate instructions. defm RLDCL : MDSForm_1r<30, 8, @@ -1156,7 +1155,7 @@ defm RLWIMI8 : MForm_2r<20, (outs g8rc:$RA), (ins g8rc:$RAi, g8rc:$RS, u5imm:$SH, u5imm:$MB, u5imm:$ME), "rlwimi", "$RA, $RS, $SH, $MB, $ME", IIC_IntRotate, []>, PPC970_DGroup_Cracked, - RegConstraint<"$RAi = $RA">, NoEncode<"$RAi">; + RegConstraint<"$RAi = $RA">; let isSelect = 1 in def ISEL8 : AForm_4<31, 15, @@ -1313,21 +1312,18 @@ let Interpretation64Bit = 1, isCodeGenOnly = 1 in def LHAU8 : DForm_1<43, (outs g8rc:$RST, ptr_rc_nor0:$ea_result), (ins (memri $D, $RA):$addr), "lhau $RST, $addr", IIC_LdStLHAU, - []>, RegConstraint<"$addr.reg = $ea_result">, - NoEncode<"$ea_result">; + []>, RegConstraint<"$addr.reg = $ea_result">; // NO LWAU! let Interpretation64Bit = 1, isCodeGenOnly = 1 in def LHAUX8 : XForm_1_memOp<31, 375, (outs g8rc:$RST, ptr_rc_nor0:$ea_result), (ins (memrr $RA, $RB):$addr), "lhaux $RST, $addr", IIC_LdStLHAUX, - []>, RegConstraint<"$addr.ptrreg = $ea_result">, - NoEncode<"$ea_result">; + []>, RegConstraint<"$addr.ptrreg = $ea_result">; def LWAUX : XForm_1_memOp<31, 373, (outs g8rc:$RST, ptr_rc_nor0:$ea_result), (ins (memrr $RA, $RB):$addr), "lwaux $RST, $addr", IIC_LdStLHAUX, - []>, RegConstraint<"$addr.ptrreg = $ea_result">, - NoEncode<"$ea_result">, isPPC64; + []>, RegConstraint<"$addr.ptrreg = $ea_result">, isPPC64; } } @@ -1366,34 +1362,28 @@ let mayLoad = 1, hasSideEffects = 0 in { def LBZU8 : DForm_1<35, (outs g8rc:$RST, ptr_rc_nor0:$ea_result), (ins (memri $D, $RA):$addr), "lbzu $RST, $addr", IIC_LdStLoadUpd, - []>, RegConstraint<"$addr.reg = $ea_result">, - NoEncode<"$ea_result">; + []>, RegConstraint<"$addr.reg = $ea_result">; def LHZU8 : DForm_1<41, (outs g8rc:$RST, ptr_rc_nor0:$ea_result), (ins (memri $D, $RA):$addr), "lhzu $RST, $addr", IIC_LdStLoadUpd, - []>, RegConstraint<"$addr.reg = $ea_result">, - NoEncode<"$ea_result">; + []>, RegConstraint<"$addr.reg = $ea_result">; def LWZU8 : DForm_1<33, (outs g8rc:$RST, ptr_rc_nor0:$ea_result), (ins (memri $D, $RA):$addr), "lwzu $RST, $addr", IIC_LdStLoadUpd, - []>, RegConstraint<"$addr.reg = $ea_result">, - NoEncode<"$ea_result">; + []>, RegConstraint<"$addr.reg = $ea_result">; def LBZUX8 : XForm_1_memOp<31, 119, (outs g8rc:$RST, ptr_rc_nor0:$ea_result), (ins (memrr $RA, $RB):$addr), "lbzux $RST, $addr", IIC_LdStLoadUpdX, - []>, RegConstraint<"$addr.ptrreg = $ea_result">, - NoEncode<"$ea_result">; + []>, RegConstraint<"$addr.ptrreg = $ea_result">; def LHZUX8 : XForm_1_memOp<31, 311, (outs g8rc:$RST, ptr_rc_nor0:$ea_result), (ins (memrr $RA, $RB):$addr), "lhzux $RST, $addr", IIC_LdStLoadUpdX, - []>, RegConstraint<"$addr.ptrreg = $ea_result">, - NoEncode<"$ea_result">; + []>, RegConstraint<"$addr.ptrreg = $ea_result">; def LWZUX8 : XForm_1_memOp<31, 55, (outs g8rc:$RST, ptr_rc_nor0:$ea_result), (ins (memrr $RA, $RB):$addr), "lwzux $RST, $addr", IIC_LdStLoadUpdX, - []>, RegConstraint<"$addr.ptrreg = $ea_result">, - NoEncode<"$ea_result">; + []>, RegConstraint<"$addr.ptrreg = $ea_result">; } } } // Interpretation64Bit @@ -1445,14 +1435,12 @@ let mayLoad = 1, hasSideEffects = 0 in { def LDU : DSForm_1<58, 1, (outs g8rc:$RST, ptr_rc_nor0:$ea_result), (ins (memrix $D, $RA):$addr), "ldu $RST, $addr", IIC_LdStLDU, - []>, RegConstraint<"$addr.reg = $ea_result">, isPPC64, - NoEncode<"$ea_result">; + []>, RegConstraint<"$addr.reg = $ea_result">, isPPC64; def LDUX : XForm_1_memOp<31, 53, (outs g8rc:$RST, ptr_rc_nor0:$ea_result), (ins (memrr $RA, $RB):$addr), "ldux $RST, $addr", IIC_LdStLDUX, - []>, RegConstraint<"$addr.ptrreg = $ea_result">, - NoEncode<"$ea_result">, isPPC64; + []>, RegConstraint<"$addr.ptrreg = $ea_result">, isPPC64; } let mayLoad = 1, hasNoSchedulingInfo = 1 in { @@ -1718,45 +1706,41 @@ let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in { let Interpretation64Bit = 1, isCodeGenOnly = 1 in { def STBU8 : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$RST, (memri $D, $RA):$addr), "stbu $RST, $addr", IIC_LdStSTU, []>, - RegConstraint<"$addr.reg = $ea_res">, NoEncode<"$ea_res">; + RegConstraint<"$addr.reg = $ea_res">; def STHU8 : DForm_1<45, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$RST, (memri $D, $RA):$addr), "sthu $RST, $addr", IIC_LdStSTU, []>, - RegConstraint<"$addr.reg = $ea_res">, NoEncode<"$ea_res">; + RegConstraint<"$addr.reg = $ea_res">; def STWU8 : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$RST, (memri $D, $RA):$addr), "stwu $RST, $addr", IIC_LdStSTU, []>, - RegConstraint<"$addr.reg = $ea_res">, NoEncode<"$ea_res">; + RegConstraint<"$addr.reg = $ea_res">; def STBUX8: XForm_8_memOp<31, 247, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$RST, (memrr $RA, $RB):$addr), "stbux $RST, $addr", IIC_LdStSTUX, []>, RegConstraint<"$addr.ptrreg = $ea_res">, - NoEncode<"$ea_res">, PPC970_DGroup_Cracked; def STHUX8: XForm_8_memOp<31, 439, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$RST, (memrr $RA, $RB):$addr), "sthux $RST, $addr", IIC_LdStSTUX, []>, RegConstraint<"$addr.ptrreg = $ea_res">, - NoEncode<"$ea_res">, PPC970_DGroup_Cracked; def STWUX8: XForm_8_memOp<31, 183, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$RST, (memrr $RA, $RB):$addr), "stwux $RST, $addr", IIC_LdStSTUX, []>, RegConstraint<"$addr.ptrreg = $ea_res">, - NoEncode<"$ea_res">, PPC970_DGroup_Cracked; } // Interpretation64Bit def STDU : DSForm_1<62, 1, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$RST, (memrix $D, $RA):$addr), "stdu $RST, $addr", IIC_LdStSTU, []>, - RegConstraint<"$addr.reg = $ea_res">, NoEncode<"$ea_res">, + RegConstraint<"$addr.reg = $ea_res">, isPPC64; def STDUX : XForm_8_memOp<31, 181, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$RST, (memrr $RA, $RB):$addr), "stdux $RST, $addr", IIC_LdStSTUX, []>, RegConstraint<"$addr.ptrreg = $ea_res">, - NoEncode<"$ea_res">, PPC970_DGroup_Cracked, isPPC64; } @@ -2000,7 +1984,7 @@ def : Pat<(int_ppc_darnraw), (DARN 2)>; class X_RA5_RB5<bits<6> opcode, bits<10> xo, string opc, RegisterOperand ty, InstrItinClass itin, list<dag> pattern> - : X_L1_RS5_RS5<opcode, xo, (outs), (ins ty:$RA, ty:$RB, u1imm:$L), + : X_L1_RS5_RS5<opcode, xo, (outs), (ins ty:$RA, ty:$RB), !strconcat(opc, " $RA, $RB"), itin, pattern>{ let L = 1; } diff --git a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td index 79fe12e8e4b4..97d5e2896323 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td +++ b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td @@ -261,6 +261,13 @@ def immEQOneV : PatLeaf<(build_vector), [{ return C->isOne(); return false; }]>; + +def VSRVSRO : PatFrag<(ops node:$input, node:$shift), + (int_ppc_altivec_vsr + (int_ppc_altivec_vsro node:$input, node:$shift), + node:$shift), + [{ return N->getOperand(1).hasOneUse(); }]>; + //===----------------------------------------------------------------------===// // Helpers for defining instructions that directly correspond to intrinsics. @@ -1471,13 +1478,13 @@ def VINSERTB : VXForm_1<781, (outs vrrc:$VD), "vinsertb $VD, $VB, $VA", IIC_VecGeneral, [(set v16i8:$VD, (PPCvecinsert v16i8:$VDi, v16i8:$VB, imm32SExt16:$VA))]>, - RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">; + RegConstraint<"$VDi = $VD">; def VINSERTH : VXForm_1<845, (outs vrrc:$VD), (ins vrrc:$VDi, u4imm:$VA, vrrc:$VB), "vinserth $VD, $VB, $VA", IIC_VecGeneral, [(set v8i16:$VD, (PPCvecinsert v8i16:$VDi, v8i16:$VB, imm32SExt16:$VA))]>, - RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">; + RegConstraint<"$VDi = $VD">; def VINSERTW : VX1_VT5_UIM5_VB5<909, "vinsertw", []>; def VINSERTD : VX1_VT5_UIM5_VB5<973, "vinsertd", []>; @@ -1569,7 +1576,7 @@ def VRLWMI : VXForm_1<133, (outs vrrc:$VD), (ins vrrc:$VA, vrrc:$VB, vrrc:$VDi), [(set v4i32:$VD, (int_ppc_altivec_vrlwmi v4i32:$VA, v4i32:$VB, v4i32:$VDi))]>, - RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">; + RegConstraint<"$VDi = $VD">; def VRLDNM : VX1_VT5_VA5_VB5<453, "vrldnm", [(set v2i64:$VD, (int_ppc_altivec_vrldnm v2i64:$VA, @@ -1579,7 +1586,7 @@ def VRLDMI : VXForm_1<197, (outs vrrc:$VD), (ins vrrc:$VA, vrrc:$VB, vrrc:$VDi), [(set v2i64:$VD, (int_ppc_altivec_vrldmi v2i64:$VA, v2i64:$VB, v2i64:$VDi))]>, - RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">; + RegConstraint<"$VDi = $VD">; // Vector Shift Left/Right def VSLV : VX1_VT5_VA5_VB5<1860, "vslv", diff --git a/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/llvm/lib/Target/PowerPC/PPCInstrFormats.td index b4b475b470a5..fba1c6609dba 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrFormats.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFormats.td @@ -18,7 +18,7 @@ class I<bits<6> opcode, dag OOL, dag IOL, string asmstr, InstrItinClass itin> bit PPC64 = 0; // Default value, override with isPPC64 let Namespace = "PPC"; - let Inst{0-5} = opcode; + let Inst{0...5} = opcode; let OutOperandList = OOL; let InOperandList = IOL; let AsmString = asmstr; @@ -34,7 +34,7 @@ class I<bits<6> opcode, dag OOL, dag IOL, string asmstr, InstrItinClass itin> let TSFlags{0} = PPC970_First; let TSFlags{1} = PPC970_Single; let TSFlags{2} = PPC970_Cracked; - let TSFlags{5-3} = PPC970_Unit; + let TSFlags{5...3} = PPC970_Unit; // Indicate that this instruction is of type X-Form Load or Store bits<1> XFormMemOp = 0; @@ -99,8 +99,8 @@ class I2<bits<6> opcode1, bits<6> opcode2, dag OOL, dag IOL, string asmstr, bit PPC64 = 0; // Default value, override with isPPC64 let Namespace = "PPC"; - let Inst{0-5} = opcode1; - let Inst{32-37} = opcode2; + let Inst{0...5} = opcode1; + let Inst{32...37} = opcode2; let OutOperandList = OOL; let InOperandList = IOL; let AsmString = asmstr; @@ -116,7 +116,7 @@ class I2<bits<6> opcode1, bits<6> opcode2, dag OOL, dag IOL, string asmstr, let TSFlags{0} = PPC970_First; let TSFlags{1} = PPC970_Single; let TSFlags{2} = PPC970_Cracked; - let TSFlags{5-3} = PPC970_Unit; + let TSFlags{5...3} = PPC970_Unit; // Fields used for relation models. string BaseName = ""; @@ -135,7 +135,7 @@ class IForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr, let Pattern = pattern; bits<24> LI; - let Inst{6-29} = LI; + let Inst{6...29} = LI; let Inst{30} = aa; let Inst{31} = lk; } @@ -148,12 +148,12 @@ class BForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr> bits<14> BD; bits<5> BI; - let BI{0-1} = BIBO{5-6}; - let BI{2-4} = CR{0-2}; + let BI{0...1} = BIBO{5...6}; + let BI{2...4} = CR{0...2}; - let Inst{6-10} = BIBO{4-0}; - let Inst{11-15} = BI; - let Inst{16-29} = BD; + let Inst{6...10} = BIBO{4...0}; + let Inst{11...15} = BI; + let Inst{16...29} = BD; let Inst{30} = aa; let Inst{31} = lk; } @@ -161,8 +161,8 @@ class BForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr> class BForm_1<bits<6> opcode, bits<5> bo, bit aa, bit lk, dag OOL, dag IOL, string asmstr> : BForm<opcode, aa, lk, OOL, IOL, asmstr> { - let BIBO{4-0} = bo; - let BIBO{6-5} = 0; + let BIBO{4...0} = bo; + let BIBO{6...5} = 0; let CR = 0; } @@ -171,9 +171,9 @@ class BForm_2<bits<6> opcode, bits<5> bo, bits<5> bi, bit aa, bit lk, : I<opcode, OOL, IOL, asmstr, IIC_BrB> { bits<14> BD; - let Inst{6-10} = bo; - let Inst{11-15} = bi; - let Inst{16-29} = BD; + let Inst{6...10} = bo; + let Inst{11...15} = bi; + let Inst{16...29} = BD; let Inst{30} = aa; let Inst{31} = lk; } @@ -185,9 +185,9 @@ class BForm_3<bits<6> opcode, bit aa, bit lk, bits<5> BI; bits<14> BD; - let Inst{6-10} = BO; - let Inst{11-15} = BI; - let Inst{16-29} = BD; + let Inst{6...10} = BO; + let Inst{11...15} = BI; + let Inst{16...29} = BD; let Inst{30} = aa; let Inst{31} = lk; } @@ -200,10 +200,10 @@ class BForm_3_at<bits<6> opcode, bit aa, bit lk, bits<5> BI; bits<14> BD; - let Inst{6-8} = BO{4-2}; - let Inst{9-10} = at; - let Inst{11-15} = BI; - let Inst{16-29} = BD; + let Inst{6...8} = BO{4...2}; + let Inst{9...10} = at; + let Inst{11...15} = BI; + let Inst{16...29} = BD; let Inst{30} = aa; let Inst{31} = lk; } @@ -215,9 +215,9 @@ BForm_4<bits<6> opcode, bits<5> bo, bit aa, bit lk, bits<5> BI; bits<14> BD; - let Inst{6-10} = bo; - let Inst{11-15} = BI; - let Inst{16-29} = BD; + let Inst{6...10} = bo; + let Inst{11...15} = BI; + let Inst{16...29} = BD; let Inst{30} = aa; let Inst{31} = lk; } @@ -231,7 +231,7 @@ class SCForm<bits<6> opcode, bits<1> xo1, bits<1> xo2, let Pattern = pattern; - let Inst{20-26} = LEV; + let Inst{20...26} = LEV; let Inst{30} = xo1; let Inst{31} = xo2; } @@ -246,9 +246,9 @@ class DForm_base<bits<6> opcode, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = RST; - let Inst{11-15} = RA; - let Inst{16-31} = D; + let Inst{6...10} = RST; + let Inst{11...15} = RA; + let Inst{16...31} = D; } class DForm_1<bits<6> opcode, dag OOL, dag IOL, string asmstr, @@ -273,9 +273,9 @@ class DForm_2_r0<bits<6> opcode, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = RST; - let Inst{11-15} = 0; - let Inst{16-31} = D; + let Inst{6...10} = RST; + let Inst{11...15} = 0; + let Inst{16...31} = D; } class DForm_4<bits<6> opcode, dag OOL, dag IOL, string asmstr, @@ -287,9 +287,9 @@ class DForm_4<bits<6> opcode, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = RST; - let Inst{11-15} = RA; - let Inst{16-31} = D; + let Inst{6...10} = RST; + let Inst{11...15} = RA; + let Inst{16...31} = D; } class DForm_4_zero<bits<6> opcode, dag OOL, dag IOL, string asmstr, @@ -321,13 +321,13 @@ class IForm_and_DForm_1<bits<6> opcode1, bit aa, bit lk, bits<6> opcode2, let Pattern = pattern; bits<24> LI; - let Inst{6-29} = LI; + let Inst{6...29} = LI; let Inst{30} = aa; let Inst{31} = lk; - let Inst{38-42} = RST; - let Inst{43-47} = RA; - let Inst{48-63} = D; + let Inst{38...42} = RST; + let Inst{43...47} = RA; + let Inst{48...63} = D; } // This is used to emit BL8+NOP. @@ -349,11 +349,11 @@ class DForm_5<bits<6> opcode, dag OOL, dag IOL, string asmstr, bits<5> RA; bits<16> D; - let Inst{6-8} = BF; + let Inst{6...8} = BF; let Inst{9} = 0; let Inst{10} = L; - let Inst{11-15} = RA; - let Inst{16-31} = D; + let Inst{11...15} = RA; + let Inst{16...31} = D; } class DForm_5_ext<bits<6> opcode, dag OOL, dag IOL, string asmstr, @@ -383,10 +383,10 @@ class DSForm_1<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = RST; - let Inst{11-15} = RA; - let Inst{16-29} = D; - let Inst{30-31} = xo; + let Inst{6...10} = RST; + let Inst{11...15} = RA; + let Inst{16...29} = D; + let Inst{30...31} = xo; } // ISA V3.0B 1.6.6 DX-Form @@ -398,10 +398,10 @@ class DXForm<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = RT; - let Inst{11-15} = D{5-1}; // d1 - let Inst{16-25} = D{15-6}; // d0 - let Inst{26-30} = xo; + let Inst{6...10} = RT; + let Inst{11...15} = D{5...1}; // d1 + let Inst{16...25} = D{15...6}; // d0 + let Inst{26...30} = xo; let Inst{31} = D{0}; // d2 } @@ -415,11 +415,11 @@ class DQ_RD6_RS5_DQ12<bits<6> opcode, bits<3> xo, dag OOL, dag IOL, let Pattern = pattern; - let Inst{6-10} = XT{4-0}; - let Inst{11-15} = RA; - let Inst{16-27} = DQ; + let Inst{6...10} = XT{4...0}; + let Inst{11...15} = RA; + let Inst{16...27} = DQ; let Inst{28} = XT{5}; - let Inst{29-31} = xo; + let Inst{29...31} = xo; } class DQForm_RTp5_RA17_MEM<bits<6> opcode, bits<4> xo, dag OOL, dag IOL, @@ -431,10 +431,10 @@ class DQForm_RTp5_RA17_MEM<bits<6> opcode, bits<4> xo, dag OOL, dag IOL, bits<12> DQ; let Pattern = pattern; - let Inst{6-10} = RTp{4-0}; - let Inst{11-15} = RA; - let Inst{16-27} = DQ; - let Inst{28-31} = xo; + let Inst{6...10} = RTp{4...0}; + let Inst{11...15} = RA; + let Inst{16...27} = DQ; + let Inst{28...31} = xo; } // 1.7.6 X-Form @@ -449,10 +449,10 @@ class XForm_base_r3xo<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asms bit RC = 0; // set by isRecordForm - let Inst{6-10} = RST; - let Inst{11-15} = RA; - let Inst{16-20} = RB; - let Inst{21-30} = xo; + let Inst{6...10} = RST; + let Inst{11...15} = RA; + let Inst{16...20} = RB; + let Inst{21...30} = xo; let Inst{31} = RC; } @@ -475,7 +475,7 @@ class XForm_tlbilx<bits<10> xo, dag OOL, dag IOL, string asmstr, class XForm_attn<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, InstrItinClass itin> : I<opcode, OOL, IOL, asmstr, itin> { - let Inst{21-30} = xo; + let Inst{21...30} = xo; } // This is the same as XForm_base_r3xo, but the first two operands are swapped @@ -490,10 +490,10 @@ class XForm_base_r3xo_swapped bit RC = 0; // set by isRecordForm - let Inst{6-10} = RST; - let Inst{11-15} = RA; - let Inst{16-20} = RB; - let Inst{21-30} = xo; + let Inst{6...10} = RST; + let Inst{11...15} = RA; + let Inst{16...20} = RB; + let Inst{21...30} = xo; let Inst{31} = RC; } @@ -528,10 +528,10 @@ class XForm_tlbws<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = RST; - let Inst{11-15} = RA; + let Inst{6...10} = RST; + let Inst{11...15} = RA; let Inst{20} = WS; - let Inst{21-30} = xo; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -570,12 +570,12 @@ class XForm_16<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, bits<5> RA; bits<5> RB; - let Inst{6-8} = BF; + let Inst{6...8} = BF; let Inst{9} = 0; let Inst{10} = L; - let Inst{11-15} = RA; - let Inst{16-20} = RB; - let Inst{21-30} = xo; + let Inst{11...15} = RA; + let Inst{16...20} = RB; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -587,10 +587,10 @@ class XForm_icbt<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, bits<5> RB; let Inst{6} = 0; - let Inst{7-10} = CT; - let Inst{11-15} = RA; - let Inst{16-20} = RB; - let Inst{21-30} = xo; + let Inst{7...10} = CT; + let Inst{11...15} = RA; + let Inst{16...20} = RB; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -600,9 +600,9 @@ class XForm_sr<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, bits<5> RS; bits<4> SR; - let Inst{6-10} = RS; - let Inst{12-15} = SR; - let Inst{21-30} = xo; + let Inst{6...10} = RS; + let Inst{12...15} = SR; + let Inst{21...30} = xo; } class XForm_mbar<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, @@ -610,8 +610,8 @@ class XForm_mbar<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, : I<opcode, OOL, IOL, asmstr, itin> { bits<5> MO; - let Inst{6-10} = MO; - let Inst{21-30} = xo; + let Inst{6...10} = MO; + let Inst{21...30} = xo; } class XForm_srin<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, @@ -620,9 +620,9 @@ class XForm_srin<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, bits<5> RS; bits<5> RB; - let Inst{6-10} = RS; - let Inst{16-20} = RB; - let Inst{21-30} = xo; + let Inst{6...10} = RS; + let Inst{16...20} = RB; + let Inst{21...30} = xo; } class XForm_mtmsr<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, @@ -631,9 +631,9 @@ class XForm_mtmsr<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, bits<5> RS; bits<1> L; - let Inst{6-10} = RS; + let Inst{6...10} = RS; let Inst{15} = L; - let Inst{21-30} = xo; + let Inst{21...30} = xo; } class XForm_16_ext<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, @@ -649,11 +649,11 @@ class XForm_17<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, bits<5> RA; bits<5> RB; - let Inst{6-8} = BF; - let Inst{9-10} = 0; - let Inst{11-15} = RA; - let Inst{16-20} = RB; - let Inst{21-30} = xo; + let Inst{6...8} = BF; + let Inst{9...10} = 0; + let Inst{11...15} = RA; + let Inst{16...20} = RB; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -673,10 +673,10 @@ class XForm_18<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = FRT; - let Inst{11-15} = FRA; - let Inst{16-20} = FRB; - let Inst{21-30} = xo; + let Inst{6...10} = FRT; + let Inst{11...15} = FRA; + let Inst{16...20} = FRB; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -696,11 +696,11 @@ class XForm_20<bits<6> opcode, bits<6> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = FRT; - let Inst{11-15} = FRA; - let Inst{16-20} = FRB; - let Inst{21-24} = tttt; - let Inst{25-30} = xo; + let Inst{6...10} = FRT; + let Inst{11...15} = FRA; + let Inst{16...20} = FRB; + let Inst{21...24} = tttt; + let Inst{25...30} = xo; let Inst{31} = 0; } @@ -708,10 +708,10 @@ class XForm_24<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern> : I<opcode, OOL, IOL, asmstr, itin> { let Pattern = pattern; - let Inst{6-10} = 31; - let Inst{11-15} = 0; - let Inst{16-20} = 0; - let Inst{21-30} = xo; + let Inst{6...10} = 31; + let Inst{11...15} = 0; + let Inst{16...20} = 0; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -721,11 +721,11 @@ class XForm_24_sync<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, bits<2> L; let Pattern = pattern; - let Inst{6-8} = 0; - let Inst{9-10} = L; - let Inst{11-15} = 0; - let Inst{16-20} = 0; - let Inst{21-30} = xo; + let Inst{6...8} = 0; + let Inst{9...10} = L; + let Inst{11...15} = 0; + let Inst{16...20} = 0; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -736,12 +736,12 @@ class XForm_IMM2_IMM2<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, bits<2> PL; let Pattern = pattern; - let Inst{6-8} = 0; - let Inst{9-10} = L; - let Inst{11-13} = 0; - let Inst{14-15} = PL; - let Inst{16-20} = 0; - let Inst{21-30} = xo; + let Inst{6...8} = 0; + let Inst{9...10} = L; + let Inst{11...13} = 0; + let Inst{14...15} = PL; + let Inst{16...20} = 0; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -752,12 +752,12 @@ class XForm_IMM3_IMM2<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, bits<2> SC; let Pattern = pattern; - let Inst{6-7} = 0; - let Inst{8-10} = L; - let Inst{11-13} = 0; - let Inst{14-15} = SC; - let Inst{16-20} = 0; - let Inst{21-30} = xo; + let Inst{6...7} = 0; + let Inst{8...10} = L; + let Inst{11...13} = 0; + let Inst{14...15} = SC; + let Inst{16...20} = 0; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -803,9 +803,9 @@ class XForm_42<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, bit RC = 0; // set by isRecordForm - let Inst{6-10} = RST; - let Inst{11-20} = 0; - let Inst{21-30} = xo; + let Inst{6...10} = RST; + let Inst{11...20} = 0; + let Inst{21...30} = xo; let Inst{31} = RC; } class XForm_43<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, @@ -816,9 +816,9 @@ class XForm_43<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, bit RC = 0; // set by isRecordForm - let Inst{6-10} = FM; - let Inst{11-20} = 0; - let Inst{21-30} = xo; + let Inst{6...10} = FM; + let Inst{11...20} = 0; + let Inst{21...30} = xo; let Inst{31} = RC; } @@ -828,11 +828,11 @@ class XForm_44<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, bits<5> RT; bits<3> BFA; - let Inst{6-10} = RT; - let Inst{11-13} = BFA; - let Inst{14-15} = 0; - let Inst{16-20} = 0; - let Inst{21-30} = xo; + let Inst{6...10} = RT; + let Inst{11...13} = BFA; + let Inst{14...15} = 0; + let Inst{16...20} = 0; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -842,11 +842,11 @@ class XForm_45<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, bits<5> RT; bits<2> L; - let Inst{6-10} = RT; - let Inst{11-13} = 0; - let Inst{14-15} = L; - let Inst{16-20} = 0; - let Inst{21-30} = xo; + let Inst{6...10} = RT; + let Inst{11...13} = 0; + let Inst{14...15} = L; + let Inst{16...20} = 0; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -856,11 +856,11 @@ class X_FRT5_XO2_XO3_XO10<bits<6> opcode, bits<2> xo1, bits<3> xo2, bits<10> xo, : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> { let Pattern = pattern; - let Inst{6-10} = RST; - let Inst{11-12} = xo1; - let Inst{13-15} = xo2; - let Inst{16-20} = 0; - let Inst{21-30} = xo; + let Inst{6...10} = RST; + let Inst{11...12} = xo1; + let Inst{13...15} = xo2; + let Inst{16...20} = 0; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -871,11 +871,11 @@ class X_FRT5_XO2_XO3_FRB5_XO10<bits<6> opcode, bits<2> xo1, bits<3> xo2, let Pattern = pattern; bits<5> FRB; - let Inst{6-10} = RST; - let Inst{11-12} = xo1; - let Inst{13-15} = xo2; - let Inst{16-20} = FRB; - let Inst{21-30} = xo; + let Inst{6...10} = RST; + let Inst{11...12} = xo1; + let Inst{13...15} = xo2; + let Inst{16...20} = FRB; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -886,12 +886,12 @@ class X_FRT5_XO2_XO3_DRM3_XO10<bits<6> opcode, bits<2> xo1, bits<3> xo2, let Pattern = pattern; bits<3> DRM; - let Inst{6-10} = RST; - let Inst{11-12} = xo1; - let Inst{13-15} = xo2; - let Inst{16-17} = 0; - let Inst{18-20} = DRM; - let Inst{21-30} = xo; + let Inst{6...10} = RST; + let Inst{11...12} = xo1; + let Inst{13...15} = xo2; + let Inst{16...17} = 0; + let Inst{18...20} = DRM; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -902,12 +902,12 @@ class X_FRT5_XO2_XO3_RM2_X10<bits<6> opcode, bits<2> xo1, bits<3> xo2, let Pattern = pattern; bits<2> RM; - let Inst{6-10} = RST; - let Inst{11-12} = xo1; - let Inst{13-15} = xo2; - let Inst{16-18} = 0; - let Inst{19-20} = RM; - let Inst{21-30} = xo; + let Inst{6...10} = RST; + let Inst{11...12} = xo1; + let Inst{13...15} = xo2; + let Inst{16...18} = 0; + let Inst{19...20} = RM; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -934,10 +934,10 @@ class XForm_htm0<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, bit RC = 1; - let Inst{6-9} = 0; + let Inst{6...9} = 0; let Inst{10} = R; - let Inst{11-20} = 0; - let Inst{21-30} = xo; + let Inst{11...20} = 0; + let Inst{21...30} = xo; let Inst{31} = RC; } @@ -949,8 +949,8 @@ class XForm_htm1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, bit RC = 1; let Inst{6} = A; - let Inst{7-20} = 0; - let Inst{21-30} = xo; + let Inst{7...20} = 0; + let Inst{21...30} = xo; let Inst{31} = RC; } @@ -961,10 +961,10 @@ class XForm_htm2<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, bit RC = 0; // set by isRecordForm - let Inst{7-9} = 0; + let Inst{7...9} = 0; let Inst{10} = L; - let Inst{11-20} = 0; - let Inst{21-30} = xo; + let Inst{11...20} = 0; + let Inst{21...30} = xo; let Inst{31} = RC; } @@ -975,9 +975,9 @@ class XForm_htm3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, bit RC = 0; - let Inst{6-8} = BF; - let Inst{9-20} = 0; - let Inst{21-30} = xo; + let Inst{6...8} = BF; + let Inst{9...20} = 0; + let Inst{21...30} = xo; let Inst{31} = RC; } @@ -992,12 +992,12 @@ class X_BF3_L1_RS5_RS5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, let Pattern = pattern; - let Inst{6-8} = BF; + let Inst{6...8} = BF; let Inst{9} = 0; let Inst{10} = L; - let Inst{11-15} = RA; - let Inst{16-20} = RB; - let Inst{21-30} = xo; + let Inst{11...15} = RA; + let Inst{16...20} = RB; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -1011,11 +1011,11 @@ class X_BF3_RS5_RS5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, let Pattern = pattern; - let Inst{6-8} = BF; - let Inst{9-10} = 0; - let Inst{11-15} = RA; - let Inst{16-20} = RB; - let Inst{21-30} = xo; + let Inst{6...8} = BF; + let Inst{9...10} = 0; + let Inst{11...15} = RA; + let Inst{16...20} = RB; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -1035,10 +1035,10 @@ class X_BF3_DCMX7_RS5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, let Pattern = pattern; - let Inst{6-8} = BF; - let Inst{9-15} = DCMX; - let Inst{16-20} = VB; - let Inst{21-30} = xo; + let Inst{6...8} = BF; + let Inst{9...15} = DCMX; + let Inst{16...20} = VB; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -1050,10 +1050,10 @@ class X_RD6_IMM8<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, let Pattern = pattern; - let Inst{6-10} = XT{4-0}; - let Inst{11-12} = 0; - let Inst{13-20} = IMM8; - let Inst{21-30} = xo; + let Inst{6...10} = XT{4...0}; + let Inst{11...12} = 0; + let Inst{13...20} = IMM8; + let Inst{21...30} = xo; let Inst{31} = XT{5}; } @@ -1092,10 +1092,10 @@ class XX1Form<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = XT{4-0}; - let Inst{11-15} = RA; - let Inst{16-20} = RB; - let Inst{21-30} = xo; + let Inst{6...10} = XT{4...0}; + let Inst{11...15} = RA; + let Inst{16...20} = RB; + let Inst{21...30} = xo; let Inst{31} = XT{5}; } @@ -1117,10 +1117,10 @@ class XX2Form<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = XT{4-0}; - let Inst{11-15} = 0; - let Inst{16-20} = XB{4-0}; - let Inst{21-29} = xo; + let Inst{6...10} = XT{4...0}; + let Inst{11...15} = 0; + let Inst{16...20} = XB{4...0}; + let Inst{21...29} = xo; let Inst{30} = XB{5}; let Inst{31} = XT{5}; } @@ -1133,10 +1133,10 @@ class XX2Form_1<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-8} = CR; - let Inst{9-15} = 0; - let Inst{16-20} = XB{4-0}; - let Inst{21-29} = xo; + let Inst{6...8} = CR; + let Inst{9...15} = 0; + let Inst{16...20} = XB{4...0}; + let Inst{21...29} = xo; let Inst{30} = XB{5}; let Inst{31} = 0; } @@ -1150,11 +1150,11 @@ class XX2Form_2<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = XT{4-0}; - let Inst{11-13} = 0; - let Inst{14-15} = D; - let Inst{16-20} = XB{4-0}; - let Inst{21-29} = xo; + let Inst{6...10} = XT{4...0}; + let Inst{11...13} = 0; + let Inst{14...15} = D; + let Inst{16...20} = XB{4...0}; + let Inst{21...29} = xo; let Inst{30} = XB{5}; let Inst{31} = XT{5}; } @@ -1168,10 +1168,10 @@ class XX2_RD6_UIM5_RS6<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, let Pattern = pattern; - let Inst{6-10} = XT{4-0}; - let Inst{11-15} = UIM5; - let Inst{16-20} = XB{4-0}; - let Inst{21-29} = xo; + let Inst{6...10} = XT{4...0}; + let Inst{11...15} = UIM5; + let Inst{16...20} = XB{4...0}; + let Inst{21...29} = xo; let Inst{30} = XB{5}; let Inst{31} = XT{5}; } @@ -1185,10 +1185,10 @@ class XX2_RD5_XO5_RS6<bits<6> opcode, bits<5> xo2, bits<9> xo, dag OOL, dag IOL, let Pattern = pattern; - let Inst{6-10} = RT; - let Inst{11-15} = xo2; - let Inst{16-20} = XB{4-0}; - let Inst{21-29} = xo; + let Inst{6...10} = RT; + let Inst{11...15} = xo2; + let Inst{16...20} = XB{4...0}; + let Inst{21...29} = xo; let Inst{30} = XB{5}; let Inst{31} = 0; } @@ -1202,10 +1202,10 @@ class XX2_RD6_XO5_RS6<bits<6> opcode, bits<5> xo2, bits<9> xo, dag OOL, dag IOL, let Pattern = pattern; - let Inst{6-10} = XT{4-0}; - let Inst{11-15} = xo2; - let Inst{16-20} = XB{4-0}; - let Inst{21-29} = xo; + let Inst{6...10} = XT{4...0}; + let Inst{11...15} = xo2; + let Inst{16...20} = XB{4...0}; + let Inst{21...29} = xo; let Inst{30} = XB{5}; let Inst{31} = XT{5}; } @@ -1219,10 +1219,10 @@ class XX2_BF3_DCMX7_RS6<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, let Pattern = pattern; - let Inst{6-8} = BF; - let Inst{9-15} = DCMX; - let Inst{16-20} = XB{4-0}; - let Inst{21-29} = xo; + let Inst{6...8} = BF; + let Inst{9...15} = DCMX; + let Inst{16...20} = XB{4...0}; + let Inst{21...29} = xo; let Inst{30} = XB{5}; let Inst{31} = 0; } @@ -1237,12 +1237,12 @@ class XX2_RD6_DCMX7_RS6<bits<6> opcode, bits<4> xo1, bits<3> xo2, let Pattern = pattern; - let Inst{6-10} = XT{4-0}; - let Inst{11-15} = DCMX{4-0}; - let Inst{16-20} = XB{4-0}; - let Inst{21-24} = xo1; + let Inst{6...10} = XT{4...0}; + let Inst{11...15} = DCMX{4...0}; + let Inst{16...20} = XB{4...0}; + let Inst{21...24} = xo1; let Inst{25} = DCMX{6}; - let Inst{26-28} = xo2; + let Inst{26...28} = xo2; let Inst{29} = DCMX{5}; let Inst{30} = XB{5}; let Inst{31} = XT{5}; @@ -1257,10 +1257,10 @@ class XForm_XD6_RA5_RB5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, let Pattern = pattern; - let Inst{6-10} = D{4-0}; // D - let Inst{11-15} = RA; - let Inst{16-20} = RB; - let Inst{21-30} = xo; + let Inst{6...10} = D{4...0}; // D + let Inst{11...15} = RA; + let Inst{16...20} = RB; + let Inst{21...30} = xo; let Inst{31} = D{5}; // DX } @@ -1273,11 +1273,11 @@ class XForm_BF3_UIM6_FRB5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, let Pattern = pattern; - let Inst{6-8} = BF; + let Inst{6...8} = BF; let Inst{9} = 0; - let Inst{10-15} = UIM; - let Inst{16-20} = FRB; - let Inst{21-30} = xo; + let Inst{10...15} = UIM; + let Inst{16...20} = FRB; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -1292,11 +1292,11 @@ class XForm_SP2_FRTB5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asms bit RC = 0; // set by isRecordForm - let Inst{6 - 10} = FRT; - let Inst{11 - 12} = SP; - let Inst{13 - 15} = 0; - let Inst{16 - 20} = FRB; - let Inst{21 - 30} = xo; + let Inst{6...10} = FRT; + let Inst{11...12} = SP; + let Inst{13...15} = 0; + let Inst{16...20} = FRB; + let Inst{21...30} = xo; let Inst{31} = RC; } @@ -1311,11 +1311,11 @@ class XForm_S1_FRTB5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, bit RC = 0; // set by isRecordForm - let Inst{6 - 10} = FRT; + let Inst{6...10} = FRT; let Inst{11} = S; - let Inst{12 - 15} = 0; - let Inst{16 - 20} = FRB; - let Inst{21 - 30} = xo; + let Inst{12...15} = 0; + let Inst{16...20} = FRB; + let Inst{21...30} = xo; let Inst{31} = RC; } @@ -1328,10 +1328,10 @@ class XX3Form<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = XT{4-0}; - let Inst{11-15} = XA{4-0}; - let Inst{16-20} = XB{4-0}; - let Inst{21-28} = xo; + let Inst{6...10} = XT{4...0}; + let Inst{11...15} = XA{4...0}; + let Inst{16...20} = XB{4...0}; + let Inst{21...28} = xo; let Inst{29} = XA{5}; let Inst{30} = XB{5}; let Inst{31} = XT{5}; @@ -1353,11 +1353,11 @@ class XX3Form_1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-8} = CR; - let Inst{9-10} = 0; - let Inst{11-15} = XA{4-0}; - let Inst{16-20} = XB{4-0}; - let Inst{21-28} = xo; + let Inst{6...8} = CR; + let Inst{9...10} = 0; + let Inst{11...15} = XA{4...0}; + let Inst{16...20} = XB{4...0}; + let Inst{21...28} = xo; let Inst{29} = XA{5}; let Inst{30} = XB{5}; let Inst{31} = 0; @@ -1373,12 +1373,12 @@ class XX3Form_2<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = XT{4-0}; - let Inst{11-15} = XA{4-0}; - let Inst{16-20} = XB{4-0}; + let Inst{6...10} = XT{4...0}; + let Inst{11...15} = XA{4...0}; + let Inst{16...20} = XB{4...0}; let Inst{21} = 0; - let Inst{22-23} = D; - let Inst{24-28} = xo; + let Inst{22...23} = D; + let Inst{24...28} = xo; let Inst{29} = XA{5}; let Inst{30} = XB{5}; let Inst{31} = XT{5}; @@ -1395,11 +1395,11 @@ class XX3Form_Rc<bits<6> opcode, bits<7> xo, dag OOL, dag IOL, string asmstr, bit RC = 0; // set by isRecordForm - let Inst{6-10} = XT{4-0}; - let Inst{11-15} = XA{4-0}; - let Inst{16-20} = XB{4-0}; + let Inst{6...10} = XT{4...0}; + let Inst{11...15} = XA{4...0}; + let Inst{16...20} = XB{4...0}; let Inst{21} = RC; - let Inst{22-28} = xo; + let Inst{22...28} = xo; let Inst{29} = XA{5}; let Inst{30} = XB{5}; let Inst{31} = XT{5}; @@ -1415,11 +1415,11 @@ class XX4Form<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = XT{4-0}; - let Inst{11-15} = XA{4-0}; - let Inst{16-20} = XB{4-0}; - let Inst{21-25} = XC{4-0}; - let Inst{26-27} = xo; + let Inst{6...10} = XT{4...0}; + let Inst{11...15} = XA{4...0}; + let Inst{16...20} = XB{4...0}; + let Inst{21...25} = XC{4...0}; + let Inst{26...27} = xo; let Inst{28} = XC{5}; let Inst{29} = XA{5}; let Inst{30} = XB{5}; @@ -1435,10 +1435,10 @@ class DCB_Form<bits<10> xo, bits<5> immfield, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = immfield; - let Inst{11-15} = RA; - let Inst{16-20} = RB; - let Inst{21-30} = xo; + let Inst{6...10} = immfield; + let Inst{11...15} = RA; + let Inst{16...20} = RB; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -1451,10 +1451,10 @@ class DCB_Form_hint<bits<10> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = TH; - let Inst{11-15} = RA; - let Inst{16-20} = RB; - let Inst{21-30} = xo; + let Inst{6...10} = TH; + let Inst{11...15} = RA; + let Inst{16...20} = RB; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -1469,11 +1469,11 @@ class DSS_Form<bits<1> T, bits<10> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; let Inst{6} = T; - let Inst{7-8} = 0; - let Inst{9-10} = STRM; - let Inst{11-15} = RA; - let Inst{16-20} = RB; - let Inst{21-30} = xo; + let Inst{7...8} = 0; + let Inst{9...10} = STRM; + let Inst{11...15} = RA; + let Inst{16...20} = RB; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -1487,10 +1487,10 @@ class XLForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = CRD; - let Inst{11-15} = CRA; - let Inst{16-20} = CRB; - let Inst{21-30} = xo; + let Inst{6...10} = CRD; + let Inst{11...15} = CRA; + let Inst{16...20} = CRB; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -1527,10 +1527,10 @@ class XLForm_1_ext<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = CRD; - let Inst{11-15} = CRD; - let Inst{16-20} = CRD; - let Inst{21-30} = xo; + let Inst{6...10} = CRD; + let Inst{11...15} = CRD; + let Inst{16...20} = CRD; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -1543,11 +1543,11 @@ class XLForm_2<bits<6> opcode, bits<10> xo, bit lk, dag OOL, dag IOL, string asm let Pattern = pattern; - let Inst{6-10} = BO; - let Inst{11-15} = BI; - let Inst{16-18} = 0; - let Inst{19-20} = BH; - let Inst{21-30} = xo; + let Inst{6...10} = BO; + let Inst{11...15} = BI; + let Inst{16...18} = 0; + let Inst{19...20} = BH; + let Inst{21...30} = xo; let Inst{31} = lk; } @@ -1557,9 +1557,9 @@ class XLForm_2_br<bits<6> opcode, bits<10> xo, bit lk, bits<7> BIBO; // 2 bits of BI and 5 bits of BO. bits<3> CR; - let BO = BIBO{4-0}; - let BI{0-1} = BIBO{5-6}; - let BI{2-4} = CR{0-2}; + let BO = BIBO{4...0}; + let BI{0...1} = BIBO{5...6}; + let BI{2...4} = CR{0...2}; let BH = 0; } @@ -1584,12 +1584,12 @@ class XLForm_3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, bits<3> BF; bits<3> BFA; - let Inst{6-8} = BF; - let Inst{9-10} = 0; - let Inst{11-13} = BFA; - let Inst{14-15} = 0; - let Inst{16-20} = 0; - let Inst{21-30} = xo; + let Inst{6...8} = BF; + let Inst{9...10} = 0; + let Inst{11...13} = BFA; + let Inst{14...15} = 0; + let Inst{16...20} = 0; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -1602,13 +1602,13 @@ class XLForm_4<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, bit RC = 0; - let Inst{6-8} = BF; - let Inst{9-10} = 0; - let Inst{11-14} = 0; + let Inst{6...8} = BF; + let Inst{9...10} = 0; + let Inst{11...14} = 0; let Inst{15} = W; - let Inst{16-19} = U; + let Inst{16...19} = U; let Inst{20} = 0; - let Inst{21-30} = xo; + let Inst{21...30} = xo; let Inst{31} = RC; } @@ -1619,9 +1619,9 @@ class XLForm_S<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-19} = 0; + let Inst{6...19} = 0; let Inst{20} = S; - let Inst{21-30} = xo; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -1640,17 +1640,17 @@ class XLForm_2_and_DSForm_1<bits<6> opcode1, bits<10> xo1, bit lk, let Pattern = pattern; - let Inst{6-10} = BO; - let Inst{11-15} = BI; - let Inst{16-18} = 0; - let Inst{19-20} = BH; - let Inst{21-30} = xo1; + let Inst{6...10} = BO; + let Inst{11...15} = BI; + let Inst{16...18} = 0; + let Inst{19...20} = BH; + let Inst{21...30} = xo1; let Inst{31} = lk; - let Inst{38-42} = RST; - let Inst{43-47} = RA; - let Inst{48-61} = D; - let Inst{62-63} = xo2; + let Inst{38...42} = RST; + let Inst{43...47} = RA; + let Inst{48...61} = D; + let Inst{62...63} = xo2; } class XLForm_2_ext_and_DSForm_1<bits<6> opcode1, bits<10> xo1, @@ -1677,16 +1677,16 @@ class XLForm_2_ext_and_DForm_1<bits<6> opcode1, bits<10> xo1, bits<5> bo, let Pattern = pattern; - let Inst{6-10} = bo; - let Inst{11-15} = bi; - let Inst{16-18} = 0; - let Inst{19-20} = 0; // Unused (BH) - let Inst{21-30} = xo1; + let Inst{6...10} = bo; + let Inst{11...15} = bi; + let Inst{16...18} = 0; + let Inst{19...20} = 0; // Unused (BH) + let Inst{21...30} = xo1; let Inst{31} = lk; - let Inst{38-42} = RST; - let Inst{43-47} = RA; - let Inst{48-63} = D; + let Inst{38...42} = RST; + let Inst{43...47} = RA; + let Inst{48...63} = D; } // 1.7.8 XFX-Form @@ -1696,7 +1696,7 @@ class XFXForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, bits<5> RST; bits<10> SPR; - let Inst{6-10} = RST; + let Inst{6...10} = RST; let Inst{11} = SPR{4}; let Inst{12} = SPR{3}; let Inst{13} = SPR{2}; @@ -1707,7 +1707,7 @@ class XFXForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, let Inst{18} = SPR{7}; let Inst{19} = SPR{6}; let Inst{20} = SPR{5}; - let Inst{21-30} = xo; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -1722,9 +1722,9 @@ class XFXForm_3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, : I<opcode, OOL, IOL, asmstr, itin> { bits<5> RT; - let Inst{6-10} = RT; - let Inst{11-20} = 0; - let Inst{21-30} = xo; + let Inst{6...10} = RT; + let Inst{11...20} = 0; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -1735,9 +1735,9 @@ class XFXForm_3p<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, bits<10> imm; let Pattern = pattern; - let Inst{6-10} = RT; - let Inst{11-20} = imm; - let Inst{21-30} = xo; + let Inst{6...10} = RT; + let Inst{11...20} = imm; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -1747,11 +1747,11 @@ class XFXForm_5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, bits<8> FXM; bits<5> RST; - let Inst{6-10} = RST; + let Inst{6...10} = RST; let Inst{11} = 0; - let Inst{12-19} = FXM; + let Inst{12...19} = FXM; let Inst{20} = 0; - let Inst{21-30} = xo; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -1761,11 +1761,11 @@ class XFXForm_5a<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, bits<5> RST; bits<8> FXM; - let Inst{6-10} = RST; + let Inst{6...10} = RST; let Inst{11} = 1; - let Inst{12-19} = FXM; + let Inst{12...19} = FXM; let Inst{20} = 0; - let Inst{21-30} = xo; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -1782,10 +1782,10 @@ class XFLForm<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; let Inst{6} = 0; - let Inst{7-14} = FM; + let Inst{7...14} = FM; let Inst{15} = 0; - let Inst{16-20} = RT; - let Inst{21-30} = xo; + let Inst{16...20} = RT; + let Inst{21...30} = xo; let Inst{31} = RC; } @@ -1801,10 +1801,10 @@ class XFLForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; let Inst{6} = L; - let Inst{7-14} = FLM; + let Inst{7...14} = FLM; let Inst{15} = W; - let Inst{16-20} = FRB; - let Inst{21-30} = xo; + let Inst{16...20} = FRB; + let Inst{21...30} = xo; let Inst{31} = RC; } @@ -1819,10 +1819,10 @@ class XSForm_1<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr, bit RC = 0; // set by isRecordForm let Pattern = pattern; - let Inst{6-10} = RS; - let Inst{11-15} = RA; - let Inst{16-20} = SH{4,3,2,1,0}; - let Inst{21-29} = xo; + let Inst{6...10} = RS; + let Inst{11...15} = RA; + let Inst{16...20} = SH{4,3,2,1,0}; + let Inst{21...29} = xo; let Inst{30} = SH{5}; let Inst{31} = RC; } @@ -1839,11 +1839,11 @@ class XOForm_1<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL, string asms bit RC = 0; // set by isRecordForm - let Inst{6-10} = RT; - let Inst{11-15} = RA; - let Inst{16-20} = RB; + let Inst{6...10} = RT; + let Inst{11...15} = RA; + let Inst{16...20} = RB; let Inst{21} = oe; - let Inst{22-30} = xo; + let Inst{22...30} = xo; let Inst{31} = RC; } @@ -1866,11 +1866,11 @@ class AForm_1<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, bit RC = 0; // set by isRecordForm - let Inst{6-10} = FRT; - let Inst{11-15} = FRA; - let Inst{16-20} = FRB; - let Inst{21-25} = FRC; - let Inst{26-30} = xo; + let Inst{6...10} = FRT; + let Inst{11...15} = FRA; + let Inst{16...20} = FRB; + let Inst{21...25} = FRC; + let Inst{26...30} = xo; let Inst{31} = RC; } @@ -1896,11 +1896,11 @@ class AForm_4<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = RT; - let Inst{11-15} = RA; - let Inst{16-20} = RB; - let Inst{21-25} = COND; - let Inst{26-30} = xo; + let Inst{6...10} = RT; + let Inst{11...15} = RA; + let Inst{16...20} = RB; + let Inst{21...25} = COND; + let Inst{26...30} = xo; let Inst{31} = 0; } @@ -1918,11 +1918,11 @@ class MForm_1<bits<6> opcode, dag OOL, dag IOL, string asmstr, bit RC = 0; // set by isRecordForm - let Inst{6-10} = RS; - let Inst{11-15} = RA; - let Inst{16-20} = RB; - let Inst{21-25} = MB; - let Inst{26-30} = ME; + let Inst{6...10} = RS; + let Inst{11...15} = RA; + let Inst{16...20} = RB; + let Inst{21...25} = MB; + let Inst{26...30} = ME; let Inst{31} = RC; } @@ -1939,11 +1939,11 @@ class MForm_2<bits<6> opcode, dag OOL, dag IOL, string asmstr, bit RC = 0; // set by isRecordForm - let Inst{6-10} = RS; - let Inst{11-15} = RA; - let Inst{16-20} = SH; - let Inst{21-25} = MB; - let Inst{26-30} = ME; + let Inst{6...10} = RS; + let Inst{11...15} = RA; + let Inst{16...20} = SH; + let Inst{21...25} = MB; + let Inst{26...30} = ME; let Inst{31} = RC; } @@ -1960,11 +1960,11 @@ class MDForm_1<bits<6> opcode, bits<3> xo, dag OOL, dag IOL, string asmstr, bit RC = 0; // set by isRecordForm - let Inst{6-10} = RS; - let Inst{11-15} = RA; - let Inst{16-20} = SH{4,3,2,1,0}; - let Inst{21-26} = MBE{4,3,2,1,0,5}; - let Inst{27-29} = xo; + let Inst{6...10} = RS; + let Inst{11...15} = RA; + let Inst{16...20} = SH{4,3,2,1,0}; + let Inst{21...26} = MBE{4,3,2,1,0,5}; + let Inst{27...29} = xo; let Inst{30} = SH{5}; let Inst{31} = RC; } @@ -1981,11 +1981,11 @@ class MDSForm_1<bits<6> opcode, bits<4> xo, dag OOL, dag IOL, string asmstr, bit RC = 0; // set by isRecordForm - let Inst{6-10} = RS; - let Inst{11-15} = RA; - let Inst{16-20} = RB; - let Inst{21-26} = MBE{4,3,2,1,0,5}; - let Inst{27-30} = xo; + let Inst{6...10} = RS; + let Inst{11...15} = RA; + let Inst{16...20} = RB; + let Inst{21...26} = MBE{4,3,2,1,0,5}; + let Inst{27...30} = xo; let Inst{31} = RC; } @@ -2003,11 +2003,11 @@ class VAForm_1<bits<6> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = RT; - let Inst{11-15} = RA; - let Inst{16-20} = RB; - let Inst{21-25} = RC; - let Inst{26-31} = xo; + let Inst{6...10} = RT; + let Inst{11...15} = RA; + let Inst{16...20} = RB; + let Inst{21...25} = RC; + let Inst{26...31} = xo; } // VAForm_1a - DABC ordering. @@ -2021,11 +2021,11 @@ class VAForm_1a<bits<6> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = RT; - let Inst{11-15} = RA; - let Inst{16-20} = RB; - let Inst{21-25} = RC; - let Inst{26-31} = xo; + let Inst{6...10} = RT; + let Inst{11...15} = RA; + let Inst{16...20} = RB; + let Inst{21...25} = RC; + let Inst{26...31} = xo; } class VAForm_2<bits<6> xo, dag OOL, dag IOL, string asmstr, @@ -2038,12 +2038,12 @@ class VAForm_2<bits<6> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = RT; - let Inst{11-15} = RA; - let Inst{16-20} = RB; + let Inst{6...10} = RT; + let Inst{11...15} = RA; + let Inst{16...20} = RB; let Inst{21} = 0; - let Inst{22-25} = SH; - let Inst{26-31} = xo; + let Inst{22...25} = SH; + let Inst{26...31} = xo; } // E-2 VX-Form @@ -2056,10 +2056,10 @@ class VXForm_1<bits<11> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = VD; - let Inst{11-15} = VA; - let Inst{16-20} = VB; - let Inst{21-31} = xo; + let Inst{6...10} = VD; + let Inst{11...15} = VA; + let Inst{16...20} = VB; + let Inst{21...31} = xo; } class VXForm_setzero<bits<11> xo, dag OOL, dag IOL, string asmstr, @@ -2078,10 +2078,10 @@ class VXForm_2<bits<11> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = VD; - let Inst{11-15} = 0; - let Inst{16-20} = VB; - let Inst{21-31} = xo; + let Inst{6...10} = VD; + let Inst{11...15} = 0; + let Inst{16...20} = VB; + let Inst{21...31} = xo; } class VXForm_3<bits<11> xo, dag OOL, dag IOL, string asmstr, @@ -2092,10 +2092,10 @@ class VXForm_3<bits<11> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = VD; - let Inst{11-15} = IMM; - let Inst{16-20} = 0; - let Inst{21-31} = xo; + let Inst{6...10} = VD; + let Inst{11...15} = IMM; + let Inst{16...20} = 0; + let Inst{21...31} = xo; } /// VXForm_4 - VX instructions with "VD,0,0" register fields, like mfvscr. @@ -2106,10 +2106,10 @@ class VXForm_4<bits<11> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = VD; - let Inst{11-15} = 0; - let Inst{16-20} = 0; - let Inst{21-31} = xo; + let Inst{6...10} = VD; + let Inst{11...15} = 0; + let Inst{16...20} = 0; + let Inst{21...31} = xo; } /// VXForm_5 - VX instructions with "0,0,VB" register fields, like mtvscr. @@ -2120,10 +2120,10 @@ class VXForm_5<bits<11> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = 0; - let Inst{11-15} = 0; - let Inst{16-20} = VB; - let Inst{21-31} = xo; + let Inst{6...10} = 0; + let Inst{11...15} = 0; + let Inst{16...20} = VB; + let Inst{21...31} = xo; } // e.g. [PO VRT EO VRB XO] @@ -2135,10 +2135,10 @@ class VXForm_RD5_XO5_RS5<bits<11> xo, bits<5> eo, dag OOL, dag IOL, let Pattern = pattern; - let Inst{6-10} = VD; - let Inst{11-15} = eo; - let Inst{16-20} = VB; - let Inst{21-31} = xo; + let Inst{6...10} = VD; + let Inst{11...15} = eo; + let Inst{16...20} = VB; + let Inst{21...31} = xo; } /// VXForm_CR - VX crypto instructions with "VRT, VRA, ST, SIX" @@ -2152,11 +2152,11 @@ class VXForm_CR<bits<11> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = VD; - let Inst{11-15} = VA; + let Inst{6...10} = VD; + let Inst{11...15} = VA; let Inst{16} = ST; - let Inst{17-20} = SIX; - let Inst{21-31} = xo; + let Inst{17...20} = SIX; + let Inst{21...31} = xo; } /// VXForm_BX - VX crypto instructions with "VRT, VRA, 0 - like vsbox" @@ -2168,10 +2168,10 @@ class VXForm_BX<bits<11> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = VD; - let Inst{11-15} = VA; - let Inst{16-20} = 0; - let Inst{21-31} = xo; + let Inst{6...10} = VD; + let Inst{11...15} = VA; + let Inst{16...20} = 0; + let Inst{21...31} = xo; } // E-4 VXR-Form @@ -2185,11 +2185,11 @@ class VXRForm_1<bits<10> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = VD; - let Inst{11-15} = VA; - let Inst{16-20} = VB; + let Inst{6...10} = VD; + let Inst{11...15} = VA; + let Inst{16...20} = VB; let Inst{21} = RC; - let Inst{22-31} = xo; + let Inst{22...31} = xo; } // VX-Form: [PO VRT EO VRB 1 PS XO] @@ -2203,12 +2203,12 @@ class VX_RD5_EO5_RS5_PS1_XO9<bits<5> eo, bits<9> xo, let Pattern = pattern; - let Inst{6-10} = VD; - let Inst{11-15} = eo; - let Inst{16-20} = VB; + let Inst{6...10} = VD; + let Inst{11...15} = eo; + let Inst{16...20} = VB; let Inst{21} = 1; let Inst{22} = PS; - let Inst{23-31} = xo; + let Inst{23...31} = xo; } // VX-Form: [PO VRT VRA VRB 1 PS XO] or [PO VRT VRA VRB 1 / XO] @@ -2222,12 +2222,12 @@ class VX_RD5_RSp5_PS1_XO9<bits<9> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = VD; - let Inst{11-15} = VA; - let Inst{16-20} = VB; + let Inst{6...10} = VD; + let Inst{11...15} = VA; + let Inst{16...20} = VB; let Inst{21} = 1; let Inst{22} = PS; - let Inst{23-31} = xo; + let Inst{23...31} = xo; } class Z22Form_BF3_FRA5_DCM6<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, @@ -2240,11 +2240,11 @@ class Z22Form_BF3_FRA5_DCM6<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, let Pattern = pattern; - let Inst{6-8} = BF; - let Inst{9-10} = 0; - let Inst{11-15} = FRA; - let Inst{16-21} = DCM; - let Inst{22-30} = xo; + let Inst{6...8} = BF; + let Inst{9...10} = 0; + let Inst{11...15} = FRA; + let Inst{16...21} = DCM; + let Inst{22...30} = xo; let Inst{31} = 0; } @@ -2260,10 +2260,10 @@ class Z22Form_FRTA5_SH6<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, bit RC = 0; // set by isRecordForm - let Inst{6 - 10} = FRT; - let Inst{11 - 15} = FRA; - let Inst{16 - 21} = SH; - let Inst{22 - 30} = xo; + let Inst{6...10} = FRT; + let Inst{11...15} = FRA; + let Inst{16...21} = SH; + let Inst{22...30} = xo; let Inst{31} = RC; } @@ -2279,12 +2279,12 @@ class Z23Form_8<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, bit RC = 0; // set by isRecordForm - let Inst{6-10} = VRT; - let Inst{11-14} = 0; + let Inst{6...10} = VRT; + let Inst{11...14} = 0; let Inst{15} = R; - let Inst{16-20} = VRB; - let Inst{21-22} = idx; - let Inst{23-30} = xo; + let Inst{16...20} = VRB; + let Inst{21...22} = idx; + let Inst{23...30} = xo; let Inst{31} = RC; } @@ -2298,11 +2298,11 @@ class Z23Form_RTAB5_CY2<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, let Pattern = pattern; - let Inst{6-10} = RT; - let Inst{11-15} = RA; - let Inst{16-20} = RB; - let Inst{21-22} = CY; - let Inst{23-30} = xo; + let Inst{6...10} = RT; + let Inst{11...15} = RA; + let Inst{16...20} = RB; + let Inst{21...22} = CY; + let Inst{23...30} = xo; let Inst{31} = 0; } @@ -2318,11 +2318,11 @@ class Z23Form_FRTAB5_RMC2<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, bit RC = 0; // set by isRecordForm - let Inst{6 - 10} = FRT; - let Inst{11 - 15} = FRA; - let Inst{16 - 20} = FRB; - let Inst{21 - 22} = RMC; - let Inst{23 - 30} = xo; + let Inst{6...10} = FRT; + let Inst{11...15} = FRA; + let Inst{16...20} = FRB; + let Inst{21...22} = RMC; + let Inst{23...30} = xo; let Inst{31} = RC; } @@ -2345,12 +2345,12 @@ class Z23Form_FRTB5_R1_RMC2<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, bit RC = 0; // set by isRecordForm - let Inst{6 - 10} = FRT; - let Inst{11 - 14} = 0; + let Inst{6...10} = FRT; + let Inst{11...14} = 0; let Inst{15} = R; - let Inst{16 - 20} = FRB; - let Inst{21 - 22} = RMC; - let Inst{23 - 30} = xo; + let Inst{16...20} = FRB; + let Inst{21...22} = RMC; + let Inst{23...30} = xo; let Inst{31} = RC; } @@ -2362,7 +2362,7 @@ class PPCEmitTimePseudo<dag OOL, dag IOL, string asmstr, list<dag> pattern> let isCodeGenOnly = 1; let PPC64 = 0; let Pattern = pattern; - let Inst{31-0} = 0; + let Inst{31...0} = 0; let hasNoSchedulingInfo = 1; } diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td index 80fac18d5737..a12dfae2a0d7 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td @@ -13,7 +13,7 @@ class XOForm_RTAB5_L1<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr, list<dag> pattern> - : I<opcode, OOL, IOL, asmstr, NoItinerary> { + : I<opcode, OOL, IOL, asmstr, NoItinerary> { bits<5> RT; bits<5> RA; bits<5> RB; @@ -21,64 +21,174 @@ class XOForm_RTAB5_L1<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, let Pattern = pattern; - bit RC = 0; // set by isRecordForm + bit RC = 0; // set by isRecordForm - let Inst{6-10} = RT; - let Inst{11-15} = RA; - let Inst{16-20} = RB; - let Inst{21} = L; - let Inst{22-30} = xo; - let Inst{31} = RC; + let Inst{6...10} = RT; + let Inst{11...15} = RA; + let Inst{16...20} = RB; + let Inst{21} = L; + let Inst{22...30} = xo; + let Inst{31} = RC; } multiclass XOForm_RTAB5_L1r<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, - string asmbase, string asmstr, - list<dag> pattern> { + string asmbase, string asmstr, list<dag> pattern> { let BaseName = asmbase in { def NAME : XOForm_RTAB5_L1<opcode, xo, OOL, IOL, !strconcat(asmbase, !strconcat(" ", asmstr)), - pattern>, RecFormRel; - let Defs = [CR0] in - def _rec : XOForm_RTAB5_L1<opcode, xo, OOL, IOL, - !strconcat(asmbase, !strconcat(". ", asmstr)), - []>, isRecordForm, RecFormRel; + pattern>, + RecFormRel; + let Defs = [CR0] in def _rec + : XOForm_RTAB5_L1<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), []>, + isRecordForm, RecFormRel; } } +class VXForm_VRTB5<bits<11> xo, bits<5> R, dag OOL, dag IOL, string asmstr, + list<dag> pattern> : I<4, OOL, IOL, asmstr, NoItinerary> { + bits<5> VRT; + bits<5> VRB; + + let Pattern = pattern; + + let Inst{6...10} = VRT; + let Inst{11...15} = R; + let Inst{16...20} = VRB; + let Inst{21...31} = xo; +} + +class VXForm_VRTB5_UIM2<bits<11> xo, bits<3> R, dag OOL, dag IOL, string asmstr, + list<dag> pattern> + : I<4, OOL, IOL, asmstr, NoItinerary> { + bits<5> VRT; + bits<5> VRB; + bits<2> UIM; + + let Pattern = pattern; + + let Inst{6...10} = VRT; + let Inst{11...13} = R; + let Inst{14...15} = UIM; + let Inst{16...20} = VRB; + let Inst{21...31} = xo; +} + +class VXForm_VRTB5_UIM1<bits<11> xo, bits<4> R, dag OOL, dag IOL, string asmstr, + list<dag> pattern> + : I<4, OOL, IOL, asmstr, NoItinerary> { + bits<5> VRT; + bits<5> VRB; + bits<1> UIM; + + let Pattern = pattern; + + let Inst{6...10} = VRT; + let Inst{11...14} = R; + let Inst{15} = UIM; + let Inst{16...20} = VRB; + let Inst{21...31} = xo; +} + +class VXForm_VRTB5_UIM3<bits<11> xo, bits<2> R, dag OOL, dag IOL, string asmstr, + list<dag> pattern> + : I<4, OOL, IOL, asmstr, NoItinerary> { + bits<5> VRT; + bits<5> VRB; + bits<3> UIM; + + let Pattern = pattern; + + let Inst{6...10} = VRT; + let Inst{11...12} = R; + let Inst{13...15} = UIM; + let Inst{16...20} = VRB; + let Inst{21...31} = xo; +} + +class VXForm_VRTAB5<bits<11> xo, dag OOL, dag IOL, string asmstr, + list<dag> pattern> : I<4, OOL, IOL, asmstr, NoItinerary> { + bits<5> VRT; + bits<5> VRA; + bits<5> VRB; + + let Pattern = pattern; + + let Inst{6...10} = VRT; + let Inst{11...15} = VRA; + let Inst{16...20} = VRB; + let Inst{21...31} = xo; +} + let Predicates = [IsISAFuture] in { defm SUBFUS : XOForm_RTAB5_L1r<31, 72, (outs g8rc:$RT), - (ins g8rc:$RA, g8rc:$RB, u1imm:$L), - "subfus", "$RT, $L, $RA, $RB", []>; + (ins g8rc:$RA, g8rc:$RB, u1imm:$L), "subfus", + "$RT, $L, $RA, $RB", []>; } let Predicates = [HasVSX, IsISAFuture] in { let mayLoad = 1 in { - def LXVRL - : XX1Form_memOp<31, 525, (outs vsrc:$XT), (ins memr:$RA, g8rc:$RB), - "lxvrl $XT, $RA, $RB", IIC_LdStLoad, []>; - def LXVRLL - : XX1Form_memOp<31, 557, (outs vsrc:$XT), (ins memr:$RA, g8rc:$RB), - "lxvrll $XT, $RA, $RB", IIC_LdStLoad, []>; - def LXVPRL - : XForm_XTp5_XAB5<31, 589, (outs vsrprc:$XTp), (ins memr:$RA, g8rc:$RB), - "lxvprl $XTp, $RA, $RB", IIC_LdStLFD, []>; - def LXVPRLL - : XForm_XTp5_XAB5<31, 621, (outs vsrprc:$XTp), (ins memr:$RA, g8rc:$RB), - "lxvprll $XTp, $RA, $RB", IIC_LdStLFD, []>; + def LXVRL : XX1Form_memOp<31, 525, (outs vsrc:$XT), + (ins (memr $RA):$addr, g8rc:$RB), + "lxvrl $XT, $addr, $RB", IIC_LdStLoad, []>; + def LXVRLL : XX1Form_memOp<31, 557, (outs vsrc:$XT), + (ins (memr $RA):$addr, g8rc:$RB), + "lxvrll $XT, $addr, $RB", IIC_LdStLoad, []>; + def LXVPRL : XForm_XTp5_XAB5<31, 589, (outs vsrprc:$XTp), + (ins (memr $RA):$addr, g8rc:$RB), + "lxvprl $XTp, $addr, $RB", IIC_LdStLFD, []>; + def LXVPRLL : XForm_XTp5_XAB5<31, 621, (outs vsrprc:$XTp), + (ins (memr $RA):$addr, g8rc:$RB), + "lxvprll $XTp, $addr, $RB", IIC_LdStLFD, []>; } let mayStore = 1 in { - def STXVRL - : XX1Form_memOp<31, 653, (outs), (ins vsrc:$XT, memr:$RA, g8rc:$RB), - "stxvrl $XT, $RA, $RB", IIC_LdStLoad, []>; - def STXVRLL - : XX1Form_memOp<31, 685, (outs), (ins vsrc:$XT, memr:$RA, g8rc:$RB), - "stxvrll $XT, $RA, $RB", IIC_LdStLoad, []>; + def STXVRL : XX1Form_memOp<31, 653, (outs), + (ins vsrc:$XT, (memr $RA):$addr, g8rc:$RB), + "stxvrl $XT, $addr, $RB", IIC_LdStLoad, []>; + def STXVRLL : XX1Form_memOp<31, 685, (outs), + (ins vsrc:$XT, (memr $RA):$addr, g8rc:$RB), + "stxvrll $XT, $addr, $RB", IIC_LdStLoad, []>; def STXVPRL : XForm_XTp5_XAB5<31, 717, (outs), - (ins vsrprc:$XTp, memr:$RA, g8rc:$RB), - "stxvprl $XTp, $RA, $RB", IIC_LdStLFD, []>; - def STXVPRLL : XForm_XTp5_XAB5<31, 749, (outs), - (ins vsrprc:$XTp, memr:$RA, g8rc:$RB), - "stxvprll $XTp, $RA, $RB", IIC_LdStLFD, []>; + (ins vsrprc:$XTp, (memr $RA):$addr, g8rc:$RB), + "stxvprl $XTp, $addr, $RB", IIC_LdStLFD, []>; + def STXVPRLL + : XForm_XTp5_XAB5<31, 749, (outs), + (ins vsrprc:$XTp, (memr $RA):$addr, g8rc:$RB), + "stxvprll $XTp, $addr, $RB", IIC_LdStLFD, []>; } + + def VUPKHSNTOB : VXForm_VRTB5<387, 0, (outs vrrc:$VRT), (ins vrrc:$VRB), + "vupkhsntob $VRT, $VRB", []>; + def VUPKLSNTOB : VXForm_VRTB5<387, 1, (outs vrrc:$VRT), (ins vrrc:$VRB), + "vupklsntob $VRT, $VRB", []>; + def VUPKINT4TOBF16 + : VXForm_VRTB5_UIM2<387, 2, (outs vrrc:$VRT), (ins vrrc:$VRB, u2imm:$UIM), + "vupkint4tobf16 $VRT, $VRB, $UIM", []>; + def VUPKINT8TOBF16 + : VXForm_VRTB5_UIM1<387, 1, (outs vrrc:$VRT), (ins vrrc:$VRB, u1imm:$UIM), + "vupkint8tobf16 $VRT, $VRB, $UIM", []>; + def VUPKINT8TOFP32 + : VXForm_VRTB5_UIM2<387, 3, (outs vrrc:$VRT), (ins vrrc:$VRB, u2imm:$UIM), + "vupkint8tofp32 $VRT, $VRB, $UIM", []>; + def VUPKINT4TOFP32 + : VXForm_VRTB5_UIM3<387, 2, (outs vrrc:$VRT), (ins vrrc:$VRB, u3imm:$UIM), + "vupkint4tofp32 $VRT, $VRB, $UIM", []>; + + def VUCMPRHN : VXForm_VRTAB5<3, (outs vrrc:$VRT), (ins vrrc:$VRA, vrrc:$VRB), + "vucmprhn $VRT, $VRA, $VRB", []>; + def VUCMPRLN : VXForm_VRTAB5<67, (outs vrrc:$VRT), (ins vrrc:$VRA, vrrc:$VRB), + "vucmprln $VRT, $VRA, $VRB", []>; + def VUCMPRHB + : VXForm_VRTAB5<131, (outs vrrc:$VRT), (ins vrrc:$VRA, vrrc:$VRB), + "vucmprhb $VRT, $VRA, $VRB", []>; + def VUCMPRLB + : VXForm_VRTAB5<195, (outs vrrc:$VRT), (ins vrrc:$VRA, vrrc:$VRB), + "vucmprlb $VRT, $VRA, $VRB", []>; + def VUCMPRHH + : VXForm_VRTAB5<259, (outs vrrc:$VRT), (ins vrrc:$VRA, vrrc:$VRB), + "vucmprhh $VRT, $VRA, $VRB", []>; + def VUCMPRLH + : VXForm_VRTAB5<323, (outs vrrc:$VRT), (ins vrrc:$VRA, vrrc:$VRB), + "vucmprlh $VRT, $VRA, $VRB", []>; } diff --git a/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td b/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td index ef8b27f9b8d3..884895793752 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td @@ -8,12 +8,13 @@ //===----------------------------------------------------------------------===// // // This file describes the instructions introduced for the Future CPU for MMA. +// Please reference "PPCInstrVSX.td" for file structure. // //===----------------------------------------------------------------------===// class XX3Form_AT3_XABp5_P1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, list<dag> pattern> - : I<opcode, OOL, IOL, asmstr, NoItinerary> { + : I<opcode, OOL, IOL, asmstr, NoItinerary> { bits<3> AT; bits<5> XAp; bits<5> XBp; @@ -21,13 +22,13 @@ class XX3Form_AT3_XABp5_P1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, let Pattern = pattern; - let Inst{6-8} = AT{2-0}; - let Inst{9-10} = 0; - let Inst{11-14} = XAp{3-0}; + let Inst{6...8} = AT{2...0}; + let Inst{9...10} = 0; + let Inst{11...14} = XAp{3...0}; let Inst{15} = P; - let Inst{16-19} = XBp{3-0}; + let Inst{16...19} = XBp{3...0}; let Inst{20} = 0; - let Inst{21-28} = xo; + let Inst{21...28} = xo; let Inst{29} = XAp{4}; let Inst{30} = XBp{4}; let Inst{31} = 0; @@ -35,65 +36,64 @@ class XX3Form_AT3_XABp5_P1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, class XX2Form_AT3_XBp5_P2<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr, list<dag> pattern> - : I<opcode, OOL, IOL, asmstr, NoItinerary> { + : I<opcode, OOL, IOL, asmstr, NoItinerary> { bits<3> AT; bits<5> XBp; bits<2> P; let Pattern = pattern; - let Inst{6-8} = AT{2-0}; - let Inst{9-14} = 0; + let Inst{6...8} = AT{2...0}; + let Inst{9...14} = 0; let Inst{15} = P{0}; - let Inst{16-19} = XBp{3-0}; + let Inst{16...19} = XBp{3...0}; let Inst{20} = P{1}; - let Inst{21-29} = xo; + let Inst{21...29} = xo; let Inst{30} = XBp{4}; let Inst{31} = 0; } class XForm_ATB3<bits<6> opcode, bits<5> o, bits<10> xo, dag OOL, dag IOL, string asmstr, list<dag> pattern> - : I <opcode, OOL, IOL, asmstr, NoItinerary> { + : I<opcode, OOL, IOL, asmstr, NoItinerary> { bits<3> AT; bits<3> AB; let Pattern = pattern; - let Inst{6-8} = AT{2-0}; - let Inst{9-10} = 0; - let Inst{11-15} = o; - let Inst{16-18} = AB{2-0}; - let Inst{19-20} = 0; - let Inst{21-30} = xo; + let Inst{6...8} = AT{2...0}; + let Inst{9...10} = 0; + let Inst{11...15} = o; + let Inst{16...18} = AB{2...0}; + let Inst{19...20} = 0; + let Inst{21...30} = xo; let Inst{31} = 0; } class XX3Form_AT3_XAp5B6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, - string asmstr, InstrItinClass itin, - list<dag> pattern> - : I<opcode, OOL, IOL, asmstr, itin> { + string asmstr, InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { bits<3> AT; bits<5> XAp; bits<6> XB; let Pattern = pattern; - let Inst{6-8} = AT; - let Inst{9-10} = 0; - let Inst{11-14} = XAp{3-0}; + let Inst{6...8} = AT; + let Inst{9...10} = 0; + let Inst{11...14} = XAp{3...0}; let Inst{15} = 0; - let Inst{16-20} = XB{4-0}; - let Inst{21-28} = xo; - let Inst{29} = XAp{4}; - let Inst{30} = XB{5}; + let Inst{16...20} = XB{4...0}; + let Inst{21...28} = xo; + let Inst{29} = XAp{4}; + let Inst{30} = XB{5}; let Inst{31} = 0; } class MMIRR_XX3Form_X8YP4_XAp5B6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern> - : PI<1, opcode, OOL, IOL, asmstr, itin> { + : PI<1, opcode, OOL, IOL, asmstr, itin> { bits<3> AT; bits<5> XAp; bits<6> XB; @@ -104,29 +104,29 @@ class MMIRR_XX3Form_X8YP4_XAp5B6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, let Pattern = pattern; // The prefix. - let Inst{6-7} = 3; - let Inst{8-11} = 9; - let Inst{12-15} = 0; - let Inst{16-19} = PMSK; - let Inst{20-27} = XMSK; - let Inst{28-31} = YMSK; + let Inst{6...7} = 3; + let Inst{8...11} = 9; + let Inst{12...15} = 0; + let Inst{16...19} = PMSK; + let Inst{20...27} = XMSK; + let Inst{28...31} = YMSK; // The instruction. - let Inst{38-40} = AT; - let Inst{41-42} = 0; - let Inst{43-46} = XAp{3-0}; + let Inst{38...40} = AT; + let Inst{41...42} = 0; + let Inst{43...46} = XAp{3...0}; let Inst{47} = 0; - let Inst{48-52} = XB{4-0}; - let Inst{53-60} = xo; + let Inst{48...52} = XB{4...0}; + let Inst{53...60} = xo; let Inst{61} = XAp{4}; let Inst{62} = XB{5}; let Inst{63} = 0; } class MMIRR_XX3Form_X8Y4P2_XAp5B6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, - string asmstr, InstrItinClass itin, - list<dag> pattern> - : PI<1, opcode, OOL, IOL, asmstr, itin> { + string asmstr, InstrItinClass itin, + list<dag> pattern> + : PI<1, opcode, OOL, IOL, asmstr, itin> { bits<3> AT; bits<5> XAp; bits<6> XB; @@ -137,21 +137,21 @@ class MMIRR_XX3Form_X8Y4P2_XAp5B6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, let Pattern = pattern; // The prefix. - let Inst{6-7} = 3; - let Inst{8-11} = 9; - let Inst{12-15} = 0; - let Inst{16-17} = PMSK; - let Inst{18-19} = 0; - let Inst{20-27} = XMSK; - let Inst{28-31} = YMSK; + let Inst{6...7} = 3; + let Inst{8...11} = 9; + let Inst{12...15} = 0; + let Inst{16...17} = PMSK; + let Inst{18...19} = 0; + let Inst{20...27} = XMSK; + let Inst{28...31} = YMSK; // The instruction. - let Inst{38-40} = AT; - let Inst{41-42} = 0; - let Inst{43-46} = XAp{3-0}; + let Inst{38...40} = AT; + let Inst{41...42} = 0; + let Inst{43...46} = XAp{3...0}; let Inst{47} = 0; - let Inst{48-52} = XB{4-0}; - let Inst{53-60} = xo; + let Inst{48...52} = XB{4...0}; + let Inst{53...60} = xo; let Inst{61} = XAp{4}; let Inst{62} = XB{5}; let Inst{63} = 0; @@ -160,14 +160,15 @@ class MMIRR_XX3Form_X8Y4P2_XAp5B6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, multiclass DMR_UM_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase, string asmstr> { let Predicates = [MMA, IsISAFuture] in { - def NAME : - XX3Form_AT3_XAp5B6<opcode, !or(xo, 0x01), (outs dmr:$AT), IOL, - !strconcat(asmbase#" ", asmstr), IIC_VecFP, []>, - RegConstraint<"@earlyclobber $AT">; - def PP : - XX3Form_AT3_XAp5B6<opcode, xo, (outs dmr:$AT), !con((ins dmr:$ATi), IOL), - !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def NAME + : XX3Form_AT3_XAp5B6<opcode, !or(xo, 0x01), (outs dmr:$AT), IOL, + !strconcat(asmbase#" ", asmstr), IIC_VecFP, []>, + RegConstraint<"@earlyclobber $AT">; + def PP + : XX3Form_AT3_XAp5B6<opcode, xo, (outs dmr:$AT), + !con((ins dmr:$ATi), IOL), + !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">; } } @@ -175,202 +176,217 @@ multiclass DMR_UM_M448_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase, string asmstr> { defm NAME : DMR_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>; let Predicates = [MMA, PrefixInstrs, IsISAFuture] in { - def PM#NAME : - MMIRR_XX3Form_X8YP4_XAp5B6< - opcode, !or(xo, 0x01), (outs dmr:$AT), - !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK)), - !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"), - IIC_VecFP, []>, - RegConstraint<"@earlyclobber $AT">; - def PM#NAME#PP : - MMIRR_XX3Form_X8YP4_XAp5B6< - opcode, xo, (outs dmr:$AT), - !con((ins dmr:$ATi), - !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK))), - !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"), - IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def PM#NAME + : MMIRR_XX3Form_X8YP4_XAp5B6< + opcode, !or(xo, 0x01), (outs dmr:$AT), + !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK)), + !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"@earlyclobber $AT">; + def PM#NAME#PP + : MMIRR_XX3Form_X8YP4_XAp5B6< + opcode, xo, (outs dmr:$AT), + !con((ins dmr:$ATi), + !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK))), + !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">; } } multiclass DMR_BF16_UM_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase, - string asmstr> { + string asmstr> { let Predicates = [MMA, IsISAFuture] in { - def NAME : - XX3Form_AT3_XAp5B6<opcode, !or(xo, 0x11), (outs dmr:$AT), IOL, - !strconcat(asmbase#" ", asmstr), IIC_VecFP, []>, - RegConstraint<"@earlyclobber $AT">; - def PP : - XX3Form_AT3_XAp5B6<opcode, xo, (outs dmr:$AT), !con((ins dmr:$ATi), IOL), - !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def NAME + : XX3Form_AT3_XAp5B6<opcode, !or(xo, 0x11), (outs dmr:$AT), IOL, + !strconcat(asmbase#" ", asmstr), IIC_VecFP, []>, + RegConstraint<"@earlyclobber $AT">; + def PP + : XX3Form_AT3_XAp5B6<opcode, xo, (outs dmr:$AT), + !con((ins dmr:$ATi), IOL), + !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">; } } -multiclass DMR_BF16_UM_M284_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase, - string asmstr> { +multiclass DMR_BF16_UM_M284_XOEO<bits<6> opcode, bits<8> xo, dag IOL, + string asmbase, string asmstr> { defm NAME : DMR_BF16_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>; let Predicates = [MMA, PrefixInstrs, IsISAFuture] in { - def PM#NAME : - MMIRR_XX3Form_X8Y4P2_XAp5B6< - opcode, !or(xo, 0x11), (outs dmr:$AT), - !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK)), - !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"), - IIC_VecFP, []>, - RegConstraint<"@earlyclobber $AT">; - def PM#NAME#PP : - MMIRR_XX3Form_X8Y4P2_XAp5B6< - opcode, xo, (outs dmr:$AT), - !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), - !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"), - IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def PM#NAME + : MMIRR_XX3Form_X8Y4P2_XAp5B6< + opcode, !or(xo, 0x11), (outs dmr:$AT), + !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK)), + !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"@earlyclobber $AT">; + def PM#NAME#PP + : MMIRR_XX3Form_X8Y4P2_XAp5B6< + opcode, xo, (outs dmr:$AT), + !con((ins dmr:$ATi), + !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), + !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">; } } -multiclass DMR_F16_UM_M284_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase, - string asmstr> { +multiclass DMR_F16_UM_M284_XOEO<bits<6> opcode, bits<8> xo, dag IOL, + string asmbase, string asmstr> { defm NAME : DMR_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>; let Predicates = [MMA, PrefixInstrs, IsISAFuture] in { - def PM#NAME : - MMIRR_XX3Form_X8Y4P2_XAp5B6< - opcode, !or(xo, 0x01), (outs dmr:$AT), - !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK)), - !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"), - IIC_VecFP, []>, - RegConstraint<"@earlyclobber $AT">; - def PM#NAME#PP : - MMIRR_XX3Form_X8Y4P2_XAp5B6< - opcode, xo, (outs dmr:$AT), - !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), - !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"), - IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def PM#NAME + : MMIRR_XX3Form_X8Y4P2_XAp5B6< + opcode, !or(xo, 0x01), (outs dmr:$AT), + !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK)), + !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"@earlyclobber $AT">; + def PM#NAME#PP + : MMIRR_XX3Form_X8Y4P2_XAp5B6< + opcode, xo, (outs dmr:$AT), + !con((ins dmr:$ATi), + !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), + !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">; } } multiclass DMR_NEG_UM_M284_XOXORf939a0<bits<6> opcode, bits<8> xo, dag IOL, - string asmbase, string asmstr> { + string asmbase, string asmstr> { defm NAME : DMR_BF16_UM_M284_XOEO<opcode, xo, IOL, asmbase, asmstr>; let Predicates = [MMA, IsISAFuture] in { - def PN : XX3Form_AT3_XAp5B6< - opcode, !xor(xo, 0xF9), (outs dmr:$AT), !con((ins dmr:$ATi), IOL), - !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; - def NP : XX3Form_AT3_XAp5B6< - opcode, !xor(xo, 0x39), (outs dmr:$AT), !con((ins dmr:$ATi), IOL), - !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; - def NN : XX3Form_AT3_XAp5B6< - opcode, !xor(xo, 0xA0), (outs dmr:$AT), !con((ins dmr:$ATi), IOL), - !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def PN + : XX3Form_AT3_XAp5B6<opcode, !xor(xo, 0xF9), (outs dmr:$AT), + !con((ins dmr:$ATi), IOL), + !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">; + def NP + : XX3Form_AT3_XAp5B6<opcode, !xor(xo, 0x39), (outs dmr:$AT), + !con((ins dmr:$ATi), IOL), + !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">; + def NN + : XX3Form_AT3_XAp5B6<opcode, !xor(xo, 0xA0), (outs dmr:$AT), + !con((ins dmr:$ATi), IOL), + !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">; } let Predicates = [MMA, PrefixInstrs, IsISAFuture] in { - def PM#NAME#PN : - MMIRR_XX3Form_X8Y4P2_XAp5B6< - opcode, !xor(xo, 0xF9), (outs dmr:$AT), - !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), - !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK, $PMSK"), - IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; - def PM#NAME#NP : - MMIRR_XX3Form_X8Y4P2_XAp5B6< - opcode, !xor(xo, 0x39), (outs dmr:$AT), - !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), - !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK, $PMSK"), - IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; - def PM#NAME#NN : - MMIRR_XX3Form_X8Y4P2_XAp5B6< - opcode, !xor(xo, 0xA0), (outs dmr:$AT), - !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), - !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK, $PMSK"), - IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def PM#NAME#PN + : MMIRR_XX3Form_X8Y4P2_XAp5B6< + opcode, !xor(xo, 0xF9), (outs dmr:$AT), + !con((ins dmr:$ATi), + !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), + !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">; + def PM#NAME#NP + : MMIRR_XX3Form_X8Y4P2_XAp5B6< + opcode, !xor(xo, 0x39), (outs dmr:$AT), + !con((ins dmr:$ATi), + !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), + !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">; + def PM#NAME#NN + : MMIRR_XX3Form_X8Y4P2_XAp5B6< + opcode, !xor(xo, 0xA0), (outs dmr:$AT), + !con((ins dmr:$ATi), + !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), + !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">; } } multiclass DMR_NEG_UM_M284_XOXORd11188<bits<6> opcode, bits<8> xo, dag IOL, - string asmbase, string asmstr> { + string asmbase, string asmstr> { defm NAME : DMR_F16_UM_M284_XOEO<opcode, xo, IOL, asmbase, asmstr>; let Predicates = [MMA, IsISAFuture] in { - def PN : XX3Form_AT3_XAp5B6< - opcode, !xor(xo, 0xD1), (outs dmr:$AT), !con((ins dmr:$ATi), IOL), - !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; - def NP : XX3Form_AT3_XAp5B6< - opcode, !xor(xo, 0x11), (outs dmr:$AT), !con((ins dmr:$ATi), IOL), - !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; - def NN : XX3Form_AT3_XAp5B6< - opcode, !xor(xo, 0x88), (outs dmr:$AT), !con((ins dmr:$ATi), IOL), - !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def PN + : XX3Form_AT3_XAp5B6<opcode, !xor(xo, 0xD1), (outs dmr:$AT), + !con((ins dmr:$ATi), IOL), + !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">; + def NP + : XX3Form_AT3_XAp5B6<opcode, !xor(xo, 0x11), (outs dmr:$AT), + !con((ins dmr:$ATi), IOL), + !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">; + def NN + : XX3Form_AT3_XAp5B6<opcode, !xor(xo, 0x88), (outs dmr:$AT), + !con((ins dmr:$ATi), IOL), + !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">; } let Predicates = [MMA, PrefixInstrs, IsISAFuture] in { - def PM#NAME#PN : - MMIRR_XX3Form_X8Y4P2_XAp5B6< - opcode, !xor(xo, 0xD1), (outs dmr:$AT), - !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), - !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK, $PMSK"), - IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; - def PM#NAME#NP : - MMIRR_XX3Form_X8Y4P2_XAp5B6< - opcode, !xor(xo, 0x11), (outs dmr:$AT), - !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), - !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK, $PMSK"), - IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; - def PM#NAME#NN : - MMIRR_XX3Form_X8Y4P2_XAp5B6< - opcode, !xor(xo, 0x88), (outs dmr:$AT), - !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), - !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK, $PMSK"), - IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def PM#NAME#PN + : MMIRR_XX3Form_X8Y4P2_XAp5B6< + opcode, !xor(xo, 0xD1), (outs dmr:$AT), + !con((ins dmr:$ATi), + !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), + !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">; + def PM#NAME#NP + : MMIRR_XX3Form_X8Y4P2_XAp5B6< + opcode, !xor(xo, 0x11), (outs dmr:$AT), + !con((ins dmr:$ATi), + !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), + !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">; + def PM#NAME#NN + : MMIRR_XX3Form_X8Y4P2_XAp5B6< + opcode, !xor(xo, 0x88), (outs dmr:$AT), + !con((ins dmr:$ATi), + !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), + !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">; } } class XForm_AT3_T1_AB3<bits<6> opcode, bits<5> o, bits<10> xo, dag OOL, dag IOL, string asmstr, list<dag> pattern> - : I <opcode, OOL, IOL, asmstr, NoItinerary> { + : I<opcode, OOL, IOL, asmstr, NoItinerary> { bits<3> AT; bits<3> AB; bits<1> T; let Pattern = pattern; - let Inst{6-8} = AT{2-0}; + let Inst{6...8} = AT{2...0}; let Inst{9} = 0; let Inst{10} = T; - let Inst{11-15} = o; - let Inst{16-18} = AB{2-0}; - let Inst{19-20} = 0; - let Inst{21-30} = xo; + let Inst{11...15} = o; + let Inst{16...18} = AB{2...0}; + let Inst{19...20} = 0; + let Inst{21...30} = xo; let Inst{31} = 0; } class XForm_ATp2_SR5<bits<6> opcode, bits<5> o, bits<10> xo, dag OOL, dag IOL, string asmstr, list<dag> pattern> - : I <opcode, OOL, IOL, asmstr, NoItinerary> { + : I<opcode, OOL, IOL, asmstr, NoItinerary> { bits<2> ATp; bits<5> SR; let Pattern = pattern; - let Inst{6-7} = ATp{1-0}; - let Inst{8-10} = 0; - let Inst{11-15} = o; - let Inst{16-20} = SR{4-0}; - let Inst{21-30} = xo; + let Inst{6...7} = ATp{1...0}; + let Inst{8...10} = 0; + let Inst{11...15} = o; + let Inst{16...20} = SR{4...0}; + let Inst{21...30} = xo; let Inst{31} = 0; } class XX2Form_AT3_XB6_ID2_E1_BL2<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, - string asmstr, list<dag> pattern> - : I<opcode, OOL, IOL, asmstr, NoItinerary> { + string asmstr, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, NoItinerary> { bits<3> AT; bits<6> XB; bits<2> ID; @@ -379,41 +395,48 @@ class XX2Form_AT3_XB6_ID2_E1_BL2<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, let Pattern = pattern; - let Inst{6-8} = AT{2-0}; - let Inst{9-10} = 0; - let Inst{11-12} = ID{1-0}; + let Inst{6...8} = AT{2...0}; + let Inst{9...10} = 0; + let Inst{11...12} = ID{1...0}; let Inst{13} = E; - let Inst{14-15} = BL{1-0}; - let Inst{16-20} = XB{4-0}; - let Inst{21-29} = xo; + let Inst{14...15} = BL{1...0}; + let Inst{16...20} = XB{4...0}; + let Inst{21...29} = xo; let Inst{30} = XB{5}; let Inst{31} = 0; } -let Predicates = [IsISAFuture] in { - def DMXXEXTFDMR512 : XX3Form_AT3_XABp5_P1<60, 226, - (outs vsrprc:$XAp, vsrprc:$XBp), - (ins wacc:$AT), - "dmxxextfdmr512 $XAp, $XBp, $AT, 0", []> { +//-------------------------- Instruction definitions -------------------------// +// Predicate combinations available: +// [MMA, IsISAFuture] +// [MMA, PrefixInstrs, IsISAFuture] + +let Predicates = [MMA, IsISAFuture] in { + def DMXXEXTFDMR512 + : XX3Form_AT3_XABp5_P1<60, 226, (outs vsrprc:$XAp, vsrprc:$XBp), + (ins wacc:$AT), + "dmxxextfdmr512 $XAp, $XBp, $AT, 0", []> { let P = 0; } - def DMXXEXTFDMR512_HI : XX3Form_AT3_XABp5_P1<60, 226, - (outs vsrprc:$XAp, vsrprc:$XBp), - (ins wacc_hi:$AT), - "dmxxextfdmr512 $XAp, $XBp, $AT, 1", []> { + def DMXXEXTFDMR512_HI + : XX3Form_AT3_XABp5_P1<60, 226, (outs vsrprc:$XAp, vsrprc:$XBp), + (ins wacc_hi:$AT), + "dmxxextfdmr512 $XAp, $XBp, $AT, 1", []> { let P = 1; } - def DMXXINSTDMR512 : XX3Form_AT3_XABp5_P1<60, 234, (outs wacc:$AT), - (ins vsrprc:$XAp, vsrprc:$XBp), - "dmxxinstdmr512 $AT, $XAp, $XBp, 0", []> { + def DMXXINSTDMR512 + : XX3Form_AT3_XABp5_P1<60, 234, (outs wacc:$AT), + (ins vsrprc:$XAp, vsrprc:$XBp), + "dmxxinstdmr512 $AT, $XAp, $XBp, 0", []> { let P = 0; } - def DMXXINSTDMR512_HI : XX3Form_AT3_XABp5_P1<60, 234, (outs wacc_hi:$AT), - (ins vsrprc:$XAp, vsrprc:$XBp), - "dmxxinstdmr512 $AT, $XAp, $XBp, 1", []> { + def DMXXINSTDMR512_HI + : XX3Form_AT3_XABp5_P1<60, 234, (outs wacc_hi:$AT), + (ins vsrprc:$XAp, vsrprc:$XBp), + "dmxxinstdmr512 $AT, $XAp, $XBp, 1", []> { let P = 1; } @@ -422,236 +445,220 @@ let Predicates = [IsISAFuture] in { "dmxxextfdmr256 $XBp, $AT, $P", []>; def DMXXINSTDMR256 : XX2Form_AT3_XBp5_P2<60, 485, (outs dmrrowp:$AT), - (ins vsrprc:$XBp, u2imm:$P), - "dmxxinstdmr256 $AT, $XBp, $P", []>; + (ins vsrprc:$XBp, u2imm:$P), + "dmxxinstdmr256 $AT, $XBp, $P", []>; - def DMMR : XForm_ATB3<31, 6, 177, (outs dmr:$AT), (ins dmr:$AB), - "dmmr $AT, $AB", - [(set v1024i1:$AT, (int_ppc_mma_dmmr v1024i1:$AB))]>; + def DMMR + : XForm_ATB3<31, 6, 177, (outs dmr:$AT), (ins dmr:$AB), "dmmr $AT, $AB", + [(set v1024i1:$AT, (int_ppc_mma_dmmr v1024i1:$AB))]>; def DMXOR : XForm_ATB3<31, 7, 177, (outs dmr:$AT), (ins dmr:$ATi, dmr:$AB), "dmxor $AT, $AB", - [(set v1024i1:$AT, (int_ppc_mma_dmxor v1024i1:$ATi, v1024i1:$AB))]>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; - - def DMSETDMRZ : XForm_AT3<31, 2, 177, (outs dmr:$AT), (ins), - "dmsetdmrz $AT", NoItinerary, - [(set v1024i1:$AT, (int_ppc_mma_dmsetdmrz))]>; -} - -// MMA+ accumulating/non-accumulating instructions. - -// DMXVI8GERX4, DMXVI8GERX4PP, PMDMXVI8GERX4, PMDMXVI8GERX4PP -defm DMXVI8GERX4 : DMR_UM_M448_XOEO<59, 10, (ins vsrprc:$XAp, vsrc:$XB), - "dmxvi8gerx4", "$AT, $XAp, $XB">; - -let Predicates = [MMA, IsISAFuture] in { - def DMXVI8GERX4SPP : - XX3Form_AT3_XAp5B6<59, 98, (outs dmr:$AT), (ins dmr:$ATi, vsrprc:$XAp, vsrc:$XB), - "dmxvi8gerx4spp $AT, $XAp, $XB", IIC_VecGeneral, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; -} + [(set v1024i1:$AT, (int_ppc_mma_dmxor v1024i1:$ATi, + v1024i1:$AB))]>, + RegConstraint<"$ATi = $AT">; + + def DMSETDMRZ + : XForm_AT3<31, 2, 177, (outs dmr:$AT), (ins), "dmsetdmrz $AT", + NoItinerary, [(set v1024i1:$AT, (int_ppc_mma_dmsetdmrz))]>; + + // DMXVI8GERX4, DMXVI8GERX4PP, PMDMXVI8GERX4, PMDMXVI8GERX4PP + defm DMXVI8GERX4 : DMR_UM_M448_XOEO<59, 10, (ins vsrprc:$XAp, vsrc:$XB), + "dmxvi8gerx4", "$AT, $XAp, $XB">; + + // DMXVBF16GERX2, DMXVBF16GERX2PP, DMXVBF16GERX2PN, dMXVBF16GERX2NP, + // DMXVBF16GERX2NN PMDMXVBF16GERX2, PMDMXVBF16GERX2PP, PMDMXVBF16GERX2PN, + // PMDMXVBF16GERX2NP, PMDMXVBF16GERX2NN + defm DMXVBF16GERX2 + : DMR_NEG_UM_M284_XOXORf939a0<59, 74, (ins vsrprc:$XAp, vsrc:$XB), + "dmxvbf16gerx2", "$AT, $XAp, $XB">; + + // DMXVF16GERX2, DMXVF16GERX2PP, DMXVF16GERX2PN, dMXVF16GERX2NP, + // DMXVF16GERX2NN PMDMXVF16GERX2, PMDMXVF16GERX2PP, PMDMXVF16GERX2PN, + // PMDMXVF16GERX2NP, PMDMXVF16GERX2NN + defm DMXVF16GERX2 + : DMR_NEG_UM_M284_XOXORd11188<59, 66, (ins vsrprc:$XAp, vsrc:$XB), + "dmxvf16gerx2", "$AT, $XAp, $XB">; + + // DMF cryptography [support] Instructions + def DMSHA2HASH + : XForm_AT3_T1_AB3< + 31, 14, 177, (outs dmr:$AT), (ins dmr:$ATi, dmr:$AB, u1imm:$T), + "dmsha2hash $AT, $AB, $T", + [(set v1024i1:$AT, (int_ppc_mma_dmsha2hash v1024i1:$ATi, + v1024i1:$AB, timm:$T))]>, + RegConstraint<"$ATi = $AT">; + def DMSHA3HASH + : XForm_ATp2_SR5<31, 15, 177, (outs dmrp:$ATp), + (ins dmrp:$ATpi, u5imm:$SR), "dmsha3hash $ATp, $SR", + [(set v2048i1:$ATp, + (int_ppc_mma_dmsha3hash v2048i1:$ATpi, timm:$SR))]>, + RegConstraint<"$ATpi = $ATp">; + def DMXXSHAPAD + : XX2Form_AT3_XB6_ID2_E1_BL2<60, 421, (outs dmr:$AT), + (ins dmr:$ATi, vsrc:$XB, u2imm:$ID, u1imm:$E, + u2imm:$BL), + "dmxxshapad $AT, $XB, $ID, $E, $BL", []>, + RegConstraint<"$ATi = $AT">; + + // MMA+ accumulating/non-accumulating instructions. + def DMXVI8GERX4SPP + : XX3Form_AT3_XAp5B6<59, 98, (outs dmr:$AT), + (ins dmr:$ATi, vsrprc:$XAp, vsrc:$XB), + "dmxvi8gerx4spp $AT, $XAp, $XB", IIC_VecGeneral, []>, + RegConstraint<"$ATi = $AT">; + +} // End of [MMA, IsISAFuture] let Predicates = [MMA, PrefixInstrs, IsISAFuture] in { - def PMDMXVI8GERX4SPP : - MMIRR_XX3Form_X8YP4_XAp5B6<59, 98, (outs dmr:$AT), - (ins dmr:$ATi, vsrprc:$XAp,vsrc:$XB, u8imm:$XMSK, - u4imm:$YMSK, u4imm:$PMSK), - "pmdmxvi8gerx4spp $AT, $XAp, $XB, $XMSK, $YMSK, $PMSK", - IIC_VecGeneral, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def PMDMXVI8GERX4SPP + : MMIRR_XX3Form_X8YP4_XAp5B6< + 59, 98, (outs dmr:$AT), + (ins dmr:$ATi, vsrprc:$XAp, vsrc:$XB, u8imm:$XMSK, u4imm:$YMSK, + u4imm:$PMSK), + "pmdmxvi8gerx4spp $AT, $XAp, $XB, $XMSK, $YMSK, $PMSK", + IIC_VecGeneral, []>, + RegConstraint<"$ATi = $AT">; } -// DMXVBF16GERX2, DMXVBF16GERX2PP, DMXVBF16GERX2PN, dMXVBF16GERX2NP, DMXVBF16GERX2NN -// PMDMXVBF16GERX2, PMDMXVBF16GERX2PP, PMDMXVBF16GERX2PN, PMDMXVBF16GERX2NP, PMDMXVBF16GERX2NN -defm DMXVBF16GERX2 : DMR_NEG_UM_M284_XOXORf939a0<59, 74, (ins vsrprc:$XAp, vsrc:$XB), - "dmxvbf16gerx2", "$AT, $XAp, $XB">; - -// DMXVF16GERX2, DMXVF16GERX2PP, DMXVF16GERX2PN, dMXVF16GERX2NP, DMXVF16GERX2NN -// PMDMXVF16GERX2, PMDMXVF16GERX2PP, PMDMXVF16GERX2PN, PMDMXVF16GERX2NP, PMDMXVF16GERX2NN -defm DMXVF16GERX2 : DMR_NEG_UM_M284_XOXORd11188<59, 66, (ins vsrprc:$XAp, vsrc:$XB), - "dmxvf16gerx2", "$AT, $XAp, $XB">; - -// DMF cryptography [support] Instructions -let Predicates = [IsISAFuture] in { - def DMSHA2HASH : - XForm_AT3_T1_AB3<31, 14, 177, (outs dmr:$AT), (ins dmr:$ATi, dmr:$AB, u1imm:$T), - "dmsha2hash $AT, $AB, $T", - [(set v1024i1:$AT, (int_ppc_mma_dmsha2hash v1024i1:$ATi, v1024i1:$AB, timm:$T))]>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; - - def DMSHA3HASH : - XForm_ATp2_SR5<31, 15, 177, (outs dmrp:$ATp), (ins dmrp:$ATpi , u5imm:$SR), - "dmsha3hash $ATp, $SR", - [(set v2048i1:$ATp, (int_ppc_mma_dmsha3hash v2048i1:$ATpi, timm:$SR))]>, - RegConstraint<"$ATpi = $ATp">, NoEncode<"$ATpi">; - - def DMXXSHAPAD : - XX2Form_AT3_XB6_ID2_E1_BL2<60, 421, (outs dmr:$AT), - (ins dmr:$ATi, vsrc:$XB, u2imm:$ID, u1imm:$E, u2imm:$BL), - "dmxxshapad $AT, $XB, $ID, $E, $BL", []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; -} +//---------------------------- Anonymous Patterns ----------------------------// +// Predicate combinations available: +// [MMA, IsISAFuture] +// [MMA, PrefixInstrs, IsISAFuture] -// MMA+ Intrinsics let Predicates = [MMA, IsISAFuture] in { + // MMA+ Intrinsics def : Pat<(v1024i1 (int_ppc_mma_dmxvi8gerx4 v256i1:$XAp, v16i8:$XB)), (DMXVI8GERX4 $XAp, RCCp.BToVSRC)>; - def : Pat<(v1024i1 (int_ppc_mma_dmxvi8gerx4pp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)), + def : Pat<(v1024i1 (int_ppc_mma_dmxvi8gerx4pp v1024i1:$ATi, v256i1:$XAp, + v16i8:$XB)), (DMXVI8GERX4PP $ATi, $XAp, RCCp.BToVSRC)>; - - def : Pat<(v1024i1 (int_ppc_mma_dmxvi8gerx4spp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)), + def : Pat<(v1024i1 (int_ppc_mma_dmxvi8gerx4spp v1024i1:$ATi, v256i1:$XAp, + v16i8:$XB)), (DMXVI8GERX4SPP $ATi, $XAp, RCCp.BToVSRC)>; - def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2 v256i1:$XAp, v16i8:$XB)), (DMXVBF16GERX2 $XAp, RCCp.BToVSRC)>; - - def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2pp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)), + def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2pp v1024i1:$ATi, v256i1:$XAp, + v16i8:$XB)), (DMXVBF16GERX2PP $ATi, $XAp, RCCp.BToVSRC)>; - - def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2pn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)), + def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2pn v1024i1:$ATi, v256i1:$XAp, + v16i8:$XB)), (DMXVBF16GERX2PN $ATi, $XAp, RCCp.BToVSRC)>; - - def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2np v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)), + def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2np v1024i1:$ATi, v256i1:$XAp, + v16i8:$XB)), (DMXVBF16GERX2NP $ATi, $XAp, RCCp.BToVSRC)>; - - def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2nn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)), + def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2nn v1024i1:$ATi, v256i1:$XAp, + v16i8:$XB)), (DMXVBF16GERX2NN $ATi, $XAp, RCCp.BToVSRC)>; - def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2 v256i1:$XAp, v16i8:$XB)), (DMXVF16GERX2 $XAp, RCCp.BToVSRC)>; - - def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2pp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)), + def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2pp v1024i1:$ATi, v256i1:$XAp, + v16i8:$XB)), (DMXVF16GERX2PP $ATi, $XAp, RCCp.BToVSRC)>; - - def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2pn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)), + def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2pn v1024i1:$ATi, v256i1:$XAp, + v16i8:$XB)), (DMXVF16GERX2PN $ATi, $XAp, RCCp.BToVSRC)>; - - def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2np v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)), + def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2np v1024i1:$ATi, v256i1:$XAp, + v16i8:$XB)), (DMXVF16GERX2NP $ATi, $XAp, RCCp.BToVSRC)>; - - def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2nn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)), + def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2nn v1024i1:$ATi, v256i1:$XAp, + v16i8:$XB)), (DMXVF16GERX2NN $ATi, $XAp, RCCp.BToVSRC)>; + + // Cryptography Intrinsic + def : Pat<(v1024i1 (int_ppc_mma_dmxxshapad v1024i1:$ATi, v16i8:$XB, timm:$ID, + timm:$E, timm:$BL)), + (DMXXSHAPAD $ATi, RCCp.BToVSRC, $ID, $E, $BL)>; } let Predicates = [MMA, PrefixInstrs, IsISAFuture] in { - def : Pat<(v1024i1 (int_ppc_mma_pmdmxvi8gerx4 v256i1:$XAp, v16i8:$XB, Msk8Imm:$XMSK, - Msk4Imm:$YMSK, Msk4Imm:$PMSK)), - (PMDMXVI8GERX4 $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK, - Msk4Imm:$YMSK, Msk4Imm:$PMSK)>; - - def : Pat<(v1024i1 (int_ppc_mma_pmdmxvi8gerx4pp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB, - Msk8Imm:$XMSK, Msk4Imm:$YMSK, - Msk4Imm:$PMSK)), + def : Pat<(v1024i1 (int_ppc_mma_pmdmxvi8gerx4 v256i1:$XAp, v16i8:$XB, + Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk4Imm:$PMSK)), + (PMDMXVI8GERX4 $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK, Msk4Imm:$YMSK, + Msk4Imm:$PMSK)>; + + def : Pat<(v1024i1 (int_ppc_mma_pmdmxvi8gerx4pp v1024i1:$ATi, v256i1:$XAp, + v16i8:$XB, Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk4Imm:$PMSK)), (PMDMXVI8GERX4PP $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK, - Msk4Imm:$YMSK, Msk4Imm:$PMSK)>; + Msk4Imm:$YMSK, Msk4Imm:$PMSK)>; - def : Pat<(v1024i1 (int_ppc_mma_pmdmxvi8gerx4spp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB, - Msk8Imm:$XMSK, Msk4Imm:$YMSK, - Msk4Imm:$PMSK)), + def : Pat<(v1024i1 (int_ppc_mma_pmdmxvi8gerx4spp v1024i1:$ATi, v256i1:$XAp, + v16i8:$XB, Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk4Imm:$PMSK)), (PMDMXVI8GERX4SPP $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK, - Msk4Imm:$YMSK, Msk4Imm:$PMSK)>; + Msk4Imm:$YMSK, Msk4Imm:$PMSK)>; - def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2 v256i1:$XAp, v16i8:$XB, Msk8Imm:$XMSK, - Msk4Imm:$YMSK, Msk2Imm:$PMSK)), - (PMDMXVBF16GERX2 $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK, - Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2 v256i1:$XAp, v16i8:$XB, + Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk2Imm:$PMSK)), + (PMDMXVBF16GERX2 $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK, Msk4Imm:$YMSK, + Msk2Imm:$PMSK)>; - def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2pp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB, - Msk8Imm:$XMSK, Msk4Imm:$YMSK, - Msk2Imm:$PMSK)), + def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2pp v1024i1:$ATi, v256i1:$XAp, + v16i8:$XB, Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk2Imm:$PMSK)), (PMDMXVBF16GERX2PP $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK, - Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; - def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2pn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB, - Msk8Imm:$XMSK, Msk4Imm:$YMSK, - Msk2Imm:$PMSK)), + def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2pn v1024i1:$ATi, v256i1:$XAp, + v16i8:$XB, Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk2Imm:$PMSK)), (PMDMXVBF16GERX2PN $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK, - Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; - def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2np v1024i1:$ATi, v256i1:$XAp, v16i8:$XB, - Msk8Imm:$XMSK, Msk4Imm:$YMSK, - Msk2Imm:$PMSK)), + def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2np v1024i1:$ATi, v256i1:$XAp, + v16i8:$XB, Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk2Imm:$PMSK)), (PMDMXVBF16GERX2NP $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK, - Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; - def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2nn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB, - Msk8Imm:$XMSK, Msk4Imm:$YMSK, - Msk2Imm:$PMSK)), + def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2nn v1024i1:$ATi, v256i1:$XAp, + v16i8:$XB, Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk2Imm:$PMSK)), (PMDMXVBF16GERX2NN $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK, - Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; - def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2 v256i1:$XAp, v16i8:$XB, Msk8Imm:$XMSK, - Msk4Imm:$YMSK, Msk2Imm:$PMSK)), - (PMDMXVF16GERX2 $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK, - Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2 v256i1:$XAp, v16i8:$XB, + Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk2Imm:$PMSK)), + (PMDMXVF16GERX2 $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK, Msk4Imm:$YMSK, + Msk2Imm:$PMSK)>; - def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2pp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB, - Msk8Imm:$XMSK, Msk4Imm:$YMSK, - Msk2Imm:$PMSK)), + def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2pp v1024i1:$ATi, v256i1:$XAp, + v16i8:$XB, Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk2Imm:$PMSK)), (PMDMXVF16GERX2PP $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK, - Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; - def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2pn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB, - Msk8Imm:$XMSK, Msk4Imm:$YMSK, - Msk2Imm:$PMSK)), + def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2pn v1024i1:$ATi, v256i1:$XAp, + v16i8:$XB, Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk2Imm:$PMSK)), (PMDMXVF16GERX2PN $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK, - Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; - def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2np v1024i1:$ATi, v256i1:$XAp, v16i8:$XB, - Msk8Imm:$XMSK, Msk4Imm:$YMSK, - Msk2Imm:$PMSK)), + def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2np v1024i1:$ATi, v256i1:$XAp, + v16i8:$XB, Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk2Imm:$PMSK)), (PMDMXVF16GERX2NP $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK, - Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; - def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2nn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB, - Msk8Imm:$XMSK, Msk4Imm:$YMSK, - Msk2Imm:$PMSK)), + def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2nn v1024i1:$ATi, v256i1:$XAp, + v16i8:$XB, Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk2Imm:$PMSK)), (PMDMXVF16GERX2NN $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK, - Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; -} - -// Cryptography Intrinsic -let Predicates = [IsISAFuture] in { - def : Pat<(v1024i1 (int_ppc_mma_dmxxshapad v1024i1:$ATi, v16i8:$XB, timm:$ID, - timm:$E, timm:$BL)), (DMXXSHAPAD $ATi, RCCp.BToVSRC, $ID, $E, $BL)>; + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; } -// MMA+ Instruction aliases -let Predicates = [IsISAFuture] in { - def : InstAlias<"dmsha256hash $AT, $AB", - (DMSHA2HASH dmr:$AT, dmr:$AB, 0)>; +//---------------------------- Instruction aliases ---------------------------// - def : InstAlias<"dmsha512hash $AT, $AB", - (DMSHA2HASH dmr:$AT, dmr:$AB, 1)>; - - def : InstAlias<"dmsha3dw $ATp", - (DMSHA3HASH dmrp:$ATp, 0)>; - - def : InstAlias<"dmcryshash $ATp", - (DMSHA3HASH dmrp:$ATp, 12)>; - - def : InstAlias<"dmxxsha3512pad $AT, $XB, $E", - (DMXXSHAPAD dmr:$AT, vsrc:$XB, 0, u1imm:$E, 0)>; - - def : InstAlias<"dmxxsha3384pad $AT, $XB, $E", - (DMXXSHAPAD dmr:$AT, vsrc:$XB, 0, u1imm:$E, 1)>; - - def : InstAlias<"dmxxsha3256pad $AT, $XB, $E", - (DMXXSHAPAD dmr:$AT, vsrc:$XB, 0, u1imm:$E, 2)>; - - def : InstAlias<"dmxxsha3224pad $AT, $XB, $E", - (DMXXSHAPAD dmr:$AT, vsrc:$XB, 0, u1imm:$E, 3)>; - - def : InstAlias<"dmxxshake256pad $AT, $XB, $E", - (DMXXSHAPAD dmr:$AT, vsrc:$XB, 1, u1imm:$E, 0)>; - - def : InstAlias<"dmxxshake128pad $AT, $XB, $E", - (DMXXSHAPAD dmr:$AT, vsrc:$XB, 1, u1imm:$E, 1)>; - - def : InstAlias<"dmxxsha384512pad $AT, $XB", - (DMXXSHAPAD dmr:$AT, vsrc:$XB, 2, 0, 0)>; - - def : InstAlias<"dmxxsha224256pad $AT, $XB", - (DMXXSHAPAD dmr:$AT, vsrc:$XB, 3, 0, 0)>; +let Predicates = [MMA, IsISAFuture] in { + def : InstAlias<"dmsha256hash $AT, $AB", (DMSHA2HASH dmr:$AT, dmr:$AB, 0)>; + def : InstAlias<"dmsha512hash $AT, $AB", (DMSHA2HASH dmr:$AT, dmr:$AB, 1)>; + def : InstAlias<"dmsha3dw $ATp", (DMSHA3HASH dmrp:$ATp, 0)>; + def : InstAlias<"dmcryshash $ATp", (DMSHA3HASH dmrp:$ATp, 12)>; + def : InstAlias<"dmxxsha3512pad $AT, $XB, $E", (DMXXSHAPAD dmr:$AT, vsrc:$XB, + 0, u1imm:$E, 0)>; + def : InstAlias<"dmxxsha3384pad $AT, $XB, $E", (DMXXSHAPAD dmr:$AT, vsrc:$XB, + 0, u1imm:$E, 1)>; + def : InstAlias<"dmxxsha3256pad $AT, $XB, $E", (DMXXSHAPAD dmr:$AT, vsrc:$XB, + 0, u1imm:$E, 2)>; + def : InstAlias<"dmxxsha3224pad $AT, $XB, $E", (DMXXSHAPAD dmr:$AT, vsrc:$XB, + 0, u1imm:$E, 3)>; + def : InstAlias<"dmxxshake256pad $AT, $XB, $E", (DMXXSHAPAD dmr:$AT, vsrc:$XB, + 1, u1imm:$E, 0)>; + def : InstAlias<"dmxxshake128pad $AT, $XB, $E", (DMXXSHAPAD dmr:$AT, vsrc:$XB, + 1, u1imm:$E, 1)>; + def : InstAlias<"dmxxsha384512pad $AT, $XB", (DMXXSHAPAD dmr:$AT, vsrc:$XB, 2, + 0, 0)>; + def : InstAlias<"dmxxsha224256pad $AT, $XB", (DMXXSHAPAD dmr:$AT, vsrc:$XB, 3, + 0, 0)>; } diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 7c1550e99bae..db066bc4b7bd 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -30,6 +30,7 @@ #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/RegisterPressure.h" +#include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/StackMaps.h" @@ -87,8 +88,8 @@ static cl::opt<bool> EnableFMARegPressureReduction( // Pin the vtable to this file. void PPCInstrInfo::anchor() {} -PPCInstrInfo::PPCInstrInfo(PPCSubtarget &STI) - : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP, +PPCInstrInfo::PPCInstrInfo(const PPCSubtarget &STI) + : PPCGenInstrInfo(STI, PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP, /* CatchRetOpcode */ -1, STI.isPPC64() ? PPC::BLR8 : PPC::BLR), Subtarget(STI), RI(STI.getTargetMachine()) {} @@ -1863,6 +1864,48 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, .addReg(SrcRegSub1) .addReg(SrcRegSub1, getKillRegState(KillSrc)); return; + } else if ((PPC::WACCRCRegClass.contains(DestReg) || + PPC::WACC_HIRCRegClass.contains(DestReg)) && + (PPC::WACCRCRegClass.contains(SrcReg) || + PPC::WACC_HIRCRegClass.contains(SrcReg))) { + + Opc = PPC::WACCRCRegClass.contains(SrcReg) ? PPC::DMXXEXTFDMR512 + : PPC::DMXXEXTFDMR512_HI; + + RegScavenger RS; + RS.enterBasicBlockEnd(MBB); + RS.backward(std::next(I)); + + Register TmpReg1 = RS.scavengeRegisterBackwards(PPC::VSRpRCRegClass, I, + /* RestoreAfter */ false, 0, + /* AllowSpill */ false); + + RS.setRegUsed(TmpReg1); + Register TmpReg2 = RS.scavengeRegisterBackwards(PPC::VSRpRCRegClass, I, + /* RestoreAfter */ false, 0, + /* AllowSpill */ false); + + BuildMI(MBB, I, DL, get(Opc)) + .addReg(TmpReg1, RegState::Define) + .addReg(TmpReg2, RegState::Define) + .addReg(SrcReg, getKillRegState(KillSrc)); + + Opc = PPC::WACCRCRegClass.contains(DestReg) ? PPC::DMXXINSTDMR512 + : PPC::DMXXINSTDMR512_HI; + + BuildMI(MBB, I, DL, get(Opc), DestReg) + .addReg(TmpReg1, RegState::Kill) + .addReg(TmpReg2, RegState::Kill); + + return; + } else if (PPC::DMRRCRegClass.contains(DestReg) && + PPC::DMRRCRegClass.contains(SrcReg)) { + + BuildMI(MBB, I, DL, get(PPC::DMMR), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + + return; + } else llvm_unreachable("Impossible reg-to-reg copy"); diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h index 7931a9e3ae13..63ebd6591057 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -279,7 +279,7 @@ enum PPCMachineCombinerPattern : unsigned { class PPCSubtarget; class PPCInstrInfo : public PPCGenInstrInfo { - PPCSubtarget &Subtarget; + const PPCSubtarget &Subtarget; const PPCRegisterInfo RI; const unsigned StoreSpillOpcodesArray[4][SOK_LastOpcodeSpill] = StoreOpcodesForSpill; @@ -369,7 +369,7 @@ protected: unsigned OpIdx2) const override; public: - explicit PPCInstrInfo(PPCSubtarget &STI); + explicit PPCInstrInfo(const PPCSubtarget &STI); bool isLoadFromConstantPool(MachineInstr *I) const; const Constant *getConstantFromConstantPool(MachineInstr *I) const; diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index c2f91ce8e6b9..c12cf8511312 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -58,6 +58,10 @@ def SDT_PPCVecShift : SDTypeProfile<1, 3, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisPtrTy<3> ]>; +def SDT_PPCVecShiftQuad : SDTypeProfile<1, 2, [ + SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2> +]>; + def SDT_PPCVecInsert : SDTypeProfile<1, 3, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3> ]>; @@ -157,6 +161,8 @@ def PPCfctiwz : SDNode<"PPCISD::FCTIWZ", SDTFPUnaryOp, []>; def PPCfctiduz: SDNode<"PPCISD::FCTIDUZ",SDTFPUnaryOp, []>; def PPCfctiwuz: SDNode<"PPCISD::FCTIWUZ",SDTFPUnaryOp, []>; +def PPCvsrq: SDNode<"PPCISD::VSRQ", SDT_PPCVecShiftQuad, []>; + def PPCstrict_fcfid : SDNode<"PPCISD::STRICT_FCFID", SDTFPUnaryOp, [SDNPHasChain]>; def PPCstrict_fcfidu : SDNode<"PPCISD::STRICT_FCFIDU", @@ -665,9 +671,6 @@ class isRecordForm { bit RC = 1; } class RegConstraint<string C> { string Constraints = C; } -class NoEncode<string E> { - string DisableEncoding = E; -} // Define PowerPC specific addressing mode. @@ -1989,29 +1992,24 @@ def LBZU : DForm_1<35, (outs gprc:$RST, ptr_rc_nor0:$ea_result), (ins (memri $D, def LHAU : DForm_1<43, (outs gprc:$RST, ptr_rc_nor0:$ea_result), (ins (memri $D, $RA):$addr), "lhau $RST, $addr", IIC_LdStLHAU, - []>, RegConstraint<"$addr.reg = $ea_result">, - NoEncode<"$ea_result">; + []>, RegConstraint<"$addr.reg = $ea_result">; def LHZU : DForm_1<41, (outs gprc:$RST, ptr_rc_nor0:$ea_result), (ins (memri $D, $RA):$addr), "lhzu $RST, $addr", IIC_LdStLoadUpd, - []>, RegConstraint<"$addr.reg = $ea_result">, - NoEncode<"$ea_result">; + []>, RegConstraint<"$addr.reg = $ea_result">; def LWZU : DForm_1<33, (outs gprc:$RST, ptr_rc_nor0:$ea_result), (ins (memri $D, $RA):$addr), "lwzu $RST, $addr", IIC_LdStLoadUpd, - []>, RegConstraint<"$addr.reg = $ea_result">, - NoEncode<"$ea_result">; + []>, RegConstraint<"$addr.reg = $ea_result">; let Predicates = [HasFPU] in { def LFSU : DForm_1<49, (outs f4rc:$RST, ptr_rc_nor0:$ea_result), (ins (memri $D, $RA):$addr), "lfsu $RST, $addr", IIC_LdStLFDU, - []>, RegConstraint<"$addr.reg = $ea_result">, - NoEncode<"$ea_result">; + []>, RegConstraint<"$addr.reg = $ea_result">; def LFDU : DForm_1<51, (outs f8rc:$RST, ptr_rc_nor0:$ea_result), (ins (memri $D, $RA):$addr), "lfdu $RST, $addr", IIC_LdStLFDU, - []>, RegConstraint<"$addr.reg = $ea_result">, - NoEncode<"$ea_result">; + []>, RegConstraint<"$addr.reg = $ea_result">; } @@ -2019,39 +2017,33 @@ def LFDU : DForm_1<51, (outs f8rc:$RST, ptr_rc_nor0:$ea_result), (ins (memri $D, def LBZUX : XForm_1_memOp<31, 119, (outs gprc:$RST, ptr_rc_nor0:$ea_result), (ins (memrr $RA, $RB):$addr), "lbzux $RST, $addr", IIC_LdStLoadUpdX, - []>, RegConstraint<"$addr.ptrreg = $ea_result">, - NoEncode<"$ea_result">; + []>, RegConstraint<"$addr.ptrreg = $ea_result">; def LHAUX : XForm_1_memOp<31, 375, (outs gprc:$RST, ptr_rc_nor0:$ea_result), (ins (memrr $RA, $RB):$addr), "lhaux $RST, $addr", IIC_LdStLHAUX, - []>, RegConstraint<"$addr.ptrreg = $ea_result">, - NoEncode<"$ea_result">; + []>, RegConstraint<"$addr.ptrreg = $ea_result">; def LHZUX : XForm_1_memOp<31, 311, (outs gprc:$RST, ptr_rc_nor0:$ea_result), (ins (memrr $RA, $RB):$addr), "lhzux $RST, $addr", IIC_LdStLoadUpdX, - []>, RegConstraint<"$addr.ptrreg = $ea_result">, - NoEncode<"$ea_result">; + []>, RegConstraint<"$addr.ptrreg = $ea_result">; def LWZUX : XForm_1_memOp<31, 55, (outs gprc:$RST, ptr_rc_nor0:$ea_result), (ins (memrr $RA, $RB):$addr), "lwzux $RST, $addr", IIC_LdStLoadUpdX, - []>, RegConstraint<"$addr.ptrreg = $ea_result">, - NoEncode<"$ea_result">; + []>, RegConstraint<"$addr.ptrreg = $ea_result">; let Predicates = [HasFPU] in { def LFSUX : XForm_1_memOp<31, 567, (outs f4rc:$RST, ptr_rc_nor0:$ea_result), (ins (memrr $RA, $RB):$addr), "lfsux $RST, $addr", IIC_LdStLFDUX, - []>, RegConstraint<"$addr.ptrreg = $ea_result">, - NoEncode<"$ea_result">; + []>, RegConstraint<"$addr.ptrreg = $ea_result">; def LFDUX : XForm_1_memOp<31, 631, (outs f8rc:$RST, ptr_rc_nor0:$ea_result), (ins (memrr $RA, $RB):$addr), "lfdux $RST, $addr", IIC_LdStLFDUX, - []>, RegConstraint<"$addr.ptrreg = $ea_result">, - NoEncode<"$ea_result">; + []>, RegConstraint<"$addr.ptrreg = $ea_result">; } } } @@ -2132,20 +2124,20 @@ def STFD : DForm_1<54, (outs), (ins f8rc:$RST, (memri $D, $RA):$dst), let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in { def STBU : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins gprc:$RST, (memri $D, $RA):$dst), "stbu $RST, $dst", IIC_LdStSTU, []>, - RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">; + RegConstraint<"$dst.reg = $ea_res">; def STHU : DForm_1<45, (outs ptr_rc_nor0:$ea_res), (ins gprc:$RST, (memri $D, $RA):$dst), "sthu $RST, $dst", IIC_LdStSTU, []>, - RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">; + RegConstraint<"$dst.reg = $ea_res">; def STWU : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins gprc:$RST, (memri $D, $RA):$dst), "stwu $RST, $dst", IIC_LdStSTU, []>, - RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">; + RegConstraint<"$dst.reg = $ea_res">; let Predicates = [HasFPU] in { def STFSU : DForm_1<53, (outs ptr_rc_nor0:$ea_res), (ins f4rc:$RST, (memri $D, $RA):$dst), "stfsu $RST, $dst", IIC_LdStSTFDU, []>, - RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">; + RegConstraint<"$dst.reg = $ea_res">; def STFDU : DForm_1<55, (outs ptr_rc_nor0:$ea_res), (ins f8rc:$RST, (memri $D, $RA):$dst), "stfdu $RST, $dst", IIC_LdStSTFDU, []>, - RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">; + RegConstraint<"$dst.reg = $ea_res">; } } @@ -2207,32 +2199,27 @@ def STBUX : XForm_8_memOp<31, 247, (outs ptr_rc_nor0:$ea_res), (ins gprc:$RST, (memrr $RA, $RB):$addr), "stbux $RST, $addr", IIC_LdStSTUX, []>, RegConstraint<"$addr.ptrreg = $ea_res">, - NoEncode<"$ea_res">, PPC970_DGroup_Cracked; def STHUX : XForm_8_memOp<31, 439, (outs ptr_rc_nor0:$ea_res), (ins gprc:$RST, (memrr $RA, $RB):$addr), "sthux $RST, $addr", IIC_LdStSTUX, []>, RegConstraint<"$addr.ptrreg = $ea_res">, - NoEncode<"$ea_res">, PPC970_DGroup_Cracked; def STWUX : XForm_8_memOp<31, 183, (outs ptr_rc_nor0:$ea_res), (ins gprc:$RST, (memrr $RA, $RB):$addr), "stwux $RST, $addr", IIC_LdStSTUX, []>, RegConstraint<"$addr.ptrreg = $ea_res">, - NoEncode<"$ea_res">, PPC970_DGroup_Cracked; let Predicates = [HasFPU] in { def STFSUX: XForm_8_memOp<31, 695, (outs ptr_rc_nor0:$ea_res), (ins f4rc:$RST, (memrr $RA, $RB):$addr), "stfsux $RST, $addr", IIC_LdStSTFDU, []>, RegConstraint<"$addr.ptrreg = $ea_res">, - NoEncode<"$ea_res">, PPC970_DGroup_Cracked; def STFDUX: XForm_8_memOp<31, 759, (outs ptr_rc_nor0:$ea_res), (ins f8rc:$RST, (memrr $RA, $RB):$addr), "stfdux $RST, $addr", IIC_LdStSTFDU, []>, RegConstraint<"$addr.ptrreg = $ea_res">, - NoEncode<"$ea_res">, PPC970_DGroup_Cracked; } } @@ -3099,7 +3086,7 @@ defm RLWIMI : MForm_2r<20, (outs gprc:$RA), (ins gprc:$RAi, gprc:$RS, u5imm:$SH, u5imm:$MB, u5imm:$ME), "rlwimi", "$RA, $RS, $SH, $MB, $ME", IIC_IntRotate, []>, PPC970_DGroup_Cracked, - RegConstraint<"$RAi = $RA">, NoEncode<"$RAi">; + RegConstraint<"$RAi = $RA">; } let BaseName = "rlwinm" in { def RLWINM : MForm_2<21, @@ -3235,9 +3222,10 @@ def PPC32GOT: PPCEmitTimePseudo<(outs gprc:$rD), (ins), "#PPC32GOT", // Get the _GLOBAL_OFFSET_TABLE_ in PIC mode. // This uses two output registers, the first as the real output, the second as a -// temporary register, used internally in code generation. +// temporary register, used internally in code generation. A "bl" also clobbers LR. +let Defs = [LR] in def PPC32PICGOT: PPCEmitTimePseudo<(outs gprc:$rD, gprc:$rT), (ins), "#PPC32PICGOT", - []>, NoEncode<"$rT">; + []>; def LDgotTprelL32: PPCEmitTimePseudo<(outs gprc_nor0:$rD), (ins s16imm:$disp, gprc_nor0:$reg), "#LDgotTprelL32", @@ -4287,7 +4275,7 @@ def WRTEEI: I<31, (outs), (ins i1imm:$E), "wrteei $E", IIC_SprMTMSR>, bits<1> E; let Inst{16} = E; - let Inst{21-30} = 163; + let Inst{21...30} = 163; } def DCCCI : XForm_tlb<454, (outs), (ins gprc:$RA, gprc:$RB), @@ -4967,44 +4955,44 @@ defm : BranchSimpleMnemonic1<"dzf", "", 2>; multiclass BranchExtendedMnemonicPM<string name, string pm, int bibo> { def : InstAlias<"b"#name#pm#" $cc, $dst", - (BCC bibo, crrc:$cc, condbrtarget:$dst)>; + (BCC (pred bibo, crrc:$cc), condbrtarget:$dst)>; def : InstAlias<"b"#name#pm#" $dst", - (BCC bibo, CR0, condbrtarget:$dst)>; + (BCC (pred bibo, CR0), condbrtarget:$dst)>; def : InstAlias<"b"#name#"a"#pm#" $cc, $dst", - (BCCA bibo, crrc:$cc, abscondbrtarget:$dst)>; + (BCCA (pred bibo, crrc:$cc), abscondbrtarget:$dst)>; def : InstAlias<"b"#name#"a"#pm#" $dst", - (BCCA bibo, CR0, abscondbrtarget:$dst)>; + (BCCA (pred bibo, CR0), abscondbrtarget:$dst)>; def : InstAlias<"b"#name#"lr"#pm#" $cc", - (BCCLR bibo, crrc:$cc)>; + (BCCLR (pred bibo, crrc:$cc))>; def : InstAlias<"b"#name#"lr"#pm, - (BCCLR bibo, CR0)>; + (BCCLR (pred bibo, CR0))>; def : InstAlias<"b"#name#"ctr"#pm#" $cc", - (BCCCTR bibo, crrc:$cc)>; + (BCCCTR (pred bibo, crrc:$cc))>; def : InstAlias<"b"#name#"ctr"#pm, - (BCCCTR bibo, CR0)>; + (BCCCTR (pred bibo, CR0))>; def : InstAlias<"b"#name#"l"#pm#" $cc, $dst", - (BCCL bibo, crrc:$cc, condbrtarget:$dst)>; + (BCCL (pred bibo, crrc:$cc), condbrtarget:$dst)>; def : InstAlias<"b"#name#"l"#pm#" $dst", - (BCCL bibo, CR0, condbrtarget:$dst)>; + (BCCL (pred bibo, CR0), condbrtarget:$dst)>; def : InstAlias<"b"#name#"la"#pm#" $cc, $dst", - (BCCLA bibo, crrc:$cc, abscondbrtarget:$dst)>; + (BCCLA (pred bibo, crrc:$cc), abscondbrtarget:$dst)>; def : InstAlias<"b"#name#"la"#pm#" $dst", - (BCCLA bibo, CR0, abscondbrtarget:$dst)>; + (BCCLA (pred bibo, CR0), abscondbrtarget:$dst)>; def : InstAlias<"b"#name#"lrl"#pm#" $cc", - (BCCLRL bibo, crrc:$cc)>; + (BCCLRL (pred bibo, crrc:$cc))>; def : InstAlias<"b"#name#"lrl"#pm, - (BCCLRL bibo, CR0)>; + (BCCLRL (pred bibo, CR0))>; def : InstAlias<"b"#name#"ctrl"#pm#" $cc", - (BCCCTRL bibo, crrc:$cc)>; + (BCCCTRL (pred bibo, crrc:$cc))>; def : InstAlias<"b"#name#"ctrl"#pm, - (BCCCTRL bibo, CR0)>; + (BCCCTRL (pred bibo, CR0))>; } multiclass BranchExtendedMnemonic<string name, int bibo> { defm : BranchExtendedMnemonicPM<name, "", bibo>; diff --git a/llvm/lib/Target/PowerPC/PPCInstrMMA.td b/llvm/lib/Target/PowerPC/PPCInstrMMA.td index 436715a0e4ab..b38dd4ae948c 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrMMA.td +++ b/llvm/lib/Target/PowerPC/PPCInstrMMA.td @@ -14,7 +14,7 @@ multiclass ACC_UM_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase, def PP : XX3Form_AT3_XAB6<opcode, xo, (outs acc:$AT), !con((ins acc:$ATi), IOL), !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; } let Predicates = [MMA, IsISAFuture], isCodeGenOnly = 1 in { def NAME#W : @@ -24,7 +24,7 @@ multiclass ACC_UM_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase, def WPP : XX3Form_AT3_XAB6<opcode, xo, (outs wacc:$AT), !con((ins wacc:$ATi), IOL), !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; } } @@ -48,7 +48,7 @@ multiclass ACC_UM_M844_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase, !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u8imm:$PMSK))), !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; } let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in { def PM#NAME#W : @@ -65,7 +65,7 @@ multiclass ACC_UM_M844_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase, !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u8imm:$PMSK))), !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; } } @@ -89,7 +89,7 @@ multiclass ACC_UM_M444_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase, !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK))), !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; } let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in { def PM#NAME#W : @@ -106,7 +106,7 @@ multiclass ACC_UM_M444_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase, !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK))), !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; } } @@ -129,7 +129,7 @@ multiclass ACC_UM_M244_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase, !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; } let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in { def PM#NAME#W : @@ -145,7 +145,7 @@ multiclass ACC_UM_M244_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase, !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; } } @@ -162,7 +162,7 @@ multiclass ACC_UM_M244_XO46<bits<6> opcode, bits<8> xo, dag IOL, string asmbase, XX3Form_AT3_XAB6< opcode, !or(xo, 0x20), (outs acc:$AT), !con((ins acc:$ATi), IOL), !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; } let Predicates = [MMA, PrefixInstrs, IsNotISAFuture] in { def PM#NAME : @@ -179,7 +179,7 @@ multiclass ACC_UM_M244_XO46<bits<6> opcode, bits<8> xo, dag IOL, string asmbase, !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; } let Predicates = [MMA, IsISAFuture], isCodeGenOnly = 1 in { def NAME#W : @@ -190,7 +190,7 @@ multiclass ACC_UM_M244_XO46<bits<6> opcode, bits<8> xo, dag IOL, string asmbase, XX3Form_AT3_XAB6< opcode, !or(xo, 0x20), (outs wacc:$AT), !con((ins wacc:$ATi), IOL), !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; } let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in { def PM#NAME#W : @@ -207,7 +207,7 @@ multiclass ACC_UM_M244_XO46<bits<6> opcode, bits<8> xo, dag IOL, string asmbase, !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; } } @@ -220,29 +220,29 @@ multiclass ACC_NEG_UM_M244_XOM84C<bits<6> opcode, bits<8> xo, dag IOL, def PN : XX3Form_AT3_XAB6< opcode, !or(xo, 0x80), (outs acc:$AT), !con((ins acc:$ATi), IOL), !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; def NP : XX3Form_AT3_XAB6< opcode, !or(xo, 0x40), (outs acc:$AT), !con((ins acc:$ATi), IOL), !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; def NN : XX3Form_AT3_XAB6< opcode, !or(xo, 0xC0), (outs acc:$AT), !con((ins acc:$ATi), IOL), !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; } let Predicates = [MMA, IsISAFuture], isCodeGenOnly = 1 in { def WPN : XX3Form_AT3_XAB6< opcode, !or(xo, 0x80), (outs wacc:$AT), !con((ins wacc:$ATi), IOL), !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; def WNP : XX3Form_AT3_XAB6< opcode, !or(xo, 0x40), (outs wacc:$AT), !con((ins wacc:$ATi), IOL), !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; def WNN : XX3Form_AT3_XAB6< opcode, !or(xo, 0xC0), (outs wacc:$AT), !con((ins wacc:$ATi), IOL), !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; } let Predicates = [MMA, PrefixInstrs, IsNotISAFuture] in { def PM#NAME#PN : @@ -251,21 +251,21 @@ multiclass ACC_NEG_UM_M244_XOM84C<bits<6> opcode, bits<8> xo, dag IOL, !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK, $PMSK"), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; def PM#NAME#NP : MMIRR_XX3Form_XY4P2_XAB6< opcode, !or(xo, 0x40), (outs acc:$AT), !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK, $PMSK"), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; def PM#NAME#NN : MMIRR_XX3Form_XY4P2_XAB6< opcode, !or(xo, 0xC0), (outs acc:$AT), !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK, $PMSK"), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; } let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in { def PM#NAME#WPN : @@ -274,21 +274,21 @@ multiclass ACC_NEG_UM_M244_XOM84C<bits<6> opcode, bits<8> xo, dag IOL, !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK, $PMSK"), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; def PM#NAME#WNP : MMIRR_XX3Form_XY4P2_XAB6< opcode, !or(xo, 0x40), (outs wacc:$AT), !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK, $PMSK"), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; def PM#NAME#WNN : MMIRR_XX3Form_XY4P2_XAB6< opcode, !or(xo, 0xC0), (outs wacc:$AT), !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK, $PMSK"), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; } } @@ -301,29 +301,29 @@ multiclass ACC_NEG_UM_XOM84C<bits<6> opcode, bits<8> xo, dag IOL, def PN : XX3Form_AT3_XAB6<opcode, !or(xo, 0x80), (outs acc:$AT), !con((ins acc:$ATi), IOL), !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; def NP : XX3Form_AT3_XAB6<opcode, !or(xo, 0x40), (outs acc:$AT), !con((ins acc:$ATi), IOL), !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; def NN : XX3Form_AT3_XAB6<opcode, !or(xo, 0xC0), (outs acc:$AT), !con((ins acc:$ATi), IOL), !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; } let Predicates = [MMA, IsISAFuture], isCodeGenOnly = 1 in { def WPN : XX3Form_AT3_XAB6<opcode, !or(xo, 0x80), (outs wacc:$AT), !con((ins wacc:$ATi), IOL), !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; def WNP : XX3Form_AT3_XAB6<opcode, !or(xo, 0x40), (outs wacc:$AT), !con((ins wacc:$ATi), IOL), !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; def WNN : XX3Form_AT3_XAB6<opcode, !or(xo, 0xC0), (outs wacc:$AT), !con((ins wacc:$ATi), IOL), !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; } } @@ -346,28 +346,28 @@ multiclass ACC_NEG_UM_M44_XOM84C<bits<6> opcode, bits<8> xo, dag IOL, !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))), !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK"), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; def PM#NAME#PN : MMIRR_XX3Form_XY4_XAB6< opcode, !or(xo, 0x80), (outs acc:$AT), !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))), !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK"), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; def PM#NAME#NP : MMIRR_XX3Form_XY4_XAB6< opcode, !or(xo, 0x40), (outs acc:$AT), !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))), !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK"), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; def PM#NAME#NN : MMIRR_XX3Form_XY4_XAB6< opcode, !or(xo, 0xC0), (outs acc:$AT), !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))), !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK"), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; } let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in { def PM#NAME#W : @@ -383,28 +383,28 @@ multiclass ACC_NEG_UM_M44_XOM84C<bits<6> opcode, bits<8> xo, dag IOL, !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))), !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK"), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; def PM#NAME#WPN : MMIRR_XX3Form_XY4_XAB6< opcode, !or(xo, 0x80), (outs wacc:$AT), !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))), !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK"), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; def PM#NAME#WNP : MMIRR_XX3Form_XY4_XAB6< opcode, !or(xo, 0x40), (outs wacc:$AT), !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))), !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK"), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; def PM#NAME#WNN : MMIRR_XX3Form_XY4_XAB6< opcode, !or(xo, 0xC0), (outs wacc:$AT), !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))), !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK"), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; } } @@ -427,28 +427,28 @@ multiclass ACC_NEG_UM_M42_XOM84C<bits<6> opcode, bits<8> xo, dag IOL, !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))), !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK"), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; def PM#NAME#PN : MMIRR_XX3Form_X4Y2_XAB6< opcode, !or(xo, 0x80), (outs acc:$AT), !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))), !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK"), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; def PM#NAME#NP : MMIRR_XX3Form_X4Y2_XAB6< opcode, !or(xo, 0x40), (outs acc:$AT), !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))), !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK"), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; def PM#NAME#NN : MMIRR_XX3Form_X4Y2_XAB6< opcode, !or(xo, 0xC0), (outs acc:$AT), !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))), !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK"), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; } let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in { def PM#NAME#W : @@ -464,28 +464,28 @@ multiclass ACC_NEG_UM_M42_XOM84C<bits<6> opcode, bits<8> xo, dag IOL, !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))), !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK"), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; def PM#NAME#WPN : MMIRR_XX3Form_X4Y2_XAB6< opcode, !or(xo, 0x80), (outs wacc:$AT), !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))), !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK"), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; def PM#NAME#WNP : MMIRR_XX3Form_X4Y2_XAB6< opcode, !or(xo, 0x40), (outs wacc:$AT), !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))), !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK"), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; def PM#NAME#WNN : MMIRR_XX3Form_X4Y2_XAB6< opcode, !or(xo, 0xC0), (outs wacc:$AT), !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))), !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK"), IIC_VecFP, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; } } @@ -497,12 +497,12 @@ let Predicates = [MMA, IsNotISAFuture] in { XForm_AT3<31, 0, 177, (outs acc:$ATo), (ins acc:$AT), "xxmfacc $AT", IIC_VecGeneral, [(set v512i1:$ATo, (int_ppc_mma_xxmfacc v512i1:$AT))]>, - RegConstraint<"$ATo = $AT">, NoEncode<"$ATo">; + RegConstraint<"$ATo = $AT">; def XXMTACC : XForm_AT3<31, 1, 177, (outs acc:$AT), (ins acc:$ATi), "xxmtacc $AT", IIC_VecGeneral, [(set v512i1:$AT, (int_ppc_mma_xxmtacc v512i1:$ATi))]>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; def KILL_PAIR : PPCPostRAExpPseudo<(outs vsrprc:$XTp), (ins vsrprc:$XSp), "#KILL_PAIR", []>, RegConstraint<"$XTp = $XSp">; @@ -519,7 +519,7 @@ let Predicates = [MMA, IsNotISAFuture] in { def XVI8GER4SPP : XX3Form_AT3_XAB6<59, 99, (outs acc:$AT), (ins acc:$ATi, vsrc:$XA, vsrc:$XB), "xvi8ger4spp $AT, $XA, $XB", IIC_VecGeneral, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; let mayStore = 1 in { def SPILL_ACC: PPCEmitTimePseudo<(outs), (ins acc:$AT, memrix16:$dst), "#SPILL_ACC", []>; @@ -544,11 +544,11 @@ let Predicates = [MMA, IsISAFuture], isCodeGenOnly = 1 in { def XXMFACCW : XForm_AT3<31, 0, 177, (outs wacc:$ATo), (ins wacc:$AT), "xxmfacc $AT", IIC_VecGeneral, []>, - RegConstraint<"$ATo = $AT">, NoEncode<"$ATo">; + RegConstraint<"$ATo = $AT">; def XXMTACCW : XForm_AT3<31, 1, 177, (outs wacc:$AT), (ins wacc:$ATi), "xxmtacc $AT", IIC_VecGeneral, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; let isAsCheapAsAMove = 1, isReMaterializable = 1 in { def DMXXSETACCZ : @@ -560,7 +560,7 @@ let Predicates = [MMA, IsISAFuture], isCodeGenOnly = 1 in { XX3Form_AT3_XAB6<59, 99, (outs wacc:$AT), (ins wacc:$ATi, vsrc:$XA, vsrc:$XB), "xvi8ger4spp $AT, $XA, $XB", IIC_VecGeneral, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; let mayStore = 1 in { def SPILL_WACC: PPCEmitTimePseudo<(outs), (ins wacc:$AT, memrix16:$dst), @@ -593,7 +593,7 @@ let Predicates = [MMA, PrefixInstrs, IsNotISAFuture] in { u4imm:$YMSK, u4imm:$PMSK), "pmxvi8ger4spp $AT, $XA, $XB, $XMSK, $YMSK, $PMSK", IIC_VecGeneral, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; } let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in { @@ -603,7 +603,7 @@ let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in { u4imm:$YMSK, u4imm:$PMSK), "pmxvi8ger4spp $AT, $XA, $XB, $XMSK, $YMSK, $PMSK", IIC_VecGeneral, []>, - RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + RegConstraint<"$ATi = $AT">; } // MMA accumulating/non-accumulating instructions. diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td index c4a027d65b66..149a44ddfc10 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrP10.td +++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td @@ -125,8 +125,8 @@ class PI<bits<6> pref, bits<6> opcode, dag OOL, dag IOL, string asmstr, let InOperandList = IOL; let AsmString = asmstr; let Itinerary = itin; - let Inst{0-5} = pref; - let Inst{32-37} = opcode; + let Inst{0...5} = pref; + let Inst{32...37} = opcode; bits<1> PPC970_First = 0; bits<1> PPC970_Single = 0; @@ -138,7 +138,7 @@ class PI<bits<6> pref, bits<6> opcode, dag OOL, dag IOL, string asmstr, let TSFlags{0} = PPC970_First; let TSFlags{1} = PPC970_Single; let TSFlags{2} = PPC970_Cracked; - let TSFlags{5-3} = PPC970_Unit; + let TSFlags{5...3} = PPC970_Unit; bits<1> Prefixed = 1; // This is a prefixed instruction. let TSFlags{7} = Prefixed; @@ -167,11 +167,11 @@ class VXForm_VTB5_RC<bits<10> xo, bits<5> R, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = VT; - let Inst{11-15} = R; - let Inst{16-20} = VB; + let Inst{6...10} = VT; + let Inst{11...15} = R; + let Inst{16...20} = VB; let Inst{21} = RC; - let Inst{22-31} = xo; + let Inst{22...31} = xo; } // Multiclass definition to account for record and non-record form @@ -200,16 +200,16 @@ class MLS_DForm_R_SI34_RTA5_MEM<bits<6> opcode, dag OOL, dag IOL, string asmstr, let Pattern = pattern; // The prefix. - let Inst{6-7} = 2; - let Inst{8-10} = 0; + let Inst{6...7} = 2; + let Inst{8...10} = 0; let Inst{11} = PCRel; - let Inst{12-13} = 0; - let Inst{14-31} = D{33-16}; // d0 + let Inst{12...13} = 0; + let Inst{14...31} = D{33...16}; // d0 // The instruction. - let Inst{38-42} = RST{4-0}; - let Inst{43-47} = RA; - let Inst{48-63} = D{15-0}; // d1 + let Inst{38...42} = RST{4...0}; + let Inst{43...47} = RA; + let Inst{48...63} = D{15...0}; // d1 } class MLS_DForm_R_SI34_RTA5<bits<6> opcode, dag OOL, dag IOL, string asmstr, @@ -222,16 +222,16 @@ class MLS_DForm_R_SI34_RTA5<bits<6> opcode, dag OOL, dag IOL, string asmstr, let Pattern = pattern; // The prefix. - let Inst{6-7} = 2; - let Inst{8-10} = 0; + let Inst{6...7} = 2; + let Inst{8...10} = 0; let Inst{11} = PCRel; - let Inst{12-13} = 0; - let Inst{14-31} = SI{33-16}; + let Inst{12...13} = 0; + let Inst{14...31} = SI{33...16}; // The instruction. - let Inst{38-42} = RT; - let Inst{43-47} = RA; - let Inst{48-63} = SI{15-0}; + let Inst{38...42} = RT; + let Inst{43...47} = RA; + let Inst{48...63} = SI{15...0}; } class MLS_DForm_SI34_RT5<bits<6> opcode, dag OOL, dag IOL, string asmstr, @@ -243,16 +243,16 @@ class MLS_DForm_SI34_RT5<bits<6> opcode, dag OOL, dag IOL, string asmstr, let Pattern = pattern; // The prefix. - let Inst{6-7} = 2; - let Inst{8-10} = 0; + let Inst{6...7} = 2; + let Inst{8...10} = 0; let Inst{11} = 0; - let Inst{12-13} = 0; - let Inst{14-31} = SI{33-16}; + let Inst{12...13} = 0; + let Inst{14...31} = SI{33...16}; // The instruction. - let Inst{38-42} = RT; - let Inst{43-47} = 0; - let Inst{48-63} = SI{15-0}; + let Inst{38...42} = RT; + let Inst{43...47} = 0; + let Inst{48...63} = SI{15...0}; } multiclass MLS_DForm_R_SI34_RTA5_p<bits<6> opcode, dag OOL, dag IOL, @@ -274,15 +274,15 @@ class 8LS_DForm_R_SI34_RTA5_MEM<bits<6> opcode, dag OOL, dag IOL, string asmstr, let Pattern = pattern; // The prefix. - let Inst{6-10} = 0; + let Inst{6...10} = 0; let Inst{11} = PCRel; - let Inst{12-13} = 0; - let Inst{14-31} = D{33-16}; // d0 + let Inst{12...13} = 0; + let Inst{14...31} = D{33...16}; // d0 // The instruction. - let Inst{38-42} = RST{4-0}; - let Inst{43-47} = RA; - let Inst{48-63} = D{15-0}; // d1 + let Inst{38...42} = RST{4...0}; + let Inst{43...47} = RA; + let Inst{48...63} = D{15...0}; // d1 } // 8LS:D-Form: [ 1 0 0 // R // d0 @@ -298,18 +298,18 @@ class 8LS_DForm_R_SI34_XT6_RA5_MEM<bits<5> opcode, dag OOL, dag IOL, let Pattern = pattern; // The prefix. - let Inst{6-7} = 0; + let Inst{6...7} = 0; let Inst{8} = 0; - let Inst{9-10} = 0; // reserved + let Inst{9...10} = 0; // reserved let Inst{11} = PCRel; - let Inst{12-13} = 0; // reserved - let Inst{14-31} = D{33-16}; // d0 + let Inst{12...13} = 0; // reserved + let Inst{14...31} = D{33...16}; // d0 // The instruction. let Inst{37} = XST{5}; - let Inst{38-42} = XST{4-0}; - let Inst{43-47} = RA; - let Inst{48-63} = D{15-0}; // d1 + let Inst{38...42} = XST{4...0}; + let Inst{43...47} = RA; + let Inst{48...63} = D{15...0}; // d1 } // X-Form: [PO T IMM VRB XO TX] @@ -321,10 +321,10 @@ class XForm_XT6_IMM5_VB5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, bits<5> IMM; let Pattern = pattern; - let Inst{6-10} = XT{4-0}; - let Inst{11-15} = IMM; - let Inst{16-20} = VRB; - let Inst{21-30} = xo; + let Inst{6...10} = XT{4...0}; + let Inst{11...15} = IMM; + let Inst{16...20} = VRB; + let Inst{21...30} = xo; let Inst{31} = XT{5}; } @@ -341,19 +341,19 @@ class 8RR_XX4Form_IMM8_XTAB6<bits<6> opcode, bits<2> xo, let Pattern = pattern; // The prefix. - let Inst{6-7} = 1; + let Inst{6...7} = 1; let Inst{8} = 0; - let Inst{9-11} = 0; - let Inst{12-13} = 0; - let Inst{14-23} = 0; - let Inst{24-31} = IMM; + let Inst{9...11} = 0; + let Inst{12...13} = 0; + let Inst{14...23} = 0; + let Inst{24...31} = IMM; // The instruction. - let Inst{38-42} = XT{4-0}; - let Inst{43-47} = XA{4-0}; - let Inst{48-52} = XB{4-0}; - let Inst{53-57} = XC{4-0}; - let Inst{58-59} = xo; + let Inst{38...42} = XT{4...0}; + let Inst{43...47} = XA{4...0}; + let Inst{48...52} = XB{4...0}; + let Inst{53...57} = XC{4...0}; + let Inst{58...59} = xo; let Inst{60} = XC{5}; let Inst{61} = XA{5}; let Inst{62} = XB{5}; @@ -369,11 +369,11 @@ class VXForm_RD5_N3_VB5<bits<11> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = RD; - let Inst{11-12} = 0; - let Inst{13-15} = N; - let Inst{16-20} = VB; - let Inst{21-31} = xo; + let Inst{6...10} = RD; + let Inst{11...12} = 0; + let Inst{13...15} = N; + let Inst{16...20} = VB; + let Inst{21...31} = xo; } @@ -382,14 +382,14 @@ class VXForm_RD5_N3_VB5<bits<11> xo, dag OOL, dag IOL, string asmstr, class VXForm_VTB5_RA5_ins<bits<11> xo, string opc, list<dag> pattern> : VXForm_1<xo, (outs vrrc:$VD), (ins vrrc:$VDi, gprc:$VA, vrrc:$VB), !strconcat(opc, " $VD, $VA, $VB"), IIC_VecGeneral, pattern>, - RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">; + RegConstraint<"$VDi = $VD">; // VX-Form: [PO VRT RA RB XO]. // Destructive (insert) forms are suffixed with _ins. class VXForm_VRT5_RAB5_ins<bits<11> xo, string opc, list<dag> pattern> : VXForm_1<xo, (outs vrrc:$VD), (ins vrrc:$VDi, gprc:$VA, gprc:$VB), !strconcat(opc, " $VD, $VA, $VB"), IIC_VecGeneral, pattern>, - RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">; + RegConstraint<"$VDi = $VD">; // VX-Form: [ PO BF // VRA VRB XO ] class VXForm_BF3_VAB5<bits<11> xo, dag OOL, dag IOL, string asmstr, @@ -401,11 +401,11 @@ class VXForm_BF3_VAB5<bits<11> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-8} = BF; - let Inst{9-10} = 0; - let Inst{11-15} = VA; - let Inst{16-20} = VB; - let Inst{21-31} = xo; + let Inst{6...8} = BF; + let Inst{9...10} = 0; + let Inst{11...15} = VA; + let Inst{16...20} = VB; + let Inst{21...31} = xo; } // VN-Form: [PO VRT VRA VRB PS SD XO] @@ -420,12 +420,12 @@ class VNForm_VTAB5_SD3<bits<6> xo, bits<2> ps, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = VRT; - let Inst{11-15} = VRA; - let Inst{16-20} = VRB; - let Inst{21-22} = ps; - let Inst{23-25} = SD; - let Inst{26-31} = xo; + let Inst{6...10} = VRT; + let Inst{11...15} = VRA; + let Inst{16...20} = VRB; + let Inst{21...22} = ps; + let Inst{23...25} = SD; + let Inst{26...31} = xo; } class VXForm_RD5_MP_VB5<bits<11> xo, bits<4> eo, dag OOL, dag IOL, @@ -437,11 +437,11 @@ class VXForm_RD5_MP_VB5<bits<11> xo, bits<4> eo, dag OOL, dag IOL, let Pattern = pattern; - let Inst{6-10} = RD; - let Inst{11-14} = eo; + let Inst{6...10} = RD; + let Inst{11...14} = eo; let Inst{15} = MP; - let Inst{16-20} = VB; - let Inst{21-31} = xo; + let Inst{16...20} = VB; + let Inst{21...31} = xo; } // 8RR:D-Form: [ 1 1 0 // // imm0 @@ -456,17 +456,17 @@ class 8RR_DForm_IMM32_XT6<bits<6> opcode, bits<4> xo, dag OOL, dag IOL, let Pattern = pattern; // The prefix. - let Inst{6-7} = 1; - let Inst{8-11} = 0; - let Inst{12-13} = 0; // reserved - let Inst{14-15} = 0; // reserved - let Inst{16-31} = IMM32{31-16}; + let Inst{6...7} = 1; + let Inst{8...11} = 0; + let Inst{12...13} = 0; // reserved + let Inst{14...15} = 0; // reserved + let Inst{16...31} = IMM32{31...16}; // The instruction. - let Inst{38-42} = XT{4-0}; - let Inst{43-46} = xo; + let Inst{38...42} = XT{4...0}; + let Inst{43...46} = xo; let Inst{47} = XT{5}; - let Inst{48-63} = IMM32{15-0}; + let Inst{48...63} = IMM32{15...0}; } // 8RR:D-Form: [ 1 1 0 // // imm0 @@ -482,18 +482,18 @@ class 8RR_DForm_IMM32_XT6_IX<bits<6> opcode, bits<3> xo, dag OOL, dag IOL, let Pattern = pattern; // The prefix. - let Inst{6-7} = 1; - let Inst{8-11} = 0; - let Inst{12-13} = 0; // reserved - let Inst{14-15} = 0; // reserved - let Inst{16-31} = IMM32{31-16}; + let Inst{6...7} = 1; + let Inst{8...11} = 0; + let Inst{12...13} = 0; // reserved + let Inst{14...15} = 0; // reserved + let Inst{16...31} = IMM32{31...16}; // The instruction. - let Inst{38-42} = XT{4-0}; - let Inst{43-45} = xo; + let Inst{38...42} = XT{4...0}; + let Inst{43...45} = xo; let Inst{46} = IX; let Inst{47} = XT{5}; - let Inst{48-63} = IMM32{15-0}; + let Inst{48...63} = IMM32{15...0}; } class 8RR_XX4Form_XTABC6<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, @@ -507,17 +507,17 @@ class 8RR_XX4Form_XTABC6<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, let Pattern = pattern; // The prefix. - let Inst{6-7} = 1; - let Inst{8-11} = 0; - let Inst{12-13} = 0; - let Inst{14-31} = 0; + let Inst{6...7} = 1; + let Inst{8...11} = 0; + let Inst{12...13} = 0; + let Inst{14...31} = 0; // The instruction. - let Inst{38-42} = XT{4-0}; - let Inst{43-47} = XA{4-0}; - let Inst{48-52} = XB{4-0}; - let Inst{53-57} = XC{4-0}; - let Inst{58-59} = xo; + let Inst{38...42} = XT{4...0}; + let Inst{43...47} = XA{4...0}; + let Inst{48...52} = XB{4...0}; + let Inst{53...57} = XC{4...0}; + let Inst{58...59} = xo; let Inst{60} = XC{5}; let Inst{61} = XA{5}; let Inst{62} = XB{5}; @@ -537,18 +537,18 @@ class 8RR_XX4Form_IMM3_XTABC6<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, let Pattern = pattern; // The prefix. - let Inst{6-7} = 1; - let Inst{8-11} = 0; - let Inst{12-13} = 0; - let Inst{14-28} = 0; - let Inst{29-31} = IMM; + let Inst{6...7} = 1; + let Inst{8...11} = 0; + let Inst{12...13} = 0; + let Inst{14...28} = 0; + let Inst{29...31} = IMM; // The instruction. - let Inst{38-42} = XT{4-0}; - let Inst{43-47} = XA{4-0}; - let Inst{48-52} = XB{4-0}; - let Inst{53-57} = XC{4-0}; - let Inst{58-59} = xo; + let Inst{38...42} = XT{4...0}; + let Inst{43...47} = XA{4...0}; + let Inst{48...52} = XB{4...0}; + let Inst{53...57} = XC{4...0}; + let Inst{58...59} = xo; let Inst{60} = XC{5}; let Inst{61} = XA{5}; let Inst{62} = XB{5}; @@ -565,11 +565,11 @@ class XX2_BF3_XO5_XB6_XO9<bits<6> opcode, bits<5> xo2, bits<9> xo, dag OOL, let Pattern = pattern; - let Inst{6-8} = BF; - let Inst{9-10} = 0; - let Inst{11-15} = xo2; - let Inst{16-20} = XB{4-0}; - let Inst{21-29} = xo; + let Inst{6...8} = BF; + let Inst{9...10} = 0; + let Inst{11...15} = xo2; + let Inst{16...20} = XB{4...0}; + let Inst{21...29} = xo; let Inst{30} = XB{5}; let Inst{31} = 0; } @@ -863,11 +863,11 @@ class DQForm_XTp5_RA17_MEM<bits<6> opcode, bits<4> xo, dag OOL, dag IOL, let Pattern = pattern; - let Inst{6-9} = XTp{3-0}; + let Inst{6...9} = XTp{3...0}; let Inst{10} = XTp{4}; - let Inst{11-15} = RA; - let Inst{16-27} = DQ; - let Inst{28-31} = xo; + let Inst{11...15} = RA; + let Inst{16...27} = DQ; + let Inst{28...31} = xo; } class XForm_XTp5_XAB5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, @@ -878,11 +878,11 @@ class XForm_XTp5_XAB5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, bits<5> RB; let Pattern = pattern; - let Inst{6-9} = XTp{3-0}; + let Inst{6...9} = XTp{3...0}; let Inst{10} = XTp{4}; - let Inst{11-15} = RA; - let Inst{16-20} = RB; - let Inst{21-30} = xo; + let Inst{11...15} = RA; + let Inst{16...20} = RB; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -896,16 +896,16 @@ class 8LS_DForm_R_XTp5_SI34_MEM<bits<6> opcode, dag OOL, dag IOL, string asmstr, let Pattern = pattern; // The prefix. - let Inst{6-10} = 0; + let Inst{6...10} = 0; let Inst{11} = PCRel; - let Inst{12-13} = 0; - let Inst{14-31} = D{33-16}; // Imm18 + let Inst{12...13} = 0; + let Inst{14...31} = D{33...16}; // Imm18 // The instruction. - let Inst{38-41} = XTp{3-0}; + let Inst{38...41} = XTp{3...0}; let Inst{42} = XTp{4}; - let Inst{43-47} = RA; - let Inst{48-63} = D{15-0}; + let Inst{43...47} = RA; + let Inst{48...63} = D{15...0}; } multiclass 8LS_DForm_R_XTp5_SI34_MEM_p<bits<6> opcode, dag OOL, @@ -935,11 +935,11 @@ class XForm_AT3<bits<6> opcode, bits<5> xo2, bits<10> xo, dag OOL, dag IOL, let Pattern = pattern; - let Inst{6-8} = AT; - let Inst{9-10} = 0; - let Inst{11-15} = xo2; - let Inst{16-20} = 0; - let Inst{21-30} = xo; + let Inst{6...8} = AT; + let Inst{9...10} = 0; + let Inst{11...15} = xo2; + let Inst{16...20} = 0; + let Inst{21...30} = xo; let Inst{31} = 0; } @@ -952,10 +952,10 @@ class XForm_XT6_IMM5<bits<6> opcode, bits<5> eo, bits<10> xo, dag OOL, dag IOL, let Pattern = pattern; - let Inst{6-10} = XT{4-0}; - let Inst{11-15} = eo; - let Inst{16-20} = UIM; - let Inst{21-30} = xo; + let Inst{6...10} = XT{4...0}; + let Inst{11...15} = eo; + let Inst{16...20} = UIM; + let Inst{21...30} = xo; let Inst{31} = XT{5}; } @@ -969,11 +969,11 @@ class XX3Form_AT3_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, let Pattern = pattern; - let Inst{6-8} = AT; - let Inst{9-10} = 0; - let Inst{11-15} = XA{4-0}; - let Inst{16-20} = XB{4-0}; - let Inst{21-28} = xo; + let Inst{6...8} = AT; + let Inst{9...10} = 0; + let Inst{11...15} = XA{4...0}; + let Inst{16...20} = XB{4...0}; + let Inst{21...28} = xo; let Inst{29} = XA{5}; let Inst{30} = XB{5}; let Inst{31} = 0; @@ -993,20 +993,20 @@ class MMIRR_XX3Form_XY4P2_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, let Pattern = pattern; // The prefix. - let Inst{6-7} = 3; - let Inst{8-11} = 9; - let Inst{12-15} = 0; - let Inst{16-17} = PMSK; - let Inst{18-23} = 0; - let Inst{24-27} = XMSK; - let Inst{28-31} = YMSK; + let Inst{6...7} = 3; + let Inst{8...11} = 9; + let Inst{12...15} = 0; + let Inst{16...17} = PMSK; + let Inst{18...23} = 0; + let Inst{24...27} = XMSK; + let Inst{28...31} = YMSK; // The instruction. - let Inst{38-40} = AT; - let Inst{41-42} = 0; - let Inst{43-47} = XA{4-0}; - let Inst{48-52} = XB{4-0}; - let Inst{53-60} = xo; + let Inst{38...40} = AT; + let Inst{41...42} = 0; + let Inst{43...47} = XA{4...0}; + let Inst{48...52} = XB{4...0}; + let Inst{53...60} = xo; let Inst{61} = XA{5}; let Inst{62} = XB{5}; let Inst{63} = 0; @@ -1025,18 +1025,18 @@ class MMIRR_XX3Form_XY4_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, let Pattern = pattern; // The prefix. - let Inst{6-7} = 3; - let Inst{8-11} = 9; - let Inst{12-23} = 0; - let Inst{24-27} = XMSK; - let Inst{28-31} = YMSK; + let Inst{6...7} = 3; + let Inst{8...11} = 9; + let Inst{12...23} = 0; + let Inst{24...27} = XMSK; + let Inst{28...31} = YMSK; // The instruction. - let Inst{38-40} = AT; - let Inst{41-42} = 0; - let Inst{43-47} = XA{4-0}; - let Inst{48-52} = XB{4-0}; - let Inst{53-60} = xo; + let Inst{38...40} = AT; + let Inst{41...42} = 0; + let Inst{43...47} = XA{4...0}; + let Inst{48...52} = XB{4...0}; + let Inst{53...60} = xo; let Inst{61} = XA{5}; let Inst{62} = XB{5}; let Inst{63} = 0; @@ -1055,19 +1055,19 @@ class MMIRR_XX3Form_X4Y2_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, let Pattern = pattern; // The prefix. - let Inst{6-7} = 3; - let Inst{8-11} = 9; - let Inst{12-23} = 0; - let Inst{24-27} = XMSK; - let Inst{28-29} = YMSK; - let Inst{30-31} = 0; + let Inst{6...7} = 3; + let Inst{8...11} = 9; + let Inst{12...23} = 0; + let Inst{24...27} = XMSK; + let Inst{28...29} = YMSK; + let Inst{30...31} = 0; // The instruction. - let Inst{38-40} = AT; - let Inst{41-42} = 0; - let Inst{43-47} = XA{4-0}; - let Inst{48-52} = XB{4-0}; - let Inst{53-60} = xo; + let Inst{38...40} = AT; + let Inst{41...42} = 0; + let Inst{43...47} = XA{4...0}; + let Inst{48...52} = XB{4...0}; + let Inst{53...60} = xo; let Inst{61} = XA{5}; let Inst{62} = XB{5}; let Inst{63} = 0; @@ -1087,19 +1087,19 @@ class MMIRR_XX3Form_XY4P8_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, let Pattern = pattern; // The prefix. - let Inst{6-7} = 3; - let Inst{8-11} = 9; - let Inst{12-15} = 0; - let Inst{16-23} = PMSK; - let Inst{24-27} = XMSK; - let Inst{28-31} = YMSK; + let Inst{6...7} = 3; + let Inst{8...11} = 9; + let Inst{12...15} = 0; + let Inst{16...23} = PMSK; + let Inst{24...27} = XMSK; + let Inst{28...31} = YMSK; // The instruction. - let Inst{38-40} = AT; - let Inst{41-42} = 0; - let Inst{43-47} = XA{4-0}; - let Inst{48-52} = XB{4-0}; - let Inst{53-60} = xo; + let Inst{38...40} = AT; + let Inst{41...42} = 0; + let Inst{43...47} = XA{4...0}; + let Inst{48...52} = XB{4...0}; + let Inst{53...60} = xo; let Inst{61} = XA{5}; let Inst{62} = XB{5}; let Inst{63} = 0; @@ -1119,20 +1119,20 @@ class MMIRR_XX3Form_XYP4_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, let Pattern = pattern; // The prefix. - let Inst{6-7} = 3; - let Inst{8-11} = 9; - let Inst{12-15} = 0; - let Inst{16-19} = PMSK; - let Inst{20-23} = 0; - let Inst{24-27} = XMSK; - let Inst{28-31} = YMSK; + let Inst{6...7} = 3; + let Inst{8...11} = 9; + let Inst{12...15} = 0; + let Inst{16...19} = PMSK; + let Inst{20...23} = 0; + let Inst{24...27} = XMSK; + let Inst{28...31} = YMSK; // The instruction. - let Inst{38-40} = AT; - let Inst{41-42} = 0; - let Inst{43-47} = XA{4-0}; - let Inst{48-52} = XB{4-0}; - let Inst{53-60} = xo; + let Inst{38...40} = AT; + let Inst{41...42} = 0; + let Inst{43...47} = XA{4...0}; + let Inst{48...52} = XB{4...0}; + let Inst{53...60} = xo; let Inst{61} = XA{5}; let Inst{62} = XB{5}; let Inst{63} = 0; @@ -1395,7 +1395,7 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1, Predicates = [P [(set v2i64:$XT, (PPCxxsplti32dx v2i64:$XTi, i32:$IX, i32:$IMM32))]>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">; + RegConstraint<"$XTi = $XT">; } let Predicates = [IsISA3_1] in { @@ -1466,13 +1466,13 @@ let Predicates = [IsISA3_1] in { "vinsw $VD, $VB, $VA", IIC_VecGeneral, [(set v4i32:$VD, (int_ppc_altivec_vinsw v4i32:$VDi, i32:$VB, timm:$VA))]>, - RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">; + RegConstraint<"$VDi = $VD">; def VINSD : VXForm_1<463, (outs vrrc:$VD), (ins vrrc:$VDi, u4imm:$VA, g8rc:$VB), "vinsd $VD, $VB, $VA", IIC_VecGeneral, [(set v2i64:$VD, (int_ppc_altivec_vinsd v2i64:$VDi, i64:$VB, timm:$VA))]>, - RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">; + RegConstraint<"$VDi = $VD">; def VINSBVLX : VXForm_VTB5_RA5_ins<15, "vinsbvlx", [(set v16i8:$VD, @@ -1538,13 +1538,13 @@ let Predicates = [IsISA3_1] in { "vinsdlx $VD, $VA, $VB", IIC_VecGeneral, [(set v2i64:$VD, (int_ppc_altivec_vinsdlx v2i64:$VDi, i64:$VA, i64:$VB))]>, - RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">; + RegConstraint<"$VDi = $VD">; def VINSDRX : VXForm_1<975, (outs vrrc:$VD), (ins vrrc:$VDi, g8rc:$VA, g8rc:$VB), "vinsdrx $VD, $VA, $VB", IIC_VecGeneral, [(set v2i64:$VD, (int_ppc_altivec_vinsdrx v2i64:$VDi, i64:$VA, i64:$VB))]>, - RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">; + RegConstraint<"$VDi = $VD">; def VEXTRACTBM : VXForm_RD5_XO5_RS5<1602, 8, (outs gprc:$VD), (ins vrrc:$VB), "vextractbm $VD, $VB", IIC_VecGeneral, [(set i32:$VD, @@ -1915,10 +1915,11 @@ let Predicates = [IsISA3_1] in { [(set v1i128:$VD, (int_ppc_altivec_vrlqmi v1i128:$VA, v1i128:$VB, v1i128:$VDi))]>, - RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">; + RegConstraint<"$VDi = $VD">; def VSLQ : VX1_VT5_VA5_VB5<261, "vslq", []>; def VSRAQ : VX1_VT5_VA5_VB5<773, "vsraq", []>; - def VSRQ : VX1_VT5_VA5_VB5<517, "vsrq", []>; + def VSRQ : VX1_VT5_VA5_VB5<517, "vsrq", + [(set v4i32:$VD, (PPCvsrq v4i32:$VA, v4i32:$VB))]>; def VRLQ : VX1_VT5_VA5_VB5<5, "vrlq", []>; def XSCVQPUQZ : X_VT5_XO5_VB5<63, 0, 836, "xscvqpuqz", []>; def XSCVQPSQZ : X_VT5_XO5_VB5<63, 8, 836, "xscvqpsqz", []>; @@ -2053,6 +2054,9 @@ let Predicates = [IsISA3_1, HasFPU] in { //---------------------------- Anonymous Patterns ----------------------------// let Predicates = [IsISA3_1] in { + // Exploit vsrq instruction to optimize VSR(VSRO (input, vsro_byte_shift), vsr_bit_shift) + // to VSRQ(input, vsrq_bit_shift) + def : Pat<(VSRVSRO v4i32:$vA, v4i32:$vB), (VSRQ $vA, $vB)>; // Exploit the vector multiply high instructions using intrinsics. def : Pat<(v4i32 (int_ppc_altivec_vmulhsw v4i32:$vA, v4i32:$vB)), (v4i32 (VMULHSW $vA, $vB))>; @@ -2230,6 +2234,13 @@ def VEqv (v4i32(bitconvert node:$a)), (v4i32(bitconvert node:$b)))))]>; +// Vector NAND operation (not(and)) +def VNand + : PatFrags<(ops node:$a, node:$b), [(vnot(and node:$a, node:$b)), + (bitconvert(vnot(and + (v4i32(bitconvert node:$a)), + (v4i32(bitconvert node:$b)))))]>; + // ============================================================================= // XXEVAL Ternary Pattern Multiclass: XXEvalTernarySelectAnd // This class matches the equivalent Ternary Operation: A ? f(B,C) : AND(B,C) @@ -2265,6 +2276,56 @@ multiclass XXEvalTernarySelectAnd<ValueType Vt> { Vt, (vselect Vt:$vA, (VNot Vt:$vB), (VAnd Vt:$vB, Vt:$vC)), 28>; } +// ============================================================================= +// XXEVAL Ternary Pattern Multiclass: XXEvalTernarySelectB +// This class matches the equivalent Ternary Operation: A ? f(B,C) : B +// and emit the corresponding xxeval instruction with the imm value. +// +// The patterns implement xxeval vector select operations where: +// - A is the selector vector +// - f(B,C) is the "true" case op on vectors B and C (AND, NOR, EQV, NAND) +// - B is the "false" case operand (vector B) +// +// Note: Patterns (A? C : B) and (A? not(C) : B) are not considered +// for XXEVAL instruction (4 Cycle) as XXSEL (3 cycle) instruction performs +// better. +// ============================================================================= +multiclass XXEvalTernarySelectB<ValueType Vt>{ + // Pattern: (A ? AND(B,C) : B) XXEVAL immediate value: 49 + def : XXEvalPattern<Vt, (vselect Vt:$vA, (VAnd Vt:$vB, Vt:$vC), Vt:$vB), 49>; + // Pattern: (A ? NOR(B,C) : B) XXEVAL immediate value: 56 + def : XXEvalPattern<Vt, (vselect Vt:$vA, (VNor Vt:$vB, Vt:$vC), Vt:$vB), 56>; + // Pattern: (A ? EQV(B,C) : B) XXEVAL immediate value: 57 + def : XXEvalPattern<Vt, (vselect Vt:$vA, (VEqv Vt:$vB, Vt:$vC), Vt:$vB), 57>; + // Pattern: (A ? NAND(B,C) : B) XXEVAL immediate value: 62 + def : XXEvalPattern<Vt, (vselect Vt:$vA, (VNand Vt:$vB, Vt:$vC), Vt:$vB), 62>; +} + +// ============================================================================= +// XXEVAL Ternary Pattern Multiclass: XXEvalTernarySelectC +// This class matches the equivalent Ternary Operation: A ? f(B,C) : C +// and emit the corresponding xxeval instruction with the imm value. +// +// The patterns implement xxeval vector select operations where: +// - A is the selector vector +// - f(B,C) is the "true" case op on vectors B and C (AND, NOR, EQV, NAND) +// - C is the "false" case operand (vector C) +// +// Note: Patterns (A? B : C) and (A? not(B) : C) are not considered +// for XXEVAL instruction (4 Cycle) as XXSEL (3 cycle) instruction performs +// better. +// ============================================================================= +multiclass XXEvalTernarySelectC<ValueType Vt>{ + // Pattern: (A ? AND(B,C) : C) XXEVAL immediate value: 81 + def : XXEvalPattern<Vt, (vselect Vt:$vA, (VAnd Vt:$vB, Vt:$vC), Vt:$vC), 81>; + // Pattern: (A ? NOR(B,C) : C) XXEVAL immediate value: 88 + def : XXEvalPattern<Vt, (vselect Vt:$vA, (VNor Vt:$vB, Vt:$vC), Vt:$vC), 88>; + // Pattern: (A ? EQV(B,C) : C) XXEVAL immediate value: 89 + def : XXEvalPattern<Vt, (vselect Vt:$vA, (VEqv Vt:$vB, Vt:$vC), Vt:$vC), 89>; + // Pattern: (A ? NAND(B,C) : C) XXEVAL immediate value: 94 + def : XXEvalPattern<Vt, (vselect Vt:$vA, (VNand Vt:$vB, Vt:$vC), Vt:$vC), 94>; +} + let Predicates = [PrefixInstrs, HasP10Vector] in { let AddedComplexity = 400 in { def : Pat<(v4i32 (build_vector i32immNonAllOneNonZero:$A, @@ -2376,6 +2437,8 @@ let Predicates = [PrefixInstrs, HasP10Vector] in { // XXEval Patterns for ternary Operations. foreach Ty = [v4i32, v2i64, v8i16, v16i8] in { defm : XXEvalTernarySelectAnd<Ty>; + defm : XXEvalTernarySelectB<Ty>; + defm : XXEvalTernarySelectC<Ty>; } // Anonymous patterns to select prefixed VSX loads and stores. diff --git a/llvm/lib/Target/PowerPC/PPCInstrSPE.td b/llvm/lib/Target/PowerPC/PPCInstrSPE.td index e91cae349e08..5104cc6f5607 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrSPE.td +++ b/llvm/lib/Target/PowerPC/PPCInstrSPE.td @@ -20,10 +20,10 @@ class EFXForm_1<bits<11> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = RT; - let Inst{11-15} = RA; - let Inst{16-20} = RB; - let Inst{21-31} = xo; + let Inst{6...10} = RT; + let Inst{11...15} = RA; + let Inst{16...20} = RB; + let Inst{21...31} = xo; } class EFXForm_2<bits<11> xo, dag OOL, dag IOL, string asmstr, @@ -45,11 +45,11 @@ class EFXForm_3<bits<11> xo, dag OOL, dag IOL, string asmstr, bits<5> RA; bits<5> RB; - let Inst{6-8} = crD; - let Inst{9-10} = 0; - let Inst{11-15} = RA; - let Inst{16-20} = RB; - let Inst{21-31} = xo; + let Inst{6...8} = crD; + let Inst{9...10} = 0; + let Inst{11...15} = RA; + let Inst{16...20} = RB; + let Inst{21...31} = xo; } class EVXForm_1<bits<11> xo, dag OOL, dag IOL, string asmstr, @@ -61,10 +61,10 @@ class EVXForm_1<bits<11> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = RT; - let Inst{11-15} = RA; - let Inst{16-20} = RB; - let Inst{21-31} = xo; + let Inst{6...10} = RT; + let Inst{11...15} = RA; + let Inst{16...20} = RB; + let Inst{21...31} = xo; } class EVXForm_2<bits<11> xo, dag OOL, dag IOL, string asmstr, @@ -88,11 +88,11 @@ class EVXForm_3<bits<11> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-8} = crD; - let Inst{9-10} = 0; - let Inst{11-15} = RA; - let Inst{16-20} = RB; - let Inst{21-31} = xo; + let Inst{6...8} = crD; + let Inst{9...10} = 0; + let Inst{11...15} = RA; + let Inst{16...20} = RB; + let Inst{21...31} = xo; } class EVXForm_4<bits<8> xo, dag OOL, dag IOL, string asmstr, @@ -105,11 +105,11 @@ class EVXForm_4<bits<8> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = RT; - let Inst{11-15} = RA; - let Inst{16-20} = RB; - let Inst{21-28} = xo; - let Inst{29-31} = crD; + let Inst{6...10} = RT; + let Inst{11...15} = RA; + let Inst{16...20} = RB; + let Inst{21...28} = xo; + let Inst{29...31} = crD; } class EVXForm_D<bits<11> xo, dag OOL, dag IOL, string asmstr, @@ -121,10 +121,10 @@ class EVXForm_D<bits<11> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - let Inst{6-10} = RT; - let Inst{11-15} = RA; - let Inst{16-20} = D; - let Inst{21-31} = xo; + let Inst{6...10} = RT; + let Inst{11...15} = RA; + let Inst{16...20} = D; + let Inst{21...31} = xo; } let DecoderNamespace = "SPE", Predicates = [HasSPE] in { diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td index 19448210f5db..4e5165bfcda5 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -236,7 +236,7 @@ class X_VT5_VA5_VB5_FMA<bits<6> opcode, bits<10> xo, string opc, list<dag> pattern> : XForm_1<opcode, xo, (outs vrrc:$RST), (ins vrrc:$RSTi, vrrc:$RA, vrrc:$RB), !strconcat(opc, " $RST, $RA, $RB"), IIC_VecFP, pattern>, - RegConstraint<"$RSTi = $RST">, NoEncode<"$RSTi">; + RegConstraint<"$RSTi = $RST">; // [PO VRT VRA VRB XO RO], Round to Odd version of [PO VRT VRA VRB XO /] class X_VT5_VA5_VB5_FMA_Ro<bits<6> opcode, bits<10> xo, string opc, @@ -402,13 +402,13 @@ let hasSideEffects = 0 in { (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), "xsmaddadp $XT, $XA, $XB", IIC_VecFP, [(set f64:$XT, (any_fma f64:$XA, f64:$XB, f64:$XTi))]>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; let IsVSXFMAAlt = 1 in def XSMADDMDP : XX3Form<60, 41, (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), "xsmaddmdp $XT, $XA, $XB", IIC_VecFP, []>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; } @@ -418,13 +418,13 @@ let hasSideEffects = 0 in { (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), "xsmsubadp $XT, $XA, $XB", IIC_VecFP, [(set f64:$XT, (any_fma f64:$XA, f64:$XB, (fneg f64:$XTi)))]>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; let IsVSXFMAAlt = 1 in def XSMSUBMDP : XX3Form<60, 57, (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), "xsmsubmdp $XT, $XA, $XB", IIC_VecFP, []>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; } @@ -434,13 +434,13 @@ let hasSideEffects = 0 in { (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), "xsnmaddadp $XT, $XA, $XB", IIC_VecFP, [(set f64:$XT, (fneg (any_fma f64:$XA, f64:$XB, f64:$XTi)))]>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; let IsVSXFMAAlt = 1 in def XSNMADDMDP : XX3Form<60, 169, (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), "xsnmaddmdp $XT, $XA, $XB", IIC_VecFP, []>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; } @@ -450,13 +450,13 @@ let hasSideEffects = 0 in { (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), "xsnmsubadp $XT, $XA, $XB", IIC_VecFP, [(set f64:$XT, (fneg (any_fma f64:$XA, f64:$XB, (fneg f64:$XTi))))]>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; let IsVSXFMAAlt = 1 in def XSNMSUBMDP : XX3Form<60, 185, (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), "xsnmsubmdp $XT, $XA, $XB", IIC_VecFP, []>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; } @@ -466,13 +466,13 @@ let hasSideEffects = 0 in { (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvmaddadp $XT, $XA, $XB", IIC_VecFP, [(set v2f64:$XT, (any_fma v2f64:$XA, v2f64:$XB, v2f64:$XTi))]>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; let IsVSXFMAAlt = 1 in def XVMADDMDP : XX3Form<60, 105, (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvmaddmdp $XT, $XA, $XB", IIC_VecFP, []>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; } @@ -482,13 +482,13 @@ let hasSideEffects = 0 in { (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvmaddasp $XT, $XA, $XB", IIC_VecFP, [(set v4f32:$XT, (any_fma v4f32:$XA, v4f32:$XB, v4f32:$XTi))]>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; let IsVSXFMAAlt = 1 in def XVMADDMSP : XX3Form<60, 73, (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvmaddmsp $XT, $XA, $XB", IIC_VecFP, []>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; } @@ -498,13 +498,13 @@ let hasSideEffects = 0 in { (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvmsubadp $XT, $XA, $XB", IIC_VecFP, [(set v2f64:$XT, (any_fma v2f64:$XA, v2f64:$XB, (fneg v2f64:$XTi)))]>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; let IsVSXFMAAlt = 1 in def XVMSUBMDP : XX3Form<60, 121, (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvmsubmdp $XT, $XA, $XB", IIC_VecFP, []>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; } @@ -514,13 +514,13 @@ let hasSideEffects = 0 in { (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvmsubasp $XT, $XA, $XB", IIC_VecFP, [(set v4f32:$XT, (any_fma v4f32:$XA, v4f32:$XB, (fneg v4f32:$XTi)))]>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; let IsVSXFMAAlt = 1 in def XVMSUBMSP : XX3Form<60, 89, (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvmsubmsp $XT, $XA, $XB", IIC_VecFP, []>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; } @@ -530,13 +530,13 @@ let hasSideEffects = 0 in { (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvnmaddadp $XT, $XA, $XB", IIC_VecFP, [(set v2f64:$XT, (fneg (any_fma v2f64:$XA, v2f64:$XB, v2f64:$XTi)))]>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; let IsVSXFMAAlt = 1 in def XVNMADDMDP : XX3Form<60, 233, (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvnmaddmdp $XT, $XA, $XB", IIC_VecFP, []>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; } @@ -546,13 +546,13 @@ let hasSideEffects = 0 in { (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvnmaddasp $XT, $XA, $XB", IIC_VecFP, [(set v4f32:$XT, (fneg (fma v4f32:$XA, v4f32:$XB, v4f32:$XTi)))]>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; let IsVSXFMAAlt = 1 in def XVNMADDMSP : XX3Form<60, 201, (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvnmaddmsp $XT, $XA, $XB", IIC_VecFP, []>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; } @@ -562,13 +562,13 @@ let hasSideEffects = 0 in { (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvnmsubadp $XT, $XA, $XB", IIC_VecFP, [(set v2f64:$XT, (fneg (any_fma v2f64:$XA, v2f64:$XB, (fneg v2f64:$XTi))))]>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; let IsVSXFMAAlt = 1 in def XVNMSUBMDP : XX3Form<60, 249, (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvnmsubmdp $XT, $XA, $XB", IIC_VecFP, []>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; } @@ -578,13 +578,13 @@ let hasSideEffects = 0 in { (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvnmsubasp $XT, $XA, $XB", IIC_VecFP, [(set v4f32:$XT, (fneg (any_fma v4f32:$XA, v4f32:$XB, (fneg v4f32:$XTi))))]>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; let IsVSXFMAAlt = 1 in def XVNMSUBMSP : XX3Form<60, 217, (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), "xvnmsubmsp $XT, $XA, $XB", IIC_VecFP, []>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; } @@ -1199,7 +1199,7 @@ let Predicates = [HasVSX, HasP8Vector] in { (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB), "xsmaddasp $XT, $XA, $XB", IIC_VecFP, [(set f32:$XT, (any_fma f32:$XA, f32:$XB, f32:$XTi))]>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; // FIXME: Setting the hasSideEffects flag here to match current behaviour. let IsVSXFMAAlt = 1, hasSideEffects = 1 in @@ -1207,7 +1207,7 @@ let Predicates = [HasVSX, HasP8Vector] in { (outs vssrc:$XT), (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB), "xsmaddmsp $XT, $XA, $XB", IIC_VecFP, []>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; } @@ -1219,7 +1219,7 @@ let Predicates = [HasVSX, HasP8Vector] in { "xsmsubasp $XT, $XA, $XB", IIC_VecFP, [(set f32:$XT, (any_fma f32:$XA, f32:$XB, (fneg f32:$XTi)))]>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; // FIXME: Setting the hasSideEffects flag here to match current behaviour. let IsVSXFMAAlt = 1, hasSideEffects = 1 in @@ -1227,7 +1227,7 @@ let Predicates = [HasVSX, HasP8Vector] in { (outs vssrc:$XT), (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB), "xsmsubmsp $XT, $XA, $XB", IIC_VecFP, []>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; } @@ -1239,7 +1239,7 @@ let Predicates = [HasVSX, HasP8Vector] in { "xsnmaddasp $XT, $XA, $XB", IIC_VecFP, [(set f32:$XT, (fneg (any_fma f32:$XA, f32:$XB, f32:$XTi)))]>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; // FIXME: Setting the hasSideEffects flag here to match current behaviour. let IsVSXFMAAlt = 1, hasSideEffects = 1 in @@ -1247,7 +1247,7 @@ let Predicates = [HasVSX, HasP8Vector] in { (outs vssrc:$XT), (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB), "xsnmaddmsp $XT, $XA, $XB", IIC_VecFP, []>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; } @@ -1259,7 +1259,7 @@ let Predicates = [HasVSX, HasP8Vector] in { "xsnmsubasp $XT, $XA, $XB", IIC_VecFP, [(set f32:$XT, (fneg (any_fma f32:$XA, f32:$XB, (fneg f32:$XTi))))]>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; // FIXME: Setting the hasSideEffects flag here to match current behaviour. let IsVSXFMAAlt = 1, hasSideEffects = 1 in @@ -1267,7 +1267,7 @@ let Predicates = [HasVSX, HasP8Vector] in { (outs vssrc:$XT), (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB), "xsnmsubmsp $XT, $XA, $XB", IIC_VecFP, []>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, AltVSXFMARel; } @@ -1563,7 +1563,7 @@ let Predicates = [HasVSX, HasP9Vector] in { "xxinsertw $XT, $XB, $UIM5", IIC_VecFP, [(set v4i32:$XT, (PPCvecinsert v4i32:$XTi, v4i32:$XB, imm32SExt16:$UIM5))]>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">; + RegConstraint<"$XTi = $XT">; // Vector Extract Unsigned Word // FIXME: Setting the hasSideEffects flag here to match current behaviour. @@ -1652,11 +1652,11 @@ let Predicates = [HasVSX, HasP9Vector] in { def XXPERM : XX3Form<60, 26, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XTi, vsrc:$XB), "xxperm $XT, $XA, $XB", IIC_VecPerm, []>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">; + RegConstraint<"$XTi = $XT">; def XXPERMR : XX3Form<60, 58, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XTi, vsrc:$XB), "xxpermr $XT, $XA, $XB", IIC_VecPerm, []>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">; + RegConstraint<"$XTi = $XT">; // Vector Splat Immediate Byte def XXSPLTIB : X_RD6_IMM8<60, 360, (outs vsrc:$XT), (ins u8imm:$IMM8), diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp index 996b6efb320d..736ba1edcaea 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp @@ -52,12 +52,11 @@ PPCSubtarget &PPCSubtarget::initializeSubtargetDependencies(StringRef CPU, return *this; } -PPCSubtarget::PPCSubtarget(const Triple &TT, const std::string &CPU, - const std::string &TuneCPU, const std::string &FS, - const PPCTargetMachine &TM) - : PPCGenSubtargetInfo(TT, CPU, TuneCPU, FS), TargetTriple(TT), - IsPPC64(TargetTriple.getArch() == Triple::ppc64 || - TargetTriple.getArch() == Triple::ppc64le), +PPCSubtarget::PPCSubtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU, + StringRef FS, const PPCTargetMachine &TM) + : PPCGenSubtargetInfo(TT, CPU, TuneCPU, FS), + IsPPC64(getTargetTriple().getArch() == Triple::ppc64 || + getTargetTriple().getArch() == Triple::ppc64le), TM(TM), FrameLowering(initializeSubtargetDependencies(CPU, TuneCPU, FS)), InstrInfo(*this), TLInfo(TM, *this) { TSInfo = std::make_unique<PPCSelectionDAGInfo>(); @@ -87,10 +86,10 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, // Determine default and user specified characteristics std::string CPUName = std::string(CPU); if (CPUName.empty() || CPU == "generic") { - if (TargetTriple.getSubArch() == Triple::PPCSubArch_spe) + if (getTargetTriple().getSubArch() == Triple::PPCSubArch_spe) CPUName = "e500"; else - CPUName = std::string(PPC::getNormalizedPPCTargetCPU(TargetTriple)); + CPUName = std::string(PPC::getNormalizedPPCTargetCPU(getTargetTriple())); } // Determine the CPU to schedule for. @@ -107,7 +106,7 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, if (IsPPC64 && has64BitSupport()) Use64BitRegs = true; - if (TargetTriple.isPPC32SecurePlt()) + if (getTargetTriple().isPPC32SecurePlt()) IsSecurePlt = true; if (HasSPE && IsPPC64) @@ -126,7 +125,7 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, IsLittleEndian = TM.isLittleEndian(); if (HasAIXSmallLocalExecTLS || HasAIXSmallLocalDynamicTLS) { - if (!TargetTriple.isOSAIX() || !IsPPC64) + if (!getTargetTriple().isOSAIX() || !IsPPC64) report_fatal_error("The aix-small-local-[exec|dynamic]-tls attribute is " "only supported on AIX in " "64-bit mode.\n", @@ -143,7 +142,7 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, false); } - if (HasAIXShLibTLSModelOpt && (!TargetTriple.isOSAIX() || !IsPPC64)) + if (HasAIXShLibTLSModelOpt && (!getTargetTriple().isOSAIX() || !IsPPC64)) report_fatal_error("The aix-shared-lib-tls-model-opt attribute " "is only supported on AIX in 64-bit mode.\n", false); diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h index 3c59a475c7eb..c17fca7f70a3 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.h +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -78,9 +78,6 @@ public: }; protected: - /// TargetTriple - What processor and OS we're targeting. - Triple TargetTriple; - /// stackAlignment - The minimum alignment known to hold of the stack frame on /// entry to the function and which must be maintained by every function. Align StackAlignment; @@ -119,8 +116,7 @@ public: /// This constructor initializes the data members to match that /// of the specified triple. /// - PPCSubtarget(const Triple &TT, const std::string &CPU, - const std::string &TuneCPU, const std::string &FS, + PPCSubtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU, StringRef FS, const PPCTargetMachine &TM); ~PPCSubtarget() override; @@ -210,13 +206,11 @@ public: POPCNTDKind hasPOPCNTD() const { return HasPOPCNTD; } - const Triple &getTargetTriple() const { return TargetTriple; } - - bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } - bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } - bool isTargetLinux() const { return TargetTriple.isOSLinux(); } + bool isTargetELF() const { return getTargetTriple().isOSBinFormatELF(); } + bool isTargetMachO() const { return getTargetTriple().isOSBinFormatMachO(); } + bool isTargetLinux() const { return getTargetTriple().isOSLinux(); } - bool isAIXABI() const { return TargetTriple.isOSAIX(); } + bool isAIXABI() const { return getTargetTriple().isOSAIX(); } bool isSVR4ABI() const { return !isAIXABI(); } bool isELFv2ABI() const; diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp index b5c6ac111dff..ae92d5eab20c 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -129,7 +129,7 @@ LLVMInitializePowerPCTarget() { initializePPCLoopInstrFormPrepPass(PR); initializePPCTOCRegDepsPass(PR); initializePPCEarlyReturnPass(PR); - initializePPCVSXCopyPass(PR); + initializePPCVSXWACCCopyPass(PR); initializePPCVSXFMAMutatePass(PR); initializePPCVSXSwapRemovalPass(PR); initializePPCReduceCRLogicalsPass(PR); @@ -528,7 +528,7 @@ bool PPCPassConfig::addInstSelector() { addPass(createPPCCTRLoopsVerify()); #endif - addPass(createPPCVSXCopyPass()); + addPass(createPPCVSXWACCCopyPass()); return false; } diff --git a/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp b/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp deleted file mode 100644 index 794095cd4376..000000000000 --- a/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp +++ /dev/null @@ -1,159 +0,0 @@ -//===-------------- PPCVSXCopy.cpp - VSX Copy Legalization ----------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// A pass which deals with the complexity of generating legal VSX register -// copies to/from register classes which partially overlap with the VSX -// register file. -// -//===----------------------------------------------------------------------===// - -#include "PPC.h" -#include "PPCInstrInfo.h" -#include "PPCTargetMachine.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineMemOperand.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Support/ErrorHandling.h" - -using namespace llvm; - -#define DEBUG_TYPE "ppc-vsx-copy" - -namespace { - // PPCVSXCopy pass - For copies between VSX registers and non-VSX registers - // (Altivec and scalar floating-point registers), we need to transform the - // copies into subregister copies with other restrictions. - struct PPCVSXCopy : public MachineFunctionPass { - static char ID; - PPCVSXCopy() : MachineFunctionPass(ID) {} - - const TargetInstrInfo *TII; - - bool IsRegInClass(unsigned Reg, const TargetRegisterClass *RC, - MachineRegisterInfo &MRI) { - if (Register::isVirtualRegister(Reg)) { - return RC->hasSubClassEq(MRI.getRegClass(Reg)); - } else if (RC->contains(Reg)) { - return true; - } - - return false; - } - - bool IsVSReg(unsigned Reg, MachineRegisterInfo &MRI) { - return IsRegInClass(Reg, &PPC::VSRCRegClass, MRI); - } - - bool IsVRReg(unsigned Reg, MachineRegisterInfo &MRI) { - return IsRegInClass(Reg, &PPC::VRRCRegClass, MRI); - } - - bool IsF8Reg(unsigned Reg, MachineRegisterInfo &MRI) { - return IsRegInClass(Reg, &PPC::F8RCRegClass, MRI); - } - - bool IsVSFReg(unsigned Reg, MachineRegisterInfo &MRI) { - return IsRegInClass(Reg, &PPC::VSFRCRegClass, MRI); - } - - bool IsVSSReg(unsigned Reg, MachineRegisterInfo &MRI) { - return IsRegInClass(Reg, &PPC::VSSRCRegClass, MRI); - } - -protected: - bool processBlock(MachineBasicBlock &MBB) { - bool Changed = false; - - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - for (MachineInstr &MI : MBB) { - if (!MI.isFullCopy()) - continue; - - MachineOperand &DstMO = MI.getOperand(0); - MachineOperand &SrcMO = MI.getOperand(1); - - if ( IsVSReg(DstMO.getReg(), MRI) && - !IsVSReg(SrcMO.getReg(), MRI)) { - // This is a copy *to* a VSX register from a non-VSX register. - Changed = true; - - const TargetRegisterClass *SrcRC = &PPC::VSLRCRegClass; - assert((IsF8Reg(SrcMO.getReg(), MRI) || - IsVSSReg(SrcMO.getReg(), MRI) || - IsVSFReg(SrcMO.getReg(), MRI)) && - "Unknown source for a VSX copy"); - - Register NewVReg = MRI.createVirtualRegister(SrcRC); - BuildMI(MBB, MI, MI.getDebugLoc(), - TII->get(TargetOpcode::SUBREG_TO_REG), NewVReg) - .addImm(1) // add 1, not 0, because there is no implicit clearing - // of the high bits. - .add(SrcMO) - .addImm(PPC::sub_64); - - // The source of the original copy is now the new virtual register. - SrcMO.setReg(NewVReg); - } else if (!IsVSReg(DstMO.getReg(), MRI) && - IsVSReg(SrcMO.getReg(), MRI)) { - // This is a copy *from* a VSX register to a non-VSX register. - Changed = true; - - const TargetRegisterClass *DstRC = &PPC::VSLRCRegClass; - assert((IsF8Reg(DstMO.getReg(), MRI) || - IsVSFReg(DstMO.getReg(), MRI) || - IsVSSReg(DstMO.getReg(), MRI)) && - "Unknown destination for a VSX copy"); - - // Copy the VSX value into a new VSX register of the correct subclass. - Register NewVReg = MRI.createVirtualRegister(DstRC); - BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), - NewVReg) - .add(SrcMO); - - // Transform the original copy into a subregister extraction copy. - SrcMO.setReg(NewVReg); - SrcMO.setSubReg(PPC::sub_64); - } - } - - return Changed; - } - -public: - bool runOnMachineFunction(MachineFunction &MF) override { - // If we don't have VSX on the subtarget, don't do anything. - const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>(); - if (!STI.hasVSX()) - return false; - TII = STI.getInstrInfo(); - - bool Changed = false; - - for (MachineBasicBlock &B : llvm::make_early_inc_range(MF)) - if (processBlock(B)) - Changed = true; - - return Changed; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - MachineFunctionPass::getAnalysisUsage(AU); - } - }; - } // end anonymous namespace - -INITIALIZE_PASS(PPCVSXCopy, DEBUG_TYPE, - "PowerPC VSX Copy Legalization", false, false) - -char PPCVSXCopy::ID = 0; -FunctionPass* -llvm::createPPCVSXCopyPass() { return new PPCVSXCopy(); } diff --git a/llvm/lib/Target/PowerPC/PPCVSXWACCCopy.cpp b/llvm/lib/Target/PowerPC/PPCVSXWACCCopy.cpp new file mode 100644 index 000000000000..2ec566ddb0b8 --- /dev/null +++ b/llvm/lib/Target/PowerPC/PPCVSXWACCCopy.cpp @@ -0,0 +1,182 @@ +//===--------- PPCVSXWACCCopy.cpp - VSX and WACC Copy Legalization --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// A pass which deals with the complexity of generating legal VSX register +// copies to/from register classes which partially overlap with the VSX +// register file and combines the wacc/wacc_hi copies when needed. +// +//===----------------------------------------------------------------------===// + +#include "PPC.h" +#include "PPCInstrInfo.h" +#include "PPCTargetMachine.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/ErrorHandling.h" + +using namespace llvm; + +#define DEBUG_TYPE "ppc-vsx-copy" + +namespace { +// PPCVSXWACCCopy pass - For copies between VSX registers and non-VSX registers +// (Altivec and scalar floating-point registers), we need to transform the +// copies into subregister copies with other restrictions. +struct PPCVSXWACCCopy : public MachineFunctionPass { + static char ID; + PPCVSXWACCCopy() : MachineFunctionPass(ID) {} + + const TargetInstrInfo *TII; + + bool IsRegInClass(unsigned Reg, const TargetRegisterClass *RC, + MachineRegisterInfo &MRI) { + if (Register::isVirtualRegister(Reg)) { + return RC->hasSubClassEq(MRI.getRegClass(Reg)); + } else if (RC->contains(Reg)) { + return true; + } + + return false; + } + + bool IsVSReg(unsigned Reg, MachineRegisterInfo &MRI) { + return IsRegInClass(Reg, &PPC::VSRCRegClass, MRI); + } + + bool IsVRReg(unsigned Reg, MachineRegisterInfo &MRI) { + return IsRegInClass(Reg, &PPC::VRRCRegClass, MRI); + } + + bool IsF8Reg(unsigned Reg, MachineRegisterInfo &MRI) { + return IsRegInClass(Reg, &PPC::F8RCRegClass, MRI); + } + + bool IsVSFReg(unsigned Reg, MachineRegisterInfo &MRI) { + return IsRegInClass(Reg, &PPC::VSFRCRegClass, MRI); + } + + bool IsVSSReg(unsigned Reg, MachineRegisterInfo &MRI) { + return IsRegInClass(Reg, &PPC::VSSRCRegClass, MRI); + } + +protected: + bool processBlock(MachineBasicBlock &MBB) { + bool Changed = false; + + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + for (MachineInstr &MI : MBB) { + if (!MI.isFullCopy()) + continue; + + MachineOperand &DstMO = MI.getOperand(0); + MachineOperand &SrcMO = MI.getOperand(1); + + if (IsVSReg(DstMO.getReg(), MRI) && !IsVSReg(SrcMO.getReg(), MRI)) { + // This is a copy *to* a VSX register from a non-VSX register. + Changed = true; + + const TargetRegisterClass *SrcRC = &PPC::VSLRCRegClass; + assert((IsF8Reg(SrcMO.getReg(), MRI) || IsVSSReg(SrcMO.getReg(), MRI) || + IsVSFReg(SrcMO.getReg(), MRI)) && + "Unknown source for a VSX copy"); + + Register NewVReg = MRI.createVirtualRegister(SrcRC); + BuildMI(MBB, MI, MI.getDebugLoc(), + TII->get(TargetOpcode::SUBREG_TO_REG), NewVReg) + .addImm(1) // add 1, not 0, because there is no implicit clearing + // of the high bits. + .add(SrcMO) + .addImm(PPC::sub_64); + + // The source of the original copy is now the new virtual register. + SrcMO.setReg(NewVReg); + } else if (!IsVSReg(DstMO.getReg(), MRI) && + IsVSReg(SrcMO.getReg(), MRI)) { + // This is a copy *from* a VSX register to a non-VSX register. + Changed = true; + + const TargetRegisterClass *DstRC = &PPC::VSLRCRegClass; + assert((IsF8Reg(DstMO.getReg(), MRI) || IsVSFReg(DstMO.getReg(), MRI) || + IsVSSReg(DstMO.getReg(), MRI)) && + "Unknown destination for a VSX copy"); + + // Copy the VSX value into a new VSX register of the correct subclass. + Register NewVReg = MRI.createVirtualRegister(DstRC); + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), + NewVReg) + .add(SrcMO); + + // Transform the original copy into a subregister extraction copy. + SrcMO.setReg(NewVReg); + SrcMO.setSubReg(PPC::sub_64); + } else if (IsRegInClass(DstMO.getReg(), &PPC::WACC_HIRCRegClass, MRI) && + IsRegInClass(SrcMO.getReg(), &PPC::WACCRCRegClass, MRI)) { + // Matches the pattern: + // %a:waccrc = COPY %b.sub_wacc_hi:dmrrc + // %c:wacc_hirc = COPY %a:waccrc + // And replaces it with: + // %c:wacc_hirc = COPY %b.sub_wacc_hi:dmrrc + MachineInstr *DefMI = MRI.getUniqueVRegDef(SrcMO.getReg()); + if (!DefMI || !DefMI->isCopy()) + continue; + + MachineOperand &OrigSrc = DefMI->getOperand(1); + + if (!IsRegInClass(OrigSrc.getReg(), &PPC::DMRRCRegClass, MRI)) + continue; + + if (OrigSrc.getSubReg() != PPC::sub_wacc_hi) + continue; + + // Rewrite the second copy to use the original register's subreg + SrcMO.setReg(OrigSrc.getReg()); + SrcMO.setSubReg(PPC::sub_wacc_hi); + Changed = true; + + // Remove the intermediate copy if safe + if (MRI.use_nodbg_empty(DefMI->getOperand(0).getReg())) + DefMI->eraseFromParent(); + } + } + + return Changed; + } + +public: + bool runOnMachineFunction(MachineFunction &MF) override { + // If we don't have VSX on the subtarget, don't do anything. + const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>(); + if (!STI.hasVSX()) + return false; + TII = STI.getInstrInfo(); + + bool Changed = false; + + for (MachineBasicBlock &B : llvm::make_early_inc_range(MF)) + if (processBlock(B)) + Changed = true; + + return Changed; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + MachineFunctionPass::getAnalysisUsage(AU); + } +}; +} // end anonymous namespace + +INITIALIZE_PASS(PPCVSXWACCCopy, DEBUG_TYPE, "PowerPC VSX Copy Legalization", + false, false) + +char PPCVSXWACCCopy::ID = 0; +FunctionPass *llvm::createPPCVSXWACCCopyPass() { return new PPCVSXWACCCopy(); } diff --git a/llvm/lib/Target/PowerPC/README_P9.txt b/llvm/lib/Target/PowerPC/README_P9.txt index ee1ea735acad..208c8abfdc5f 100644 --- a/llvm/lib/Target/PowerPC/README_P9.txt +++ b/llvm/lib/Target/PowerPC/README_P9.txt @@ -224,22 +224,22 @@ VSX: . isCommutable = 1 // xsmaddqp [(set f128:$vT, (fma f128:$vA, f128:$vB, f128:$vTi))]>, - RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">, + RegConstraint<"$vTi = $vT">, AltVSXFMARel; // xsmsubqp [(set f128:$vT, (fma f128:$vA, f128:$vB, (fneg f128:$vTi)))]>, - RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">, + RegConstraint<"$vTi = $vT">, AltVSXFMARel; // xsnmaddqp [(set f128:$vT, (fneg (fma f128:$vA, f128:$vB, f128:$vTi)))]>, - RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">, + RegConstraint<"$vTi = $vT">, AltVSXFMARel; // xsnmsubqp [(set f128:$vT, (fneg (fma f128:$vA, f128:$vB, (fneg f128:$vTi))))]>, - RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">, + RegConstraint<"$vTi = $vT">, AltVSXFMARel; - Round to Odd of QP (Negative) Multiply-{Add/Subtract}: @@ -276,22 +276,22 @@ VSX: . isCommutable = 1 // xsmaddqpo [(set f128:$vT, (PPCfmarto f128:$vA, f128:$vB, f128:$vTi))]>, - RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">, + RegConstraint<"$vTi = $vT">, AltVSXFMARel; // xsmsubqpo [(set f128:$vT, (PPCfmarto f128:$vA, f128:$vB, (fneg f128:$vTi)))]>, - RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">, + RegConstraint<"$vTi = $vT">, AltVSXFMARel; // xsnmaddqpo [(set f128:$vT, (fneg (PPCfmarto f128:$vA, f128:$vB, f128:$vTi)))]>, - RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">, + RegConstraint<"$vTi = $vT">, AltVSXFMARel; // xsnmsubqpo [(set f128:$vT, (fneg (PPCfmarto f128:$vA, f128:$vB, (fneg f128:$vTi))))]>, - RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">, + RegConstraint<"$vTi = $vT">, AltVSXFMARel; - QP Compare Ordered/Unordered: xscmpoqp xscmpuqp @@ -405,7 +405,7 @@ Fixed Point Facility: But how to map to it?? [(set v1f128:$XT, (insertelement v1f128:$XTi, f128:$XB, i4:$UIMM))]>, - RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + RegConstraint<"$XTi = $XT">, . Or use intrinsic? (set v1f128:$XT, (int_ppc_vsx_xxinsertw v1f128:$XTi, f128:$XB, i4:$UIMM)) diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 9ce44d0ff7fd..cd8392849ac4 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -121,7 +121,7 @@ class RISCVAsmParser : public MCTargetAsmParser { bool parseVTypeToken(const AsmToken &Tok, VTypeState &State, unsigned &Sew, unsigned &Lmul, bool &Fractional, bool &TailAgnostic, - bool &MaskAgnostic); + bool &MaskAgnostic, bool &AltFmt); bool generateVTypeError(SMLoc ErrorLoc); bool generateXSfmmVTypeError(SMLoc ErrorLoc); @@ -2261,14 +2261,23 @@ ParseStatus RISCVAsmParser::parseJALOffset(OperandVector &Operands) { bool RISCVAsmParser::parseVTypeToken(const AsmToken &Tok, VTypeState &State, unsigned &Sew, unsigned &Lmul, bool &Fractional, bool &TailAgnostic, - bool &MaskAgnostic) { + bool &MaskAgnostic, bool &AltFmt) { if (Tok.isNot(AsmToken::Identifier)) return true; StringRef Identifier = Tok.getIdentifier(); if (State < VTypeState::SeenSew && Identifier.consume_front("e")) { - if (Identifier.getAsInteger(10, Sew)) - return true; + if (Identifier.getAsInteger(10, Sew)) { + if (Identifier == "16alt") { + AltFmt = true; + Sew = 16; + } else if (Identifier == "8alt") { + AltFmt = true; + Sew = 8; + } else { + return true; + } + } if (!RISCVVType::isValidSEW(Sew)) return true; @@ -2340,11 +2349,12 @@ ParseStatus RISCVAsmParser::parseVTypeI(OperandVector &Operands) { bool Fractional = false; bool TailAgnostic = false; bool MaskAgnostic = false; + bool AltFmt = false; VTypeState State = VTypeState::SeenNothingYet; do { if (parseVTypeToken(getTok(), State, Sew, Lmul, Fractional, TailAgnostic, - MaskAgnostic)) { + MaskAgnostic, AltFmt)) { // The first time, errors return NoMatch rather than Failure if (State == VTypeState::SeenNothingYet) return ParseStatus::NoMatch; @@ -2370,12 +2380,17 @@ ParseStatus RISCVAsmParser::parseVTypeI(OperandVector &Operands) { } unsigned VTypeI = - RISCVVType::encodeVTYPE(VLMUL, Sew, TailAgnostic, MaskAgnostic); + RISCVVType::encodeVTYPE(VLMUL, Sew, TailAgnostic, MaskAgnostic, AltFmt); Operands.push_back(RISCVOperand::createVType(VTypeI, S)); return ParseStatus::Success; } bool RISCVAsmParser::generateVTypeError(SMLoc ErrorLoc) { + if (STI->hasFeature(RISCV::FeatureStdExtZvfbfa)) + return Error( + ErrorLoc, + "operand must be " + "e[8|8alt|16|16alt|32|64],m[1|2|4|8|f2|f4|f8],[ta|tu],[ma|mu]"); return Error( ErrorLoc, "operand must be " diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt index 47329b2c2f4d..0ff178e1f195 100644 --- a/llvm/lib/Target/RISCV/CMakeLists.txt +++ b/llvm/lib/Target/RISCV/CMakeLists.txt @@ -7,7 +7,8 @@ tablegen(LLVM RISCVGenAsmWriter.inc -gen-asm-writer) tablegen(LLVM RISCVGenCompressInstEmitter.inc -gen-compress-inst-emitter) tablegen(LLVM RISCVGenMacroFusion.inc -gen-macro-fusion-pred) tablegen(LLVM RISCVGenDAGISel.inc -gen-dag-isel) -tablegen(LLVM RISCVGenDisassemblerTables.inc -gen-disassembler) +tablegen(LLVM RISCVGenDisassemblerTables.inc -gen-disassembler + --specialize-decoders-per-bitwidth) tablegen(LLVM RISCVGenInstrInfo.inc -gen-instr-info) tablegen(LLVM RISCVGenMCCodeEmitter.inc -gen-emitter) tablegen(LLVM RISCVGenMCPseudoLowering.inc -gen-pseudo-lowering) @@ -87,6 +88,7 @@ add_llvm_target(RISCVCodeGen GlobalISel IPO MC + Passes RISCVDesc RISCVInfo Scalar diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index dbb16fce8390..89df9d82f878 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -46,8 +46,6 @@ public: raw_ostream &CStream) const override; private: - void addSPOperands(MCInst &MI) const; - DecodeStatus getInstruction48(MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address, raw_ostream &CStream) const; @@ -196,6 +194,12 @@ static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, uint32_t RegNo, return MCDisassembler::Success; } +static DecodeStatus DecodeSPRegisterClass(MCInst &Inst, + const MCDisassembler *Decoder) { + Inst.addOperand(MCOperand::createReg(RISCV::X2)); + return MCDisassembler::Success; +} + static DecodeStatus DecodeGPRNoX0RegisterClass(MCInst &Inst, uint32_t RegNo, uint64_t Address, const MCDisassembler *Decoder) { @@ -558,7 +562,7 @@ static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm, return decodeZcmpRlist(Inst, Imm, Address, Decoder); } -static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn, +static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint16_t Insn, uint64_t Address, const MCDisassembler *Decoder) { uint32_t Rs1 = fieldFromInstruction(Insn, 7, 5); @@ -600,15 +604,6 @@ static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn, #include "RISCVGenDisassemblerTables.inc" -// Add implied SP operand for C.*SP compressed instructions. The SP operand -// isn't explicitly encoded in the instruction. -void RISCVDisassembler::addSPOperands(MCInst &MI) const { - const MCInstrDesc &MCID = MCII->get(MI.getOpcode()); - for (unsigned i = 0; i < MCID.getNumOperands(); i++) - if (MCID.operands()[i].RegClass == RISCV::SPRegClassID) - MI.insert(MI.begin() + i, MCOperand::createReg(RISCV::X2)); -} - namespace { struct DecoderListEntry { @@ -656,6 +651,13 @@ static constexpr FeatureBitset XSfSystemGroup = { RISCV::FeatureVendorXSiFivecflushdlone, }; +static constexpr FeatureBitset XMIPSGroup = { + RISCV::FeatureVendorXMIPSLSP, + RISCV::FeatureVendorXMIPSCMov, + RISCV::FeatureVendorXMIPSCBOP, + RISCV::FeatureVendorXMIPSEXECTL, +}; + static constexpr FeatureBitset XTHeadGroup = { RISCV::FeatureVendorXTHeadBa, RISCV::FeatureVendorXTHeadBb, RISCV::FeatureVendorXTHeadBs, RISCV::FeatureVendorXTHeadCondMov, @@ -684,13 +686,7 @@ static constexpr DecoderListEntry DecoderList32[]{ {DecoderTableXSfvector32, XSfVectorGroup, "SiFive vector extensions"}, {DecoderTableXSfsystem32, XSfSystemGroup, "SiFive system extensions"}, {DecoderTableXSfcease32, {RISCV::FeatureVendorXSfcease}, "SiFive sf.cease"}, - {DecoderTableXmipslsp32, {RISCV::FeatureVendorXMIPSLSP}, "MIPS mips.lsp"}, - {DecoderTableXmipscmov32, - {RISCV::FeatureVendorXMIPSCMov}, - "MIPS mips.ccmov"}, - {DecoderTableXmipscbop32, - {RISCV::FeatureVendorXMIPSCBOP}, - "MIPS mips.pref"}, + {DecoderTableXMIPS32, XMIPSGroup, "Mips extensions"}, {DecoderTableXAndes32, XAndesGroup, "Andes extensions"}, {DecoderTableXSMT32, XSMTGroup, "SpacemiT extensions"}, // Standard Extensions @@ -700,6 +696,14 @@ static constexpr DecoderListEntry DecoderList32[]{ {DecoderTableZdinxRV32Only32, {}, "RV32-only Zdinx (Double in Integer)"}, }; +namespace { +// Define bitwidths for various types used to instantiate the decoder. +template <> constexpr uint32_t InsnBitWidth<uint16_t> = 16; +template <> constexpr uint32_t InsnBitWidth<uint32_t> = 32; +// Use uint64_t to represent 48 bit instructions. +template <> constexpr uint32_t InsnBitWidth<uint64_t> = 48; +} // namespace + DecodeStatus RISCVDisassembler::getInstruction32(MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address, @@ -710,9 +714,7 @@ DecodeStatus RISCVDisassembler::getInstruction32(MCInst &MI, uint64_t &Size, } Size = 4; - // Use uint64_t to match getInstruction48. decodeInstruction is templated - // on the Insn type. - uint64_t Insn = support::endian::read32le(Bytes.data()); + uint32_t Insn = support::endian::read32le(Bytes.data()); for (const DecoderListEntry &Entry : DecoderList32) { if (!Entry.haveContainedFeatures(STI.getFeatureBits())) @@ -758,9 +760,7 @@ DecodeStatus RISCVDisassembler::getInstruction16(MCInst &MI, uint64_t &Size, } Size = 2; - // Use uint64_t to match getInstruction48. decodeInstruction is templated - // on the Insn type. - uint64_t Insn = support::endian::read16le(Bytes.data()); + uint16_t Insn = support::endian::read16le(Bytes.data()); for (const DecoderListEntry &Entry : DecoderList16) { if (!Entry.haveContainedFeatures(STI.getFeatureBits())) @@ -769,12 +769,8 @@ DecodeStatus RISCVDisassembler::getInstruction16(MCInst &MI, uint64_t &Size, LLVM_DEBUG(dbgs() << "Trying " << Entry.Desc << " table:\n"); DecodeStatus Result = decodeInstruction(Entry.Table, MI, Insn, Address, this, STI); - if (Result == MCDisassembler::Fail) - continue; - - addSPOperands(MI); - - return Result; + if (Result != MCDisassembler::Fail) + return Result; } return MCDisassembler::Fail; diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp index 51ea3fc5f677..7df1b7e58000 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp @@ -1158,8 +1158,8 @@ bool RISCVInstructionSelector::selectAddr(MachineInstr &MI, switch (TM.getCodeModel()) { default: { - reportGISelFailure(const_cast<MachineFunction &>(*MF), *TPC, *MORE, - getName(), "Unsupported code model for lowering", MI); + reportGISelFailure(*MF, *TPC, *MORE, getName(), + "Unsupported code model for lowering", MI); return false; } case CodeModel::Small: { diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp index e88f33d6859e..564657ac65fd 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp @@ -26,6 +26,8 @@ #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsRISCV.h" #include "llvm/IR/Type.h" using namespace llvm; @@ -152,7 +154,8 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) getActionDefinitionsBuilder({G_SADDO, G_SSUBO}).minScalar(0, sXLen).lower(); // TODO: Use Vector Single-Width Saturating Instructions for vector types. - getActionDefinitionsBuilder({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT}) + getActionDefinitionsBuilder( + {G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT, G_SSHLSAT, G_USHLSAT}) .lower(); getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR}) @@ -485,6 +488,10 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) .minScalar(ST.hasStdExtZbb(), 0, sXLen) .lower(); + getActionDefinitionsBuilder({G_ABDS, G_ABDU}) + .minScalar(ST.hasStdExtZbb(), 0, sXLen) + .lower(); + getActionDefinitionsBuilder({G_UMAX, G_UMIN, G_SMAX, G_SMIN}) .legalFor(ST.hasStdExtZbb(), {sXLen}) .minScalar(ST.hasStdExtZbb(), 0, sXLen) @@ -692,6 +699,16 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) .customIf(all(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST), typeIsLegalIntOrFPVec(1, IntOrFPVecTys, ST))); + getActionDefinitionsBuilder(G_ATOMICRMW_ADD) + .legalFor(ST.hasStdExtA(), {{sXLen, p0}}) + .libcallFor(!ST.hasStdExtA(), {{s8, p0}, {s16, p0}, {s32, p0}, {s64, p0}}) + .clampScalar(0, sXLen, sXLen); + + getActionDefinitionsBuilder(G_ATOMICRMW_SUB) + .libcallFor(!ST.hasStdExtA(), {{s8, p0}, {s16, p0}, {s32, p0}, {s64, p0}}) + .clampScalar(0, sXLen, sXLen) + .lower(); + getLegacyLegalizerInfo().computeTables(); verify(*ST.getInstrInfo()); } @@ -729,6 +746,9 @@ bool RISCVLegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MI.eraseFromParent(); return true; } + case Intrinsic::riscv_masked_atomicrmw_add: + case Intrinsic::riscv_masked_atomicrmw_sub: + return true; } } diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp index 543c4c5ddfc9..37fe32531800 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp @@ -36,6 +36,12 @@ RISCVTargetELFStreamer::RISCVTargetELFStreamer(MCStreamer &S, setFlagsFromFeatures(STI); } +RISCVELFStreamer::RISCVELFStreamer(MCContext &C, + std::unique_ptr<MCAsmBackend> MAB, + std::unique_ptr<MCObjectWriter> MOW, + std::unique_ptr<MCCodeEmitter> MCE) + : MCELFStreamer(C, std::move(MAB), std::move(MOW), std::move(MCE)) {} + RISCVELFStreamer &RISCVTargetELFStreamer::getStreamer() { return static_cast<RISCVELFStreamer &>(Streamer); } diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h index 98948cd3e949..26da2441d4ae 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h @@ -28,8 +28,7 @@ class RISCVELFStreamer : public MCELFStreamer { public: RISCVELFStreamer(MCContext &C, std::unique_ptr<MCAsmBackend> MAB, std::unique_ptr<MCObjectWriter> MOW, - std::unique_ptr<MCCodeEmitter> MCE) - : MCELFStreamer(C, std::move(MAB), std::move(MOW), std::move(MCE)) {} + std::unique_ptr<MCCodeEmitter> MCE); void changeSection(MCSection *Section, uint32_t Subsection) override; void emitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp index b0c27ce6010f..50f5a5d09a69 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp @@ -216,9 +216,12 @@ void RISCVInstPrinter::printVTypeI(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { unsigned Imm = MI->getOperand(OpNo).getImm(); // Print the raw immediate for reserved values: vlmul[2:0]=4, vsew[2:0]=0b1xx, - // or non-zero in bits 8 and above. + // altfmt=1 without zvfbfa extension, or non-zero in bits 9 and above. if (RISCVVType::getVLMUL(Imm) == RISCVVType::VLMUL::LMUL_RESERVED || - RISCVVType::getSEW(Imm) > 64 || (Imm >> 8) != 0) { + RISCVVType::getSEW(Imm) > 64 || + (RISCVVType::isAltFmt(Imm) && + !STI.hasFeature(RISCV::FeatureStdExtZvfbfa)) || + (Imm >> 9) != 0) { O << formatImm(Imm); return; } diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp index 83566b1c5778..66ca43604670 100644 --- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp +++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp @@ -126,7 +126,7 @@ private: void LowerPATCHABLE_TAIL_CALL(const MachineInstr *MI); void emitSled(const MachineInstr *MI, SledKind Kind); - bool lowerToMCInst(const MachineInstr *MI, MCInst &OutMI); + void lowerToMCInst(const MachineInstr *MI, MCInst &OutMI); }; } @@ -329,12 +329,17 @@ void RISCVAsmPrinter::emitInstruction(const MachineInstr *MI) { case TargetOpcode::STATEPOINT: return LowerSTATEPOINT(*OutStreamer, SM, *MI); case TargetOpcode::PATCHABLE_FUNCTION_ENTER: { - // patchable-function-entry is handled in lowerToMCInst - // Therefore, we break out of the switch statement if we encounter it here. const Function &F = MI->getParent()->getParent()->getFunction(); - if (F.hasFnAttribute("patchable-function-entry")) - break; - + if (F.hasFnAttribute("patchable-function-entry")) { + unsigned Num; + [[maybe_unused]] bool Result = + F.getFnAttribute("patchable-function-entry") + .getValueAsString() + .getAsInteger(10, Num); + assert(!Result && "Enforced by the verifier"); + emitNops(Num); + return; + } LowerPATCHABLE_FUNCTION_ENTER(MI); return; } @@ -347,8 +352,8 @@ void RISCVAsmPrinter::emitInstruction(const MachineInstr *MI) { } MCInst OutInst; - if (!lowerToMCInst(MI, OutInst)) - EmitToStreamer(*OutStreamer, OutInst); + lowerToMCInst(MI, OutInst); + EmitToStreamer(*OutStreamer, OutInst); } bool RISCVAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, @@ -1174,9 +1179,9 @@ static bool lowerRISCVVMachineInstrToMCInst(const MachineInstr *MI, return true; } -bool RISCVAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) { +void RISCVAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) { if (lowerRISCVVMachineInstrToMCInst(MI, OutMI, STI)) - return false; + return; OutMI.setOpcode(MI->getOpcode()); @@ -1185,23 +1190,6 @@ bool RISCVAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) { if (lowerOperand(MO, MCOp)) OutMI.addOperand(MCOp); } - - switch (OutMI.getOpcode()) { - case TargetOpcode::PATCHABLE_FUNCTION_ENTER: { - const Function &F = MI->getParent()->getParent()->getFunction(); - if (F.hasFnAttribute("patchable-function-entry")) { - unsigned Num; - if (F.getFnAttribute("patchable-function-entry") - .getValueAsString() - .getAsInteger(10, Num)) - return false; - emitNops(Num); - return true; - } - break; - } - } - return false; } void RISCVAsmPrinter::emitMachineConstantPoolValue( diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 3b738e4cc11a..063963d4ec36 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -680,6 +680,13 @@ def FeatureStdExtV [FeatureStdExtZvl128b, FeatureStdExtZve64d]>, RISCVExtensionBitmask<0, 21>; +def FeatureStdExtZvfbfa + : RISCVExperimentalExtension<0, 1, "Additional BF16 vector compute support", + [FeatureStdExtZve32f, FeatureStdExtZfbfmin]>; +def HasStdExtZvfbfa : Predicate<"Subtarget->hasStdExtZvfbfa()">, + AssemblerPredicate<(all_of FeatureStdExtZvfbfa), + "'Zvfbfa' (Additional BF16 vector compute support)">; + def FeatureStdExtZvfbfmin : RISCVExtension<1, 0, "Vector BF16 Converts", [FeatureStdExtZve32f]>; def HasStdExtZvfbfmin : Predicate<"Subtarget->hasStdExtZvfbfmin()">, @@ -1396,20 +1403,27 @@ def HasVendorXMIPSCMov AssemblerPredicate<(all_of FeatureVendorXMIPSCMov), "'Xmipscmov' ('mips.ccmov' instruction)">; def UseCCMovInsn : Predicate<"Subtarget->useCCMovInsn()">; + def FeatureVendorXMIPSLSP : RISCVExtension<1, 0, "MIPS optimization for hardware load-store bonding">; def HasVendorXMIPSLSP : Predicate<"Subtarget->hasVendorXMIPSLSP()">, AssemblerPredicate<(all_of FeatureVendorXMIPSLSP), "'Xmipslsp' (load and store pair instructions)">; -def FeatureVendorXMIPSCBOP - : RISCVExtension<1, 0, "MIPS Software Prefetch">; + +def FeatureVendorXMIPSCBOP : RISCVExtension<1, 0, "MIPS Software Prefetch">; def HasVendorXMIPSCBOP : Predicate<"Subtarget->hasVendorXMIPSCBOP()">, AssemblerPredicate<(all_of FeatureVendorXMIPSCBOP), "'Xmipscbop' (MIPS hardware prefetch)">; def NoVendorXMIPSCBOP : Predicate<"!Subtarget->hasVendorXMIPSCBOP()">; +def FeatureVendorXMIPSEXECTL : RISCVExtension<1, 0, "MIPS execution control">; +def HasVendorXMIPSEXECTL + : Predicate<"Subtarget->hasVendorXMIPSEXT()">, + AssemblerPredicate<(all_of FeatureVendorXMIPSEXECTL), + "'Xmipsexectl' (MIPS execution control)">; + // WCH / Nanjing Qinheng Microelectronics Extension(s) def FeatureVendorXwchc @@ -1668,7 +1682,7 @@ def IsRV32 : Predicate<"!Subtarget->is64Bit()">, "RV32I Base Instruction Set">; defvar RV32 = DefaultMode; -def RV64 : HwMode<"+64bit", [IsRV64]>; +def RV64 : HwMode<[IsRV64]>; def FeatureRelax : SubtargetFeature<"relax", "EnableLinkerRelax", "true", diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index 9fc0d815ceee..06ce91771c9e 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -106,8 +106,14 @@ static void emitSCSPrologue(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL) { const auto &STI = MF.getSubtarget<RISCVSubtarget>(); + // We check Zimop instead of (Zimop || Zcmop) to determine whether HW shadow + // stack is available despite the fact that sspush/sspopchk both have a + // compressed form, because if only Zcmop is available, we would need to + // reserve X5 due to c.sspopchk only takes X5 and we currently do not support + // using X5 as the return address register. + // However, we can still aggressively use c.sspush x1 if zcmop is available. bool HasHWShadowStack = MF.getFunction().hasFnAttribute("hw-shadow-stack") && - STI.hasStdExtZicfiss(); + STI.hasStdExtZimop(); bool HasSWShadowStack = MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack); if (!HasHWShadowStack && !HasSWShadowStack) @@ -124,7 +130,12 @@ static void emitSCSPrologue(MachineFunction &MF, MachineBasicBlock &MBB, const RISCVInstrInfo *TII = STI.getInstrInfo(); if (HasHWShadowStack) { - BuildMI(MBB, MI, DL, TII->get(RISCV::SSPUSH)).addReg(RAReg); + if (STI.hasStdExtZcmop()) { + static_assert(RAReg == RISCV::X1, "C.SSPUSH only accepts X1"); + BuildMI(MBB, MI, DL, TII->get(RISCV::PseudoMOP_C_SSPUSH)); + } else { + BuildMI(MBB, MI, DL, TII->get(RISCV::PseudoMOP_SSPUSH)).addReg(RAReg); + } return; } @@ -172,7 +183,7 @@ static void emitSCSEpilogue(MachineFunction &MF, MachineBasicBlock &MBB, const DebugLoc &DL) { const auto &STI = MF.getSubtarget<RISCVSubtarget>(); bool HasHWShadowStack = MF.getFunction().hasFnAttribute("hw-shadow-stack") && - STI.hasStdExtZicfiss(); + STI.hasStdExtZimop(); bool HasSWShadowStack = MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack); if (!HasHWShadowStack && !HasSWShadowStack) @@ -186,7 +197,7 @@ static void emitSCSEpilogue(MachineFunction &MF, MachineBasicBlock &MBB, const RISCVInstrInfo *TII = STI.getInstrInfo(); if (HasHWShadowStack) { - BuildMI(MBB, MI, DL, TII->get(RISCV::SSPOPCHK)).addReg(RAReg); + BuildMI(MBB, MI, DL, TII->get(RISCV::PseudoMOP_SSPOPCHK)).addReg(RAReg); return; } diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp index 80a48c5ec11f..52dc53e4545e 100644 --- a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp @@ -561,7 +561,7 @@ bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II) { EVL = Builder.CreateElementCount( Builder.getInt32Ty(), cast<VectorType>(DataType)->getElementCount()); - CallInst *Call; + Value *Call; if (!StoreVal) { Call = Builder.CreateIntrinsic( @@ -571,8 +571,7 @@ bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II) { // Merge llvm.masked.gather's passthru if (II->getIntrinsicID() == Intrinsic::masked_gather) - Call = Builder.CreateIntrinsic(Intrinsic::vp_select, {DataType}, - {Mask, Call, II->getArgOperand(3), EVL}); + Call = Builder.CreateSelect(Mask, Call, II->getArgOperand(3)); } else Call = Builder.CreateIntrinsic( Intrinsic::experimental_vp_strided_store, diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index f9f35f66319b..c7f15415ebb9 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -819,49 +819,6 @@ bool RISCVDAGToDAGISel::trySignedBitfieldInsertInSign(SDNode *Node) { return false; } -// (xor X, (and (xor X, C1), C2)) -// -> (qc.insbi X, (C1 >> ShAmt), Width, ShAmt) -// where C2 is a shifted mask with width=Width and shift=ShAmt -bool RISCVDAGToDAGISel::tryBitfieldInsertOpFromXor(SDNode *Node) { - - if (!Subtarget->hasVendorXqcibm()) - return false; - - using namespace SDPatternMatch; - - SDValue X; - APInt CImm, CMask; - if (!sd_match( - Node, - m_Xor(m_Value(X), - m_OneUse(m_And(m_OneUse(m_Xor(m_Deferred(X), m_ConstInt(CImm))), - m_ConstInt(CMask)))))) - return false; - - unsigned Width, ShAmt; - if (!CMask.isShiftedMask(ShAmt, Width)) - return false; - - int64_t Imm = CImm.getSExtValue(); - Imm >>= ShAmt; - - SDLoc DL(Node); - SDValue ImmNode; - auto Opc = RISCV::QC_INSB; - - if (isInt<5>(Imm)) { - Opc = RISCV::QC_INSBI; - ImmNode = CurDAG->getSignedTargetConstant(Imm, DL, MVT::i32); - } else { - ImmNode = selectImm(CurDAG, DL, MVT::i32, Imm, *Subtarget); - } - SDValue Ops[] = {X, ImmNode, CurDAG->getTargetConstant(Width, DL, MVT::i32), - CurDAG->getTargetConstant(ShAmt, DL, MVT::i32)}; - ReplaceNode(Node, CurDAG->getMachineNode(Opc, DL, MVT::i32, Ops)); - - return true; -} - bool RISCVDAGToDAGISel::tryUnsignedBitfieldExtract(SDNode *Node, const SDLoc &DL, MVT VT, SDValue X, unsigned Msb, @@ -1095,7 +1052,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { SDLoc DL(Node); MVT VT = Node->getSimpleValueType(0); - bool HasBitTest = Subtarget->hasStdExtZbs() || Subtarget->hasVendorXTHeadBs(); + bool HasBitTest = Subtarget->hasBEXTILike(); switch (Opcode) { case ISD::Constant: { @@ -1442,9 +1399,6 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { if (tryShrinkShlLogicImm(Node)) return; - if (tryBitfieldInsertOpFromXor(Node)) - return; - break; case ISD::AND: { auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1)); @@ -2951,6 +2905,65 @@ static bool isWorthFoldingAdd(SDValue Add) { return true; } +bool isRegImmLoadOrStore(SDNode *User, SDValue Add) { + switch (User->getOpcode()) { + default: + return false; + case ISD::LOAD: + case RISCVISD::LD_RV32: + case ISD::ATOMIC_LOAD: + break; + case ISD::STORE: + // Don't allow stores of Add. It must only be used as the address. + if (cast<StoreSDNode>(User)->getValue() == Add) + return false; + break; + case RISCVISD::SD_RV32: + // Don't allow stores of Add. It must only be used as the address. + if (User->getOperand(0) == Add || User->getOperand(1) == Add) + return false; + break; + case ISD::ATOMIC_STORE: + // Don't allow stores of Add. It must only be used as the address. + if (cast<AtomicSDNode>(User)->getVal() == Add) + return false; + break; + } + + return true; +} + +// To prevent SelectAddrRegImm from folding offsets that conflict with the +// fusion of PseudoMovAddr, check if the offset of every use of a given address +// is within the alignment. +bool RISCVDAGToDAGISel::areOffsetsWithinAlignment(SDValue Addr, + Align Alignment) { + assert(Addr->getOpcode() == RISCVISD::ADD_LO); + for (auto *User : Addr->users()) { + // If the user is a load or store, then the offset is 0 which is always + // within alignment. + if (isRegImmLoadOrStore(User, Addr)) + continue; + + if (CurDAG->isBaseWithConstantOffset(SDValue(User, 0))) { + int64_t CVal = cast<ConstantSDNode>(User->getOperand(1))->getSExtValue(); + if (!isInt<12>(CVal) || Alignment <= CVal) + return false; + + // Make sure all uses are foldable load/stores. + for (auto *AddUser : User->users()) + if (!isRegImmLoadOrStore(AddUser, SDValue(User, 0))) + return false; + + continue; + } + + return false; + } + + return true; +} + bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base, SDValue &Offset) { if (SelectAddrFrameIndex(Addr, Base, Offset)) @@ -2960,9 +2973,21 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base, MVT VT = Addr.getSimpleValueType(); if (Addr.getOpcode() == RISCVISD::ADD_LO) { - Base = Addr.getOperand(0); - Offset = Addr.getOperand(1); - return true; + bool CanFold = true; + // Unconditionally fold if operand 1 is not a global address (e.g. + // externsymbol) + if (auto *GA = dyn_cast<GlobalAddressSDNode>(Addr.getOperand(1))) { + const DataLayout &DL = CurDAG->getDataLayout(); + Align Alignment = commonAlignment( + GA->getGlobal()->getPointerAlignment(DL), GA->getOffset()); + if (!areOffsetsWithinAlignment(Addr, Alignment)) + CanFold = false; + } + if (CanFold) { + Base = Addr.getOperand(0); + Offset = Addr.getOperand(1); + return true; + } } if (CurDAG->isBaseWithConstantOffset(Addr)) { @@ -2980,7 +3005,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base, const DataLayout &DL = CurDAG->getDataLayout(); Align Alignment = commonAlignment( GA->getGlobal()->getPointerAlignment(DL), GA->getOffset()); - if ((CVal == 0 || Alignment > CVal)) { + if ((CVal == 0 || Alignment > CVal) && + areOffsetsWithinAlignment(Base, Alignment)) { int64_t CombinedOffset = CVal + GA->getOffset(); Base = Base.getOperand(0); Offset = CurDAG->getTargetGlobalAddress( @@ -3983,6 +4009,15 @@ bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits, if (Use.getOperandNo() == 0 && Bits >= 32) break; return false; + case RISCV::TH_EXT: + case RISCV::TH_EXTU: { + unsigned Msb = User->getConstantOperandVal(1); + unsigned Lsb = User->getConstantOperandVal(2); + // Behavior of Msb < Lsb is not well documented. + if (Msb >= Lsb && Bits > Msb) + break; + return false; + } } } diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h index c329a4c6ec62..cf2f763abc06 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h @@ -45,6 +45,8 @@ public: InlineAsm::ConstraintCode ConstraintID, std::vector<SDValue> &OutOps) override; + bool areOffsetsWithinAlignment(SDValue Addr, Align Alignment); + bool SelectAddrFrameIndex(SDValue Addr, SDValue &Base, SDValue &Offset); bool SelectAddrRegImm(SDValue Addr, SDValue &Base, SDValue &Offset); bool SelectAddrRegImm9(SDValue Addr, SDValue &Base, SDValue &Offset); @@ -75,7 +77,6 @@ public: bool trySignedBitfieldExtract(SDNode *Node); bool trySignedBitfieldInsertInSign(SDNode *Node); bool trySignedBitfieldInsertInMask(SDNode *Node); - bool tryBitfieldInsertOpFromXor(SDNode *Node); bool tryBitfieldInsertOpFromOrAndImm(SDNode *Node); bool tryUnsignedBitfieldExtract(SDNode *Node, const SDLoc &DL, MVT VT, SDValue X, unsigned Msb, unsigned Lsb); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index a33224845e2b..a68a3c14dc41 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2173,7 +2173,7 @@ bool RISCVTargetLowering::isMaskAndCmp0FoldingBeneficial( // on the basis that it's possible the sinking+duplication of the AND in // CodeGenPrepare triggered by this hook wouldn't decrease the instruction // count and would increase code size (e.g. ANDI+BNEZ => BEXTI+BNEZ). - if (!Subtarget.hasStdExtZbs() && !Subtarget.hasVendorXTHeadBs()) + if (!Subtarget.hasBEXTILike()) return false; ConstantInt *Mask = dyn_cast<ConstantInt>(AndI.getOperand(1)); if (!Mask) @@ -3744,9 +3744,11 @@ static SDValue matchSplatAsGather(SDValue SplatVal, MVT VT, const SDLoc &DL, // different // FIXME: Support i1 vectors, maybe by promoting to i8? MVT EltTy = VT.getVectorElementType(); + if (EltTy == MVT::i1 || + !DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType())) + return SDValue(); MVT SrcVT = Src.getSimpleValueType(); - if (EltTy == MVT::i1 || EltTy != SrcVT.getVectorElementType() || - !DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) + if (EltTy != SrcVT.getVectorElementType()) return SDValue(); SDValue Idx = SplatVal.getOperand(1); // The index must be a legal type. @@ -4518,41 +4520,104 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, const unsigned Policy = RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC; + // General case: splat the first operand and slide other operands down one + // by one to form a vector. Alternatively, if every operand is an + // extraction from element 0 of a vector, we use that vector from the last + // extraction as the start value and slide up instead of slide down. Such that + // (1) we can avoid the initial splat (2) we can turn those vslide1up into + // vslideup of 1 later and eliminate the vector to scalar movement, which is + // something we cannot do with vslide1down/vslidedown. + // Of course, using vslide1up/vslideup might increase the register pressure, + // and that's why we conservatively limit to cases where every operand is an + // extraction from the first element. + SmallVector<SDValue> Operands(Op->op_begin(), Op->op_end()); + SDValue EVec; + bool SlideUp = false; + auto getVSlide = [&](EVT ContainerVT, SDValue Passthru, SDValue Vec, + SDValue Offset, SDValue Mask, SDValue VL) -> SDValue { + if (SlideUp) + return getVSlideup(DAG, Subtarget, DL, ContainerVT, Passthru, Vec, Offset, + Mask, VL, Policy); + return getVSlidedown(DAG, Subtarget, DL, ContainerVT, Passthru, Vec, Offset, + Mask, VL, Policy); + }; + + // The reason we don't use all_of here is because we're also capturing EVec + // from the last non-undef operand. If the std::execution_policy of the + // underlying std::all_of is anything but std::sequenced_policy we might + // capture the wrong EVec. + for (SDValue V : Operands) { + using namespace SDPatternMatch; + SlideUp = V.isUndef() || sd_match(V, m_ExtractElt(m_Value(EVec), m_Zero())); + if (!SlideUp) + break; + } + + if (SlideUp) { + MVT EVecContainerVT = EVec.getSimpleValueType(); + // Make sure the original vector has scalable vector type. + if (EVecContainerVT.isFixedLengthVector()) { + EVecContainerVT = + getContainerForFixedLengthVector(DAG, EVecContainerVT, Subtarget); + EVec = convertToScalableVector(EVecContainerVT, EVec, DAG, Subtarget); + } + + // Adapt EVec's type into ContainerVT. + if (EVecContainerVT.getVectorMinNumElements() < + ContainerVT.getVectorMinNumElements()) + EVec = DAG.getInsertSubvector(DL, DAG.getUNDEF(ContainerVT), EVec, 0); + else + EVec = DAG.getExtractSubvector(DL, ContainerVT, EVec, 0); + + // Reverse the elements as we're going to slide up from the last element. + std::reverse(Operands.begin(), Operands.end()); + } + SDValue Vec; UndefCount = 0; - for (SDValue V : Op->ops()) { + for (SDValue V : Operands) { if (V.isUndef()) { UndefCount++; continue; } - // Start our sequence with a TA splat in the hopes that hardware is able to - // recognize there's no dependency on the prior value of our temporary - // register. + // Start our sequence with either a TA splat or extract source in the + // hopes that hardware is able to recognize there's no dependency on the + // prior value of our temporary register. if (!Vec) { - Vec = DAG.getSplatVector(VT, DL, V); - Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget); + if (SlideUp) { + Vec = EVec; + } else { + Vec = DAG.getSplatVector(VT, DL, V); + Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget); + } + UndefCount = 0; continue; } if (UndefCount) { const SDValue Offset = DAG.getConstant(UndefCount, DL, Subtarget.getXLenVT()); - Vec = getVSlidedown(DAG, Subtarget, DL, ContainerVT, DAG.getUNDEF(ContainerVT), - Vec, Offset, Mask, VL, Policy); + Vec = getVSlide(ContainerVT, DAG.getUNDEF(ContainerVT), Vec, Offset, Mask, + VL); UndefCount = 0; } - auto OpCode = - VT.isFloatingPoint() ? RISCVISD::VFSLIDE1DOWN_VL : RISCVISD::VSLIDE1DOWN_VL; + + unsigned Opcode; + if (VT.isFloatingPoint()) + Opcode = SlideUp ? RISCVISD::VFSLIDE1UP_VL : RISCVISD::VFSLIDE1DOWN_VL; + else + Opcode = SlideUp ? RISCVISD::VSLIDE1UP_VL : RISCVISD::VSLIDE1DOWN_VL; + if (!VT.isFloatingPoint()) V = DAG.getNode(ISD::ANY_EXTEND, DL, Subtarget.getXLenVT(), V); - Vec = DAG.getNode(OpCode, DL, ContainerVT, DAG.getUNDEF(ContainerVT), Vec, + Vec = DAG.getNode(Opcode, DL, ContainerVT, DAG.getUNDEF(ContainerVT), Vec, V, Mask, VL); } if (UndefCount) { const SDValue Offset = DAG.getConstant(UndefCount, DL, Subtarget.getXLenVT()); - Vec = getVSlidedown(DAG, Subtarget, DL, ContainerVT, DAG.getUNDEF(ContainerVT), - Vec, Offset, Mask, VL, Policy); + Vec = getVSlide(ContainerVT, DAG.getUNDEF(ContainerVT), Vec, Offset, Mask, + VL); } return convertFromScalableVector(VT, Vec, DAG, Subtarget); } @@ -8193,6 +8258,13 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, DL, VT, LHS, DAG.getSignedConstant(Imm + 1, DL, OpVT), CCVal); return DAG.getLogicalNOT(DL, SetCC, VT); } + // Lower (setugt X, 2047) as (setne (srl X, 11), 0). + if (CCVal == ISD::SETUGT && Imm == 2047) { + SDValue Shift = DAG.getNode(ISD::SRL, DL, OpVT, LHS, + DAG.getShiftAmountConstant(11, OpVT, DL)); + return DAG.getSetCC(DL, VT, Shift, DAG.getConstant(0, DL, OpVT), + ISD::SETNE); + } } // Not a constant we could handle, swap the operands and condition code to @@ -8815,7 +8887,15 @@ SDValue RISCVTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG, reportFatalUsageError("Unsupported code model for lowering"); case CodeModel::Small: { // Generate a sequence for accessing addresses within the first 2 GiB of - // address space. This generates the pattern (addi (lui %hi(sym)) %lo(sym)). + // address space. + if (Subtarget.hasVendorXqcili()) { + // Use QC.E.LI to generate the address, as this is easier to relax than + // LUI/ADDI. + SDValue Addr = getTargetNode(N, DL, Ty, DAG, 0); + return DAG.getNode(RISCVISD::QC_E_LI, DL, Ty, Addr); + } + + // This generates the pattern (addi (lui %hi(sym)) %lo(sym)). SDValue AddrHi = getTargetNode(N, DL, Ty, DAG, RISCVII::MO_HI); SDValue AddrLo = getTargetNode(N, DL, Ty, DAG, RISCVII::MO_LO); SDValue MNHi = DAG.getNode(RISCVISD::HI, DL, Ty, AddrHi); @@ -9036,8 +9116,12 @@ static std::optional<bool> matchSetCC(SDValue LHS, SDValue RHS, return std::nullopt; } -static SDValue combineSelectToBinOp(SDNode *N, SelectionDAG &DAG, - const RISCVSubtarget &Subtarget) { +static bool isSimm12Constant(SDValue V) { + return isa<ConstantSDNode>(V) && V->getAsAPIntVal().isSignedIntN(12); +} + +static SDValue lowerSelectToBinOp(SDNode *N, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { SDValue CondV = N->getOperand(0); SDValue TrueV = N->getOperand(1); SDValue FalseV = N->getOperand(2); @@ -9057,14 +9141,16 @@ static SDValue combineSelectToBinOp(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::OR, DL, VT, Neg, DAG.getFreeze(TrueV)); } + const bool HasCZero = VT.isScalarInteger() && Subtarget.hasCZEROLike(); + // (select c, 0, y) -> (c-1) & y - if (isNullConstant(TrueV)) { - SDValue Neg = DAG.getNode(ISD::ADD, DL, VT, CondV, - DAG.getAllOnesConstant(DL, VT)); + if (isNullConstant(TrueV) && (!HasCZero || isSimm12Constant(FalseV))) { + SDValue Neg = + DAG.getNode(ISD::ADD, DL, VT, CondV, DAG.getAllOnesConstant(DL, VT)); return DAG.getNode(ISD::AND, DL, VT, Neg, DAG.getFreeze(FalseV)); } // (select c, y, 0) -> -c & y - if (isNullConstant(FalseV)) { + if (isNullConstant(FalseV) && (!HasCZero || isSimm12Constant(TrueV))) { SDValue Neg = DAG.getNegative(CondV, DL, VT); return DAG.getNode(ISD::AND, DL, VT, Neg, DAG.getFreeze(TrueV)); } @@ -9185,12 +9271,16 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::VSELECT, DL, VT, CondSplat, TrueV, FalseV); } + // Try some other optimizations before falling back to generic lowering. + if (SDValue V = lowerSelectToBinOp(Op.getNode(), DAG, Subtarget)) + return V; + // When Zicond or XVentanaCondOps is present, emit CZERO_EQZ and CZERO_NEZ // nodes to implement the SELECT. Performing the lowering here allows for // greater control over when CZERO_{EQZ/NEZ} are used vs another branchless // sequence or RISCVISD::SELECT_CC node (branch-based select). - if ((Subtarget.hasStdExtZicond() || Subtarget.hasVendorXVentanaCondOps()) && - VT.isScalarInteger()) { + if (Subtarget.hasCZEROLike() && VT.isScalarInteger()) { + // (select c, t, 0) -> (czero_eqz t, c) if (isNullConstant(FalseV)) return DAG.getNode(RISCVISD::CZERO_EQZ, DL, VT, TrueV, CondV); @@ -9244,10 +9334,6 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const { DAG.getNode(RISCVISD::CZERO_EQZ, DL, VT, TrueV, CondV)); } - // Try some other optimizations before falling back to generic lowering. - if (SDValue V = combineSelectToBinOp(Op.getNode(), DAG, Subtarget)) - return V; - // (select c, c1, c2) -> (add (czero_nez c2 - c1, c), c1) // (select c, c1, c2) -> (add (czero_eqz c1 - c2, c), c2) if (isa<ConstantSDNode>(TrueV) && isa<ConstantSDNode>(FalseV)) { @@ -9280,19 +9366,38 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const { } } - const int TrueValCost = RISCVMatInt::getIntMatCost( - TrueVal, Subtarget.getXLen(), Subtarget, /*CompressionCost=*/true); - const int FalseValCost = RISCVMatInt::getIntMatCost( - FalseVal, Subtarget.getXLen(), Subtarget, /*CompressionCost=*/true); - bool IsCZERO_NEZ = TrueValCost <= FalseValCost; + // Use SHL/ADDI (and possible XORI) to avoid having to materialize + // a constant in register + if ((TrueVal - FalseVal).isPowerOf2() && FalseVal.isSignedIntN(12)) { + SDValue Log2 = DAG.getConstant((TrueVal - FalseVal).logBase2(), DL, VT); + SDValue BitDiff = DAG.getNode(ISD::SHL, DL, VT, CondV, Log2); + return DAG.getNode(ISD::ADD, DL, VT, FalseV, BitDiff); + } + if ((FalseVal - TrueVal).isPowerOf2() && TrueVal.isSignedIntN(12)) { + SDValue Log2 = DAG.getConstant((FalseVal - TrueVal).logBase2(), DL, VT); + CondV = DAG.getLogicalNOT(DL, CondV, CondV->getValueType(0)); + SDValue BitDiff = DAG.getNode(ISD::SHL, DL, VT, CondV, Log2); + return DAG.getNode(ISD::ADD, DL, VT, TrueV, BitDiff); + } + + auto getCost = [&](const APInt &Delta, const APInt &Addend) { + const int DeltaCost = RISCVMatInt::getIntMatCost( + Delta, Subtarget.getXLen(), Subtarget, /*CompressionCost=*/true); + // Does the addend fold into an ADDI + if (Addend.isSignedIntN(12)) + return DeltaCost; + const int AddendCost = RISCVMatInt::getIntMatCost( + Addend, Subtarget.getXLen(), Subtarget, /*CompressionCost=*/true); + return AddendCost + DeltaCost; + }; + bool IsCZERO_NEZ = getCost(FalseVal - TrueVal, TrueVal) <= + getCost(TrueVal - FalseVal, FalseVal); SDValue LHSVal = DAG.getConstant( IsCZERO_NEZ ? FalseVal - TrueVal : TrueVal - FalseVal, DL, VT); - SDValue RHSVal = - DAG.getConstant(IsCZERO_NEZ ? TrueVal : FalseVal, DL, VT); SDValue CMOV = DAG.getNode(IsCZERO_NEZ ? RISCVISD::CZERO_NEZ : RISCVISD::CZERO_EQZ, DL, VT, LHSVal, CondV); - return DAG.getNode(ISD::ADD, DL, VT, CMOV, RHSVal); + return DAG.getNode(ISD::ADD, DL, VT, CMOV, IsCZERO_NEZ ? TrueV : FalseV); } // (select c, c1, t) -> (add (czero_nez t - c1, c), c1) @@ -9327,12 +9432,10 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode( ISD::OR, DL, VT, DAG.getNode(RISCVISD::CZERO_EQZ, DL, VT, TrueV, CondV), - DAG.getNode(RISCVISD::CZERO_NEZ, DL, VT, FalseV, CondV)); + DAG.getNode(RISCVISD::CZERO_NEZ, DL, VT, FalseV, CondV), + SDNodeFlags::Disjoint); } - if (SDValue V = combineSelectToBinOp(Op.getNode(), DAG, Subtarget)) - return V; - if (Op.hasOneUse()) { unsigned UseOpc = Op->user_begin()->getOpcode(); if (isBinOp(UseOpc) && DAG.isSafeToSpeculativelyExecute(UseOpc)) { @@ -10738,11 +10841,11 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(Opc, DL, XLenVT, Op.getOperand(1)); } case Intrinsic::riscv_mopr: - return DAG.getNode(RISCVISD::MOPR, DL, XLenVT, Op.getOperand(1), + return DAG.getNode(RISCVISD::MOP_R, DL, XLenVT, Op.getOperand(1), Op.getOperand(2)); case Intrinsic::riscv_moprr: { - return DAG.getNode(RISCVISD::MOPRR, DL, XLenVT, Op.getOperand(1), + return DAG.getNode(RISCVISD::MOP_RR, DL, XLenVT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); } case Intrinsic::riscv_clmul: @@ -14877,7 +14980,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, SDValue NewOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1)); SDValue Res = DAG.getNode( - RISCVISD::MOPR, DL, MVT::i64, NewOp, + RISCVISD::MOP_R, DL, MVT::i64, NewOp, DAG.getTargetConstant(N->getConstantOperandVal(2), DL, MVT::i64)); Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res)); return; @@ -14890,7 +14993,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, SDValue NewOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(2)); SDValue Res = DAG.getNode( - RISCVISD::MOPRR, DL, MVT::i64, NewOp0, NewOp1, + RISCVISD::MOP_RR, DL, MVT::i64, NewOp0, NewOp1, DAG.getTargetConstant(N->getConstantOperandVal(3), DL, MVT::i64)); Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res)); return; @@ -15381,9 +15484,7 @@ static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, if (!Subtarget.hasConditionalMoveFusion()) { // (select cond, x, (and x, c)) has custom lowering with Zicond. - if ((!Subtarget.hasStdExtZicond() && - !Subtarget.hasVendorXVentanaCondOps()) || - N->getOpcode() != ISD::AND) + if (!Subtarget.hasCZEROLike() || N->getOpcode() != ISD::AND) return SDValue(); // Maybe harmful when condition code has multiple use. @@ -16059,12 +16160,55 @@ static SDValue combineOrOfCZERO(SDNode *N, SDValue N0, SDValue N1, SDValue NewN0 = DAG.getNode(RISCVISD::CZERO_EQZ, DL, VT, TrueV.getOperand(0), Cond); - SDValue NewN1 = DAG.getNode(RISCVISD::CZERO_NEZ, DL, VT, FalseV.getOperand(0), - Cond); - SDValue NewOr = DAG.getNode(ISD::OR, DL, VT, NewN0, NewN1); + SDValue NewN1 = + DAG.getNode(RISCVISD::CZERO_NEZ, DL, VT, FalseV.getOperand(0), Cond); + SDValue NewOr = + DAG.getNode(ISD::OR, DL, VT, NewN0, NewN1, SDNodeFlags::Disjoint); return DAG.getNode(ISD::XOR, DL, VT, NewOr, TrueV.getOperand(1)); } +// (xor X, (xor (and X, C2), Y)) +// ->(qc_insb X, (sra Y, ShAmt), Width, ShAmt) +// where C2 is a shifted mask with width = Width and shift = ShAmt +// qc_insb might become qc.insb or qc.insbi depending on the operands. +static SDValue combineXorToBitfieldInsert(SDNode *N, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + if (!Subtarget.hasVendorXqcibm()) + return SDValue(); + + using namespace SDPatternMatch; + + SDValue Base, Inserted; + APInt CMask; + if (!sd_match(N, m_Xor(m_Value(Base), + m_OneUse(m_Xor(m_OneUse(m_And(m_Deferred(Base), + m_ConstInt(CMask))), + m_Value(Inserted)))))) + return SDValue(); + + if (N->getValueType(0) != MVT::i32) + return SDValue(); + + unsigned Width, ShAmt; + if (!CMask.isShiftedMask(ShAmt, Width)) + return SDValue(); + + // Check if all zero bits in CMask are also zero in Inserted + if (!DAG.MaskedValueIsZero(Inserted, ~CMask)) + return SDValue(); + + SDLoc DL(N); + + // `Inserted` needs to be right shifted before it is put into the + // instruction. + Inserted = DAG.getNode(ISD::SRA, DL, MVT::i32, Inserted, + DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL)); + + SDValue Ops[] = {Base, Inserted, DAG.getConstant(Width, DL, MVT::i32), + DAG.getConstant(ShAmt, DL, MVT::i32)}; + return DAG.getNode(RISCVISD::QC_INSB, DL, MVT::i32, Ops); +} + static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const RISCVSubtarget &Subtarget) { SelectionDAG &DAG = DCI.DAG; @@ -16108,8 +16252,8 @@ static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG, SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N0.getOperand(0)); SDValue Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N0.getOperand(1)); SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i64, Op0, Op1); - SDValue And = DAG.getNOT(DL, Shl, MVT::i64); - return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, And); + SDValue Not = DAG.getNOT(DL, Shl, MVT::i64); + return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Not); } // fold (xor (sllw 1, x), -1) -> (rolw ~1, x) @@ -16137,6 +16281,9 @@ static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG, } } + if (SDValue V = combineXorToBitfieldInsert(N, DAG, Subtarget)) + return V; + if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget)) return V; if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget)) @@ -16590,10 +16737,6 @@ combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC, DAG.getConstant(0, DL, XLenVT), CC); } -// Replace (seteq (i64 (and X, 0xffffffff)), C1) with -// (seteq (i64 (sext_inreg (X, i32)), C1')) where C1' is C1 sign extended from -// bit 31. Same for setne. C1' may be cheaper to materialize and the sext_inreg -// can become a sext.w instead of a shift pair. static SDValue performSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const RISCVSubtarget &Subtarget) { @@ -16613,20 +16756,44 @@ static SDValue performSETCCCombine(SDNode *N, combineVectorSizedSetCCEquality(VT, N0, N1, Cond, dl, DAG, Subtarget)) return V; - // (X & -4096) == 0 -> (X >> 12) == 0 if the AND constant can't use ANDI. - if (DCI.isAfterLegalizeDAG() && isNullConstant(N1) && + if (DCI.isAfterLegalizeDAG() && isa<ConstantSDNode>(N1) && N0.getOpcode() == ISD::AND && N0.hasOneUse() && isa<ConstantSDNode>(N0.getOperand(1))) { - const APInt &AndRHSC = - cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); - if (!isInt<12>(AndRHSC.getSExtValue()) && AndRHSC.isNegatedPowerOf2()) { + const APInt &AndRHSC = N0.getConstantOperandAPInt(1); + // (X & -(1 << C)) == 0 -> (X >> C) == 0 if the AND constant can't use ANDI. + if (isNullConstant(N1) && !isInt<12>(AndRHSC.getSExtValue()) && + AndRHSC.isNegatedPowerOf2()) { unsigned ShiftBits = AndRHSC.countr_zero(); - SDValue Shift = DAG.getNode(ISD::SRL, dl, VT, N0.getOperand(0), - DAG.getConstant(ShiftBits, dl, VT)); + SDValue Shift = DAG.getNode(ISD::SRL, dl, OpVT, N0.getOperand(0), + DAG.getConstant(ShiftBits, dl, OpVT)); return DAG.getSetCC(dl, VT, Shift, N1, Cond); } + + // Similar to above but handling the lower 32 bits by using sraiw. Allow + // comparing with constants other than 0 if the constant can be folded into + // addi or xori after shifting. + uint64_t N1Int = cast<ConstantSDNode>(N1)->getZExtValue(); + uint64_t AndRHSInt = AndRHSC.getZExtValue(); + if (OpVT == MVT::i64 && AndRHSInt <= 0xffffffff && + isPowerOf2_32(-uint32_t(AndRHSInt)) && (N1Int & AndRHSInt) == N1Int) { + unsigned ShiftBits = llvm::countr_zero(AndRHSInt); + int64_t NewC = SignExtend64<32>(N1Int) >> ShiftBits; + if (NewC >= -2048 && NewC <= 2048) { + SDValue SExt = + DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, OpVT, N0.getOperand(0), + DAG.getValueType(MVT::i32)); + SDValue Shift = DAG.getNode(ISD::SRA, dl, OpVT, SExt, + DAG.getConstant(ShiftBits, dl, OpVT)); + return DAG.getSetCC(dl, VT, Shift, + DAG.getSignedConstant(NewC, dl, OpVT), Cond); + } + } } + // Replace (seteq (i64 (and X, 0xffffffff)), C1) with + // (seteq (i64 (sext_inreg (X, i32)), C1')) where C1' is C1 sign extended from + // bit 31. Same for setne. C1' may be cheaper to materialize and the + // sext_inreg can become a sext.w instead of a shift pair. if (OpVT != MVT::i64 || !Subtarget.is64Bit()) return SDValue(); @@ -18674,7 +18841,7 @@ static SDValue tryFoldSelectIntoOp(SDNode *N, SelectionDAG &DAG, break; } - if (!TrueVal.hasOneUse() || isa<ConstantSDNode>(FalseVal)) + if (!TrueVal.hasOneUse()) return SDValue(); unsigned OpToFold; @@ -18746,6 +18913,10 @@ static SDValue foldSelectOfCTTZOrCTLZ(SDNode *N, SelectionDAG &DAG) { if (Cond->getOperand(0) != CountZeroesArgument) return SDValue(); + unsigned BitWidth = CountZeroes.getValueSizeInBits(); + if (!isPowerOf2_32(BitWidth)) + return SDValue(); + if (CountZeroes.getOpcode() == ISD::CTTZ_ZERO_UNDEF) { CountZeroes = DAG.getNode(ISD::CTTZ, SDLoc(CountZeroes), CountZeroes.getValueType(), CountZeroesArgument); @@ -18754,7 +18925,6 @@ static SDValue foldSelectOfCTTZOrCTLZ(SDNode *N, SelectionDAG &DAG) { CountZeroes.getValueType(), CountZeroesArgument); } - unsigned BitWidth = CountZeroes.getValueSizeInBits(); SDValue BitWidthMinusOne = DAG.getConstant(BitWidth - 1, SDLoc(N), CountZeroes.getValueType()); @@ -18778,7 +18948,7 @@ static SDValue useInversedSetcc(SDNode *N, SelectionDAG &DAG, // Replace (setcc eq (and x, C)) with (setcc ne (and x, C))) to generate // BEXTI, where C is power of 2. if (Subtarget.hasStdExtZbs() && VT.isScalarInteger() && - (Subtarget.hasStdExtZicond() || Subtarget.hasVendorXVentanaCondOps())) { + (Subtarget.hasCZEROLike() || Subtarget.hasVendorXTHeadCondMov())) { SDValue LHS = Cond.getOperand(0); SDValue RHS = Cond.getOperand(1); ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); @@ -18953,6 +19123,7 @@ static SDValue foldReduceOperandViaVQDOT(SDValue InVec, const SDLoc &DL, SelectionDAG &DAG, const RISCVSubtarget &Subtarget, const RISCVTargetLowering &TLI) { + using namespace SDPatternMatch; // Note: We intentionally do not check the legality of the reduction type. // We want to handle the m4/m8 *src* types, and thus need to let illegal // intermediate types flow through here. @@ -18960,11 +19131,10 @@ static SDValue foldReduceOperandViaVQDOT(SDValue InVec, const SDLoc &DL, !InVec.getValueType().getVectorElementCount().isKnownMultipleOf(4)) return SDValue(); - // Recurse through adds (since generic dag canonicalizes to that - // form). TODO: Handle disjoint or here. - if (InVec->getOpcode() == ISD::ADD) { - SDValue A = InVec.getOperand(0); - SDValue B = InVec.getOperand(1); + // Recurse through adds/disjoint ors (since generic dag canonicalizes to that + // form). + SDValue A, B; + if (sd_match(InVec, m_AddLike(m_Value(A), m_Value(B)))) { SDValue AOpt = foldReduceOperandViaVQDOT(A, DL, DAG, Subtarget, TLI); SDValue BOpt = foldReduceOperandViaVQDOT(B, DL, DAG, Subtarget, TLI); if (AOpt || BOpt) { @@ -19001,12 +19171,9 @@ static SDValue foldReduceOperandViaVQDOT(SDValue InVec, const SDLoc &DL, // mul (zext a, zext b) -> partial_reduce_umla 0, a, b // mul (sext a, zext b) -> partial_reduce_ssmla 0, a, b // mul (zext a, sext b) -> partial_reduce_smla 0, b, a (swapped) - if (InVec.getOpcode() != ISD::MUL) + if (!sd_match(InVec, m_Mul(m_Value(A), m_Value(B)))) return SDValue(); - SDValue A = InVec.getOperand(0); - SDValue B = InVec.getOperand(1); - if (!ISD::isExtOpcode(A.getOpcode())) return SDValue(); @@ -20081,6 +20248,17 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, return V; break; case ISD::FMUL: { + using namespace SDPatternMatch; + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue X, Y; + // InstCombine canonicalizes fneg (fmul x, y) -> fmul x, (fneg y), see + // hoistFNegAboveFMulFDiv. + // Undo this and sink the fneg so we match more fmsub/fnmadd patterns. + if (sd_match(N, m_FMul(m_Value(X), m_OneUse(m_FNeg(m_Value(Y)))))) + return DAG.getNode(ISD::FNEG, DL, VT, + DAG.getNode(ISD::FMUL, DL, VT, X, Y)); + // fmul X, (copysign 1.0, Y) -> fsgnjx X, Y SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -20091,13 +20269,12 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N0->getOperand(0)); if (!C || !C->getValueAPF().isExactlyValue(+1.0)) return SDValue(); - EVT VT = N->getValueType(0); if (VT.isVector() || !isOperationLegal(ISD::FCOPYSIGN, VT)) return SDValue(); SDValue Sign = N0->getOperand(1); if (Sign.getValueType() != VT) return SDValue(); - return DAG.getNode(RISCVISD::FSGNJX, SDLoc(N), VT, N1, N0->getOperand(1)); + return DAG.getNode(RISCVISD::FSGNJX, DL, VT, N1, N0->getOperand(1)); } case ISD::FADD: case ISD::UMAX: @@ -20381,9 +20558,9 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, VT, DL, MGN->getChain(), BasePtr, DAG.getSignedConstant(StepNumerator, DL, XLenVT), MGN->getMask(), EVL, MGN->getMemOperand()); - SDValue VPSelect = DAG.getNode(ISD::VP_SELECT, DL, VT, MGN->getMask(), - StridedLoad, MGN->getPassThru(), EVL); - return DAG.getMergeValues({VPSelect, SDValue(StridedLoad.getNode(), 1)}, + SDValue Select = DAG.getSelect(DL, VT, MGN->getMask(), StridedLoad, + MGN->getPassThru()); + return DAG.getMergeValues({Select, SDValue(StridedLoad.getNode(), 1)}, DL); } } @@ -21060,6 +21237,38 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, return N->getOperand(0); break; } + case RISCVISD::VSLIDE1UP_VL: + case RISCVISD::VFSLIDE1UP_VL: { + using namespace SDPatternMatch; + SDValue SrcVec; + SDLoc DL(N); + MVT VT = N->getSimpleValueType(0); + // If the scalar we're sliding in was extracted from the first element of a + // vector, we can use that vector as the passthru in a normal slideup of 1. + // This saves us an extract_element instruction (i.e. vfmv.f.s, vmv.x.s). + if (!N->getOperand(0).isUndef() || + !sd_match(N->getOperand(2), + m_AnyOf(m_ExtractElt(m_Value(SrcVec), m_Zero()), + m_Node(RISCVISD::VMV_X_S, m_Value(SrcVec))))) + break; + + MVT SrcVecVT = SrcVec.getSimpleValueType(); + if (SrcVecVT.getVectorElementType() != VT.getVectorElementType()) + break; + // Adapt the value type of source vector. + if (SrcVecVT.isFixedLengthVector()) { + SrcVecVT = getContainerForFixedLengthVector(SrcVecVT); + SrcVec = convertToScalableVector(SrcVecVT, SrcVec, DAG, Subtarget); + } + if (SrcVecVT.getVectorMinNumElements() < VT.getVectorMinNumElements()) + SrcVec = DAG.getInsertSubvector(DL, DAG.getUNDEF(VT), SrcVec, 0); + else + SrcVec = DAG.getExtractSubvector(DL, VT, SrcVec, 0); + + return getVSlideup(DAG, Subtarget, DL, VT, SrcVec, N->getOperand(1), + DAG.getConstant(1, DL, XLenVT), N->getOperand(3), + N->getOperand(4)); + } } return SDValue(); @@ -21120,9 +21329,14 @@ bool RISCVTargetLowering::isDesirableToCommuteWithShift( auto *C1 = dyn_cast<ConstantSDNode>(N0->getOperand(1)); auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1)); - // Bail if we might break a sh{1,2,3}add pattern. - if ((Subtarget.hasStdExtZba() || Subtarget.hasVendorXAndesPerf()) && C2 && - C2->getZExtValue() >= 1 && C2->getZExtValue() <= 3 && N->hasOneUse() && + bool IsShXAdd = + (Subtarget.hasStdExtZba() || Subtarget.hasVendorXAndesPerf()) && C2 && + C2->getZExtValue() >= 1 && C2->getZExtValue() <= 3; + bool IsQCShlAdd = Subtarget.hasVendorXqciac() && C2 && + C2->getZExtValue() >= 4 && C2->getZExtValue() <= 31; + + // Bail if we might break a sh{1,2,3}add/qc.shladd pattern. + if ((IsShXAdd || IsQCShlAdd) && N->hasOneUse() && N->user_begin()->getOpcode() == ISD::ADD && !isUsedByLdSt(*N->user_begin(), nullptr) && !isa<ConstantSDNode>(N->user_begin()->getOperand(1))) @@ -21346,6 +21560,24 @@ void RISCVTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known = Known.sext(BitWidth); break; } + case RISCVISD::SRLW: { + KnownBits Known2; + Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); + Known = KnownBits::lshr(Known.trunc(32), Known2.trunc(5).zext(32)); + // Restore the original width by sign extending. + Known = Known.sext(BitWidth); + break; + } + case RISCVISD::SRAW: { + KnownBits Known2; + Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); + Known = KnownBits::ashr(Known.trunc(32), Known2.trunc(5).zext(32)); + // Restore the original width by sign extending. + Known = Known.sext(BitWidth); + break; + } case RISCVISD::CTZW: { KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); unsigned PossibleTZ = Known2.trunc(32).countMaxTrailingZeros(); @@ -21451,8 +21683,16 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode( if (Tmp < 33) return 1; return 33; } + case RISCVISD::SRAW: { + unsigned Tmp = + DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1); + // sraw produces at least 33 sign bits. If the input already has more than + // 33 sign bits sraw, will preserve them. + // TODO: A more precise answer could be calculated depending on known bits + // in the shift amount. + return std::max(Tmp, 33U); + } case RISCVISD::SLLW: - case RISCVISD::SRAW: case RISCVISD::SRLW: case RISCVISD::DIVW: case RISCVISD::DIVUW: @@ -21463,9 +21703,7 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode( case RISCVISD::FCVT_WU_RV64: case RISCVISD::STRICT_FCVT_W_RV64: case RISCVISD::STRICT_FCVT_WU_RV64: - // TODO: As the result is sign-extended, this is conservatively correct. A - // more precise answer could be calculated for SRAW depending on known - // bits in the shift amount. + // TODO: As the result is sign-extended, this is conservatively correct. return 33; case RISCVISD::VMV_X_S: { // The number of sign bits of the scalar result is computed by obtaining the @@ -21548,6 +21786,14 @@ bool RISCVTargetLowering::canCreateUndefOrPoisonForTargetNode( // TODO: Add more target nodes. switch (Op.getOpcode()) { + case RISCVISD::SLLW: + case RISCVISD::SRAW: + case RISCVISD::SRLW: + case RISCVISD::RORW: + case RISCVISD::ROLW: + // Only the lower 5 bits of RHS are read, guaranteeing the rotate/shift + // amount is bounds. + return false; case RISCVISD::SELECT_CC: // Integer comparisons cannot create poison. assert(Op.getOperand(0).getValueType().isInteger() && @@ -24683,7 +24929,7 @@ RISCVTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, bool RISCVTargetLowering::shouldFoldSelectWithSingleBitTest( EVT VT, const APInt &AndMask) const { - if (Subtarget.hasStdExtZicond() || Subtarget.hasVendorXVentanaCondOps()) + if (Subtarget.hasCZEROLike()) return !Subtarget.hasStdExtZbs() && AndMask.ugt(1024); return TargetLowering::shouldFoldSelectWithSingleBitTest(VT, AndMask); } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index fb63ebcfaace..4581c11356af 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -435,8 +435,8 @@ public: const APInt &GapMask) const override; bool lowerInterleavedStore(Instruction *Store, Value *Mask, - ShuffleVectorInst *SVI, - unsigned Factor) const override; + ShuffleVectorInst *SVI, unsigned Factor, + const APInt &GapMask) const override; bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, IntrinsicInst *DI) const override; diff --git a/llvm/lib/Target/RISCV/RISCVIndirectBranchTracking.cpp b/llvm/lib/Target/RISCV/RISCVIndirectBranchTracking.cpp index 43621b8f0f33..9664ab345dcb 100644 --- a/llvm/lib/Target/RISCV/RISCVIndirectBranchTracking.cpp +++ b/llvm/lib/Target/RISCV/RISCVIndirectBranchTracking.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// The pass adds LPAD (AUIPC with rs1 = X0) machine instructions at the +// The pass adds LPAD (AUIPC with rd = X0) machine instructions at the // beginning of each basic block or function that is referenced by an indirect // jump/call instruction. // diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormatsC.td b/llvm/lib/Target/RISCV/RISCVInstrFormatsC.td index 209c3fae63f4..4c7cd05723ac 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrFormatsC.td +++ b/llvm/lib/Target/RISCV/RISCVInstrFormatsC.td @@ -54,7 +54,6 @@ class RVInst16CSS<bits<3> funct3, bits<2> opcode, dag outs, dag ins, : RVInst16<outs, ins, opcodestr, argstr, [], InstFormatCSS> { bits<10> imm; bits<5> rs2; - bits<5> rs1; let Inst{15-13} = funct3; let Inst{12-7} = imm{5-0}; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 7b4a1de16769..d0bb57a3eaa1 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -80,8 +80,8 @@ namespace llvm::RISCV { } // end namespace llvm::RISCV -RISCVInstrInfo::RISCVInstrInfo(RISCVSubtarget &STI) - : RISCVGenInstrInfo(RISCV::ADJCALLSTACKDOWN, RISCV::ADJCALLSTACKUP), +RISCVInstrInfo::RISCVInstrInfo(const RISCVSubtarget &STI) + : RISCVGenInstrInfo(STI, RISCV::ADJCALLSTACKDOWN, RISCV::ADJCALLSTACKUP), STI(STI) {} #define GET_INSTRINFO_HELPERS @@ -3511,6 +3511,9 @@ RISCVInstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI, return outliner::InstrType::Illegal; } + if (isLPAD(MI)) + return outliner::InstrType::Illegal; + return outliner::InstrType::Legal; } @@ -4796,8 +4799,22 @@ unsigned RISCV::getDestLog2EEW(const MCInstrDesc &Desc, unsigned Log2SEW) { return Scaled; } -/// Given two VL operands, do we know that LHS <= RHS? +static std::optional<int64_t> getEffectiveImm(const MachineOperand &MO) { + assert(MO.isImm() || MO.getReg().isVirtual()); + if (MO.isImm()) + return MO.getImm(); + const MachineInstr *Def = + MO.getParent()->getMF()->getRegInfo().getVRegDef(MO.getReg()); + int64_t Imm; + if (isLoadImm(Def, Imm)) + return Imm; + return std::nullopt; +} + +/// Given two VL operands, do we know that LHS <= RHS? Must be used in SSA form. bool RISCV::isVLKnownLE(const MachineOperand &LHS, const MachineOperand &RHS) { + assert((LHS.isImm() || LHS.getParent()->getMF()->getRegInfo().isSSA()) && + (RHS.isImm() || RHS.getParent()->getMF()->getRegInfo().isSSA())); if (LHS.isReg() && RHS.isReg() && LHS.getReg().isVirtual() && LHS.getReg() == RHS.getReg()) return true; @@ -4807,9 +4824,11 @@ bool RISCV::isVLKnownLE(const MachineOperand &LHS, const MachineOperand &RHS) { return true; if (LHS.isImm() && LHS.getImm() == RISCV::VLMaxSentinel) return false; - if (!LHS.isImm() || !RHS.isImm()) + std::optional<int64_t> LHSImm = getEffectiveImm(LHS), + RHSImm = getEffectiveImm(RHS); + if (!LHSImm || !RHSImm) return false; - return LHS.getImm() <= RHS.getImm(); + return LHSImm <= RHSImm; } namespace { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h index 785c8352d4a5..57ec431749eb 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -62,7 +62,7 @@ enum RISCVMachineCombinerPattern : unsigned { class RISCVInstrInfo : public RISCVGenInstrInfo { public: - explicit RISCVInstrInfo(RISCVSubtarget &STI); + explicit RISCVInstrInfo(const RISCVSubtarget &STI); MCInst getNop() const override; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 23f5a848137c..92552b36aa0b 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -1698,8 +1698,6 @@ let Predicates = [IsRV32] in { def : Pat<(i32 (setlt (i32 GPR:$rs1), 0)), (SRLI GPR:$rs1, 31)>; // compressible } let Predicates = [IsRV64] in { -def : Pat<(i64 (seteq (i64 (and GPR:$rs1, 0x0000000080000000)), 0)), - (XORI (i64 (SRLIW GPR:$rs1, 31)), 1)>; def : Pat<(i64 (setlt (i64 GPR:$rs1), 0)), (SRLI GPR:$rs1, 63)>; // compressible def : Pat<(i64 (setlt (sext_inreg GPR:$rs1, i32), 0)), (SRLIW GPR:$rs1, 31)>; } @@ -2330,7 +2328,6 @@ include "RISCVInstrInfoZalasr.td" include "RISCVInstrInfoZimop.td" include "RISCVInstrInfoZicbo.td" include "RISCVInstrInfoZicond.td" -include "RISCVInstrInfoZicfiss.td" include "RISCVInstrInfoZilsd.td" // Scalar FP @@ -2359,6 +2356,9 @@ include "RISCVInstrInfoZc.td" include "RISCVInstrInfoZcmop.td" include "RISCVInstrInfoZclsd.td" +// Control Flow Integriy, this requires Zimop/Zcmop +include "RISCVInstrInfoZicfiss.td" + // Short Forward Branch include "RISCVInstrInfoSFB.td" diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td index c5551fbdec28..9fc73662d970 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td @@ -230,13 +230,17 @@ let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in class CStackLoad<bits<3> funct3, string OpcodeStr, DAGOperand cls, DAGOperand opnd> : RVInst16CI<funct3, 0b10, (outs cls:$rd), (ins SPMem:$rs1, opnd:$imm), - OpcodeStr, "$rd, ${imm}(${rs1})">; + OpcodeStr, "$rd, ${imm}(${rs1})"> { + bits<0> rs1; +} let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in class CStackStore<bits<3> funct3, string OpcodeStr, DAGOperand cls, DAGOperand opnd> : RVInst16CSS<funct3, 0b10, (outs), (ins cls:$rs2, SPMem:$rs1, opnd:$imm), - OpcodeStr, "$rs2, ${imm}(${rs1})">; + OpcodeStr, "$rs2, ${imm}(${rs1})"> { + bits<0> rs1; +} let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in class CLoad_ri<bits<3> funct3, string OpcodeStr, @@ -301,14 +305,6 @@ def C_ADDI4SPN : RVInst16CIW<0b000, 0b00, (outs GPRC:$rd), let Inst{5} = imm{3}; } -let Predicates = [HasStdExtCOrZcd, HasStdExtD] in -def C_FLD : CLoad_ri<0b001, "c.fld", FPR64C, uimm8_lsb000>, - Sched<[WriteFLD64, ReadFMemBase]> { - bits<8> imm; - let Inst{12-10} = imm{5-3}; - let Inst{6-5} = imm{7-6}; -} - def C_LW : CLoad_ri<0b010, "c.lw", GPRC, uimm7_lsb00>, Sched<[WriteLDW, ReadMemBase]> { bits<7> imm; @@ -326,16 +322,6 @@ def C_LW_INX : CLoad_ri<0b010, "c.lw", GPRF32C, uimm7_lsb00>, let Inst{5} = imm{6}; } -let DecoderNamespace = "RV32Only", - Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in -def C_FLW : CLoad_ri<0b011, "c.flw", FPR32C, uimm7_lsb00>, - Sched<[WriteFLD32, ReadFMemBase]> { - bits<7> imm; - let Inst{12-10} = imm{5-3}; - let Inst{6} = imm{2}; - let Inst{5} = imm{6}; -} - let Predicates = [HasStdExtZca, IsRV64] in def C_LD : CLoad_ri<0b011, "c.ld", GPRC, uimm8_lsb000>, Sched<[WriteLDD, ReadMemBase]> { @@ -344,14 +330,6 @@ def C_LD : CLoad_ri<0b011, "c.ld", GPRC, uimm8_lsb000>, let Inst{6-5} = imm{7-6}; } -let Predicates = [HasStdExtCOrZcd, HasStdExtD] in -def C_FSD : CStore_rri<0b101, "c.fsd", FPR64C, uimm8_lsb000>, - Sched<[WriteFST64, ReadFStoreData, ReadFMemBase]> { - bits<8> imm; - let Inst{12-10} = imm{5-3}; - let Inst{6-5} = imm{7-6}; -} - def C_SW : CStore_rri<0b110, "c.sw", GPRC, uimm7_lsb00>, Sched<[WriteSTW, ReadStoreData, ReadMemBase]> { bits<7> imm; @@ -369,16 +347,6 @@ def C_SW_INX : CStore_rri<0b110, "c.sw", GPRF32C, uimm7_lsb00>, let Inst{5} = imm{6}; } -let DecoderNamespace = "RV32Only", - Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in -def C_FSW : CStore_rri<0b111, "c.fsw", FPR32C, uimm7_lsb00>, - Sched<[WriteFST32, ReadFStoreData, ReadFMemBase]> { - bits<7> imm; - let Inst{12-10} = imm{5-3}; - let Inst{6} = imm{2}; - let Inst{5} = imm{6}; -} - let Predicates = [HasStdExtZca, IsRV64] in def C_SD : CStore_rri<0b111, "c.sd", GPRC, uimm8_lsb000>, Sched<[WriteSTD, ReadStoreData, ReadMemBase]> { @@ -500,12 +468,6 @@ def C_SLLI : RVInst16CI<0b000, 0b10, (outs GPR:$rd_wb), let Constraints = "$rd = $rd_wb"; } -let Predicates = [HasStdExtCOrZcd, HasStdExtD] in -def C_FLDSP : CStackLoad<0b001, "c.fldsp", FPR64, uimm9_lsb000>, - Sched<[WriteFLD64, ReadFMemBase]> { - let Inst{4-2} = imm{8-6}; -} - def C_LWSP : CStackLoad<0b010, "c.lwsp", GPRNoX0, uimm8_lsb00>, Sched<[WriteLDW, ReadMemBase]> { let Inst{3-2} = imm{7-6}; @@ -517,13 +479,6 @@ def C_LWSP_INX : CStackLoad<0b010, "c.lwsp", GPRF32NoX0, uimm8_lsb00>, let Inst{3-2} = imm{7-6}; } -let DecoderNamespace = "RV32Only", - Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in -def C_FLWSP : CStackLoad<0b011, "c.flwsp", FPR32, uimm8_lsb00>, - Sched<[WriteFLD32, ReadFMemBase]> { - let Inst{3-2} = imm{7-6}; -} - let Predicates = [HasStdExtZca, IsRV64] in def C_LDSP : CStackLoad<0b011, "c.ldsp", GPRNoX0, uimm9_lsb000>, Sched<[WriteLDD, ReadMemBase]> { @@ -560,12 +515,6 @@ def C_ADD : RVInst16CR<0b1001, 0b10, (outs GPR:$rd), let Constraints = "$rs1 = $rd"; } -let Predicates = [HasStdExtCOrZcd, HasStdExtD] in -def C_FSDSP : CStackStore<0b101, "c.fsdsp", FPR64, uimm9_lsb000>, - Sched<[WriteFST64, ReadFStoreData, ReadFMemBase]> { - let Inst{9-7} = imm{8-6}; -} - def C_SWSP : CStackStore<0b110, "c.swsp", GPR, uimm8_lsb00>, Sched<[WriteSTW, ReadStoreData, ReadMemBase]> { let Inst{8-7} = imm{7-6}; @@ -577,13 +526,6 @@ def C_SWSP_INX : CStackStore<0b110, "c.swsp", GPRF32, uimm8_lsb00>, let Inst{8-7} = imm{7-6}; } -let DecoderNamespace = "RV32Only", - Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in -def C_FSWSP : CStackStore<0b111, "c.fswsp", FPR32, uimm8_lsb00>, - Sched<[WriteFST32, ReadFStoreData, ReadFMemBase]> { - let Inst{8-7} = imm{7-6}; -} - let Predicates = [HasStdExtZca, IsRV64] in def C_SDSP : CStackStore<0b111, "c.sdsp", GPR, uimm9_lsb000>, Sched<[WriteSTD, ReadStoreData, ReadMemBase]> { @@ -600,6 +542,61 @@ def C_UNIMP : RVInst16<(outs), (ins), "c.unimp", "", [], InstFormatOther>, } // Predicates = [HasStdExtZca] +let DecoderNamespace = "RV32Only", + Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in { + def C_FLW : CLoad_ri<0b011, "c.flw", FPR32C, uimm7_lsb00>, + Sched<[WriteFLD32, ReadFMemBase]> { + bits<7> imm; + let Inst{12-10} = imm{5-3}; + let Inst{6} = imm{2}; + let Inst{5} = imm{6}; + } + + def C_FSW : CStore_rri<0b111, "c.fsw", FPR32C, uimm7_lsb00>, + Sched<[WriteFST32, ReadFStoreData, ReadFMemBase]> { + bits<7> imm; + let Inst{12-10} = imm{5-3}; + let Inst{6} = imm{2}; + let Inst{5} = imm{6}; + } + + def C_FLWSP : CStackLoad<0b011, "c.flwsp", FPR32, uimm8_lsb00>, + Sched<[WriteFLD32, ReadFMemBase]> { + let Inst{3-2} = imm{7-6}; + } + + def C_FSWSP : CStackStore<0b111, "c.fswsp", FPR32, uimm8_lsb00>, + Sched<[WriteFST32, ReadFStoreData, ReadFMemBase]> { + let Inst{8-7} = imm{7-6}; + } +} // DecoderNamespace = "RV32Only", Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] + +let Predicates = [HasStdExtCOrZcd, HasStdExtD] in { + def C_FLD : CLoad_ri<0b001, "c.fld", FPR64C, uimm8_lsb000>, + Sched<[WriteFLD64, ReadFMemBase]> { + bits<8> imm; + let Inst{12-10} = imm{5-3}; + let Inst{6-5} = imm{7-6}; + } + + def C_FSD : CStore_rri<0b101, "c.fsd", FPR64C, uimm8_lsb000>, + Sched<[WriteFST64, ReadFStoreData, ReadFMemBase]> { + bits<8> imm; + let Inst{12-10} = imm{5-3}; + let Inst{6-5} = imm{7-6}; + } + + def C_FLDSP : CStackLoad<0b001, "c.fldsp", FPR64, uimm9_lsb000>, + Sched<[WriteFLD64, ReadFMemBase]> { + let Inst{4-2} = imm{8-6}; + } + + def C_FSDSP : CStackStore<0b101, "c.fsdsp", FPR64, uimm9_lsb000>, + Sched<[WriteFST64, ReadFStoreData, ReadFMemBase]> { + let Inst{9-7} = imm{8-6}; + } +} // Predicates = [HasStdExtCOrZcd, HasStdExtD] in { + //===----------------------------------------------------------------------===// // HINT Instructions //===----------------------------------------------------------------------===// @@ -767,20 +764,17 @@ def : InstAlias<".insn_cj $opcode, $funct3, $imm11", // Compress Instruction tablegen backend. //===----------------------------------------------------------------------===// -// Patterns are defined in the same order the compressed instructions appear +// Zca patterns are defined in the same order the compressed instructions appear // under the "RVC Instruction Set Listings" section of the ISA manual. +// Zca Instructions + // Quadrant 0 let Predicates = [HasStdExtZca] in { def : CompressPat<(ADDI GPRC:$rd, SP:$rs1, uimm10_lsb00nonzero:$imm), (C_ADDI4SPN GPRC:$rd, SP:$rs1, uimm10_lsb00nonzero:$imm)>; } // Predicates = [HasStdExtZca] -let Predicates = [HasStdExtCOrZcd, HasStdExtD] in { -def : CompressPat<(FLD FPR64C:$rd, GPRCMem:$rs1, uimm8_lsb000:$imm), - (C_FLD FPR64C:$rd, GPRCMem:$rs1, uimm8_lsb000:$imm)>; -} // Predicates = [HasStdExtCOrZcd, HasStdExtD] - let Predicates = [HasStdExtZca] in { def : CompressPat<(LW GPRC:$rd, GPRCMem:$rs1, uimm7_lsb00:$imm), (C_LW GPRC:$rd, GPRCMem:$rs1, uimm7_lsb00:$imm)>; @@ -790,21 +784,11 @@ def : CompressPat<(LW_INX GPRF32C:$rd, GPRCMem:$rs1, uimm7_lsb00:$imm), (C_LW_INX GPRF32C:$rd, GPRCMem:$rs1, uimm7_lsb00:$imm)>; } // Predicates = [HasStdExtZca] -let Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in { -def : CompressPat<(FLW FPR32C:$rd, GPRCMem:$rs1, uimm7_lsb00:$imm), - (C_FLW FPR32C:$rd, GPRCMem:$rs1, uimm7_lsb00:$imm)>; -} // Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] - let Predicates = [HasStdExtZca, IsRV64] in { def : CompressPat<(LD GPRC:$rd, GPRCMem:$rs1, uimm8_lsb000:$imm), (C_LD GPRC:$rd, GPRCMem:$rs1, uimm8_lsb000:$imm)>; } // Predicates = [HasStdExtZca, IsRV64] -let Predicates = [HasStdExtCOrZcd, HasStdExtD] in { -def : CompressPat<(FSD FPR64C:$rs2, GPRCMem:$rs1, uimm8_lsb000:$imm), - (C_FSD FPR64C:$rs2, GPRCMem:$rs1, uimm8_lsb000:$imm)>; -} // Predicates = [HasStdExtCOrZcd, HasStdExtD] - let Predicates = [HasStdExtZca] in { def : CompressPat<(SW GPRC:$rs2, GPRCMem:$rs1, uimm7_lsb00:$imm), (C_SW GPRC:$rs2, GPRCMem:$rs1, uimm7_lsb00:$imm)>; @@ -814,11 +798,6 @@ def : CompressPat<(SW_INX GPRF32C:$rs2, GPRCMem:$rs1, uimm7_lsb00:$imm), (C_SW_INX GPRF32C:$rs2, GPRCMem:$rs1, uimm7_lsb00:$imm)>; } // Predicates = [HasStdExtZca] -let Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in { -def : CompressPat<(FSW FPR32C:$rs2, GPRCMem:$rs1, uimm7_lsb00:$imm), - (C_FSW FPR32C:$rs2, GPRCMem:$rs1, uimm7_lsb00:$imm)>; -} // Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] - let Predicates = [HasStdExtZca, IsRV64] in { def : CompressPat<(SD GPRC:$rs2, GPRCMem:$rs1, uimm8_lsb000:$imm), (C_SD GPRC:$rs2, GPRCMem:$rs1, uimm8_lsb000:$imm)>; @@ -907,11 +886,6 @@ def : CompressPat<(SLLI GPRNoX0:$rs1, GPRNoX0:$rs1, uimmlog2xlennonzero:$imm), (C_SLLI GPRNoX0:$rs1, uimmlog2xlennonzero:$imm)>; } // Predicates = [HasStdExtZca] -let Predicates = [HasStdExtCOrZcd, HasStdExtD] in { -def : CompressPat<(FLD FPR64:$rd, SPMem:$rs1, uimm9_lsb000:$imm), - (C_FLDSP FPR64:$rd, SPMem:$rs1, uimm9_lsb000:$imm)>; -} // Predicates = [HasStdExtCOrZcd, HasStdExtD] - let Predicates = [HasStdExtZca] in { def : CompressPat<(LW GPRNoX0:$rd, SPMem:$rs1, uimm8_lsb00:$imm), (C_LWSP GPRNoX0:$rd, SPMem:$rs1, uimm8_lsb00:$imm)>; @@ -921,11 +895,6 @@ def : CompressPat<(LW_INX GPRF32NoX0:$rd, SPMem:$rs1, uimm8_lsb00:$imm), (C_LWSP_INX GPRF32NoX0:$rd, SPMem:$rs1, uimm8_lsb00:$imm)>; } // Predicates = [HasStdExtZca] -let Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in { -def : CompressPat<(FLW FPR32:$rd, SPMem:$rs1, uimm8_lsb00:$imm), - (C_FLWSP FPR32:$rd, SPMem:$rs1, uimm8_lsb00:$imm)>; -} // Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] - let Predicates = [HasStdExtZca, IsRV64] in { def : CompressPat<(LD GPRNoX0:$rd, SPMem:$rs1, uimm9_lsb000:$imm), (C_LDSP GPRNoX0:$rd, SPMem:$rs1, uimm9_lsb000:$imm)>; @@ -953,11 +922,6 @@ def : CompressPat<(ADD GPRNoX0:$rs1, GPRNoX0:$rs2, GPRNoX0:$rs1), (C_ADD GPRNoX0:$rs1, GPRNoX0:$rs2)>; } // Predicates = [HasStdExtZca] -let Predicates = [HasStdExtCOrZcd, HasStdExtD] in { -def : CompressPat<(FSD FPR64:$rs2, SPMem:$rs1, uimm9_lsb000:$imm), - (C_FSDSP FPR64:$rs2, SPMem:$rs1, uimm9_lsb000:$imm)>; -} // Predicates = [HasStdExtCOrZcd, HasStdExtD] - let Predicates = [HasStdExtZca] in { def : CompressPat<(SW GPR:$rs2, SPMem:$rs1, uimm8_lsb00:$imm), (C_SWSP GPR:$rs2, SPMem:$rs1, uimm8_lsb00:$imm)>; @@ -967,12 +931,38 @@ def : CompressPat<(SW_INX GPRF32:$rs2, SPMem:$rs1, uimm8_lsb00:$imm), (C_SWSP_INX GPRF32:$rs2, SPMem:$rs1, uimm8_lsb00:$imm)>; } // Predicates = [HasStdExtZca] -let Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in { -def : CompressPat<(FSW FPR32:$rs2, SPMem:$rs1, uimm8_lsb00:$imm), - (C_FSWSP FPR32:$rs2, SPMem:$rs1, uimm8_lsb00:$imm)>; -} // Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] - let Predicates = [HasStdExtZca, IsRV64] in { def : CompressPat<(SD GPR:$rs2, SPMem:$rs1, uimm9_lsb000:$imm), (C_SDSP GPR:$rs2, SPMem:$rs1, uimm9_lsb000:$imm)>; } // Predicates = [HasStdExtZca, IsRV64] + +// Zcf Instructions +let Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in { + // Quadrant 0 + def : CompressPat<(FLW FPR32C:$rd, GPRCMem:$rs1, uimm7_lsb00:$imm), + (C_FLW FPR32C:$rd, GPRCMem:$rs1, uimm7_lsb00:$imm)>; + def : CompressPat<(FSW FPR32C:$rs2, GPRCMem:$rs1, uimm7_lsb00:$imm), + (C_FSW FPR32C:$rs2, GPRCMem:$rs1, uimm7_lsb00:$imm)>; + + // Quadrant 2 + def : CompressPat<(FLW FPR32:$rd, SPMem:$rs1, uimm8_lsb00:$imm), + (C_FLWSP FPR32:$rd, SPMem:$rs1, uimm8_lsb00:$imm)>; + def : CompressPat<(FSW FPR32:$rs2, SPMem:$rs1, uimm8_lsb00:$imm), + (C_FSWSP FPR32:$rs2, SPMem:$rs1, uimm8_lsb00:$imm)>; +} // Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] + +// Zcd Instructions +let Predicates = [HasStdExtCOrZcd, HasStdExtD] in { + // Quadrant 0 + def : CompressPat<(FLD FPR64C:$rd, GPRCMem:$rs1, uimm8_lsb000:$imm), + (C_FLD FPR64C:$rd, GPRCMem:$rs1, uimm8_lsb000:$imm)>; + def : CompressPat<(FSD FPR64C:$rs2, GPRCMem:$rs1, uimm8_lsb000:$imm), + (C_FSD FPR64C:$rs2, GPRCMem:$rs1, uimm8_lsb000:$imm)>; + + // Quadrant 2 + def : CompressPat<(FLD FPR64:$rd, SPMem:$rs1, uimm9_lsb000:$imm), + (C_FLDSP FPR64:$rd, SPMem:$rs1, uimm9_lsb000:$imm)>; + def : CompressPat<(FSD FPR64:$rs2, SPMem:$rs1, uimm9_lsb000:$imm), + (C_FSDSP FPR64:$rs2, SPMem:$rs1, uimm9_lsb000:$imm)>; +} // Predicates = [HasStdExtCOrZcd, HasStdExtD] + diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td index c342b41e41d0..6840dacaea54 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td @@ -25,7 +25,7 @@ def SImm8UnsignedAsmOperand : SImmAsmOperand<8, "Unsigned"> { } // A 8-bit signed immediate allowing range [-128, 255] -// but represented as [-128, 255]. +// but represented as [-128, 127]. def simm8_unsigned : RISCVOp { let ParserMatchClass = SImm8UnsignedAsmOperand; let EncoderMethod = "getImmOpValue"; @@ -98,6 +98,40 @@ class PLUI_i<bits<7> funct7, string opcodestr> let Inst{23-15} = imm10{9-1}; } +// Common base for widening Binary/Ternary ops +class RVPWideningBase<bits<2> w, bit arith_shift, dag outs, dag ins, + string opcodestr> + : RVInst<outs, ins, opcodestr, "$rd, $rs1, $rs2", [], InstFormatOther> { + bits<5> rs2; + bits<5> rs1; + bits<5> rd; + + let Inst{31} = 0b0; + let Inst{26-25} = w; + let Inst{24-20} = rs2; + let Inst{19-15} = rs1; + let Inst{14-12} = 0b010; + let Inst{11-8} = rd{4-1}; + let Inst{7} = arith_shift; + let Inst{6-0} = OPC_OP_IMM_32.Value; +} + +// Common base for narrowing ops +class RVPNarrowingBase<bits<3> f, bit r, bits<4> funct4, dag outs, dag ins, + string opcodestr, string argstr> + : RVInst<outs, ins, opcodestr, argstr, [], InstFormatOther> { + bits<5> rs1; + bits<5> rd; + + let Inst{31} = 0b0; + let Inst{30-28} = f; + let Inst{27} = r; + let Inst{19-16} = rs1{4-1}; + let Inst{15-12} = funct4; + let Inst{11-7} = rd; + let Inst{6-0} = OPC_OP_IMM_32.Value; +} + let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in class RVPShift_ri<bits<3> f, bits<3> funct3, string opcodestr, Operand ImmType> : RVInstIBase<funct3, OPC_OP_IMM_32, (outs GPR:$rd), @@ -141,6 +175,100 @@ class RVPShiftB_ri<bits<3> f, bits<3> funct3, string opcodestr> } let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in +class RVPWideningShift_ri<bits<3> f, string opcodestr, Operand ImmType> + : RVInst<(outs GPRPairRV32:$rd), (ins GPR:$rs1, ImmType:$shamt), opcodestr, + "$rd, $rs1, $shamt", [], InstFormatOther> { + bits<5> rs1; + bits<5> rd; + + let Inst{31} = 0b0; + let Inst{30-28} = f; + let Inst{27} = 0b0; + let Inst{19-15} = rs1; + let Inst{14-12} = 0b010; + let Inst{11-8} = rd{4-1}; + let Inst{7} = 0b0; + let Inst{6-0} = OPC_OP_IMM_32.Value; + + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; +} + +class RVPWideningShiftW_ri<bits<3> f, string opcodestr> + : RVPWideningShift_ri<f, opcodestr, uimm6> { + bits<6> shamt; + + let Inst{26} = 0b1; + let Inst{25-20} = shamt; +} + +class RVPWideningShiftH_ri<bits<3> f, string opcodestr> + : RVPWideningShift_ri<f, opcodestr, uimm5> { + bits<5> shamt; + + let Inst{26-25} = 0b01; + let Inst{24-20} = shamt; +} + +class RVPWideningShiftB_ri<bits<3> f, string opcodestr> + : RVPWideningShift_ri<f, opcodestr, uimm4> { + bits<4> shamt; + + let Inst{26-24} = 0b001; + let Inst{23-20} = shamt; +} + +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in +class RVPNarrowingShift_ri<bits<3> f, string opcodestr, Operand ImmType> + : RVPNarrowingBase<f, 0b0, 0b1100, (outs GPR:$rd), + (ins GPRPairRV32:$rs1, ImmType:$shamt), opcodestr, + "$rd, $rs1, $shamt">; + +class RVPNarrowingShiftW_ri<bits<3> f, string opcodestr> + : RVPNarrowingShift_ri<f, opcodestr, uimm6> { + bits<6> shamt; + + let Inst{26} = 0b1; + let Inst{25-20} = shamt; +} + +class RVPNarrowingShiftH_ri<bits<3> f, string opcodestr> + : RVPNarrowingShift_ri<f, opcodestr, uimm5> { + bits<5> shamt; + + let Inst{26-25} = 0b01; + let Inst{24-20} = shamt; +} + +class RVPNarrowingShiftB_ri<bits<3> f, string opcodestr> + : RVPNarrowingShift_ri<f, opcodestr, uimm4> { + bits<4> shamt; + + let Inst{26-24} = 0b001; + let Inst{23-20} = shamt; +} + +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in +class RVPNarrowingShift_rr<bits<3> f, bits<2> w, string opcodestr> + : RVPNarrowingBase<f, 0b1, 0b1100, (outs GPR:$rd), + (ins GPRPairRV32:$rs1, GPR:$rs2), opcodestr, + "$rd, $rs1, $rs2"> { + bits<5> rs2; + + let Inst{26-25} = w; + let Inst{24-20} = rs2; +} + +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in +class RVPWideningShift_rr<bits<3> f, bits<2> w, string opcodestr> + : RVPWideningBase<w, 0b0, (outs GPRPairRV32:$rd), (ins GPR:$rs1, GPR:$rs2), + opcodestr> { + let Inst{30-28} = f; + let Inst{27} = 0b1; +} + +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in class RVPUnary_ri<bits<2> w, bits<5> uf, string opcodestr> : RVInstIBase<0b010, OPC_OP_IMM_32, (outs GPR:$rd), (ins GPR:$rs1), opcodestr, "$rd, $rs1"> { @@ -169,6 +297,24 @@ class RVPBinary_rr<bits<4> f, bits<2> w, bits<3> funct3, string opcodestr> } let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in +class RVPWideningBinary_rr<bits<4> f, bits<2> w, string opcodestr> + : RVPWideningBase<w, 0b1, (outs GPRPairRV32:$rd), (ins GPR:$rs1, GPR:$rs2), + opcodestr> { + let Inst{30-27} = f; +} + +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in +class RVPNarrowingBinary_rr<bits<3> f, bits<2> w, string opcodestr> + : RVPNarrowingBase<f, 0b1, 0b0100, (outs GPR:$rd), + (ins GPRPairRV32:$rs1, GPR:$rs2), opcodestr, + "$rd, $rs1, $rs2"> { + bits<5> rs2; + + let Inst{26-25} = w; + let Inst{24-20} = rs2; +} + +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in class RVPTernary_rrr<bits<4> f, bits<2> w, bits<3> funct3, string opcodestr> : RVInstRBase<funct3, OPC_OP_32, (outs GPR:$rd_wb), (ins GPR:$rd, GPR:$rs1, GPR:$rs2), opcodestr, @@ -180,6 +326,15 @@ class RVPTernary_rrr<bits<4> f, bits<2> w, bits<3> funct3, string opcodestr> let Constraints = "$rd = $rd_wb"; } +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in +class RVPWideningTernary_rrr<bits<4> f, bits<2> w, string opcodestr> + : RVPWideningBase<w, 0b1, (outs GPRPairRV32:$rd_wb), + (ins GPR:$rd, GPR:$rs1, GPR:$rs2), opcodestr> { + let Inst{30-27} = f; + + let Constraints = "$rd = $rd_wb"; +} + // Common base for pli.db/h/w and plui.dh/w class RVPPairLoadImm_i<bits<7> funct7, dag ins, string opcodestr, string argstr> @@ -889,3 +1044,156 @@ let Predicates = [HasStdExtP, IsRV32] in { let Inst{23-15} = imm10{9-1}; } } + +let Predicates = [HasStdExtP, IsRV32] in { + def PWSLLI_B : RVPWideningShiftB_ri<0b000, "pwslli.b">; + def PWSLLI_H : RVPWideningShiftH_ri<0b000, "pwslli.h">; + def WSLLI : RVPWideningShiftW_ri<0b000, "wslli">; + + def PWSLAI_B : RVPWideningShiftB_ri<0b100, "pwslai.b">; + def PWSLAI_H : RVPWideningShiftH_ri<0b100, "pwslai.h">; + def WSLAI : RVPWideningShiftW_ri<0b100, "wslai">; + + def PWSLL_BS : RVPWideningShift_rr<0b000, 0b00, "pwsll.bs">; + def PWSLL_HS : RVPWideningShift_rr<0b000, 0b01, "pwsll.hs">; + def WSLL : RVPWideningShift_rr<0b000, 0b11, "wsll">; + + def PWSLA_BS : RVPWideningShift_rr<0b100, 0b00, "pwsla.bs">; + def PWSLA_HS : RVPWideningShift_rr<0b100, 0b01, "pwsla.hs">; + def WSLA : RVPWideningShift_rr<0b100, 0b11, "wsla">; + + def WZIP8P : RVPWideningShift_rr<0b111, 0b00, "wzip8p">; + def WZIP16P : RVPWideningShift_rr<0b111, 0b01, "wzip16p">; + + def PWADD_H : RVPWideningBinary_rr<0b0000, 0b00, "pwadd.h">; + def WADD : RVPWideningBinary_rr<0b0000, 0b01, "wadd">; + def PWADD_B : RVPWideningBinary_rr<0b0000, 0b10, "pwadd.b">; + def PM2WADD_H : RVPWideningBinary_rr<0b0000, 0b11, "pm2wadd.h">; + + def PWADDA_H : RVPWideningTernary_rrr<0b0001, 0b00, "pwadda.h">; + def WADDA : RVPWideningTernary_rrr<0b0001, 0b01, "wadda">; + def PWADDA_B : RVPWideningTernary_rrr<0b0001, 0b10, "pwadda.b">; + def PM2WADDA_H : RVPWideningTernary_rrr<0b0001, 0b11, "pm2wadda.h">; + + def PWADDU_H : RVPWideningBinary_rr<0b0010, 0b00, "pwaddu.h">; + def WADDU : RVPWideningBinary_rr<0b0010, 0b01, "waddu">; + def PWADDU_B : RVPWideningBinary_rr<0b0010, 0b10, "pwaddu.b">; + def PM2WADD_HX : RVPWideningBinary_rr<0b0010, 0b11, "pm2wadd.hx">; + + def PWADDAU_H : RVPWideningTernary_rrr<0b0011, 0b00, "pwaddau.h">; + def WADDAU : RVPWideningTernary_rrr<0b0011, 0b01, "waddau">; + def PWADDAU_B : RVPWideningTernary_rrr<0b0011, 0b10, "pwaddau.b">; + def PM2WADDA_HX : RVPWideningTernary_rrr<0b0011, 0b11, "pm2wadda.hx">; + + def PWMUL_H : RVPWideningBinary_rr<0b0100, 0b00, "pwmul.h">; + def WMUL : RVPWideningBinary_rr<0b0100, 0b01, "wmul">; + def PWMUL_B : RVPWideningBinary_rr<0b0100, 0b10, "pwmul.b">; + def PM2WADDU_H : RVPWideningBinary_rr<0b0100, 0b11, "pm2waddu.h">; + + def PWMACC_H : RVPWideningTernary_rrr<0b0101, 0b00, "pwmacc.h">; + def WMACC : RVPWideningTernary_rrr<0b0101, 0b01, "wmacc">; + def PM2WADDAU_H : RVPWideningTernary_rrr<0b0101, 0b11, "pm2waddau.h">; + + def PWMULU_H : RVPWideningBinary_rr<0b0110, 0b00, "pwmulu.h">; + def WMULU : RVPWideningBinary_rr<0b0110, 0b01, "wmulu">; + def PWMULU_B : RVPWideningBinary_rr<0b0110, 0b10, "pwmulu.b">; + + def PWMACCU_H : RVPWideningTernary_rrr<0b0111, 0b00, "pwmaccu.h">; + def WMACCU : RVPWideningTernary_rrr<0b0111, 0b01, "wmaccu">; + + def PWSUB_H : RVPWideningBinary_rr<0b1000, 0b00, "pwsub.h">; + def WSUB : RVPWideningBinary_rr<0b1000, 0b01, "wsub">; + def PWSUB_B : RVPWideningBinary_rr<0b1000, 0b10, "pwsub.b">; + def PM2WSUB_H : RVPWideningBinary_rr<0b1000, 0b11, "pm2wsub.h">; + + def PWSUBA_H : RVPWideningTernary_rrr<0b1001, 0b00, "pwsuba.h">; + def WSUBA : RVPWideningTernary_rrr<0b1001, 0b01, "wsuba">; + def PWSUBA_B : RVPWideningTernary_rrr<0b1001, 0b10, "pwsuba.b">; + def PM2WSUBA_H : RVPWideningTernary_rrr<0b1001, 0b11, "pm2wsuba.h">; + + def PWSUBU_H : RVPWideningBinary_rr<0b1010, 0b00, "pwsubu.h">; + def WSUBU : RVPWideningBinary_rr<0b1010, 0b01, "wsubu">; + def PWSUBU_B : RVPWideningBinary_rr<0b1010, 0b10, "pwsubu.b">; + def PM2WSUB_HX : RVPWideningBinary_rr<0b1010, 0b11, "pm2wsub.hx">; + + def PWSUBAU_H : RVPWideningTernary_rrr<0b1011, 0b00, "pwsubau.h">; + def WSUBAU : RVPWideningTernary_rrr<0b1011, 0b01, "wsubau">; + def PWSUBAU_B : RVPWideningTernary_rrr<0b1011, 0b10, "pwsubau.b">; + def PM2WSUBA_HX : RVPWideningTernary_rrr<0b1011, 0b11, "pm2wsuba.hx">; + + def PWMULSU_H : RVPWideningBinary_rr<0b1100, 0b00, "pwmulsu.h">; + def WMULSU : RVPWideningBinary_rr<0b1100, 0b01, "wmulsu">; + def PWMULSU_B : RVPWideningBinary_rr<0b1100, 0b10, "pwmulsu.b">; + def PM2WADDSU_H : RVPWideningBinary_rr<0b1100, 0b11, "pm2waddsu.h">; + + def PWMACCSU_H : RVPWideningTernary_rrr<0b1101, 0b00, "pwmaccsu.h">; + def WMACCSU : RVPWideningTernary_rrr<0b1101, 0b01, "wmaccsu">; + def PM2WADDASU_H : RVPWideningTernary_rrr<0b1101, 0b11, "pm2waddasu.h">; + + def PMQWACC_H : RVPWideningTernary_rrr<0b1111, 0b00, "pmqwacc.h">; + def PMQWACC : RVPWideningTernary_rrr<0b1111, 0b01, "pmqwacc">; + def PMQRWACC_H : RVPWideningTernary_rrr<0b1111, 0b10, "pmqrwacc.h">; + def PMQRWACC : RVPWideningTernary_rrr<0b1111, 0b11, "pmqrwacc">; + + def PREDSUM_DHS : RVPNarrowingBinary_rr<0b001, 0b00, "predsum.dhs">; + def PREDSUM_DBS : RVPNarrowingBinary_rr<0b001, 0b10, "predsum.dbs">; + + def PREDSUMU_DHS : RVPNarrowingBinary_rr<0b011, 0b00, "predsumu.dhs">; + def PREDSUMU_DBS : RVPNarrowingBinary_rr<0b011, 0b10, "predsumu.dbs">; + + def PNSRLI_B : RVPNarrowingShiftB_ri<0b000, "pnsrli.b">; + def PNSRLI_H : RVPNarrowingShiftH_ri<0b000, "pnsrli.h">; + def NSRLI : RVPNarrowingShiftW_ri<0b000, "nsrli">; + + def PNCLIPIU_B : RVPNarrowingShiftB_ri<0b010, "pnclipiu.b">; + def PNCLIPIU_H : RVPNarrowingShiftH_ri<0b010, "pnclipiu.h">; + def NCLIPIU : RVPNarrowingShiftW_ri<0b010, "nclipiu">; + + def PNCLIPRIU_B : RVPNarrowingShiftB_ri<0b011, "pnclipriu.b">; + def PNCLIPRIU_H : RVPNarrowingShiftH_ri<0b011, "pnclipriu.h">; + def NCLIPRIU : RVPNarrowingShiftW_ri<0b011, "nclipriu">; + + def PNSRAI_B : RVPNarrowingShiftB_ri<0b100, "pnsrai.b">; + def PNSRAI_H : RVPNarrowingShiftH_ri<0b100, "pnsrai.h">; + def NSRAI : RVPNarrowingShiftW_ri<0b100, "nsrai">; + + def PNSARI_B : RVPNarrowingShiftB_ri<0b101, "pnsari.b">; + def PNSARI_H : RVPNarrowingShiftH_ri<0b101, "pnsari.h">; + def NSARI : RVPNarrowingShiftW_ri<0b101, "nsari">; + + def PNCLIPI_B : RVPNarrowingShiftB_ri<0b110, "pnclipi.b">; + def PNCLIPI_H : RVPNarrowingShiftH_ri<0b110, "pnclipi.h">; + def NCLIPI : RVPNarrowingShiftW_ri<0b110, "nclipi">; + + def PNCLIPRI_B : RVPNarrowingShiftB_ri<0b111, "pnclipri.b">; + def PNCLIPRI_H : RVPNarrowingShiftH_ri<0b111, "pnclipri.h">; + def NCLIPRI : RVPNarrowingShiftW_ri<0b111, "nclipri">; + + def PNSRL_BS : RVPNarrowingShift_rr<0b000, 0b00, "pnsrl.bs">; + def PNSRL_HS : RVPNarrowingShift_rr<0b000, 0b01, "pnsrl.hs">; + def NSRL : RVPNarrowingShift_rr<0b000, 0b11, "nsrl">; + + def PNCLIPU_BS : RVPNarrowingShift_rr<0b010, 0b00, "pnclipu.bs">; + def PNCLIPU_HS : RVPNarrowingShift_rr<0b010, 0b01, "pnclipu.hs">; + def NCLIPU : RVPNarrowingShift_rr<0b010, 0b11, "nclipu">; + + def PNCLIPRU_BS : RVPNarrowingShift_rr<0b011, 0b00, "pnclipru.bs">; + def PNCLIPRU_HS : RVPNarrowingShift_rr<0b011, 0b01, "pnclipru.hs">; + def NCLIPRU : RVPNarrowingShift_rr<0b011, 0b11, "nclipru">; + + def PNSRA_BS : RVPNarrowingShift_rr<0b100, 0b00, "pnsra.bs">; + def PNSRA_HS : RVPNarrowingShift_rr<0b100, 0b01, "pnsra.hs">; + def NSRA : RVPNarrowingShift_rr<0b100, 0b11, "nsra">; + + def PNSRAR_BS : RVPNarrowingShift_rr<0b101, 0b00, "pnsrar.bs">; + def PNSRAR_HS : RVPNarrowingShift_rr<0b101, 0b01, "pnsrar.hs">; + def NSRAR : RVPNarrowingShift_rr<0b101, 0b11, "nsrar">; + + def PNCLIP_BS : RVPNarrowingShift_rr<0b110, 0b00, "pnclip.bs">; + def PNCLIP_HS : RVPNarrowingShift_rr<0b110, 0b01, "pnclip.hs">; + def NCLIP : RVPNarrowingShift_rr<0b110, 0b11, "nclip">; + + def PNCLIPR_BS : RVPNarrowingShift_rr<0b111, 0b00, "pnclipr.bs">; + def PNCLIPR_HS : RVPNarrowingShift_rr<0b111, 0b01, "pnclipr.hs">; + def NCLIPR : RVPNarrowingShift_rr<0b111, 0b11, "nclipr">; +} // Predicates = [HasStdExtP, IsRV32] diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td index 32f533b8f114..f732ab13e5f8 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td @@ -44,153 +44,95 @@ def PseudoCCMOVGPRNoX0 : Pseudo<(outs GPRNoX0:$dst), Sched<[]>; } +class SFBALU_rr + : Pseudo<(outs GPR:$dst), + (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, GPR:$falsev, GPR:$rs1, + GPR:$rs2), []>, + Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, ReadSFBALU, + ReadSFBALU]> { + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; + let Size = 8; + let Constraints = "$dst = $falsev"; +} + +class SFBALU_ri + : Pseudo<(outs GPR:$dst), + (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, GPR:$falsev, GPR:$rs1, + simm12:$imm), []>, + Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, ReadSFBALU]> { + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; + let Size = 8; + let Constraints = "$dst = $falsev"; +} + +class SFBShift_ri + : Pseudo<(outs GPR:$dst), + (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, GPR:$falsev, GPR:$rs1, + uimmlog2xlen:$imm), []>, + Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, ReadSFBALU]> { + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; + let Size = 8; + let Constraints = "$dst = $falsev"; +} + +class SFBShiftW_ri + : Pseudo<(outs GPR:$dst), + (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, GPR:$falsev, GPR:$rs1, + uimm5:$imm), []>, + Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, ReadSFBALU]> { + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; + let Size = 8; + let Constraints = "$dst = $falsev"; +} + // Conditional binops, that updates update $dst to (op rs1, rs2) when condition // is true. Returns $falsev otherwise. Selected by optimizeSelect. // TODO: Can we use DefaultOperands on the regular binop to accomplish this more // like how ARM does predication? -let Predicates = [HasShortForwardBranchOpt], hasSideEffects = 0, - mayLoad = 0, mayStore = 0, Size = 8, Constraints = "$dst = $falsev" in { -def PseudoCCADD : Pseudo<(outs GPR:$dst), - (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, GPR:$rs2), []>, - Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, - ReadSFBALU, ReadSFBALU, ReadSFBALU]>; -def PseudoCCSUB : Pseudo<(outs GPR:$dst), - (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, GPR:$rs2), []>, - Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, - ReadSFBALU, ReadSFBALU, ReadSFBALU]>; -def PseudoCCSLL : Pseudo<(outs GPR:$dst), - (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, GPR:$rs2), []>, - Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, - ReadSFBALU, ReadSFBALU]>; -def PseudoCCSRL : Pseudo<(outs GPR:$dst), - (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, GPR:$rs2), []>, - Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, - ReadSFBALU, ReadSFBALU]>; -def PseudoCCSRA : Pseudo<(outs GPR:$dst), - (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, GPR:$rs2), []>, - Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, - ReadSFBALU, ReadSFBALU]>; -def PseudoCCAND : Pseudo<(outs GPR:$dst), - (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, GPR:$rs2), []>, - Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, - ReadSFBALU, ReadSFBALU, ReadSFBALU]>; -def PseudoCCOR : Pseudo<(outs GPR:$dst), - (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, GPR:$rs2), []>, - Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, - ReadSFBALU, ReadSFBALU, ReadSFBALU]>; -def PseudoCCXOR : Pseudo<(outs GPR:$dst), - (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, GPR:$rs2), []>, - Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, - ReadSFBALU, ReadSFBALU, ReadSFBALU]>; +let Predicates = [HasShortForwardBranchOpt] in { +def PseudoCCADD : SFBALU_rr; +def PseudoCCSUB : SFBALU_rr; +def PseudoCCSLL : SFBALU_rr; +def PseudoCCSRL : SFBALU_rr; +def PseudoCCSRA : SFBALU_rr; +def PseudoCCAND : SFBALU_rr; +def PseudoCCOR : SFBALU_rr; +def PseudoCCXOR : SFBALU_rr; -def PseudoCCADDI : Pseudo<(outs GPR:$dst), - (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, simm12:$rs2), []>, - Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, - ReadSFBALU]>; -def PseudoCCSLLI : Pseudo<(outs GPR:$dst), - (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, uimmlog2xlen:$shamt), []>, - Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, - ReadSFBALU]>; -def PseudoCCSRLI : Pseudo<(outs GPR:$dst), - (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, uimmlog2xlen:$shamt), []>, - Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, - ReadSFBALU]>; -def PseudoCCSRAI : Pseudo<(outs GPR:$dst), - (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, uimmlog2xlen:$shamt), []>, - Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, - ReadSFBALU]>; -def PseudoCCANDI : Pseudo<(outs GPR:$dst), - (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, simm12:$rs2), []>, - Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, - ReadSFBALU]>; -def PseudoCCORI : Pseudo<(outs GPR:$dst), - (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, simm12:$rs2), []>, - Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, - ReadSFBALU]>; -def PseudoCCXORI : Pseudo<(outs GPR:$dst), - (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, simm12:$rs2), []>, - Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, - ReadSFBALU]>; +def PseudoCCADDI : SFBALU_ri; +def PseudoCCANDI : SFBALU_ri; +def PseudoCCORI : SFBALU_ri; +def PseudoCCXORI : SFBALU_ri; + +def PseudoCCSLLI : SFBShift_ri; +def PseudoCCSRLI : SFBShift_ri; +def PseudoCCSRAI : SFBShift_ri; // RV64I instructions -def PseudoCCADDW : Pseudo<(outs GPR:$dst), - (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, GPR:$rs2), []>, - Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, - ReadSFBALU, ReadSFBALU, ReadSFBALU]>; -def PseudoCCSUBW : Pseudo<(outs GPR:$dst), - (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, GPR:$rs2), []>, - Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, - ReadSFBALU, ReadSFBALU, ReadSFBALU]>; -def PseudoCCSLLW : Pseudo<(outs GPR:$dst), - (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, GPR:$rs2), []>, - Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, - ReadSFBALU, ReadSFBALU]>; -def PseudoCCSRLW : Pseudo<(outs GPR:$dst), - (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, GPR:$rs2), []>, - Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, - ReadSFBALU, ReadSFBALU]>; -def PseudoCCSRAW : Pseudo<(outs GPR:$dst), - (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, GPR:$rs2), []>, - Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, - ReadSFBALU, ReadSFBALU]>; +def PseudoCCADDW : SFBALU_rr; +def PseudoCCSUBW : SFBALU_rr; +def PseudoCCSLLW : SFBALU_rr; +def PseudoCCSRLW : SFBALU_rr; +def PseudoCCSRAW : SFBALU_rr; + +def PseudoCCADDIW : SFBALU_ri; -def PseudoCCADDIW : Pseudo<(outs GPR:$dst), - (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, simm12:$rs2), []>, - Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, - ReadSFBALU]>; -def PseudoCCSLLIW : Pseudo<(outs GPR:$dst), - (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, uimm5:$shamt), []>, - Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, - ReadSFBALU]>; -def PseudoCCSRLIW : Pseudo<(outs GPR:$dst), - (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, uimm5:$shamt), []>, - Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, - ReadSFBALU]>; -def PseudoCCSRAIW : Pseudo<(outs GPR:$dst), - (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, uimm5:$shamt), []>, - Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, - ReadSFBALU]>; +def PseudoCCSLLIW : SFBShiftW_ri; +def PseudoCCSRLIW : SFBShiftW_ri; +def PseudoCCSRAIW : SFBShiftW_ri; // Zbb/Zbkb instructions -def PseudoCCANDN : Pseudo<(outs GPR:$dst), - (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, GPR:$rs2), []>, - Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, - ReadSFBALU, ReadSFBALU, ReadSFBALU]>; -def PseudoCCORN : Pseudo<(outs GPR:$dst), - (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, GPR:$rs2), []>, - Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, - ReadSFBALU, ReadSFBALU, ReadSFBALU]>; -def PseudoCCXNOR : Pseudo<(outs GPR:$dst), - (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, GPR:$rs2), []>, - Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, - ReadSFBALU, ReadSFBALU, ReadSFBALU]>; +def PseudoCCANDN : SFBALU_rr; +def PseudoCCORN : SFBALU_rr; +def PseudoCCXNOR : SFBALU_rr; } let Predicates = [HasShortForwardBranchOpt] in diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td index acbccddce2b5..063ee5c5e8b9 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -830,19 +830,6 @@ multiclass VPatTiedBinaryNoMaskVL_V<SDNode vop, result_reg_class:$rs1, op2_reg_class:$rs2, GPR:$vl, sew, TAIL_AGNOSTIC)>; - // Tail undisturbed - def : Pat<(riscv_vmerge_vl true_mask, - (result_type (vop - result_reg_class:$rs1, - (op2_type op2_reg_class:$rs2), - srcvalue, - true_mask, - VLOpFrag)), - result_reg_class:$rs1, result_reg_class:$rs1, VLOpFrag), - (!cast<Instruction>(instruction_name#"_"#suffix#"_"# vlmul.MX#"_TIED") - result_reg_class:$rs1, - op2_reg_class:$rs2, - GPR:$vl, sew, TU_MU)>; } class VPatTiedBinaryMaskVL_V<SDNode vop, @@ -892,22 +879,6 @@ multiclass VPatTiedBinaryNoMaskVL_V_RM<SDNode vop, // RISCVInsertReadWriteCSR FRM_DYN, GPR:$vl, log2sew, TAIL_AGNOSTIC)>; - // Tail undisturbed - def : Pat<(riscv_vmerge_vl true_mask, - (result_type (vop - result_reg_class:$rs1, - (op2_type op2_reg_class:$rs2), - srcvalue, - true_mask, - VLOpFrag)), - result_reg_class:$rs1, result_reg_class:$rs1, VLOpFrag), - (!cast<Instruction>(name) - result_reg_class:$rs1, - op2_reg_class:$rs2, - // Value to indicate no rounding mode change in - // RISCVInsertReadWriteCSR - FRM_DYN, - GPR:$vl, log2sew, TU_MU)>; } class VPatBinaryVL_XI<SDPatternOperator vop, @@ -1755,50 +1726,6 @@ multiclass VPatMultiplyAddVL_VV_VX<SDNode op, string instruction_name> { } } -multiclass VPatMultiplyAccVL_VV_VX<PatFrag op, string instruction_name> { - foreach vti = AllIntegerVectors in { - defvar suffix = vti.LMul.MX; - let Predicates = GetVTypePredicates<vti>.Predicates in { - def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm), - (vti.Vector (op vti.RegClass:$rd, - (riscv_mul_vl_oneuse vti.RegClass:$rs1, vti.RegClass:$rs2, - srcvalue, (vti.Mask true_mask), VLOpFrag), - srcvalue, (vti.Mask true_mask), VLOpFrag)), - vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag), - (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK") - vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TU_MU)>; - def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm), - (vti.Vector (op vti.RegClass:$rd, - (riscv_mul_vl_oneuse (SplatPat XLenVT:$rs1), vti.RegClass:$rs2, - srcvalue, (vti.Mask true_mask), VLOpFrag), - srcvalue, (vti.Mask true_mask), VLOpFrag)), - vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag), - (!cast<Instruction>(instruction_name#"_VX_"# suffix #"_MASK") - vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TU_MU)>; - def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm), - (vti.Vector (op vti.RegClass:$rd, - (riscv_mul_vl_oneuse vti.RegClass:$rs1, vti.RegClass:$rs2, - srcvalue, (vti.Mask true_mask), VLOpFrag), - srcvalue, (vti.Mask true_mask), VLOpFrag)), - vti.RegClass:$rd, undef, VLOpFrag), - (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK") - vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; - def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm), - (vti.Vector (op vti.RegClass:$rd, - (riscv_mul_vl_oneuse (SplatPat XLenVT:$rs1), vti.RegClass:$rs2, - srcvalue, (vti.Mask true_mask), VLOpFrag), - srcvalue, (vti.Mask true_mask), VLOpFrag)), - vti.RegClass:$rd, undef, VLOpFrag), - (!cast<Instruction>(instruction_name#"_VX_"# suffix #"_MASK") - vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; - } - } -} - multiclass VPatWidenMultiplyAddVL_VV_VX<SDNode vwmacc_op, string instr_name> { foreach vtiTowti = AllWidenableIntVectors in { defvar vti = vtiTowti.Vti; @@ -1898,82 +1825,6 @@ multiclass VPatFPMulAddVL_VV_VF_RM<SDPatternOperator vop, string instruction_nam } } -multiclass VPatFPMulAccVL_VV_VF_RM<PatFrag vop, string instruction_name> { - foreach vti = AllFloatVectors in { - defvar suffix = vti.LMul.MX # "_E" # vti.SEW; - let Predicates = GetVTypePredicates<vti>.Predicates in { - def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm), - (vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rs2, - vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)), - vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag), - (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK") - vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask VMV0:$vm), - // Value to indicate no rounding mode change in - // RISCVInsertReadWriteCSR - FRM_DYN, - GPR:$vl, vti.Log2SEW, TU_MU)>; - def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm), - (vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1), vti.RegClass:$rs2, - vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)), - vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag), - (!cast<Instruction>(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK") - vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask VMV0:$vm), - // Value to indicate no rounding mode change in - // RISCVInsertReadWriteCSR - FRM_DYN, - GPR:$vl, vti.Log2SEW, TU_MU)>; - def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm), - (vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rs2, - vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)), - vti.RegClass:$rd, undef, VLOpFrag), - (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK") - vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask VMV0:$vm), - // Value to indicate no rounding mode change in - // RISCVInsertReadWriteCSR - FRM_DYN, - GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; - def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm), - (vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1), vti.RegClass:$rs2, - vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)), - vti.RegClass:$rd, undef, VLOpFrag), - (!cast<Instruction>(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK") - vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask VMV0:$vm), - // Value to indicate no rounding mode change in - // RISCVInsertReadWriteCSR - FRM_DYN, - GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; - } - } -} - -multiclass VPatWidenFPMulAccVL_VV_VF<SDNode vop, string instruction_name> { - foreach vtiToWti = AllWidenableFloatVectors in { - defvar vti = vtiToWti.Vti; - defvar wti = vtiToWti.Wti; - let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates, - GetVTypePredicates<wti>.Predicates) in { - def : Pat<(vop (vti.Vector vti.RegClass:$rs1), - (vti.Vector vti.RegClass:$rs2), - (wti.Vector wti.RegClass:$rd), (vti.Mask VMV0:$vm), - VLOpFrag), - (!cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX #"_MASK") - wti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TA_MA)>; - def : Pat<(vop (vti.Vector (SplatFPOp vti.ScalarRegClass:$rs1)), - (vti.Vector vti.RegClass:$rs2), - (wti.Vector wti.RegClass:$rd), (vti.Mask VMV0:$vm), - VLOpFrag), - (!cast<Instruction>(instruction_name#"_V"#vti.ScalarSuffix#"_"#vti.LMul.MX #"_MASK") - wti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TA_MA)>; - } - } -} - multiclass VPatWidenFPMulAccVL_VV_VF_RM<SDNode vop, string instruction_name, list<VTypeInfoToWide> vtiToWtis = AllWidenableFloatVectors> { @@ -2331,8 +2182,6 @@ defm : VPatBinaryWVL_VV_VX<riscv_vwmulsu_vl, "PseudoVWMULSU">; // 11.13 Vector Single-Width Integer Multiply-Add Instructions defm : VPatMultiplyAddVL_VV_VX<riscv_add_vl, "PseudoVMADD">; defm : VPatMultiplyAddVL_VV_VX<riscv_sub_vl, "PseudoVNMSUB">; -defm : VPatMultiplyAccVL_VV_VX<riscv_add_vl_oneuse, "PseudoVMACC">; -defm : VPatMultiplyAccVL_VV_VX<riscv_sub_vl_oneuse, "PseudoVNMSAC">; // 11.14. Vector Widening Integer Multiply-Add Instructions defm : VPatWidenMultiplyAddVL_VV_VX<riscv_vwmacc_vl, "PseudoVWMACC">; @@ -2470,10 +2319,6 @@ defm : VPatFPMulAddVL_VV_VF_RM<any_riscv_vfmadd_vl, "PseudoVFMADD">; defm : VPatFPMulAddVL_VV_VF_RM<any_riscv_vfmsub_vl, "PseudoVFMSUB">; defm : VPatFPMulAddVL_VV_VF_RM<any_riscv_vfnmadd_vl, "PseudoVFNMADD">; defm : VPatFPMulAddVL_VV_VF_RM<any_riscv_vfnmsub_vl, "PseudoVFNMSUB">; -defm : VPatFPMulAccVL_VV_VF_RM<riscv_vfmadd_vl_oneuse, "PseudoVFMACC">; -defm : VPatFPMulAccVL_VV_VF_RM<riscv_vfmsub_vl_oneuse, "PseudoVFMSAC">; -defm : VPatFPMulAccVL_VV_VF_RM<riscv_vfnmadd_vl_oneuse, "PseudoVFNMACC">; -defm : VPatFPMulAccVL_VV_VF_RM<riscv_vfnmsub_vl_oneuse, "PseudoVFNMSAC">; // 13.7. Vector Widening Floating-Point Fused Multiply-Add Instructions defm : VPatWidenFPMulAccVL_VV_VF_RM<riscv_vfwmadd_vl, "PseudoVFWMACC">; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td index 889ea9802257..d615094329b2 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td @@ -125,10 +125,25 @@ class Mips_prefetch_ri<dag outs, dag ins, string opcodestr, string argstr> let Inst{6-0} = OPC_CUSTOM_0.Value; } +// MIPS Custom Barrier Insns Format. +let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in +class MIPSExtInst_ri<bits<6> shimm5, string opcodestr> + : RVInstIShift<0b00000, 0b001, OPC_OP_IMM, (outs), (ins), opcodestr, ""> { + let shamt = shimm5; + let rd = 0; + let rs1 = 0; +} + //===----------------------------------------------------------------------===// // MIPS extensions //===----------------------------------------------------------------------===// -let Predicates = [HasVendorXMIPSCBOP] ,DecoderNamespace = "Xmipscbop" in { +let Predicates = [HasVendorXMIPSEXECTL], DecoderNamespace = "XMIPS" in { + def MIPS_EHB : MIPSExtInst_ri<0b000011, "mips.ehb">; + def MIPS_IHB : MIPSExtInst_ri<0b000001, "mips.ihb">; + def MIPS_PAUSE : MIPSExtInst_ri<0b000101, "mips.pause">; +} + +let Predicates = [HasVendorXMIPSCBOP], DecoderNamespace = "XMIPS" in { def MIPS_PREF : Mips_prefetch_ri<(outs), (ins GPR:$rs1, uimm9:$imm9, uimm5:$hint), "mips.pref", "$hint, ${imm9}(${rs1})">, Sched<[]>; @@ -146,7 +161,7 @@ let Predicates = [HasVendorXMIPSCBOP] in { } let Predicates = [HasVendorXMIPSCMov], hasSideEffects = 0, mayLoad = 0, mayStore = 0, - DecoderNamespace = "Xmipscmov" in { + DecoderNamespace = "XMIPS" in { def MIPS_CCMOV : RVInstR4<0b11, 0b011, OPC_CUSTOM_0, (outs GPR:$rd), (ins GPR:$rs1, GPR:$rs2, GPR:$rs3), "mips.ccmov", "$rd, $rs2, $rs1, $rs3">, @@ -166,7 +181,7 @@ def : Pat<(select (XLenVT GPR:$rs2), (XLenVT GPR:$rs1), (XLenVT GPR:$rs3)), } let Predicates = [HasVendorXMIPSLSP], hasSideEffects = 0, - DecoderNamespace = "Xmipslsp" in { + DecoderNamespace = "XMIPS" in { let mayLoad = 1, mayStore = 0 in { def MIPS_LWP : LWPFormat<(outs GPR:$rd1, GPR:$rd2), (ins GPR:$rs1, uimm7_lsb00:$imm7), "mips.lwp", "$rd1, $rd2, ${imm7}(${rs1})">, @@ -184,4 +199,4 @@ def MIPS_SDP : SDPFormat<(outs), (ins GPR:$rs2, GPR:$rs3, GPR:$rs1, uimm7_lsb000 "mips.sdp", "$rs2, $rs3, ${imm7}(${rs1})">, Sched<[WriteSTD, ReadStoreData, ReadStoreData, ReadMemBase]>; } // mayLoad = 0, mayStore = 1 -} // Predicates = [HasVendorXMIPSLSP], hasSideEffects = 0, DecoderNamespace = "Xmipslsp" +} // Predicates = [HasVendorXMIPSLSP], hasSideEffects = 0, DecoderNamespace = "XMIPS" diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index 2c64b0c220fb..69796a68ecd6 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -22,6 +22,15 @@ def SDT_SetMultiple : SDTypeProfile<0, 4, [SDTCisSameAs<0, 1>, def qc_setwmi : RVSDNode<"QC_SETWMI", SDT_SetMultiple, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def qc_insb : RVSDNode<"QC_INSB", SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisVT<0, i32>, + SDTCisInt<3>, + SDTCisInt<4>]>, + []>; + +def qc_e_li : RVSDNode<"QC_E_LI", SDTIntUnaryOp>; + def uimm5nonzero : RISCVOp<XLenVT>, ImmLeaf<XLenVT, [{return (Imm != 0) && isUInt<5>(Imm);}]> { let ParserMatchClass = UImmAsmOperand<5, "NonZero">; @@ -1508,6 +1517,11 @@ def : Pat<(i32 (and GPRNoX0:$rs, 1023)), (QC_EXTU GPRNoX0:$rs, 10, 0)>; def : Pat<(i32 (and GPRNoX0:$rs, 2047)), (QC_EXTU GPRNoX0:$rs, 11, 0)>; def : Pat<(i32 (bitreverse GPRNoX0:$rs1)), (QC_BREV32 GPRNoX0:$rs1)>; + +def : Pat<(qc_insb GPRNoX0:$rd, simm5:$imm5, uimm5_plus1:$width, uimm5:$shamt), + (QC_INSBI GPRNoX0:$rd, simm5:$imm5, uimm5_plus1:$width, uimm5:$shamt)>; +def : Pat<(qc_insb GPRNoX0:$rd, GPR:$rs1, uimm5_plus1:$width, uimm5:$shamt), + (QC_INSB GPRNoX0:$rd, GPR:$rs1, uimm5_plus1:$width, uimm5:$shamt)>; } // Predicates = [HasVendorXqcibm, IsRV32] // If Zbb is enabled sext.b/h is preferred since they are compressible @@ -1605,6 +1619,13 @@ def : Pat<(qc_setwmi GPR:$rs3, GPR:$rs1, tuimm5nonzero:$uimm5, tuimm7_lsb00:$uim (QC_SETWMI GPR:$rs3, GPR:$rs1, tuimm5nonzero:$uimm5, tuimm7_lsb00:$uimm7)>; } // Predicates = [HasVendorXqcilsm, IsRV32] +let Predicates = [HasVendorXqcili, IsRV32] in { +def: Pat<(qc_e_li tglobaladdr:$A), (QC_E_LI bare_simm32:$A)>; +def: Pat<(qc_e_li tblockaddress:$A), (QC_E_LI bare_simm32:$A)>; +def: Pat<(qc_e_li tjumptable:$A), (QC_E_LI bare_simm32:$A)>; +def: Pat<(qc_e_li tconstpool:$A), (QC_E_LI bare_simm32:$A)>; +} // Predicates = [HasVendorXqcili, IsRV32] + //===----------------------------------------------------------------------===/i // Compress Instruction tablegen backend. //===----------------------------------------------------------------------===// @@ -1738,10 +1759,19 @@ def : CompressPat<(QC_E_XORAI GPRNoX0:$rd, simm12:$imm), (XORI GPRNoX0:$rd, GPRNoX0:$rd, simm12:$imm)>; } // let isCompressOnly = true, Predicates = [HasVendorXqcilia, IsRV32] -let Predicates = [HasVendorXqciac, IsRV32] in { +let isCompressOnly = true, Predicates = [HasVendorXqciac, IsRV32] in { def : CompressPat<(QC_MULIADD GPRC:$rd, GPRC:$rs1, uimm5:$imm5), (QC_C_MULIADD GPRC:$rd, GPRC:$rs1, uimm5:$imm5)>; -} +} // isCompressOnly = true, Predicates = [HasVendorXqciac, IsRV32] + +let isCompressOnly = true, Predicates = [HasVendorXqciac, HasStdExtZba, IsRV32] in { +def : CompressPat<(SH1ADD GPRC:$rd, GPRC:$rs1, GPRC:$rd), + (QC_C_MULIADD GPRC:$rd, GPRC:$rs1, 2)>; +def : CompressPat<(SH2ADD GPRC:$rd, GPRC:$rs1, GPRC:$rd), + (QC_C_MULIADD GPRC:$rd, GPRC:$rs1, 4)>; +def : CompressPat<(SH3ADD GPRC:$rd, GPRC:$rs1, GPRC:$rd), + (QC_C_MULIADD GPRC:$rd, GPRC:$rs1, 8)>; +} // isCompressOnly = true, Predicates = [HasVendorXqciac, HasStdExtZba, IsRV32] let isCompressOnly = true, Predicates = [HasVendorXqcibi, IsRV32] in { def : CompressPat<(QC_E_BEQI GPRNoX0:$rs1, simm5nonzero:$imm5, bare_simm13_lsb0:$imm12), diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXwch.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXwch.td index a43cbadf6f30..bb1862cc88d6 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXwch.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXwch.td @@ -106,6 +106,7 @@ def QK_C_LBUSP : QKStackInst<0b00, (outs GPRC:$rd_rs2), (ins SPMem:$rs1, uimm4:$imm), "qk.c.lbusp", "$rd_rs2, ${imm}(${rs1})">, Sched<[WriteLDB, ReadMemBase]> { + bits<0> rs1; bits<4> imm; let Inst{10-7} = imm; } @@ -115,6 +116,7 @@ def QK_C_SBSP : QKStackInst<0b10, (outs), uimm4:$imm), "qk.c.sbsp", "$rd_rs2, ${imm}(${rs1})">, Sched<[WriteSTB, ReadStoreData, ReadMemBase]> { + bits<0> rs1; bits<4> imm; let Inst{10-7} = imm; } @@ -124,6 +126,7 @@ def QK_C_LHUSP : QKStackInst<0b01, (outs GPRC:$rd_rs2), (ins SPMem:$rs1, uimm5_lsb0:$imm), "qk.c.lhusp", "$rd_rs2, ${imm}(${rs1})">, Sched<[WriteLDH, ReadMemBase]> { + bits<0> rs1; bits<5> imm; let Inst{10-8} = imm{3-1}; let Inst{7} = imm{4}; @@ -133,6 +136,7 @@ def QK_C_SHSP : QKStackInst<0b11, (outs), (ins GPRC:$rd_rs2, SPMem:$rs1, uimm5_lsb0:$imm), "qk.c.shsp", "$rd_rs2, ${imm}(${rs1})">, Sched<[WriteSTH, ReadStoreData, ReadMemBase]> { + bits<0> rs1; bits<5> imm; let Inst{10-8} = imm{3-1}; let Inst{7} = imm{4}; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td index 2abd3e613a03..a2b4302e19ed 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td @@ -459,15 +459,15 @@ let Predicates = [HasStdExtZba, IsRV64] in { def : InstAlias<"zext.w $rd, $rs", (ADD_UW GPR:$rd, GPR:$rs, X0)>; } // Predicates = [HasStdExtZba, IsRV64] -let Predicates = [HasStdExtZbb] in { +let Predicates = [HasStdExtZbbOrZbkb] in { def : InstAlias<"ror $rd, $rs1, $shamt", - (RORI GPR:$rd, GPR:$rs1, uimmlog2xlen:$shamt), 0>; -} // Predicates = [HasStdExtZbb] + (RORI GPR:$rd, GPR:$rs1, uimmlog2xlen:$shamt), 0>; +} // Predicates = [HasStdExtZbbOrZbkb] -let Predicates = [HasStdExtZbb, IsRV64] in { +let Predicates = [HasStdExtZbbOrZbkb, IsRV64] in { def : InstAlias<"rorw $rd, $rs1, $shamt", - (RORIW GPR:$rd, GPR:$rs1, uimm5:$shamt), 0>; -} // Predicates = [HasStdExtZbb, IsRV64] + (RORIW GPR:$rd, GPR:$rs1, uimm5:$shamt), 0>; +} // Predicates = [HasStdExtZbbOrZbkb, IsRV64] let Predicates = [HasStdExtZbs] in { def : InstAlias<"bset $rd, $rs1, $shamt", diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td index 32e7f962aa2a..76dc027ffd1d 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td @@ -22,5 +22,5 @@ class CMOPInst<bits<3> imm3, string opcodestr> foreach n = [1, 3, 5, 7, 9, 11, 13, 15] in { let Predicates = [HasStdExtZcmop] in - def C_MOP # n : CMOPInst<!srl(n, 1), "c.mop." # n>, Sched<[]>; + def C_MOP_ # n : CMOPInst<!srl(n, 1), "c.mop." # n>, Sched<[]>; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZicfiss.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZicfiss.td index 49a57f86cccd..50ebaa995197 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZicfiss.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZicfiss.td @@ -62,6 +62,21 @@ defm SSAMOSWAP_W : AMO_rr_aq_rl<0b01001, 0b010, "ssamoswap.w">; let Predicates = [HasStdExtZicfiss, IsRV64] in defm SSAMOSWAP_D : AMO_rr_aq_rl<0b01001, 0b011, "ssamoswap.d">; +let Predicates = [HasStdExtZimop] in { +let hasSideEffects = 1, mayLoad = 0, mayStore = 1 in +def PseudoMOP_SSPUSH : Pseudo<(outs), (ins GPRX1X5:$rs2), []>, + PseudoInstExpansion<(MOP_RR_7 X0, X0, GPR:$rs2)>; +let hasSideEffects = 1, mayLoad = 1, mayStore = 0 in +def PseudoMOP_SSPOPCHK : Pseudo<(outs), (ins GPRX1X5:$rs1), []>, + PseudoInstExpansion<(MOP_R_28 X0, GPR:$rs1)>; +} // Predicates = [HasStdExtZimop] + +let Predicates = [HasStdExtZcmop] in { +let Uses = [X1], hasSideEffects = 1, mayLoad = 0, mayStore = 1 in +def PseudoMOP_C_SSPUSH : Pseudo<(outs), (ins), []>, + PseudoInstExpansion<(C_MOP_1)>; +} // Predicates = [HasStdExtZcmop] + //===----------------------------------------------------------------------===/ // Compress Instruction tablegen backend. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZimop.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZimop.td index 960f5669b488..0d08176f9799 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZimop.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZimop.td @@ -33,13 +33,13 @@ class RVInstRMoprr<bits<4> imm4, bits<3> imm3, bits<3> funct3, RISCVOpcode opcod } // May-Be-Operations -def riscv_mopr : RVSDNode<"MOPR", - SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>, - SDTCisSameAs<0, 2>]>>; -def riscv_moprr : RVSDNode<"MOPRR", - SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>, - SDTCisSameAs<0, 2>, - SDTCisSameAs<0, 3>]>>; +def riscv_mop_r : RVSDNode<"MOP_R", + SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>]>>; +def riscv_mop_rr : RVSDNode<"MOP_RR", + SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>]>>; let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in class RVMopr<bits<7> imm7, bits<5> imm5, bits<3> funct3, @@ -50,31 +50,32 @@ class RVMopr<bits<7> imm7, bits<5> imm5, bits<3> funct3, let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in class RVMoprr<bits<4> imm4, bits<3> imm3, bits<3> funct3, RISCVOpcode opcode, string opcodestr> - : RVInstRMoprr<imm4, imm3, funct3, opcode, (outs GPR:$rd), (ins GPR:$rs1, GPR:$rs2), + : RVInstRMoprr<imm4, imm3, funct3, opcode, (outs GPR:$rd), + (ins GPR:$rs1, GPR:$rs2), opcodestr, "$rd, $rs1, $rs2">; foreach i = 0...31 in { let Predicates = [HasStdExtZimop] in - def MOPR#i : RVMopr<0b1000111, i, 0b100, OPC_SYSTEM, "mop.r."#i>, - Sched<[]>; + def MOP_R_#i : RVMopr<0b1000111, i, 0b100, OPC_SYSTEM, "mop.r."#i>, + Sched<[]>; } foreach i = 0...7 in { let Predicates = [HasStdExtZimop] in - def MOPRR#i : RVMoprr<0b1001, i, 0b100, OPC_SYSTEM, "mop.rr."#i>, + def MOP_RR_#i : RVMoprr<0b1001, i, 0b100, OPC_SYSTEM, "mop.rr."#i>, Sched<[]>; } let Predicates = [HasStdExtZimop] in { // Zimop instructions foreach i = 0...31 in { - def : Pat<(XLenVT (riscv_mopr GPR:$rs1, (XLenVT i))), - (!cast<Instruction>("MOPR"#i) GPR:$rs1)>; + def : Pat<(XLenVT (riscv_mop_r GPR:$rs1, (XLenVT i))), + (!cast<Instruction>("MOP_R_"#i) GPR:$rs1)>; } foreach i = 0...7 in { - def : Pat<(XLenVT (riscv_moprr GPR:$rs1, GPR:$rs2, (XLenVT i))), - (!cast<Instruction>("MOPRR"#i) GPR:$rs1, GPR:$rs2)>; + def : Pat<(XLenVT (riscv_mop_rr GPR:$rs1, GPR:$rs2, (XLenVT i))), + (!cast<Instruction>("MOP_RR_"#i) GPR:$rs1, GPR:$rs2)>; } } // Predicates = [HasStdExtZimop] diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvqdotq.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvqdotq.td index 27959eaccd90..00c4e83e18a0 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvqdotq.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvqdotq.td @@ -17,16 +17,39 @@ // Instructions //===----------------------------------------------------------------------===// +class VQDOTVV<bits<6> funct6, RISCVVFormat opv, string opcodestr> + : RVInstVV<funct6, opv, (outs VR:$vd_wb), + (ins VR:$vd, VR:$vs2, VR:$vs1, VMaskOp:$vm), + opcodestr, "$vd, $vs2, $vs1$vm"> { + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let Constraints = "$vd = $vd_wb"; +} + +class VQDOTVX<bits<6> funct6, RISCVVFormat opv, string opcodestr> + : RVInstVX<funct6, opv, (outs VR:$vd_wb), + (ins VR:$vd, VR:$vs2, GPR:$rs1, VMaskOp:$vm), + opcodestr, "$vd, $vs2, $rs1$vm"> { + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let Constraints = "$vd = $vd_wb"; +} + let Predicates = [HasStdExtZvqdotq] in { - def VQDOT_VV : VALUVV<0b101100, OPMVV, "vqdot.vv">; - def VQDOT_VX : VALUVX<0b101100, OPMVX, "vqdot.vx">; - def VQDOTU_VV : VALUVV<0b101000, OPMVV, "vqdotu.vv">; - def VQDOTU_VX : VALUVX<0b101000, OPMVX, "vqdotu.vx">; - def VQDOTSU_VV : VALUVV<0b101010, OPMVV, "vqdotsu.vv">; - def VQDOTSU_VX : VALUVX<0b101010, OPMVX, "vqdotsu.vx">; - def VQDOTUS_VX : VALUVX<0b101110, OPMVX, "vqdotus.vx">; + def VQDOT_VV : VQDOTVV<0b101100, OPMVV, "vqdot.vv">; + def VQDOT_VX : VQDOTVX<0b101100, OPMVX, "vqdot.vx">; + def VQDOTU_VV : VQDOTVV<0b101000, OPMVV, "vqdotu.vv">; + def VQDOTU_VX : VQDOTVX<0b101000, OPMVX, "vqdotu.vx">; + def VQDOTSU_VV : VQDOTVV<0b101010, OPMVV, "vqdotsu.vv">; + def VQDOTSU_VX : VQDOTVX<0b101010, OPMVX, "vqdotsu.vx">; + def VQDOTUS_VX : VQDOTVX<0b101110, OPMVX, "vqdotus.vx">; } // Predicates = [HasStdExtZvqdotq] +//===----------------------------------------------------------------------===// +// Helpers to define the VL patterns. +//===----------------------------------------------------------------------===// let HasPassthruOp = true, HasMaskOp = true in { def riscv_vqdot_vl : RVSDNode<"VQDOT_VL", SDT_RISCVIntBinOp_VL>; @@ -34,6 +57,10 @@ let HasPassthruOp = true, HasMaskOp = true in { def riscv_vqdotsu_vl : RVSDNode<"VQDOTSU_VL", SDT_RISCVIntBinOp_VL>; } // let HasPassthruOp = true, HasMaskOp = true +//===----------------------------------------------------------------------===// +// Pseudo Instructions for CodeGen +//===----------------------------------------------------------------------===// + multiclass VPseudoVQDOT_VV_VX { foreach m = MxSet<32>.m in { defm "" : VPseudoBinaryV_VV<m>, @@ -52,10 +79,69 @@ let Predicates = [HasStdExtZvqdotq], mayLoad = 0, mayStore = 0, defm PseudoVQDOT : VPseudoVQDOT_VV_VX; defm PseudoVQDOTU : VPseudoVQDOT_VV_VX; defm PseudoVQDOTSU : VPseudoVQDOT_VV_VX; + // VQDOTUS does not have a VV variant + foreach m = MxListVF4 in { + defm "PseudoVQDOTUS_VX" : VPseudoTernaryWithPolicy<m.vrclass, m.vrclass, GPR, m>; + } } +//===----------------------------------------------------------------------===// +// Patterns. +//===----------------------------------------------------------------------===// + defvar AllE32Vectors = [VI32MF2, VI32M1, VI32M2, VI32M4, VI32M8]; defm : VPatBinaryVL_VV_VX<riscv_vqdot_vl, "PseudoVQDOT", AllE32Vectors>; defm : VPatBinaryVL_VV_VX<riscv_vqdotu_vl, "PseudoVQDOTU", AllE32Vectors>; defm : VPatBinaryVL_VV_VX<riscv_vqdotsu_vl, "PseudoVQDOTSU", AllE32Vectors>; +// These VPat definitions are for vqdot because they have a different operand +// order with other ternary instructions (i.e. vop.vx vd, vs2, rs1) +multiclass VPatTernaryV_VX_AABX<string intrinsic, string instruction, + list<VTypeInfoToWide> info_pairs> { + foreach pair = info_pairs in { + defvar VdInfo = pair.Wti; + defvar Vs2Info = pair.Vti; + let Predicates = GetVTypePredicates<VdInfo>.Predicates in + defm : VPatTernaryWithPolicy<intrinsic, instruction, + "V"#VdInfo.ScalarSuffix, + VdInfo.Vector, Vs2Info.Vector, Vs2Info.Scalar, + VdInfo.Mask, VdInfo.Log2SEW, VdInfo.LMul, + VdInfo.RegClass, Vs2Info.RegClass, + Vs2Info.ScalarRegClass>; + } +} + +multiclass VPatTernaryV_VV_AABX<string intrinsic, string instruction, + list<VTypeInfoToWide> info_pairs> { + foreach pair = info_pairs in { + defvar VdInfo = pair.Wti; + defvar Vs2Info = pair.Vti; + let Predicates = GetVTypePredicates<VdInfo>.Predicates in + defm : VPatTernaryWithPolicy<intrinsic, instruction, + "VV", + VdInfo.Vector, Vs2Info.Vector, Vs2Info.Vector, + VdInfo.Mask, VdInfo.Log2SEW, VdInfo.LMul, + VdInfo.RegClass, Vs2Info.RegClass, + Vs2Info.RegClass>; + } +} + +multiclass VPatTernaryV_VV_VX_AABX<string intrinsic, string instruction, + list<VTypeInfoToWide> info_pairs> + : VPatTernaryV_VV_AABX<intrinsic, instruction, info_pairs>, + VPatTernaryV_VX_AABX<intrinsic, instruction, info_pairs>; + +defset list<VTypeInfoToWide> VQDOTInfoPairs = { + def : VTypeInfoToWide<VI8MF2, VI32MF2>; + def : VTypeInfoToWide<VI8M1, VI32M1>; + def : VTypeInfoToWide<VI8M2, VI32M2>; + def : VTypeInfoToWide<VI8M4, VI32M4>; + def : VTypeInfoToWide<VI8M8, VI32M8>; +} + +let Predicates = [HasStdExtZvqdotq] in { + defm : VPatTernaryV_VV_VX_AABX<"int_riscv_vqdot", "PseudoVQDOT", VQDOTInfoPairs>; + defm : VPatTernaryV_VV_VX_AABX<"int_riscv_vqdotu", "PseudoVQDOTU", VQDOTInfoPairs>; + defm : VPatTernaryV_VV_VX_AABX<"int_riscv_vqdotsu", "PseudoVQDOTSU", VQDOTInfoPairs>; + defm : VPatTernaryV_VX_AABX<"int_riscv_vqdotus", "PseudoVQDOTUS", VQDOTInfoPairs>; +} diff --git a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td index 4abe62f4e874..06309262f1b0 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td +++ b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td @@ -148,6 +148,14 @@ def isNonZeroLoadImmediate CheckNot<CheckImmOperand<2, 0>> ]>>>; +def isLPAD + : TIIPredicate<"isLPAD", + MCReturnStatement<CheckAll<[ + CheckOpcode<[AUIPC]>, + CheckIsRegOperand<0>, + CheckRegOperand<0, X0>, + ]>>>; + def ignoresVXRM : TIIPredicate<"ignoresVXRM", MCOpcodeSwitchStatement< diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index c7b96f5c3d0c..5e1063155ba0 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -81,6 +81,12 @@ static const Intrinsic::ID FixedVssegIntrIds[] = { Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask, Intrinsic::riscv_seg8_store_mask}; +static const Intrinsic::ID FixedVsssegIntrIds[] = { + Intrinsic::riscv_sseg2_store_mask, Intrinsic::riscv_sseg3_store_mask, + Intrinsic::riscv_sseg4_store_mask, Intrinsic::riscv_sseg5_store_mask, + Intrinsic::riscv_sseg6_store_mask, Intrinsic::riscv_sseg7_store_mask, + Intrinsic::riscv_sseg8_store_mask}; + static const Intrinsic::ID ScalableVssegIntrIds[] = { Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask, Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask, @@ -275,7 +281,16 @@ bool RISCVTargetLowering::lowerInterleavedLoad( bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store, Value *LaneMask, ShuffleVectorInst *SVI, - unsigned Factor) const { + unsigned Factor, + const APInt &GapMask) const { + assert(GapMask.getBitWidth() == Factor); + + // We only support cases where the skipped fields are the trailing ones. + // TODO: Lower to strided store if there is only a single active field. + unsigned MaskFactor = GapMask.popcount(); + if (MaskFactor < 2 || !GapMask.isMask()) + return false; + IRBuilder<> Builder(Store); const DataLayout &DL = Store->getDataLayout(); auto Mask = SVI->getShuffleMask(); @@ -287,21 +302,31 @@ bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store, Value *Ptr, *VL; Align Alignment; - if (!getMemOperands(Factor, VTy, XLenTy, Store, Ptr, LaneMask, VL, Alignment)) + if (!getMemOperands(MaskFactor, VTy, XLenTy, Store, Ptr, LaneMask, VL, + Alignment)) return false; Type *PtrTy = Ptr->getType(); unsigned AS = PtrTy->getPointerAddressSpace(); - if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL)) + if (!isLegalInterleavedAccessType(VTy, MaskFactor, Alignment, AS, DL)) return false; - Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( - Store->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}); + Function *SegStoreFunc; + if (MaskFactor < Factor) + // Strided segmented store. + SegStoreFunc = Intrinsic::getOrInsertDeclaration( + Store->getModule(), FixedVsssegIntrIds[MaskFactor - 2], + {VTy, PtrTy, XLenTy, XLenTy}); + else + // Normal segmented store. + SegStoreFunc = Intrinsic::getOrInsertDeclaration( + Store->getModule(), FixedVssegIntrIds[Factor - 2], + {VTy, PtrTy, XLenTy}); SmallVector<Value *, 10> Ops; SmallVector<int, 16> NewShuffleMask; - for (unsigned i = 0; i < Factor; i++) { + for (unsigned i = 0; i < MaskFactor; i++) { // Collect shuffle mask for this lane. for (unsigned j = 0; j < VTy->getNumElements(); j++) NewShuffleMask.push_back(Mask[i + Factor * j]); @@ -312,8 +337,14 @@ bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store, NewShuffleMask.clear(); } - Ops.append({Ptr, LaneMask, VL}); - Builder.CreateCall(VssegNFunc, Ops); + Ops.push_back(Ptr); + if (MaskFactor < Factor) { + // Insert the stride argument. + unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType()); + Ops.push_back(ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes)); + } + Ops.append({LaneMask, VL}); + Builder.CreateCall(SegStoreFunc, Ops); return true; } diff --git a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp index 3b19c3456ad6..d08115b72977 100644 --- a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp +++ b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp @@ -356,6 +356,14 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI, return false; Worklist.emplace_back(UserMI, Bits); break; + case RISCV::TH_EXT: + case RISCV::TH_EXTU: + unsigned Msb = UserMI->getOperand(2).getImm(); + unsigned Lsb = UserMI->getOperand(3).getImm(); + // Behavior of Msb < Lsb is not well documented. + if (Msb >= Lsb && Bits > Msb) + break; + return false; } } } @@ -409,6 +417,16 @@ static bool isSignExtendingOpW(const MachineInstr &MI, unsigned OpNo) { assert(Log2SEW >= 3 && Log2SEW <= 6 && "Unexpected Log2SEW"); return Log2SEW <= 5; } + case RISCV::TH_EXT: { + unsigned Msb = MI.getOperand(2).getImm(); + unsigned Lsb = MI.getOperand(3).getImm(); + return Msb >= Lsb && (Msb - Lsb + 1) <= 32; + } + case RISCV::TH_EXTU: { + unsigned Msb = MI.getOperand(2).getImm(); + unsigned Lsb = MI.getOperand(3).getImm(); + return Msb >= Lsb && (Msb - Lsb + 1) < 32; + } } return false; @@ -519,9 +537,11 @@ static bool isSignExtendedW(Register SrcReg, const RISCVSubtarget &ST, case RISCV::ANDI: case RISCV::ORI: case RISCV::XORI: + case RISCV::SRAI: // |Remainder| is always <= |Dividend|. If D is 32-bit, then so is R. // DIV doesn't work because of the edge case 0xf..f 8000 0000 / (long)-1 // Logical operations use a sign extended 12-bit immediate. + // Arithmetic shift right can only increase the number of sign bits. if (!AddRegToWorkList(MI->getOperand(1).getReg())) return false; @@ -556,6 +576,9 @@ static bool isSignExtendedW(Register SrcReg, const RISCVSubtarget &ST, case RISCV::PseudoCCAND: case RISCV::PseudoCCOR: case RISCV::PseudoCCXOR: + case RISCV::PseudoCCANDN: + case RISCV::PseudoCCORN: + case RISCV::PseudoCCXNOR: case RISCV::PHI: { // If all incoming values are sign-extended, the output of AND, OR, XOR, // MIN, MAX, or PHI is also sign-extended. @@ -578,6 +601,9 @@ static bool isSignExtendedW(Register SrcReg, const RISCVSubtarget &ST, case RISCV::PseudoCCAND: case RISCV::PseudoCCOR: case RISCV::PseudoCCXOR: + case RISCV::PseudoCCANDN: + case RISCV::PseudoCCORN: + case RISCV::PseudoCCXNOR: B = 4; E = 7; break; diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td index f89d94f41b69..36d63ed23b92 100644 --- a/llvm/lib/Target/RISCV/RISCVProcessors.td +++ b/llvm/lib/Target/RISCV/RISCVProcessors.td @@ -121,7 +121,8 @@ def MIPS_P8700 : RISCVProcessorModel<"mips-p8700", FeatureStdExtZicsr, FeatureVendorXMIPSCMov, FeatureVendorXMIPSLSP, - FeatureVendorXMIPSCBOP], + FeatureVendorXMIPSCBOP, + FeatureVendorXMIPSEXECTL], [TuneMIPSP8700]>; def ROCKET_RV32 : RISCVProcessorModel<"rocket-rv32", diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp index f3966a55ce7d..40b641680b2c 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -966,7 +966,9 @@ bool RISCVRegisterInfo::getRegAllocationHints( } } - // Add a hint if it would allow auipc/lui+addi(w) fusion. + // Add a hint if it would allow auipc/lui+addi(w) fusion. We do this even + // without the fusions explicitly enabled as the impact is rarely negative + // and some cores do implement this fusion. if ((MI.getOpcode() == RISCV::ADDIW || MI.getOpcode() == RISCV::ADDI) && MI.getOperand(1).isReg()) { const MachineBasicBlock &MBB = *MI.getParent(); @@ -974,9 +976,7 @@ bool RISCVRegisterInfo::getRegAllocationHints( // Is the previous instruction a LUI or AUIPC that can be fused? if (I != MBB.begin()) { I = skipDebugInstructionsBackward(std::prev(I), MBB.begin()); - if (((I->getOpcode() == RISCV::LUI && Subtarget.hasLUIADDIFusion()) || - (I->getOpcode() == RISCV::AUIPC && - Subtarget.hasAUIPCADDIFusion())) && + if ((I->getOpcode() == RISCV::LUI || I->getOpcode() == RISCV::AUIPC) && I->getOperand(0).getReg() == MI.getOperand(1).getReg()) { if (OpIdx == 0) tryAddHint(MO, MI.getOperand(1), /*NeedGPRC=*/false); diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index fd57e02c25d0..50e76df56e57 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -186,6 +186,12 @@ public: return HasStdExtZfhmin || HasStdExtZfbfmin; } + bool hasBEXTILike() const { return HasStdExtZbs || HasVendorXTHeadBs; } + + bool hasCZEROLike() const { + return HasStdExtZicond || HasVendorXVentanaCondOps; + } + bool hasConditionalMoveFusion() const { // Do we support fusing a branch+mv or branch+c.mv as a conditional move. return (hasConditionalCompressedMoveFusion() && hasStdExtZca()) || diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index d70b1d0dc8d5..460bb33f2553 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -652,7 +652,8 @@ void RISCVPassConfig::addPostRegAlloc() { void RISCVTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PB.registerLateLoopOptimizationsEPCallback([=](LoopPassManager &LPM, OptimizationLevel Level) { - LPM.addPass(LoopIdiomVectorizePass(LoopIdiomVectorizeStyle::Predicated)); + if (Level != OptimizationLevel::O0) + LPM.addPass(LoopIdiomVectorizePass(LoopIdiomVectorizeStyle::Predicated)); }); } diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index c707fb110b10..1ca513214f67 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1566,6 +1566,18 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, return BaseT::getIntrinsicInstrCost(ICA, CostKind); } +InstructionCost +RISCVTTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, + const SCEV *Ptr, + TTI::TargetCostKind CostKind) const { + // Address computations for vector indexed load/store likely require an offset + // and/or scaling. + if (ST->hasVInstructions() && PtrTy->isVectorTy()) + return getArithmeticInstrCost(Instruction::Add, PtrTy, CostKind); + + return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind); +} + InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, @@ -2731,6 +2743,10 @@ unsigned RISCVTTIImpl::getMinTripCountTailFoldingThreshold() const { return RVVMinTripCount; } +bool RISCVTTIImpl::preferAlternateOpcodeVectorization() const { + return ST->enableUnalignedVectorMem(); +} + TTI::AddressingModeKind RISCVTTIImpl::getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const { diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 3236b2a35c85..6bd7d51daff6 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -132,7 +132,7 @@ public: unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override; - bool preferAlternateOpcodeVectorization() const override { return false; } + bool preferAlternateOpcodeVectorization() const override; bool preferEpilogueVectorization() const override { // Epilogue vectorization is usually unprofitable - tail folding or @@ -177,6 +177,10 @@ public: getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override; + InstructionCost + getAddressComputationCost(Type *PTy, ScalarEvolution *SE, const SCEV *Ptr, + TTI::TargetCostKind CostKind) const override; + InstructionCost getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index 53557049ea33..29526cf5a527 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -178,8 +178,20 @@ static unsigned getIntegerExtensionOperandEEW(unsigned Factor, return Log2EEW; } -static std::optional<unsigned> -getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) { +#define VSEG_CASES(Prefix, EEW) \ + RISCV::Prefix##SEG2E##EEW##_V: \ + case RISCV::Prefix##SEG3E##EEW##_V: \ + case RISCV::Prefix##SEG4E##EEW##_V: \ + case RISCV::Prefix##SEG5E##EEW##_V: \ + case RISCV::Prefix##SEG6E##EEW##_V: \ + case RISCV::Prefix##SEG7E##EEW##_V: \ + case RISCV::Prefix##SEG8E##EEW##_V +#define VSSEG_CASES(EEW) VSEG_CASES(VS, EEW) +#define VSSSEG_CASES(EEW) VSEG_CASES(VSS, EEW) +#define VSUXSEG_CASES(EEW) VSEG_CASES(VSUX, I##EEW) +#define VSOXSEG_CASES(EEW) VSEG_CASES(VSOX, I##EEW) + +static std::optional<unsigned> getOperandLog2EEW(const MachineOperand &MO) { const MachineInstr &MI = *MO.getParent(); const MCInstrDesc &Desc = MI.getDesc(); const RISCVVPseudosTable::PseudoInfo *RVV = @@ -225,21 +237,29 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) { case RISCV::VSE8_V: case RISCV::VLSE8_V: case RISCV::VSSE8_V: + case VSSEG_CASES(8): + case VSSSEG_CASES(8): return 3; case RISCV::VLE16_V: case RISCV::VSE16_V: case RISCV::VLSE16_V: case RISCV::VSSE16_V: + case VSSEG_CASES(16): + case VSSSEG_CASES(16): return 4; case RISCV::VLE32_V: case RISCV::VSE32_V: case RISCV::VLSE32_V: case RISCV::VSSE32_V: + case VSSEG_CASES(32): + case VSSSEG_CASES(32): return 5; case RISCV::VLE64_V: case RISCV::VSE64_V: case RISCV::VLSE64_V: case RISCV::VSSE64_V: + case VSSEG_CASES(64): + case VSSSEG_CASES(64): return 6; // Vector Indexed Instructions @@ -248,7 +268,9 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) { case RISCV::VLUXEI8_V: case RISCV::VLOXEI8_V: case RISCV::VSUXEI8_V: - case RISCV::VSOXEI8_V: { + case RISCV::VSOXEI8_V: + case VSUXSEG_CASES(8): + case VSOXSEG_CASES(8): { if (MO.getOperandNo() == 0) return MILog2SEW; return 3; @@ -256,7 +278,9 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) { case RISCV::VLUXEI16_V: case RISCV::VLOXEI16_V: case RISCV::VSUXEI16_V: - case RISCV::VSOXEI16_V: { + case RISCV::VSOXEI16_V: + case VSUXSEG_CASES(16): + case VSOXSEG_CASES(16): { if (MO.getOperandNo() == 0) return MILog2SEW; return 4; @@ -264,7 +288,9 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) { case RISCV::VLUXEI32_V: case RISCV::VLOXEI32_V: case RISCV::VSUXEI32_V: - case RISCV::VSOXEI32_V: { + case RISCV::VSOXEI32_V: + case VSUXSEG_CASES(32): + case VSOXSEG_CASES(32): { if (MO.getOperandNo() == 0) return MILog2SEW; return 5; @@ -272,7 +298,9 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) { case RISCV::VLUXEI64_V: case RISCV::VLOXEI64_V: case RISCV::VSUXEI64_V: - case RISCV::VSOXEI64_V: { + case RISCV::VSOXEI64_V: + case VSUXSEG_CASES(64): + case VSOXSEG_CASES(64): { if (MO.getOperandNo() == 0) return MILog2SEW; return 6; @@ -422,9 +450,6 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) { case RISCV::VRGATHER_VI: case RISCV::VRGATHER_VV: case RISCV::VRGATHER_VX: - // Vector Compress Instruction - // EEW=SEW. - case RISCV::VCOMPRESS_VM: // Vector Element Index Instruction case RISCV::VID_V: // Vector Single-Width Floating-Point Add/Subtract Instructions @@ -674,6 +699,12 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) { return MILog2SEW; } + // Vector Compress Instruction + // EEW=SEW, except the mask operand has EEW=1. Mask operand is not handled + // before this switch. + case RISCV::VCOMPRESS_VM: + return MO.getOperandNo() == 3 ? 0 : MILog2SEW; + // Vector Iota Instruction // EEW=SEW, except the mask operand has EEW=1. Mask operand is not handled // before this switch. @@ -778,14 +809,13 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) { } } -static std::optional<OperandInfo> -getOperandInfo(const MachineOperand &MO, const MachineRegisterInfo *MRI) { +static std::optional<OperandInfo> getOperandInfo(const MachineOperand &MO) { const MachineInstr &MI = *MO.getParent(); const RISCVVPseudosTable::PseudoInfo *RVV = RISCVVPseudosTable::getPseudoInfo(MI.getOpcode()); assert(RVV && "Could not find MI in PseudoTable"); - std::optional<unsigned> Log2EEW = getOperandLog2EEW(MO, MRI); + std::optional<unsigned> Log2EEW = getOperandLog2EEW(MO); if (!Log2EEW) return std::nullopt; @@ -900,13 +930,6 @@ static bool isSupportedInstr(const MachineInstr &MI) { case RISCV::VSEXT_VF4: case RISCV::VZEXT_VF8: case RISCV::VSEXT_VF8: - // Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions - // FIXME: Add support - case RISCV::VMADC_VV: - case RISCV::VMADC_VI: - case RISCV::VMADC_VX: - case RISCV::VMSBC_VV: - case RISCV::VMSBC_VX: // Vector Narrowing Integer Right Shift Instructions case RISCV::VNSRL_WX: case RISCV::VNSRL_WI: @@ -993,6 +1016,11 @@ static bool isSupportedInstr(const MachineInstr &MI) { case RISCV::VSBC_VXM: case RISCV::VMSBC_VVM: case RISCV::VMSBC_VXM: + case RISCV::VMADC_VV: + case RISCV::VMADC_VI: + case RISCV::VMADC_VX: + case RISCV::VMSBC_VV: + case RISCV::VMSBC_VX: // Vector Widening Integer Multiply-Add Instructions case RISCV::VWMACCU_VV: case RISCV::VWMACCU_VX: @@ -1001,10 +1029,7 @@ static bool isSupportedInstr(const MachineInstr &MI) { case RISCV::VWMACCSU_VV: case RISCV::VWMACCSU_VX: case RISCV::VWMACCUS_VX: - // Vector Integer Merge Instructions - // FIXME: Add support // Vector Integer Move Instructions - // FIXME: Add support case RISCV::VMV_V_I: case RISCV::VMV_V_X: case RISCV::VMV_V_V: @@ -1306,7 +1331,8 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const { // TODO: Use a better approach than a white-list, such as adding // properties to instructions using something like TSFlags. if (!isSupportedInstr(MI)) { - LLVM_DEBUG(dbgs() << "Not a candidate due to unsupported instruction\n"); + LLVM_DEBUG(dbgs() << "Not a candidate due to unsupported instruction: " + << MI); return false; } @@ -1328,14 +1354,14 @@ RISCVVLOptimizer::getMinimumVLForUser(const MachineOperand &UserOp) const { const MCInstrDesc &Desc = UserMI.getDesc(); if (!RISCVII::hasVLOp(Desc.TSFlags) || !RISCVII::hasSEWOp(Desc.TSFlags)) { - LLVM_DEBUG(dbgs() << " Abort due to lack of VL, assume that" + LLVM_DEBUG(dbgs() << " Abort due to lack of VL, assume that" " use VLMAX\n"); return std::nullopt; } if (RISCVII::readsPastVL( TII->get(RISCV::getRVVMCOpcode(UserMI.getOpcode())).TSFlags)) { - LLVM_DEBUG(dbgs() << " Abort because used by unsafe instruction\n"); + LLVM_DEBUG(dbgs() << " Abort because used by unsafe instruction\n"); return std::nullopt; } @@ -1352,7 +1378,7 @@ RISCVVLOptimizer::getMinimumVLForUser(const MachineOperand &UserOp) const { RISCVII::isFirstDefTiedToFirstUse(UserMI.getDesc())); auto DemandedVL = DemandedVLs.lookup(&UserMI); if (!DemandedVL || !RISCV::isVLKnownLE(*DemandedVL, VLOp)) { - LLVM_DEBUG(dbgs() << " Abort because user is passthru in " + LLVM_DEBUG(dbgs() << " Abort because user is passthru in " "instruction with demanded tail\n"); return std::nullopt; } @@ -1376,6 +1402,54 @@ RISCVVLOptimizer::getMinimumVLForUser(const MachineOperand &UserOp) const { return VLOp; } +/// Return true if MI is an instruction used for assembling registers +/// for segmented store instructions, namely, RISCVISD::TUPLE_INSERT. +/// Currently it's lowered to INSERT_SUBREG. +static bool isTupleInsertInstr(const MachineInstr &MI) { + if (!MI.isInsertSubreg()) + return false; + + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + const TargetRegisterClass *DstRC = MRI.getRegClass(MI.getOperand(0).getReg()); + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); + if (!RISCVRI::isVRegClass(DstRC->TSFlags)) + return false; + unsigned NF = RISCVRI::getNF(DstRC->TSFlags); + if (NF < 2) + return false; + + // Check whether INSERT_SUBREG has the correct subreg index for tuple inserts. + auto VLMul = RISCVRI::getLMul(DstRC->TSFlags); + unsigned SubRegIdx = MI.getOperand(3).getImm(); + [[maybe_unused]] auto [LMul, IsFractional] = RISCVVType::decodeVLMUL(VLMul); + assert(!IsFractional && "unexpected LMUL for tuple register classes"); + return TRI->getSubRegIdxSize(SubRegIdx) == RISCV::RVVBitsPerBlock * LMul; +} + +static bool isSegmentedStoreInstr(const MachineInstr &MI) { + switch (RISCV::getRVVMCOpcode(MI.getOpcode())) { + case VSSEG_CASES(8): + case VSSSEG_CASES(8): + case VSUXSEG_CASES(8): + case VSOXSEG_CASES(8): + case VSSEG_CASES(16): + case VSSSEG_CASES(16): + case VSUXSEG_CASES(16): + case VSOXSEG_CASES(16): + case VSSEG_CASES(32): + case VSSSEG_CASES(32): + case VSUXSEG_CASES(32): + case VSOXSEG_CASES(32): + case VSSEG_CASES(64): + case VSSSEG_CASES(64): + case VSUXSEG_CASES(64): + case VSOXSEG_CASES(64): + return true; + default: + return false; + } +} + std::optional<MachineOperand> RISCVVLOptimizer::checkUsers(const MachineInstr &MI) const { std::optional<MachineOperand> CommonVL; @@ -1396,6 +1470,23 @@ RISCVVLOptimizer::checkUsers(const MachineInstr &MI) const { continue; } + if (isTupleInsertInstr(UserMI)) { + LLVM_DEBUG(dbgs().indent(4) << "Peeking through uses of INSERT_SUBREG\n"); + for (MachineOperand &UseOp : + MRI->use_operands(UserMI.getOperand(0).getReg())) { + const MachineInstr &CandidateMI = *UseOp.getParent(); + // We should not propagate the VL if the user is not a segmented store + // or another INSERT_SUBREG, since VL just works differently + // between segmented operations (per-field) v.s. other RVV ops (on the + // whole register group). + if (!isTupleInsertInstr(CandidateMI) && + !isSegmentedStoreInstr(CandidateMI)) + return std::nullopt; + Worklist.insert(&UseOp); + } + continue; + } + if (UserMI.isPHI()) { // Don't follow PHI cycles if (!PHISeen.insert(&UserMI).second) @@ -1425,9 +1516,8 @@ RISCVVLOptimizer::checkUsers(const MachineInstr &MI) const { return std::nullopt; } - std::optional<OperandInfo> ConsumerInfo = getOperandInfo(UserOp, MRI); - std::optional<OperandInfo> ProducerInfo = - getOperandInfo(MI.getOperand(0), MRI); + std::optional<OperandInfo> ConsumerInfo = getOperandInfo(UserOp); + std::optional<OperandInfo> ProducerInfo = getOperandInfo(MI.getOperand(0)); if (!ConsumerInfo || !ProducerInfo) { LLVM_DEBUG(dbgs() << " Abort due to unknown operand information.\n"); LLVM_DEBUG(dbgs() << " ConsumerInfo is: " << ConsumerInfo << "\n"); @@ -1449,7 +1539,7 @@ RISCVVLOptimizer::checkUsers(const MachineInstr &MI) const { } bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI) const { - LLVM_DEBUG(dbgs() << "Trying to reduce VL for " << MI << "\n"); + LLVM_DEBUG(dbgs() << "Trying to reduce VL for " << MI); unsigned VLOpNum = RISCVII::getVLOpNum(MI.getDesc()); MachineOperand &VLOp = MI.getOperand(VLOpNum); @@ -1468,14 +1558,23 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI) const { assert((CommonVL->isImm() || CommonVL->getReg().isVirtual()) && "Expected VL to be an Imm or virtual Reg"); + // If the VL is defined by a vleff that doesn't dominate MI, try using the + // vleff's AVL. It will be greater than or equal to the output VL. + if (CommonVL->isReg()) { + const MachineInstr *VLMI = MRI->getVRegDef(CommonVL->getReg()); + if (RISCVInstrInfo::isFaultOnlyFirstLoad(*VLMI) && + !MDT->dominates(VLMI, &MI)) + CommonVL = VLMI->getOperand(RISCVII::getVLOpNum(VLMI->getDesc())); + } + if (!RISCV::isVLKnownLE(*CommonVL, VLOp)) { - LLVM_DEBUG(dbgs() << " Abort due to CommonVL not <= VLOp.\n"); + LLVM_DEBUG(dbgs() << " Abort due to CommonVL not <= VLOp.\n"); return false; } if (CommonVL->isIdenticalTo(VLOp)) { LLVM_DEBUG( - dbgs() << " Abort due to CommonVL == VLOp, no point in reducing.\n"); + dbgs() << " Abort due to CommonVL == VLOp, no point in reducing.\n"); return false; } @@ -1486,8 +1585,10 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI) const { return true; } const MachineInstr *VLMI = MRI->getVRegDef(CommonVL->getReg()); - if (!MDT->dominates(VLMI, &MI)) + if (!MDT->dominates(VLMI, &MI)) { + LLVM_DEBUG(dbgs() << " Abort due to VL not dominating.\n"); return false; + } LLVM_DEBUG( dbgs() << " Reduce VL from " << VLOp << " to " << printReg(CommonVL->getReg(), MRI->getTargetRegisterInfo()) diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp index 050de3d58a2f..62651185137c 100644 --- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp +++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp @@ -745,12 +745,24 @@ bool RISCVVectorPeephole::foldVMergeToMask(MachineInstr &MI) const { if (PassthruReg && !isKnownSameDefs(PassthruReg, FalseReg)) return false; + std::optional<std::pair<unsigned, unsigned>> NeedsCommute; + // If True has a passthru operand then it needs to be the same as vmerge's // False, since False will be used for the result's passthru operand. Register TruePassthru = True.getOperand(True.getNumExplicitDefs()).getReg(); if (RISCVII::isFirstDefTiedToFirstUse(True.getDesc()) && TruePassthru && - !isKnownSameDefs(TruePassthru, FalseReg)) - return false; + !isKnownSameDefs(TruePassthru, FalseReg)) { + // If True's passthru != False, check if it uses False in another operand + // and try to commute it. + int OtherIdx = True.findRegisterUseOperandIdx(FalseReg, TRI); + if (OtherIdx == -1) + return false; + unsigned OpIdx1 = OtherIdx; + unsigned OpIdx2 = True.getNumExplicitDefs(); + if (!TII->findCommutedOpIndices(True, OpIdx1, OpIdx2)) + return false; + NeedsCommute = {OpIdx1, OpIdx2}; + } // Make sure it doesn't raise any observable fp exceptions, since changing the // active elements will affect how fflags is set. @@ -796,6 +808,14 @@ bool RISCVVectorPeephole::foldVMergeToMask(MachineInstr &MI) const { if (!ensureDominates(MaskOp, True)) return false; + if (NeedsCommute) { + auto [OpIdx1, OpIdx2] = *NeedsCommute; + [[maybe_unused]] bool Commuted = + TII->commuteInstruction(True, /*NewMI=*/false, OpIdx1, OpIdx2); + assert(Commuted && "Failed to commute True?"); + Info = RISCV::lookupMaskedIntrinsicByUnmasked(True.getOpcode()); + } + True.setDesc(TII->get(Info->MaskedPseudo)); // Insert the mask operand. diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp index f658b67a4c2a..45e88fc94144 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp @@ -12,6 +12,7 @@ #include "SPIRVInstrInfo.h" #include "SPIRV.h" +#include "SPIRVSubtarget.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -22,7 +23,8 @@ using namespace llvm; -SPIRVInstrInfo::SPIRVInstrInfo() : SPIRVGenInstrInfo() {} +SPIRVInstrInfo::SPIRVInstrInfo(const SPIRVSubtarget &STI) + : SPIRVGenInstrInfo(STI) {} bool SPIRVInstrInfo::isConstantInstr(const MachineInstr &MI) const { switch (MI.getOpcode()) { diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h index d58dddcd8da2..72d2243fba62 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h @@ -20,12 +20,13 @@ #include "SPIRVGenInstrInfo.inc" namespace llvm { +class SPIRVSubtarget; class SPIRVInstrInfo : public SPIRVGenInstrInfo { const SPIRVRegisterInfo RI; public: - SPIRVInstrInfo(); + explicit SPIRVInstrInfo(const SPIRVSubtarget &STI); const SPIRVRegisterInfo &getRegisterInfo() const { return RI; } bool isHeaderInstr(const MachineInstr &MI) const; diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td index f0b938d681db..8d10cd0ffb3d 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td @@ -637,8 +637,8 @@ let isReturn = 1, hasDelaySlot = 0, isBarrier = 0, isTerminator = 1, isNotDuplic def OpReturnValue: Op<254, (outs), (ins ID:$ret), "OpReturnValue $ret">; def OpUnreachable: SimpleOp<"OpUnreachable", 255>; } -def OpLifetimeStart: Op<256, (outs), (ins ID:$ptr, i32imm:$sz), "OpLifetimeStart $ptr, $sz">; -def OpLifetimeStop: Op<257, (outs), (ins ID:$ptr, i32imm:$sz), "OpLifetimeStop $ptr, $sz">; +def OpLifetimeStart: Op<256, (outs), (ins ID:$ptr, i32imm:$sz), "OpLifetimeStart $ptr $sz">; +def OpLifetimeStop: Op<257, (outs), (ins ID:$ptr, i32imm:$sz), "OpLifetimeStop $ptr $sz">; def OpDemoteToHelperInvocation: SimpleOp<"OpDemoteToHelperInvocation", 5380>; // 3.42.18 Atomic Instructions diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 98c7709acf93..3ad5528fab06 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -204,6 +204,9 @@ private: bool selectIntegerDotExpansion(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const; + bool selectOpIsInf(Register ResVReg, const SPIRVType *ResType, + MachineInstr &I) const; + template <bool Signed> bool selectDot4AddPacked(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const; @@ -2042,6 +2045,17 @@ bool SPIRVInstructionSelector::selectIntegerDotExpansion( return Result; } +bool SPIRVInstructionSelector::selectOpIsInf(Register ResVReg, + const SPIRVType *ResType, + MachineInstr &I) const { + MachineBasicBlock &BB = *I.getParent(); + return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpIsInf)) + .addDef(ResVReg) + .addUse(GR.getSPIRVTypeID(ResType)) + .addUse(I.getOperand(2).getReg()) + .constrainAllUses(TII, TRI, RBI); +} + template <bool Signed> bool SPIRVInstructionSelector::selectDot4AddPacked(Register ResVReg, const SPIRVType *ResType, @@ -3183,6 +3197,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, return selectExtInst(ResVReg, ResType, I, GL::FaceForward); case Intrinsic::spv_frac: return selectExtInst(ResVReg, ResType, I, CL::fract, GL::Fract); + case Intrinsic::spv_isinf: + return selectOpIsInf(ResVReg, ResType, I); case Intrinsic::spv_normalize: return selectExtInst(ResVReg, ResType, I, CL::normalize, GL::Normalize); case Intrinsic::spv_refract: @@ -4276,9 +4292,11 @@ bool SPIRVInstructionSelector::loadHandleBeforePosition( uint32_t Binding = foldImm(HandleDef.getOperand(3), MRI); uint32_t ArraySize = foldImm(HandleDef.getOperand(4), MRI); Register IndexReg = HandleDef.getOperand(5).getReg(); - bool IsNonUniform = ArraySize > 1 && foldImm(HandleDef.getOperand(6), MRI); + // FIXME: The IsNonUniform flag needs to be set based on resource analysis. + // https://github.com/llvm/llvm-project/issues/155701 + bool IsNonUniform = false; std::string Name = - getStringValueFromReg(HandleDef.getOperand(7).getReg(), *MRI); + getStringValueFromReg(HandleDef.getOperand(6).getReg(), *MRI); bool IsStructuredBuffer = ResType->getOpcode() == SPIRV::OpTypePointer; MachineIRBuilder MIRBuilder(HandleDef); diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index 8039cf0c432f..b7e371d19086 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -124,7 +124,7 @@ getSymbolicOperandRequirements(SPIRV::OperandCategory::OperandCategory Category, })) { return {true, {}, - ReqExts, + std::move(ReqExts), VersionTuple(), VersionTuple()}; // TODO: add versions to extensions. } diff --git a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp index 55c9c4c5380b..1811492bf217 100644 --- a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp @@ -43,7 +43,7 @@ using Edge = std::pair<BasicBlock *, BasicBlock *>; static void partialOrderVisit(BasicBlock &Start, std::function<bool(BasicBlock *)> Op) { PartialOrderingVisitor V(*Start.getParent()); - V.partialOrderVisit(Start, Op); + V.partialOrderVisit(Start, std::move(Op)); } // Returns the exact convergence region in the tree defined by `Node` for which diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp index 690493fb426b..5b746a1389af 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp @@ -53,9 +53,9 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const SPIRVTargetMachine &TM) : SPIRVGenSubtargetInfo(TT, CPU, /*TuneCPU=*/CPU, FS), - PointerSize(TM.getPointerSizeInBits(/* AS= */ 0)), InstrInfo(), - FrameLowering(initSubtargetDependencies(CPU, FS)), TLInfo(TM, *this), - TargetTriple(TT) { + PointerSize(TM.getPointerSizeInBits(/* AS= */ 0)), + InstrInfo(initSubtargetDependencies(CPU, FS)), FrameLowering(*this), + TLInfo(TM, *this), TargetTriple(TT) { switch (TT.getSubArch()) { case Triple::SPIRVSubArch_v10: SPIRVVersion = VersionTuple(1, 0); diff --git a/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp index f1cd9b1ab07c..c3d60f3689e1 100644 --- a/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp +++ b/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp @@ -266,16 +266,47 @@ DecodeCoprocPairRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, return MCDisassembler::Success; } -static DecodeStatus DecodeCall(MCInst &Inst, unsigned insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeSIMM5(MCInst &Inst, unsigned insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeSIMM13(MCInst &Inst, unsigned insn, uint64_t Address, - const MCDisassembler *Decoder); +static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch, + uint64_t Address, uint64_t Offset, + uint64_t Width, MCInst &MI, + const MCDisassembler *Decoder) { + return Decoder->tryAddingSymbolicOperand(MI, Value, Address, isBranch, Offset, + Width, /*InstSize=*/4); +} + +static DecodeStatus DecodeCall(MCInst &MI, unsigned insn, uint64_t Address, + const MCDisassembler *Decoder) { + int64_t CallOffset = SignExtend64(fieldFromInstruction(insn, 0, 30), 30) * 4; + if (!tryAddingSymbolicOperand(Address + CallOffset, false, Address, 0, 30, MI, + Decoder)) + MI.addOperand(MCOperand::createImm(CallOffset)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeSIMM5(MCInst &MI, unsigned insn, uint64_t Address, + const MCDisassembler *Decoder) { + assert(isUInt<5>(insn)); + MI.addOperand(MCOperand::createImm(SignExtend64<5>(insn))); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeSIMM13(MCInst &MI, unsigned insn, uint64_t Address, + const MCDisassembler *Decoder) { + assert(isUInt<13>(insn)); + MI.addOperand(MCOperand::createImm(SignExtend64<13>(insn))); + return MCDisassembler::Success; +} + template <unsigned N> -constexpr static DecodeStatus DecodeDisp(MCInst &MI, uint32_t ImmVal, - uint64_t Address, - const MCDisassembler *Decoder); +static DecodeStatus DecodeDisp(MCInst &MI, uint32_t ImmVal, uint64_t Address, + const MCDisassembler *Decoder) { + int64_t BranchOffset = SignExtend64(ImmVal, N) * 4; + if (!tryAddingSymbolicOperand(Address + BranchOffset, true, Address, 0, N, MI, + Decoder)) + MI.addOperand(MCOperand::createImm(BranchOffset)); + return MCDisassembler::Success; +} + #include "SparcGenDisassemblerTables.inc" /// Read four bytes from the ArrayRef and return 32 bit word. @@ -321,45 +352,3 @@ DecodeStatus SparcDisassembler::getInstruction(MCInst &Instr, uint64_t &Size, return Result; } - -static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch, - uint64_t Address, uint64_t Offset, - uint64_t Width, MCInst &MI, - const MCDisassembler *Decoder) { - return Decoder->tryAddingSymbolicOperand(MI, Value, Address, isBranch, Offset, - Width, /*InstSize=*/4); -} - -static DecodeStatus DecodeCall(MCInst &MI, unsigned insn, uint64_t Address, - const MCDisassembler *Decoder) { - int64_t CallOffset = SignExtend64(fieldFromInstruction(insn, 0, 30), 30) * 4; - if (!tryAddingSymbolicOperand(Address + CallOffset, false, Address, 0, 30, MI, - Decoder)) - MI.addOperand(MCOperand::createImm(CallOffset)); - return MCDisassembler::Success; -} - -static DecodeStatus DecodeSIMM5(MCInst &MI, unsigned insn, uint64_t Address, - const MCDisassembler *Decoder) { - assert(isUInt<5>(insn)); - MI.addOperand(MCOperand::createImm(SignExtend64<5>(insn))); - return MCDisassembler::Success; -} - -static DecodeStatus DecodeSIMM13(MCInst &MI, unsigned insn, uint64_t Address, - const MCDisassembler *Decoder) { - assert(isUInt<13>(insn)); - MI.addOperand(MCOperand::createImm(SignExtend64<13>(insn))); - return MCDisassembler::Success; -} - -template <unsigned N> -constexpr static DecodeStatus DecodeDisp(MCInst &MI, uint32_t ImmVal, - uint64_t Address, - const MCDisassembler *Decoder) { - int64_t BranchOffset = SignExtend64(ImmVal, N) * 4; - if (!tryAddingSymbolicOperand(Address + BranchOffset, true, Address, 0, N, MI, - Decoder)) - MI.addOperand(MCOperand::createImm(BranchOffset)); - return MCDisassembler::Success; -} diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp index fa07578e512b..9fa60ee5229b 100644 --- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp +++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp @@ -81,8 +81,16 @@ static MCRegisterInfo *createSparcMCRegisterInfo(const Triple &TT) { static MCSubtargetInfo * createSparcMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { if (CPU.empty()) - CPU = (TT.getArch() == Triple::sparcv9) ? "v9" : "v8"; - return createSparcMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS); + CPU = TT.getArch() == Triple::sparcv9 ? "v9" : "v8"; + + MCSubtargetInfo *STI = + createSparcMCSubtargetInfoImpl(TT, CPU, /*TuneCPU=*/CPU, FS); + if (TT.isSPARC64() && !STI->hasFeature(Sparc::Feature64Bit)) { + FeatureBitset Features = STI->getFeatureBits(); + STI->setFeatureBits(Features.set(Sparc::Feature64Bit)); + } + + return STI; } static MCTargetStreamer * diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h index a7b0538d683b..b523366e6ada 100644 --- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h +++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h @@ -28,6 +28,7 @@ class MCRegisterInfo; class MCSubtargetInfo; class MCTargetOptions; class Target; +class Triple; MCCodeEmitter *createSparcMCCodeEmitter(const MCInstrInfo &MCII, MCContext &Ctx); diff --git a/llvm/lib/Target/Sparc/Sparc.td b/llvm/lib/Target/Sparc/Sparc.td index cee671e34951..7137e5fbff4f 100644 --- a/llvm/lib/Target/Sparc/Sparc.td +++ b/llvm/lib/Target/Sparc/Sparc.td @@ -34,6 +34,9 @@ def FeatureNoFMULS def FeatureV9 : SubtargetFeature<"v9", "IsV9", "true", "Enable SPARC-V9 instructions">; +def Feature64Bit : SubtargetFeature<"64bit", "Is64Bit", "true", + "Enable 64-bit mode", [FeatureV9]>; + def FeatureV8Plus : SubtargetFeature<"v8plus", "IsV8Plus", "true", "Enable V8+ mode, allowing use of 64-bit V9 instructions in 32-bit code">; diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp index d01218f573dc..2737cca62cd2 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp +++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp @@ -1907,37 +1907,37 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM, // Setup Runtime library names. if (Subtarget->is64Bit() && !Subtarget->useSoftFloat()) { - setLibcallImpl(RTLIB::ADD_F128, RTLIB::_Qp_add); - setLibcallImpl(RTLIB::SUB_F128, RTLIB::_Qp_sub); - setLibcallImpl(RTLIB::MUL_F128, RTLIB::_Qp_mul); - setLibcallImpl(RTLIB::DIV_F128, RTLIB::_Qp_div); - setLibcallImpl(RTLIB::SQRT_F128, RTLIB::_Qp_sqrt); - setLibcallImpl(RTLIB::FPTOSINT_F128_I32, RTLIB::_Qp_qtoi); - setLibcallImpl(RTLIB::FPTOUINT_F128_I32, RTLIB::_Qp_qtoui); - setLibcallImpl(RTLIB::SINTTOFP_I32_F128, RTLIB::_Qp_itoq); - setLibcallImpl(RTLIB::UINTTOFP_I32_F128, RTLIB::_Qp_uitoq); - setLibcallImpl(RTLIB::FPTOSINT_F128_I64, RTLIB::_Qp_qtox); - setLibcallImpl(RTLIB::FPTOUINT_F128_I64, RTLIB::_Qp_qtoux); - setLibcallImpl(RTLIB::SINTTOFP_I64_F128, RTLIB::_Qp_xtoq); - setLibcallImpl(RTLIB::UINTTOFP_I64_F128, RTLIB::_Qp_uxtoq); - setLibcallImpl(RTLIB::FPEXT_F32_F128, RTLIB::_Qp_stoq); - setLibcallImpl(RTLIB::FPEXT_F64_F128, RTLIB::_Qp_dtoq); - setLibcallImpl(RTLIB::FPROUND_F128_F32, RTLIB::_Qp_qtos); - setLibcallImpl(RTLIB::FPROUND_F128_F64, RTLIB::_Qp_qtod); + setLibcallImpl(RTLIB::ADD_F128, RTLIB::impl__Qp_add); + setLibcallImpl(RTLIB::SUB_F128, RTLIB::impl__Qp_sub); + setLibcallImpl(RTLIB::MUL_F128, RTLIB::impl__Qp_mul); + setLibcallImpl(RTLIB::DIV_F128, RTLIB::impl__Qp_div); + setLibcallImpl(RTLIB::SQRT_F128, RTLIB::impl__Qp_sqrt); + setLibcallImpl(RTLIB::FPTOSINT_F128_I32, RTLIB::impl__Qp_qtoi); + setLibcallImpl(RTLIB::FPTOUINT_F128_I32, RTLIB::impl__Qp_qtoui); + setLibcallImpl(RTLIB::SINTTOFP_I32_F128, RTLIB::impl__Qp_itoq); + setLibcallImpl(RTLIB::UINTTOFP_I32_F128, RTLIB::impl__Qp_uitoq); + setLibcallImpl(RTLIB::FPTOSINT_F128_I64, RTLIB::impl__Qp_qtox); + setLibcallImpl(RTLIB::FPTOUINT_F128_I64, RTLIB::impl__Qp_qtoux); + setLibcallImpl(RTLIB::SINTTOFP_I64_F128, RTLIB::impl__Qp_xtoq); + setLibcallImpl(RTLIB::UINTTOFP_I64_F128, RTLIB::impl__Qp_uxtoq); + setLibcallImpl(RTLIB::FPEXT_F32_F128, RTLIB::impl__Qp_stoq); + setLibcallImpl(RTLIB::FPEXT_F64_F128, RTLIB::impl__Qp_dtoq); + setLibcallImpl(RTLIB::FPROUND_F128_F32, RTLIB::impl__Qp_qtos); + setLibcallImpl(RTLIB::FPROUND_F128_F64, RTLIB::impl__Qp_qtod); } else if (!Subtarget->useSoftFloat()) { - setLibcallImpl(RTLIB::ADD_F128, RTLIB::_Q_add); - setLibcallImpl(RTLIB::SUB_F128, RTLIB::_Q_sub); - setLibcallImpl(RTLIB::MUL_F128, RTLIB::_Q_mul); - setLibcallImpl(RTLIB::DIV_F128, RTLIB::_Q_div); - setLibcallImpl(RTLIB::SQRT_F128, RTLIB::_Q_sqrt); - setLibcallImpl(RTLIB::FPTOSINT_F128_I32, RTLIB::_Q_qtoi); - setLibcallImpl(RTLIB::FPTOUINT_F128_I32, RTLIB::_Q_qtou); - setLibcallImpl(RTLIB::SINTTOFP_I32_F128, RTLIB::_Q_itoq); - setLibcallImpl(RTLIB::UINTTOFP_I32_F128, RTLIB::_Q_utoq); - setLibcallImpl(RTLIB::FPEXT_F32_F128, RTLIB::_Q_stoq); - setLibcallImpl(RTLIB::FPEXT_F64_F128, RTLIB::_Q_dtoq); - setLibcallImpl(RTLIB::FPROUND_F128_F32, RTLIB::_Q_qtos); - setLibcallImpl(RTLIB::FPROUND_F128_F64, RTLIB::_Q_qtod); + setLibcallImpl(RTLIB::ADD_F128, RTLIB::impl__Q_add); + setLibcallImpl(RTLIB::SUB_F128, RTLIB::impl__Q_sub); + setLibcallImpl(RTLIB::MUL_F128, RTLIB::impl__Q_mul); + setLibcallImpl(RTLIB::DIV_F128, RTLIB::impl__Q_div); + setLibcallImpl(RTLIB::SQRT_F128, RTLIB::impl__Q_sqrt); + setLibcallImpl(RTLIB::FPTOSINT_F128_I32, RTLIB::impl__Q_qtoi); + setLibcallImpl(RTLIB::FPTOUINT_F128_I32, RTLIB::impl__Q_qtou); + setLibcallImpl(RTLIB::SINTTOFP_I32_F128, RTLIB::impl__Q_itoq); + setLibcallImpl(RTLIB::UINTTOFP_I32_F128, RTLIB::impl__Q_utoq); + setLibcallImpl(RTLIB::FPEXT_F32_F128, RTLIB::impl__Q_stoq); + setLibcallImpl(RTLIB::FPEXT_F64_F128, RTLIB::impl__Q_dtoq); + setLibcallImpl(RTLIB::FPROUND_F128_F32, RTLIB::impl__Q_qtos); + setLibcallImpl(RTLIB::FPROUND_F128_F64, RTLIB::impl__Q_qtod); } } @@ -3510,7 +3510,7 @@ void SparcTargetLowering::ReplaceNodeResults(SDNode *N, // Override to enable LOAD_STACK_GUARD lowering on Linux. bool SparcTargetLowering::useLoadStackGuardNode(const Module &M) const { - if (!Subtarget->isTargetLinux()) + if (!Subtarget->getTargetTriple().isOSLinux()) return TargetLowering::useLoadStackGuardNode(M); return true; } diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp index a7fbbd4044c1..cd0f64991298 100644 --- a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp +++ b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp @@ -37,8 +37,8 @@ static cl::opt<unsigned> // Pin the vtable to this file. void SparcInstrInfo::anchor() {} -SparcInstrInfo::SparcInstrInfo(SparcSubtarget &ST) - : SparcGenInstrInfo(SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP), RI(), +SparcInstrInfo::SparcInstrInfo(const SparcSubtarget &ST) + : SparcGenInstrInfo(ST, SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP), RI(), Subtarget(ST) {} /// isLoadFromStackSlot - If the specified machine instruction is a direct @@ -643,7 +643,7 @@ unsigned SparcInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { bool SparcInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { switch (MI.getOpcode()) { case TargetOpcode::LOAD_STACK_GUARD: { - assert(Subtarget.isTargetLinux() && + assert(Subtarget.getTargetTriple().isOSLinux() && "Only Linux target is expected to contain LOAD_STACK_GUARD"); // offsetof(tcbhead_t, stack_guard) from sysdeps/sparc/nptl/tls.h in glibc. const int64_t Offset = Subtarget.is64Bit() ? 0x28 : 0x14; diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.h b/llvm/lib/Target/Sparc/SparcInstrInfo.h index 1feb12ba2fda..01d020473494 100644 --- a/llvm/lib/Target/Sparc/SparcInstrInfo.h +++ b/llvm/lib/Target/Sparc/SparcInstrInfo.h @@ -40,7 +40,7 @@ class SparcInstrInfo : public SparcGenInstrInfo { const SparcSubtarget& Subtarget; virtual void anchor(); public: - explicit SparcInstrInfo(SparcSubtarget &ST); + explicit SparcInstrInfo(const SparcSubtarget &ST); /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As /// such, whenever a client has an instance of instruction info, it should diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.td b/llvm/lib/Target/Sparc/SparcInstrInfo.td index 1a32eafb0e83..53972d6c105a 100644 --- a/llvm/lib/Target/Sparc/SparcInstrInfo.td +++ b/llvm/lib/Target/Sparc/SparcInstrInfo.td @@ -1785,22 +1785,22 @@ let Predicates = [HasV9], Uses = [ASR3], Constraints = "$swap = $rd" in // as inline assembler-supported instructions. let Predicates = [HasUMAC_SMAC], Defs = [Y, ASR18], Uses = [Y, ASR18] in { def SMACrr : F3_1<2, 0b111111, - (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2, ASRRegs:$asr18), + (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2), "smac $rs1, $rs2, $rd", [], IIC_smac_umac>; def SMACri : F3_2<2, 0b111111, - (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13, ASRRegs:$asr18), + (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13), "smac $rs1, $simm13, $rd", [], IIC_smac_umac>; def UMACrr : F3_1<2, 0b111110, - (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2, ASRRegs:$asr18), + (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2), "umac $rs1, $rs2, $rd", [], IIC_smac_umac>; def UMACri : F3_2<2, 0b111110, - (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13, ASRRegs:$asr18), + (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13), "umac $rs1, $simm13, $rd", [], IIC_smac_umac>; } diff --git a/llvm/lib/Target/Sparc/SparcSubtarget.cpp b/llvm/lib/Target/Sparc/SparcSubtarget.cpp index e42df1d68613..005930834a0c 100644 --- a/llvm/lib/Target/Sparc/SparcSubtarget.cpp +++ b/llvm/lib/Target/Sparc/SparcSubtarget.cpp @@ -28,10 +28,11 @@ void SparcSubtarget::anchor() { } SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies( StringRef CPU, StringRef TuneCPU, StringRef FS) { + const Triple &TT = getTargetTriple(); // Determine default and user specified characteristics std::string CPUName = std::string(CPU); if (CPUName.empty()) - CPUName = (Is64Bit) ? "v9" : "v8"; + CPUName = TT.isSPARC64() ? "v9" : "v8"; if (TuneCPU.empty()) TuneCPU = CPUName; @@ -39,6 +40,12 @@ SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies( // Parse features string. ParseSubtargetFeatures(CPUName, TuneCPU, FS); + if (!Is64Bit && TT.isSPARC64()) { + FeatureBitset Features = getFeatureBits(); + setFeatureBits(Features.set(Sparc::Feature64Bit)); + Is64Bit = true; + } + // Popc is a v9-only instruction. if (!IsV9) UsePopc = false; @@ -47,11 +54,9 @@ SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies( } SparcSubtarget::SparcSubtarget(const StringRef &CPU, const StringRef &TuneCPU, - const StringRef &FS, const TargetMachine &TM, - bool is64Bit) + const StringRef &FS, const TargetMachine &TM) : SparcGenSubtargetInfo(TM.getTargetTriple(), CPU, TuneCPU, FS), ReserveRegister(TM.getMCRegisterInfo()->getNumRegs()), - TargetTriple(TM.getTargetTriple()), Is64Bit(is64Bit), InstrInfo(initializeSubtargetDependencies(CPU, TuneCPU, FS)), TLInfo(TM, *this), FrameLowering(*this) { TSInfo = std::make_unique<SparcSelectionDAGInfo>(); diff --git a/llvm/lib/Target/Sparc/SparcSubtarget.h b/llvm/lib/Target/Sparc/SparcSubtarget.h index 5785c199f44b..b1decca0a4f0 100644 --- a/llvm/lib/Target/Sparc/SparcSubtarget.h +++ b/llvm/lib/Target/Sparc/SparcSubtarget.h @@ -34,11 +34,8 @@ class SparcSubtarget : public SparcGenSubtargetInfo { // register. BitVector ReserveRegister; - Triple TargetTriple; virtual void anchor(); - bool Is64Bit; - #define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \ bool ATTRIBUTE = DEFAULT; #include "SparcGenSubtargetInfo.inc" @@ -50,7 +47,7 @@ class SparcSubtarget : public SparcGenSubtargetInfo { public: SparcSubtarget(const StringRef &CPU, const StringRef &TuneCPU, - const StringRef &FS, const TargetMachine &TM, bool is64bit); + const StringRef &FS, const TargetMachine &TM); ~SparcSubtarget() override; @@ -80,8 +77,6 @@ public: StringRef TuneCPU, StringRef FS); - bool is64Bit() const { return Is64Bit; } - /// The 64-bit ABI uses biased stack and frame pointers, so the stack frame /// of the current function is the area from [%sp+BIAS] to [%fp+BIAS]. int64_t getStackPointerBias() const { @@ -96,8 +91,6 @@ public: /// returns adjusted framesize which includes space for register window /// spills and arguments. int getAdjustedFrameSize(int stackSize) const; - - bool isTargetLinux() const { return TargetTriple.isOSLinux(); } }; } // end namespace llvm diff --git a/llvm/lib/Target/Sparc/SparcTargetMachine.cpp b/llvm/lib/Target/Sparc/SparcTargetMachine.cpp index 52076a6b4dd2..754c8f63ca4e 100644 --- a/llvm/lib/Target/Sparc/SparcTargetMachine.cpp +++ b/llvm/lib/Target/Sparc/SparcTargetMachine.cpp @@ -38,7 +38,9 @@ static cl::opt<bool> BranchRelaxation("sparc-enable-branch-relax", cl::Hidden, cl::init(true), cl::desc("Relax out of range conditional branches")); -static std::string computeDataLayout(const Triple &T, bool is64Bit) { +static std::string computeDataLayout(const Triple &T) { + const bool is64Bit = T.isSPARC64(); + // Sparc is typically big endian, but some are little. std::string Ret = T.getArch() == Triple::sparcel ? "e" : "E"; Ret += "-m:e"; @@ -107,15 +109,14 @@ SparcTargetMachine::SparcTargetMachine(const Target &T, const Triple &TT, const TargetOptions &Options, std::optional<Reloc::Model> RM, std::optional<CodeModel::Model> CM, - CodeGenOptLevel OL, bool JIT, - bool is64bit) + CodeGenOptLevel OL, bool JIT) : CodeGenTargetMachineImpl( - T, computeDataLayout(TT, is64bit), TT, CPU, FS, Options, + T, computeDataLayout(TT), TT, CPU, FS, Options, getEffectiveRelocModel(RM), - getEffectiveSparcCodeModel(CM, getEffectiveRelocModel(RM), is64bit, - JIT), + getEffectiveSparcCodeModel(CM, getEffectiveRelocModel(RM), + TT.isSPARC64(), JIT), OL), - TLOF(std::make_unique<SparcELFTargetObjectFile>()), is64Bit(is64bit) { + TLOF(std::make_unique<SparcELFTargetObjectFile>()) { initAsmInfo(); } @@ -148,8 +149,7 @@ SparcTargetMachine::getSubtargetImpl(const Function &F) const { // creation will depend on the TM and the code generation flags on the // function that reside in TargetOptions. resetTargetOptions(F); - I = std::make_unique<SparcSubtarget>(CPU, TuneCPU, FS, *this, - this->is64Bit); + I = std::make_unique<SparcSubtarget>(CPU, TuneCPU, FS, *this); } return I.get(); } @@ -212,7 +212,7 @@ SparcV8TargetMachine::SparcV8TargetMachine(const Target &T, const Triple &TT, std::optional<Reloc::Model> RM, std::optional<CodeModel::Model> CM, CodeGenOptLevel OL, bool JIT) - : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, JIT, false) {} + : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, JIT) {} void SparcV9TargetMachine::anchor() { } @@ -222,7 +222,7 @@ SparcV9TargetMachine::SparcV9TargetMachine(const Target &T, const Triple &TT, std::optional<Reloc::Model> RM, std::optional<CodeModel::Model> CM, CodeGenOptLevel OL, bool JIT) - : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, JIT, true) {} + : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, JIT) {} void SparcelTargetMachine::anchor() {} @@ -232,4 +232,4 @@ SparcelTargetMachine::SparcelTargetMachine(const Target &T, const Triple &TT, std::optional<Reloc::Model> RM, std::optional<CodeModel::Model> CM, CodeGenOptLevel OL, bool JIT) - : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, JIT, false) {} + : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, JIT) {} diff --git a/llvm/lib/Target/Sparc/SparcTargetMachine.h b/llvm/lib/Target/Sparc/SparcTargetMachine.h index 9a226a47671b..e7d038c5779d 100644 --- a/llvm/lib/Target/Sparc/SparcTargetMachine.h +++ b/llvm/lib/Target/Sparc/SparcTargetMachine.h @@ -23,7 +23,6 @@ namespace llvm { class SparcTargetMachine : public CodeGenTargetMachineImpl { std::unique_ptr<TargetLoweringObjectFile> TLOF; - bool is64Bit; mutable StringMap<std::unique_ptr<SparcSubtarget>> SubtargetMap; public: @@ -31,7 +30,7 @@ public: StringRef FS, const TargetOptions &Options, std::optional<Reloc::Model> RM, std::optional<CodeModel::Model> CM, CodeGenOptLevel OL, - bool JIT, bool is64bit); + bool JIT); ~SparcTargetMachine() override; const SparcSubtarget *getSubtargetImpl(const Function &F) const override; diff --git a/llvm/lib/Target/SystemZ/SystemZFeatures.td b/llvm/lib/Target/SystemZ/SystemZFeatures.td index 2c48da8320fb..4ccc3d3079fc 100644 --- a/llvm/lib/Target/SystemZ/SystemZFeatures.td +++ b/llvm/lib/Target/SystemZ/SystemZFeatures.td @@ -196,7 +196,7 @@ def FeatureVector : SystemZFeature< >; def FeatureNoVector : SystemZMissingFeature<"Vector">; -def NoVecHwMode : HwMode<"-vector", [FeatureNoVector]>; +def NoVecHwMode : HwMode<[FeatureNoVector]>; def Arch11NewFeatures : SystemZFeatureList<[ FeatureLoadAndZeroRightmostByte, @@ -426,4 +426,3 @@ def Arch9UnsupportedFeatures : SystemZFeatureAdd<Arch10UnsupportedFeatures.List, Arch10NewFeatures.List>; def Arch8UnsupportedFeatures : SystemZFeatureAdd<Arch9UnsupportedFeatures.List, Arch9NewFeatures.List>; - diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index c73dc3021eb4..3b7d11a318dc 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -287,6 +287,9 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, // Additional instructions available with z17. if (Subtarget.hasVectorEnhancements3()) { setOperationAction(ISD::ABS, MVT::i128, Legal); + + setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, + MVT::i128, Legal); } } @@ -492,6 +495,9 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, // Map SETCCs onto one of VCE, VCH or VCHL, swapping the operands // and inverting the result as necessary. setOperationAction(ISD::SETCC, VT, Custom); + + setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, VT, + Legal); } } @@ -6719,6 +6725,14 @@ SDValue SystemZTargetLowering::lowerFSHL(SDValue Op, SelectionDAG &DAG) const { if ((ShiftAmt & 7) == 0 || Subtarget.hasVectorEnhancements2()) { SDValue Op0 = DAG.getBitcast(MVT::v16i8, Op.getOperand(0)); SDValue Op1 = DAG.getBitcast(MVT::v16i8, Op.getOperand(1)); + if (ShiftAmt > 120) { + // For N in 121..128, fshl N == fshr (128 - N), and for 1 <= N < 8 + // SHR_DOUBLE_BIT emits fewer instructions. + SDValue Val = + DAG.getNode(SystemZISD::SHR_DOUBLE_BIT, DL, MVT::v16i8, Op0, Op1, + DAG.getTargetConstant(128 - ShiftAmt, DL, MVT::i32)); + return DAG.getBitcast(MVT::i128, Val); + } SmallVector<int, 16> Mask(16); for (unsigned Elt = 0; Elt < 16; Elt++) Mask[Elt] = (ShiftAmt >> 3) + Elt; @@ -6742,13 +6756,21 @@ SDValue SystemZTargetLowering::lowerFSHR(SDValue Op, SelectionDAG &DAG) const { // i128 FSHR with a constant amount that is a multiple of 8 can be // implemented via VECTOR_SHUFFLE. If we have the vector-enhancements-2 // facility, FSHR with a constant amount less than 8 can be implemented - // via SHL_DOUBLE_BIT, and FSHR with other constant amounts by a + // via SHR_DOUBLE_BIT, and FSHR with other constant amounts by a // combination of the two. if (auto *ShiftAmtNode = dyn_cast<ConstantSDNode>(Op.getOperand(2))) { uint64_t ShiftAmt = ShiftAmtNode->getZExtValue() & 127; if ((ShiftAmt & 7) == 0 || Subtarget.hasVectorEnhancements2()) { SDValue Op0 = DAG.getBitcast(MVT::v16i8, Op.getOperand(0)); SDValue Op1 = DAG.getBitcast(MVT::v16i8, Op.getOperand(1)); + if (ShiftAmt > 120) { + // For N in 121..128, fshr N == fshl (128 - N), and for 1 <= N < 8 + // SHL_DOUBLE_BIT emits fewer instructions. + SDValue Val = + DAG.getNode(SystemZISD::SHL_DOUBLE_BIT, DL, MVT::v16i8, Op0, Op1, + DAG.getTargetConstant(128 - ShiftAmt, DL, MVT::i32)); + return DAG.getBitcast(MVT::i128, Val); + } SmallVector<int, 16> Mask(16); for (unsigned Elt = 0; Elt < 16; Elt++) Mask[Elt] = 16 - (ShiftAmt >> 3) + Elt; diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td index d0a549518cc4..82415f412509 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td @@ -2646,28 +2646,24 @@ class BranchUnaryRI<string mnemonic, bits<12> opcode, RegisterOperand cls> : InstRIb<opcode, (outs cls:$R1), (ins cls:$R1src, brtarget16:$RI2), mnemonic#"\t$R1, $RI2", []> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; } class BranchUnaryRIL<string mnemonic, bits<12> opcode, RegisterOperand cls> : InstRILb<opcode, (outs cls:$R1), (ins cls:$R1src, brtarget32:$RI2), mnemonic#"\t$R1, $RI2", []> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; } class BranchUnaryRR<string mnemonic, bits<8> opcode, RegisterOperand cls> : InstRR<opcode, (outs cls:$R1), (ins cls:$R1src, GR64:$R2), mnemonic#"\t$R1, $R2", []> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; } class BranchUnaryRRE<string mnemonic, bits<16> opcode, RegisterOperand cls> : InstRRE<opcode, (outs cls:$R1), (ins cls:$R1src, GR64:$R2), mnemonic#"\t$R1, $R2", []> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; } class BranchUnaryRX<string mnemonic, bits<8> opcode, RegisterOperand cls> @@ -2675,7 +2671,6 @@ class BranchUnaryRX<string mnemonic, bits<8> opcode, RegisterOperand cls> (ins cls:$R1src, (bdxaddr12only $B2, $D2, $X2):$XBD2), mnemonic#"\t$R1, $XBD2", []> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; } class BranchUnaryRXY<string mnemonic, bits<16> opcode, RegisterOperand cls> @@ -2683,14 +2678,12 @@ class BranchUnaryRXY<string mnemonic, bits<16> opcode, RegisterOperand cls> (ins cls:$R1src, (bdxaddr20only $B2, $D2, $X2):$XBD2), mnemonic#"\t$R1, $XBD2", []> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; } class BranchBinaryRSI<string mnemonic, bits<8> opcode, RegisterOperand cls> : InstRSI<opcode, (outs cls:$R1), (ins cls:$R1src, cls:$R3, brtarget16:$RI2), mnemonic#"\t$R1, $R3, $RI2", []> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; } class BranchBinaryRIEe<string mnemonic, bits<16> opcode, RegisterOperand cls> @@ -2698,7 +2691,6 @@ class BranchBinaryRIEe<string mnemonic, bits<16> opcode, RegisterOperand cls> (ins cls:$R1src, cls:$R3, brtarget16:$RI2), mnemonic#"\t$R1, $R3, $RI2", []> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; } class BranchBinaryRS<string mnemonic, bits<8> opcode, RegisterOperand cls> @@ -2706,7 +2698,6 @@ class BranchBinaryRS<string mnemonic, bits<8> opcode, RegisterOperand cls> (ins cls:$R1src, cls:$R3, (bdaddr12only $B2, $D2):$BD2), mnemonic#"\t$R1, $R3, $BD2", []> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; } class BranchBinaryRSY<string mnemonic, bits<16> opcode, RegisterOperand cls> @@ -2715,7 +2706,6 @@ class BranchBinaryRSY<string mnemonic, bits<16> opcode, RegisterOperand cls> (ins cls:$R1src, cls:$R3, (bdaddr20only $B2, $D2):$BD2), mnemonic#"\t$R1, $R3, $BD2", []> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; } class LoadMultipleRS<string mnemonic, bits<8> opcode, RegisterOperand cls, @@ -3116,7 +3106,6 @@ class UnaryTiedRRE<string mnemonic, bits<16> opcode, RegisterOperand cls> : InstRRE<opcode, (outs cls:$R1), (ins cls:$R1src), mnemonic#"\t$R1", []> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; let R2 = 0; } @@ -3125,7 +3114,6 @@ class UnaryMemRRFc<string mnemonic, bits<16> opcode, : InstRRFc<opcode, (outs cls2:$R2, cls1:$R1), (ins cls1:$R1src), mnemonic#"\t$R1, $R2", []> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; let M3 = 0; } @@ -3163,7 +3151,6 @@ class CondUnaryRSY<string mnemonic, bits<16> opcode, (z_select_ccmask (operator bdaddr20only:$BD2), cls:$R1src, cond4:$valid, cond4:$M3))]> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; let mayLoad = 1; let AccessBytes = bytes; let CCMaskLast = 1; @@ -3184,7 +3171,6 @@ class AsmCondUnaryRSY<string mnemonic, bits<16> opcode, let mayLoad = 1; let AccessBytes = bytes; let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; } // Like CondUnaryRSY, but with a fixed CC mask. @@ -3194,7 +3180,6 @@ class FixedCondUnaryRSY<CondVariant V, string mnemonic, bits<16> opcode, : InstRSYb<opcode, (outs cls:$R1), (ins cls:$R1src, (mode $B2, $D2):$BD2), mnemonic#V.suffix#"\t$R1, $BD2", []> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; let mayLoad = 1; let AccessBytes = bytes; let isAsmParserOnly = V.alternate; @@ -3439,7 +3424,6 @@ class SideEffectBinaryMemMemRR<string mnemonic, bits<8> opcode, : InstRR<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src), mnemonic#"\t$R1, $R2", []> { let Constraints = "$R1 = $R1src, $R2 = $R2src"; - let DisableEncoding = "$R1src, $R2src"; } class SideEffectBinaryMemRRE<string mnemonic, bits<16> opcode, @@ -3447,7 +3431,6 @@ class SideEffectBinaryMemRRE<string mnemonic, bits<16> opcode, : InstRRE<opcode, (outs cls2:$R2), (ins cls1:$R1, cls2:$R2src), mnemonic#"\t$R1, $R2", []> { let Constraints = "$R2 = $R2src"; - let DisableEncoding = "$R2src"; } class SideEffectBinaryMemMemRRE<string mnemonic, bits<16> opcode, @@ -3455,7 +3438,6 @@ class SideEffectBinaryMemMemRRE<string mnemonic, bits<16> opcode, : InstRRE<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src), mnemonic#"\t$R1, $R2", []> { let Constraints = "$R1 = $R1src, $R2 = $R2src"; - let DisableEncoding = "$R1src, $R2src"; } class SideEffectBinaryMemMemRRFc<string mnemonic, bits<16> opcode, @@ -3463,7 +3445,6 @@ class SideEffectBinaryMemMemRRFc<string mnemonic, bits<16> opcode, : InstRRFc<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src), mnemonic#"\t$R1, $R2", []> { let Constraints = "$R1 = $R1src, $R2 = $R2src"; - let DisableEncoding = "$R1src, $R2src"; let M3 = 0; } @@ -3475,7 +3456,6 @@ class BinaryRR<string mnemonic, bits<8> opcode, SDPatternOperator operator, let OpKey = mnemonic#cls1; let OpType = "reg"; let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; } class BinaryRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator, @@ -3486,7 +3466,6 @@ class BinaryRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator, let OpKey = mnemonic#cls1; let OpType = "reg"; let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; } class BinaryRRD<string mnemonic, bits<16> opcode, SDPatternOperator operator, @@ -3565,7 +3544,6 @@ class BinaryMemRRFc<string mnemonic, bits<16> opcode, : InstRRFc<opcode, (outs cls2:$R2, cls1:$R1), (ins cls1:$R1src, imm:$M3), mnemonic#"\t$R1, $R2, $M3", []> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; } multiclass BinaryMemRRFcOpt<string mnemonic, bits<16> opcode, @@ -3594,7 +3572,6 @@ class CondBinaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1, [(set cls1:$R1, (z_select_ccmask cls2:$R2, cls1:$R1src, cond4:$valid, cond4:$M3))]> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; let CCMaskLast = 1; let NumOpsKey = !subst("loc", "sel", mnemonic); let NumOpsValue = "2"; @@ -3610,7 +3587,6 @@ class AsmCondBinaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1, (ins cls1:$R1src, cls2:$R2, imm32zx4:$M3), mnemonic#"\t$R1, $R2, $M3", []> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; } // Like CondBinaryRRF, but with a fixed CC mask. @@ -3619,7 +3595,6 @@ class FixedCondBinaryRRF<CondVariant V, string mnemonic, bits<16> opcode, : InstRRFc<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2), mnemonic#V.suffix#"\t$R1, $R2", []> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; let isAsmParserOnly = V.alternate; let AsmVariantName = V.asmvariant; let M3 = V.ccmask; @@ -3678,7 +3653,6 @@ class BinaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator, mnemonic#"\t$R1, $I2", [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; } class BinaryRIE<string mnemonic, bits<16> opcode, SDPatternOperator operator, @@ -3707,7 +3681,6 @@ class CondBinaryRIE<string mnemonic, bits<16> opcode, RegisterOperand cls, [(set cls:$R1, (z_select_ccmask imm:$I2, cls:$R1src, cond4:$valid, cond4:$M3))]> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; let CCMaskLast = 1; } @@ -3719,7 +3692,6 @@ class AsmCondBinaryRIE<string mnemonic, bits<16> opcode, RegisterOperand cls, (ins cls:$R1src, imm:$I2, imm32zx4:$M3), mnemonic#"\t$R1, $I2, $M3", []> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; } // Like CondBinaryRIE, but with a fixed CC mask. @@ -3728,7 +3700,6 @@ class FixedCondBinaryRIE<CondVariant V, string mnemonic, bits<16> opcode, : InstRIEg<opcode, (outs cls:$R1), (ins cls:$R1src, imm:$I2), mnemonic#V.suffix#"\t$R1, $I2", []> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; let isAsmParserOnly = V.alternate; let AsmVariantName = V.asmvariant; let M3 = V.ccmask; @@ -3747,7 +3718,6 @@ class BinaryRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator, mnemonic#"\t$R1, $I2", [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; } class BinaryRS<string mnemonic, bits<8> opcode, SDPatternOperator operator, @@ -3758,7 +3728,6 @@ class BinaryRS<string mnemonic, bits<8> opcode, SDPatternOperator operator, [(set cls:$R1, (operator cls:$R1src, shift12only:$BD2))]> { let R3 = 0; let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; } class BinaryRSY<string mnemonic, bits<16> opcode, SDPatternOperator operator, @@ -3794,7 +3763,6 @@ class BinaryRX<string mnemonic, bits<8> opcode, SDPatternOperator operator, let OpKey = mnemonic#"r"#cls; let OpType = "mem"; let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; let mayLoad = 1; let AccessBytes = bytes; } @@ -3809,7 +3777,6 @@ class BinaryRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator, let OpKey = mnemonic#"r"#cls; let OpType = "mem"; let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; let mayLoad = 1; let AccessBytes = bytes; let M3 = 0; @@ -3838,7 +3805,6 @@ class BinaryRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator, let OpKey = mnemonic#"r"#cls; let OpType = "mem"; let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; let mayLoad = 1; let AccessBytes = bytes; } @@ -4500,7 +4466,6 @@ class SideEffectTernaryMemMemRRFa<string mnemonic, bits<16> opcode, (ins cls1:$R1src, cls2:$R2src, cls3:$R3), mnemonic#"\t$R1, $R2, $R3", []> { let Constraints = "$R1 = $R1src, $R2 = $R2src"; - let DisableEncoding = "$R1src, $R2src"; let M4 = 0; } @@ -4520,7 +4485,6 @@ class SideEffectTernaryMemMemMemRRFb<string mnemonic, bits<16> opcode, (ins cls1:$R1src, cls2:$R2src, cls3:$R3src), mnemonic#"\t$R1, $R3, $R2", []> { let Constraints = "$R1 = $R1src, $R2 = $R2src, $R3 = $R3src"; - let DisableEncoding = "$R1src, $R2src, $R3src"; let M4 = 0; } @@ -4544,7 +4508,6 @@ class SideEffectTernaryMemMemRRFc<string mnemonic, bits<16> opcode, (ins cls1:$R1src, cls2:$R2src, imm:$M3), mnemonic#"\t$R1, $R2, $M3", []> { let Constraints = "$R1 = $R1src, $R2 = $R2src"; - let DisableEncoding = "$R1src, $R2src"; } multiclass SideEffectTernaryMemMemRRFcOpt<string mnemonic, bits<16> opcode, @@ -4574,7 +4537,6 @@ class TernaryRRFb<string mnemonic, bits<16> opcode, (ins cls1:$R1src, cls2:$R2, imm32zx4:$M4), mnemonic#"\t$R1, $R3, $R2, $M4", []> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; } class TernaryRRFe<string mnemonic, bits<16> opcode, RegisterOperand cls1, @@ -4591,7 +4553,6 @@ class TernaryRRD<string mnemonic, bits<16> opcode, SDPatternOperator operator, let OpKey = mnemonic#cls; let OpType = "reg"; let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; } class TernaryRS<string mnemonic, bits<8> opcode, RegisterOperand cls, @@ -4601,7 +4562,6 @@ class TernaryRS<string mnemonic, bits<8> opcode, RegisterOperand cls, mnemonic#"\t$R1, $M3, $BD2", []> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; let mayLoad = 1; let AccessBytes = bytes; } @@ -4613,7 +4573,6 @@ class TernaryRSY<string mnemonic, bits<16> opcode, RegisterOperand cls, mnemonic#"\t$R1, $M3, $BD2", []> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; let mayLoad = 1; let AccessBytes = bytes; } @@ -4646,7 +4605,6 @@ class SideEffectTernaryMemMemRS<string mnemonic, bits<8> opcode, (ins cls1:$R1src, cls2:$R3src, (shift12only $B2, $D2):$BD2), mnemonic#"\t$R1, $R3, $BD2", []> { let Constraints = "$R1 = $R1src, $R3 = $R3src"; - let DisableEncoding = "$R1src, $R3src"; } class SideEffectTernaryMemMemRSY<string mnemonic, bits<16> opcode, @@ -4655,7 +4613,6 @@ class SideEffectTernaryMemMemRSY<string mnemonic, bits<16> opcode, (ins cls1:$R1src, cls2:$R3src, (shift20only $B2, $D2):$BD2), mnemonic#"\t$R1, $R3, $BD2", []> { let Constraints = "$R1 = $R1src, $R3 = $R3src"; - let DisableEncoding = "$R1src, $R3src"; } class TernaryRXF<string mnemonic, bits<16> opcode, SDPatternOperator operator, @@ -4669,7 +4626,6 @@ class TernaryRXF<string mnemonic, bits<16> opcode, SDPatternOperator operator, let OpKey = mnemonic#"r"#cls; let OpType = "mem"; let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; let mayLoad = 1; let AccessBytes = bytes; } @@ -4681,7 +4637,6 @@ class TernaryVRIa<string mnemonic, bits<16> opcode, SDPatternOperator operator, [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V1src), imm:$I2, index:$M3))]> { let Constraints = "$V1 = $V1src"; - let DisableEncoding = "$V1src"; } class TernaryVRId<string mnemonic, bits<16> opcode, SDPatternOperator operator, @@ -4893,7 +4848,6 @@ class TernaryVRSb<string mnemonic, bits<16> opcode, SDPatternOperator operator, cls:$R3, shift12only:$BD2))]> { let Constraints = "$V1 = $V1src"; - let DisableEncoding = "$V1src"; let M4 = type; } @@ -4913,7 +4867,6 @@ class TernaryVRSbGeneric<string mnemonic, bits<16> opcode> imm32zx4:$M4), mnemonic#"\t$V1, $R3, $BD2, $M4", []> { let Constraints = "$V1 = $V1src"; - let DisableEncoding = "$V1src"; } class TernaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes, @@ -4922,7 +4875,6 @@ class TernaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes, (ins VR128:$V1src, (bdvaddr12only $B2, $D2, $V2):$VBD2, index:$M3), mnemonic#"\t$V1, $VBD2, $M3", []> { let Constraints = "$V1 = $V1src"; - let DisableEncoding = "$V1src"; let mayLoad = 1; let AccessBytes = bytes; } @@ -4936,7 +4888,6 @@ class TernaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator, bdxaddr12only:$XBD2, index:$M3))]> { let Constraints = "$V1 = $V1src"; - let DisableEncoding = "$V1src"; let mayLoad = 1; let AccessBytes = bytes; } @@ -4951,7 +4902,6 @@ class QuaternaryVRId<string mnemonic, bits<16> opcode, SDPatternOperator operato (tr2.vt tr2.op:$V3), imm32zx8_timm:$I4))]> { let Constraints = "$V1 = $V1src"; - let DisableEncoding = "$V1src"; let M5 = type; } @@ -4961,7 +4911,6 @@ class QuaternaryVRIdGeneric<string mnemonic, bits<16> opcode> imm32zx8:$I4, imm32zx4:$M5), mnemonic#"\t$V1, $V2, $V3, $I4, $M5", []> { let Constraints = "$V1 = $V1src"; - let DisableEncoding = "$V1src"; } class QuaternaryVRIf<string mnemonic, bits<16> opcode> @@ -5087,7 +5036,6 @@ class CmpSwapRRE<string mnemonic, bits<16> opcode, : InstRRE<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2), mnemonic#"\t$R1, $R2", []> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; let mayLoad = 1; let mayStore = 1; } @@ -5099,7 +5047,6 @@ class CmpSwapRS<string mnemonic, bits<8> opcode, SDPatternOperator operator, mnemonic#"\t$R1, $R3, $BD2", [(set cls:$R1, (operator mode:$BD2, cls:$R1src, cls:$R3))]> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; let mayLoad = 1; let mayStore = 1; } @@ -5111,7 +5058,6 @@ class CmpSwapRSY<string mnemonic, bits<16> opcode, SDPatternOperator operator, mnemonic#"\t$R1, $R3, $BD2", [(set cls:$R1, (operator mode:$BD2, cls:$R1src, cls:$R3))]> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; let mayLoad = 1; let mayStore = 1; } @@ -5128,7 +5074,7 @@ multiclass CmpSwapRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode, multiclass RotateSelectRIEf<string mnemonic, bits<16> opcode, RegisterOperand cls1, RegisterOperand cls2, bits<8> I3Or = 0, bits<8> I4Or = 0> { - let Constraints = "$R1 = $R1src", DisableEncoding = "$R1src" in { + let Constraints = "$R1 = $R1src" in { def "" : InstRIEf<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2, imm32zx8:$I3, imm32zx8:$I4, imm32zx8:$I5), @@ -5328,7 +5274,6 @@ class CondBinaryRRFPseudo<string mnemonic, RegisterOperand cls1, [(set cls1:$R1, (z_select_ccmask cls2:$R2, cls1:$R1src, cond4:$valid, cond4:$M3))]> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; let CCMaskLast = 1; let NumOpsKey = !subst("loc", "sel", mnemonic); let NumOpsValue = "2"; @@ -5359,7 +5304,6 @@ class CondBinaryRIEPseudo<RegisterOperand cls, ImmOpWithPattern imm> [(set cls:$R1, (z_select_ccmask imm:$I2, cls:$R1src, cond4:$valid, cond4:$M3))]> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; let CCMaskLast = 1; } @@ -5374,7 +5318,6 @@ class CondUnaryRSYPseudo<string mnemonic, SDPatternOperator operator, (z_select_ccmask (operator mode:$BD2), cls:$R1src, cond4:$valid, cond4:$R3))]> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; let mayLoad = 1; let AccessBytes = bytes; let CCMaskLast = 1; @@ -5414,7 +5357,6 @@ class RotateSelectRIEfPseudo<RegisterOperand cls1, RegisterOperand cls2> imm32zx8:$I5), []> { let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; } // Implements "$dst = $cc & (8 >> CC) ? $src1 : $src2", where CC is diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp index 783f86aecce4..2e21f27c9032 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -59,8 +59,8 @@ static uint64_t allOnes(unsigned int Count) { // Pin the vtable to this file. void SystemZInstrInfo::anchor() {} -SystemZInstrInfo::SystemZInstrInfo(SystemZSubtarget &sti) - : SystemZGenInstrInfo(-1, -1), +SystemZInstrInfo::SystemZInstrInfo(const SystemZSubtarget &sti) + : SystemZGenInstrInfo(sti, -1, -1), RI(sti.getSpecialRegisters()->getReturnFunctionAddressRegister(), sti.getHwMode()), STI(sti) {} diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h index 8b82af61e669..7b9ad7b87a14 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h @@ -184,7 +184,7 @@ MachineBasicBlock *splitBlockBefore(MachineBasicBlock::iterator MI, class SystemZInstrInfo : public SystemZGenInstrInfo { const SystemZRegisterInfo RI; - SystemZSubtarget &STI; + const SystemZSubtarget &STI; void splitMove(MachineBasicBlock::iterator MI, unsigned NewOpcode) const; void splitAdjDynAlloc(MachineBasicBlock::iterator MI) const; @@ -225,7 +225,7 @@ protected: unsigned CommuteOpIdx2) const override; public: - explicit SystemZInstrInfo(SystemZSubtarget &STI); + explicit SystemZInstrInfo(const SystemZSubtarget &STI); // Override TargetInstrInfo. Register isLoadFromStackSlot(const MachineInstr &MI, diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td index 10de8b05cf45..479bab5ce62b 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td @@ -680,41 +680,41 @@ let Predicates = [FeatureVector] in { let isCommutable = 1 in { // Maximum. def VMX : BinaryVRRcGeneric<"vmx", 0xE7FF>; - def VMXB : BinaryVRRc<"vmxb", 0xE7FF, null_frag, v128b, v128b, 0>; - def VMXH : BinaryVRRc<"vmxh", 0xE7FF, null_frag, v128h, v128h, 1>; - def VMXF : BinaryVRRc<"vmxf", 0xE7FF, null_frag, v128f, v128f, 2>; - def VMXG : BinaryVRRc<"vmxg", 0xE7FF, null_frag, v128g, v128g, 3>; + def VMXB : BinaryVRRc<"vmxb", 0xE7FF, smax, v128b, v128b, 0>; + def VMXH : BinaryVRRc<"vmxh", 0xE7FF, smax, v128h, v128h, 1>; + def VMXF : BinaryVRRc<"vmxf", 0xE7FF, smax, v128f, v128f, 2>; + def VMXG : BinaryVRRc<"vmxg", 0xE7FF, smax, v128g, v128g, 3>; let Predicates = [FeatureVectorEnhancements3] in - def VMXQ : BinaryVRRc<"vmxq", 0xE7FF, null_frag, v128q, v128q, 4>; + def VMXQ : BinaryVRRc<"vmxq", 0xE7FF, smax, v128q, v128q, 4>; // Maximum logical. def VMXL : BinaryVRRcGeneric<"vmxl", 0xE7FD>; - def VMXLB : BinaryVRRc<"vmxlb", 0xE7FD, null_frag, v128b, v128b, 0>; - def VMXLH : BinaryVRRc<"vmxlh", 0xE7FD, null_frag, v128h, v128h, 1>; - def VMXLF : BinaryVRRc<"vmxlf", 0xE7FD, null_frag, v128f, v128f, 2>; - def VMXLG : BinaryVRRc<"vmxlg", 0xE7FD, null_frag, v128g, v128g, 3>; + def VMXLB : BinaryVRRc<"vmxlb", 0xE7FD, umax, v128b, v128b, 0>; + def VMXLH : BinaryVRRc<"vmxlh", 0xE7FD, umax, v128h, v128h, 1>; + def VMXLF : BinaryVRRc<"vmxlf", 0xE7FD, umax, v128f, v128f, 2>; + def VMXLG : BinaryVRRc<"vmxlg", 0xE7FD, umax, v128g, v128g, 3>; let Predicates = [FeatureVectorEnhancements3] in - def VMXLQ : BinaryVRRc<"vmxlq", 0xE7FD, null_frag, v128q, v128q, 4>; + def VMXLQ : BinaryVRRc<"vmxlq", 0xE7FD, umax, v128q, v128q, 4>; } let isCommutable = 1 in { // Minimum. def VMN : BinaryVRRcGeneric<"vmn", 0xE7FE>; - def VMNB : BinaryVRRc<"vmnb", 0xE7FE, null_frag, v128b, v128b, 0>; - def VMNH : BinaryVRRc<"vmnh", 0xE7FE, null_frag, v128h, v128h, 1>; - def VMNF : BinaryVRRc<"vmnf", 0xE7FE, null_frag, v128f, v128f, 2>; - def VMNG : BinaryVRRc<"vmng", 0xE7FE, null_frag, v128g, v128g, 3>; + def VMNB : BinaryVRRc<"vmnb", 0xE7FE, smin, v128b, v128b, 0>; + def VMNH : BinaryVRRc<"vmnh", 0xE7FE, smin, v128h, v128h, 1>; + def VMNF : BinaryVRRc<"vmnf", 0xE7FE, smin, v128f, v128f, 2>; + def VMNG : BinaryVRRc<"vmng", 0xE7FE, smin, v128g, v128g, 3>; let Predicates = [FeatureVectorEnhancements3] in - def VMNQ : BinaryVRRc<"vmnq", 0xE7FE, null_frag, v128q, v128q, 4>; + def VMNQ : BinaryVRRc<"vmnq", 0xE7FE, smin, v128q, v128q, 4>; // Minimum logical. def VMNL : BinaryVRRcGeneric<"vmnl", 0xE7FC>; - def VMNLB : BinaryVRRc<"vmnlb", 0xE7FC, null_frag, v128b, v128b, 0>; - def VMNLH : BinaryVRRc<"vmnlh", 0xE7FC, null_frag, v128h, v128h, 1>; - def VMNLF : BinaryVRRc<"vmnlf", 0xE7FC, null_frag, v128f, v128f, 2>; - def VMNLG : BinaryVRRc<"vmnlg", 0xE7FC, null_frag, v128g, v128g, 3>; + def VMNLB : BinaryVRRc<"vmnlb", 0xE7FC, umin, v128b, v128b, 0>; + def VMNLH : BinaryVRRc<"vmnlh", 0xE7FC, umin, v128h, v128h, 1>; + def VMNLF : BinaryVRRc<"vmnlf", 0xE7FC, umin, v128f, v128f, 2>; + def VMNLG : BinaryVRRc<"vmnlg", 0xE7FC, umin, v128g, v128g, 3>; let Predicates = [FeatureVectorEnhancements3] in - def VMNLQ : BinaryVRRc<"vmnlq", 0xE7FC, null_frag, v128q, v128q, 4>; + def VMNLQ : BinaryVRRc<"vmnlq", 0xE7FC, umin, v128q, v128q, 4>; } let isCommutable = 1 in { @@ -1250,54 +1250,45 @@ defm : IntegerAbsoluteVectorOps<v8i16, VLCH, VLPH, 15>; defm : IntegerAbsoluteVectorOps<v4i32, VLCF, VLPF, 31>; defm : IntegerAbsoluteVectorOps<v2i64, VLCG, VLPG, 63>; -// Instantiate minimum- and maximum-related patterns for TYPE. CMPH is the -// signed or unsigned "set if greater than" comparison instruction and -// MIN and MAX are the associated minimum and maximum instructions. -multiclass IntegerMinMaxVectorOps<ValueType type, SDPatternOperator cmph, - Instruction min, Instruction max> { - let Predicates = [FeatureVector] in { - def : Pat<(type (vselect (cmph VR128:$x, VR128:$y), VR128:$x, VR128:$y)), - (max VR128:$x, VR128:$y)>; - def : Pat<(type (vselect (cmph VR128:$x, VR128:$y), VR128:$y, VR128:$x)), - (min VR128:$x, VR128:$y)>; - def : Pat<(type (vselect (z_vnot (cmph VR128:$x, VR128:$y)), - VR128:$x, VR128:$y)), - (min VR128:$x, VR128:$y)>; - def : Pat<(type (vselect (z_vnot (cmph VR128:$x, VR128:$y)), - VR128:$y, VR128:$x)), - (max VR128:$x, VR128:$y)>; - } +// Instantiate packs/packu: recognize a saturating truncation and convert +// into the corresponding packs/packu instruction. +multiclass SignedSaturatingTruncate<ValueType input, ValueType output, + Instruction packs> { + def : Pat< + (output (z_pack + (smin (smax (input VR128:$a), ssat_trunc_min_vec), ssat_trunc_max_vec), + (smin (smax (input VR128:$b), ssat_trunc_min_vec), ssat_trunc_max_vec) + )), + (packs VR128:$a, VR128:$b) + >; + + def : Pat< + (output (z_pack + (smax (smin (input VR128:$a), ssat_trunc_max_vec), ssat_trunc_min_vec), + (smax (smin (input VR128:$b), ssat_trunc_max_vec), ssat_trunc_min_vec) + )), + (packs VR128:$a, VR128:$b) + >; } -// Signed min/max. -defm : IntegerMinMaxVectorOps<v16i8, z_vicmph, VMNB, VMXB>; -defm : IntegerMinMaxVectorOps<v8i16, z_vicmph, VMNH, VMXH>; -defm : IntegerMinMaxVectorOps<v4i32, z_vicmph, VMNF, VMXF>; -defm : IntegerMinMaxVectorOps<v2i64, z_vicmph, VMNG, VMXG>; - -let Predicates = [FeatureVectorEnhancements3] in { - def : Pat<(i128 (or (and VR128:$x, (z_vicmph VR128:$x, VR128:$y)), - (and VR128:$y, (not (z_vicmph VR128:$x, VR128:$y))))), - (VMXQ VR128:$x, VR128:$y)>; - def : Pat<(i128 (or (and VR128:$y, (z_vicmph VR128:$x, VR128:$y)), - (and VR128:$x, (not (z_vicmph VR128:$x, VR128:$y))))), - (VMNQ VR128:$x, VR128:$y)>; +defm : SignedSaturatingTruncate<v8i16, v16i8, VPKSH>; +defm : SignedSaturatingTruncate<v4i32, v8i16, VPKSF>; +defm : SignedSaturatingTruncate<v2i64, v4i32, VPKSG>; + +multiclass UnsignedSaturatingTruncate<ValueType input, ValueType output, + Instruction packu> { + def : Pat< + (output (z_pack + (umin (input VR128:$a), usat_trunc_max_vec), + (umin (input VR128:$b), usat_trunc_max_vec) + )), + (packu VR128:$a, VR128:$b) + >; } -// Unsigned min/max. -defm : IntegerMinMaxVectorOps<v16i8, z_vicmphl, VMNLB, VMXLB>; -defm : IntegerMinMaxVectorOps<v8i16, z_vicmphl, VMNLH, VMXLH>; -defm : IntegerMinMaxVectorOps<v4i32, z_vicmphl, VMNLF, VMXLF>; -defm : IntegerMinMaxVectorOps<v2i64, z_vicmphl, VMNLG, VMXLG>; - -let Predicates = [FeatureVectorEnhancements3] in { - def : Pat<(i128 (or (and VR128:$x, (z_vicmphl VR128:$x, VR128:$y)), - (and VR128:$y, (not (z_vicmphl VR128:$x, VR128:$y))))), - (VMXLQ VR128:$x, VR128:$y)>; - def : Pat<(i128 (or (and VR128:$y, (z_vicmphl VR128:$x, VR128:$y)), - (and VR128:$x, (not (z_vicmphl VR128:$x, VR128:$y))))), - (VMNLQ VR128:$x, VR128:$y)>; -} +defm : UnsignedSaturatingTruncate<v8i16, v16i8, VPKLSH>; +defm : UnsignedSaturatingTruncate<v4i32, v8i16, VPKLSF>; +defm : UnsignedSaturatingTruncate<v2i64, v4i32, VPKLSG>; // Instantiate comparison patterns to recognize VACC/VSCBI for TYPE. multiclass IntegerComputeCarryOrBorrow<ValueType type, diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td index 39e216b993b1..547d3dcf9280 100644 --- a/llvm/lib/Target/SystemZ/SystemZOperators.td +++ b/llvm/lib/Target/SystemZ/SystemZOperators.td @@ -1067,6 +1067,31 @@ def vsplat_imm_eq_1 : PatFrag<(ops), (build_vector), [{ }]>; def z_vzext1 : PatFrag<(ops node:$x), (and node:$x, vsplat_imm_eq_1)>; +// Vector constants for saturating truncation, containing the minimum and +// maximum value for the integer type that is half of the element width. +def ssat_trunc_min_vec: PatFrag<(ops), (build_vector), [{ + APInt Imm; + EVT EltTy = N->getValueType(0).getVectorElementType(); + unsigned SizeInBits = EltTy.getSizeInBits(); + APInt min = APInt::getSignedMinValue(SizeInBits / 2).sext(SizeInBits); + return ISD::isConstantSplatVector(N, Imm) && APInt::isSameValue(Imm, min); +}]>; +def ssat_trunc_max_vec: PatFrag<(ops), (build_vector), [{ + APInt Imm; + EVT EltTy = N->getValueType(0).getVectorElementType(); + unsigned SizeInBits = EltTy.getSizeInBits(); + APInt max = APInt::getSignedMaxValue(SizeInBits / 2).sext(SizeInBits); + return ISD::isConstantSplatVector(N, Imm) && APInt::isSameValue(Imm, max); +}]>; + +def usat_trunc_max_vec: PatFrag<(ops), (build_vector), [{ + APInt Imm; + EVT EltTy = N->getValueType(0).getVectorElementType(); + unsigned SizeInBits = EltTy.getSizeInBits(); + APInt max = APInt::getMaxValue(SizeInBits / 2).zext(SizeInBits); + return ISD::isConstantSplatVector(N, Imm) && APInt::isSameValue(Imm, max); +}]>; + // Signed "integer greater than zero" on vectors. def z_vicmph_zero : PatFrag<(ops node:$x), (z_vicmph node:$x, immAllZerosV)>; diff --git a/llvm/lib/Target/TargetLoweringObjectFile.cpp b/llvm/lib/Target/TargetLoweringObjectFile.cpp index 28495e7c5719..343bcce80e3a 100644 --- a/llvm/lib/Target/TargetLoweringObjectFile.cpp +++ b/llvm/lib/Target/TargetLoweringObjectFile.cpp @@ -191,8 +191,9 @@ void TargetLoweringObjectFile::emitCGProfileMetadata(MCStreamer &Streamer, } } -void TargetLoweringObjectFile::emitPseudoProbeDescMetadata(MCStreamer &Streamer, - Module &M) const { +void TargetLoweringObjectFile::emitPseudoProbeDescMetadata( + MCStreamer &Streamer, Module &M, + std::function<void(MCStreamer &Streamer)> COMDATSymEmitter) const { NamedMDNode *FuncInfo = M.getNamedMetadata(PseudoProbeDescMetadataName); if (!FuncInfo) return; @@ -213,6 +214,11 @@ void TargetLoweringObjectFile::emitPseudoProbeDescMetadata(MCStreamer &Streamer, TM->getFunctionSections() ? Name->getString() : StringRef()); Streamer.switchSection(S); + + // emit COFF COMDAT symbol. + if (COMDATSymEmitter) + COMDATSymEmitter(Streamer); + Streamer.emitInt64(GUID->getZExtValue()); Streamer.emitInt64(Hash->getZExtValue()); Streamer.emitULEB128IntValue(Name->getString().size()); diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp index 69b6e26e602f..ad7e503cb155 100644 --- a/llvm/lib/Target/TargetMachine.cpp +++ b/llvm/lib/Target/TargetMachine.cpp @@ -162,7 +162,6 @@ void TargetMachine::resetTargetOptions(const Function &F) const { RESET_OPTION(NoInfsFPMath, "no-infs-fp-math"); RESET_OPTION(NoNaNsFPMath, "no-nans-fp-math"); RESET_OPTION(NoSignedZerosFPMath, "no-signed-zeros-fp-math"); - RESET_OPTION(ApproxFuncFPMath, "approx-func-fp-math"); } /// Returns the code generation relocation model. The choices are static, PIC, diff --git a/llvm/lib/Target/TargetMachineC.cpp b/llvm/lib/Target/TargetMachineC.cpp index da6d35c8c8b4..aba6ea436e76 100644 --- a/llvm/lib/Target/TargetMachineC.cpp +++ b/llvm/lib/Target/TargetMachineC.cpp @@ -83,7 +83,8 @@ LLVMBool LLVMGetTargetFromTriple(const char* TripleStr, LLVMTargetRef *T, char **ErrorMessage) { std::string Error; - *T = wrap(TargetRegistry::lookupTarget(TripleStr, Error)); + Triple TT(TripleStr); + *T = wrap(TargetRegistry::lookupTarget(TT, Error)); if (!*T) { if (ErrorMessage) diff --git a/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp b/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp index d7e1666a7417..aad826b5f285 100644 --- a/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp +++ b/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp @@ -57,6 +57,7 @@ LLVMInitializeVEDisassembler() { createVEDisassembler); } +// clang-format off static const unsigned I32RegDecoderTable[] = { VE::SW0, VE::SW1, VE::SW2, VE::SW3, VE::SW4, VE::SW5, VE::SW6, VE::SW7, VE::SW8, VE::SW9, VE::SW10, VE::SW11, VE::SW12, VE::SW13, @@ -127,6 +128,7 @@ static const unsigned MiscRegDecoderTable[] = { VE::PMC4, VE::PMC5, VE::PMC6, VE::PMC7, VE::PMC8, VE::PMC9, VE::PMC10, VE::PMC11, VE::PMC12, VE::PMC13, VE::PMC14}; +// clang-format on static DecodeStatus DecodeI32RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, @@ -214,106 +216,6 @@ static DecodeStatus DecodeMISCRegisterClass(MCInst &Inst, unsigned RegNo, return MCDisassembler::Success; } -static DecodeStatus DecodeASX(MCInst &Inst, uint64_t insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeLoadI32(MCInst &Inst, uint64_t insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeStoreI32(MCInst &Inst, uint64_t insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeLoadI64(MCInst &Inst, uint64_t insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeStoreI64(MCInst &Inst, uint64_t insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeLoadF32(MCInst &Inst, uint64_t insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeStoreF32(MCInst &Inst, uint64_t insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeLoadASI64(MCInst &Inst, uint64_t insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeStoreASI64(MCInst &Inst, uint64_t insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeTS1AMI64(MCInst &Inst, uint64_t insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeTS1AMI32(MCInst &Inst, uint64_t insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeCASI64(MCInst &Inst, uint64_t insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeCASI32(MCInst &Inst, uint64_t insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeCall(MCInst &Inst, uint64_t insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeSIMM7(MCInst &Inst, uint64_t insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeSIMM32(MCInst &Inst, uint64_t insn, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeCCOperand(MCInst &Inst, uint64_t insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeRDOperand(MCInst &Inst, uint64_t insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeBranchCondition(MCInst &Inst, uint64_t insn, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeBranchConditionAlways(MCInst &Inst, uint64_t insn, - uint64_t Address, - const MCDisassembler *Decoder); - -#include "VEGenDisassemblerTables.inc" - -/// Read four bytes from the ArrayRef and return 32 bit word. -static DecodeStatus readInstruction64(ArrayRef<uint8_t> Bytes, uint64_t Address, - uint64_t &Size, uint64_t &Insn, - bool IsLittleEndian) { - // We want to read exactly 8 Bytes of data. - if (Bytes.size() < 8) { - Size = 0; - return MCDisassembler::Fail; - } - - Insn = IsLittleEndian - ? ((uint64_t)Bytes[0] << 0) | ((uint64_t)Bytes[1] << 8) | - ((uint64_t)Bytes[2] << 16) | ((uint64_t)Bytes[3] << 24) | - ((uint64_t)Bytes[4] << 32) | ((uint64_t)Bytes[5] << 40) | - ((uint64_t)Bytes[6] << 48) | ((uint64_t)Bytes[7] << 56) - : ((uint64_t)Bytes[7] << 0) | ((uint64_t)Bytes[6] << 8) | - ((uint64_t)Bytes[5] << 16) | ((uint64_t)Bytes[4] << 24) | - ((uint64_t)Bytes[3] << 32) | ((uint64_t)Bytes[2] << 40) | - ((uint64_t)Bytes[1] << 48) | ((uint64_t)Bytes[0] << 56); - - return MCDisassembler::Success; -} - -DecodeStatus VEDisassembler::getInstruction(MCInst &Instr, uint64_t &Size, - ArrayRef<uint8_t> Bytes, - uint64_t Address, - raw_ostream &CStream) const { - uint64_t Insn; - bool isLittleEndian = getContext().getAsmInfo()->isLittleEndian(); - DecodeStatus Result = - readInstruction64(Bytes, Address, Size, Insn, isLittleEndian); - if (Result == MCDisassembler::Fail) - return MCDisassembler::Fail; - - // Calling the auto-generated decoder function. - - Result = decodeInstruction(DecoderTableVE64, Instr, Insn, Address, this, STI); - - if (Result != MCDisassembler::Fail) { - Size = 8; - return Result; - } - - return MCDisassembler::Fail; -} - typedef DecodeStatus (*DecodeFunc)(MCInst &MI, unsigned RegNo, uint64_t Address, const MCDisassembler *Decoder); @@ -629,3 +531,51 @@ static DecodeStatus DecodeBranchConditionAlways(MCInst &MI, uint64_t insn, // Decode MEMri. return DecodeAS(MI, insn, Address, Decoder); } + +#include "VEGenDisassemblerTables.inc" + +/// Read four bytes from the ArrayRef and return 32 bit word. +static DecodeStatus readInstruction64(ArrayRef<uint8_t> Bytes, uint64_t Address, + uint64_t &Size, uint64_t &Insn, + bool IsLittleEndian) { + // We want to read exactly 8 Bytes of data. + if (Bytes.size() < 8) { + Size = 0; + return MCDisassembler::Fail; + } + + Insn = IsLittleEndian + ? ((uint64_t)Bytes[0] << 0) | ((uint64_t)Bytes[1] << 8) | + ((uint64_t)Bytes[2] << 16) | ((uint64_t)Bytes[3] << 24) | + ((uint64_t)Bytes[4] << 32) | ((uint64_t)Bytes[5] << 40) | + ((uint64_t)Bytes[6] << 48) | ((uint64_t)Bytes[7] << 56) + : ((uint64_t)Bytes[7] << 0) | ((uint64_t)Bytes[6] << 8) | + ((uint64_t)Bytes[5] << 16) | ((uint64_t)Bytes[4] << 24) | + ((uint64_t)Bytes[3] << 32) | ((uint64_t)Bytes[2] << 40) | + ((uint64_t)Bytes[1] << 48) | ((uint64_t)Bytes[0] << 56); + + return MCDisassembler::Success; +} + +DecodeStatus VEDisassembler::getInstruction(MCInst &Instr, uint64_t &Size, + ArrayRef<uint8_t> Bytes, + uint64_t Address, + raw_ostream &CStream) const { + uint64_t Insn; + bool isLittleEndian = getContext().getAsmInfo()->isLittleEndian(); + DecodeStatus Result = + readInstruction64(Bytes, Address, Size, Insn, isLittleEndian); + if (Result == MCDisassembler::Fail) + return MCDisassembler::Fail; + + // Calling the auto-generated decoder function. + + Result = decodeInstruction(DecoderTableVE64, Instr, Insn, Address, this, STI); + + if (Result != MCDisassembler::Fail) { + Size = 8; + return Result; + } + + return MCDisassembler::Fail; +} diff --git a/llvm/lib/Target/VE/VEInstrInfo.cpp b/llvm/lib/Target/VE/VEInstrInfo.cpp index 98e4b452a8a5..d5e804afd27f 100644 --- a/llvm/lib/Target/VE/VEInstrInfo.cpp +++ b/llvm/lib/Target/VE/VEInstrInfo.cpp @@ -34,8 +34,8 @@ using namespace llvm; // Pin the vtable to this file. void VEInstrInfo::anchor() {} -VEInstrInfo::VEInstrInfo(VESubtarget &ST) - : VEGenInstrInfo(VE::ADJCALLSTACKDOWN, VE::ADJCALLSTACKUP), RI() {} +VEInstrInfo::VEInstrInfo(const VESubtarget &ST) + : VEGenInstrInfo(ST, VE::ADJCALLSTACKDOWN, VE::ADJCALLSTACKUP), RI() {} static bool IsIntegerCC(unsigned CC) { return (CC < VECC::CC_AF); } diff --git a/llvm/lib/Target/VE/VEInstrInfo.h b/llvm/lib/Target/VE/VEInstrInfo.h index 49dcba503462..408d3ab9e05f 100644 --- a/llvm/lib/Target/VE/VEInstrInfo.h +++ b/llvm/lib/Target/VE/VEInstrInfo.h @@ -53,7 +53,7 @@ class VEInstrInfo : public VEGenInstrInfo { virtual void anchor(); public: - explicit VEInstrInfo(VESubtarget &ST); + explicit VEInstrInfo(const VESubtarget &ST); /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As /// such, whenever a client has an instance of instruction info, it should diff --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td index 7e3f29b3bd82..9869f95ae566 100644 --- a/llvm/lib/Target/VE/VEInstrInfo.td +++ b/llvm/lib/Target/VE/VEInstrInfo.td @@ -39,6 +39,8 @@ include "VEInstrFormats.td" // e.g. 0.0 (0x00000000) or -2.0 (0xC0000000=(2)1). //===----------------------------------------------------------------------===// +defvar ve_ptr_rc = I64; + def ULO7 : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(N->getZExtValue() & 0x7f, SDLoc(N), MVT::i32); @@ -325,17 +327,17 @@ def VEMEMziiAsmOperand : AsmOperandClass { // ASX format uses single assembly instruction format. def MEMrri : Operand<iPTR> { let PrintMethod = "printMemASXOperand"; - let MIOperandInfo = (ops ptr_rc, ptr_rc, i64imm); + let MIOperandInfo = (ops ve_ptr_rc, ve_ptr_rc, i64imm); let ParserMatchClass = VEMEMrriAsmOperand; } def MEMrii : Operand<iPTR> { let PrintMethod = "printMemASXOperand"; - let MIOperandInfo = (ops ptr_rc, i32imm, i64imm); + let MIOperandInfo = (ops ve_ptr_rc, i32imm, i64imm); let ParserMatchClass = VEMEMriiAsmOperand; } def MEMzri : Operand<iPTR> { let PrintMethod = "printMemASXOperand"; - let MIOperandInfo = (ops i32imm /* = 0 */, ptr_rc, i64imm); + let MIOperandInfo = (ops i32imm /* = 0 */, ve_ptr_rc, i64imm); let ParserMatchClass = VEMEMzriAsmOperand; } def MEMzii : Operand<iPTR> { @@ -358,7 +360,7 @@ def VEMEMziAsmOperand : AsmOperandClass { // 1. AS generic assembly instruction format: def MEMriASX : Operand<iPTR> { let PrintMethod = "printMemASOperandASX"; - let MIOperandInfo = (ops ptr_rc, i32imm); + let MIOperandInfo = (ops ve_ptr_rc, i32imm); let ParserMatchClass = VEMEMriAsmOperand; } def MEMziASX : Operand<iPTR> { @@ -370,7 +372,7 @@ def MEMziASX : Operand<iPTR> { // 2. AS RRM style assembly instruction format: def MEMriRRM : Operand<iPTR> { let PrintMethod = "printMemASOperandRRM"; - let MIOperandInfo = (ops ptr_rc, i32imm); + let MIOperandInfo = (ops ve_ptr_rc, i32imm); let ParserMatchClass = VEMEMriAsmOperand; } def MEMziRRM : Operand<iPTR> { @@ -382,7 +384,7 @@ def MEMziRRM : Operand<iPTR> { // 3. AS HM style assembly instruction format: def MEMriHM : Operand<iPTR> { let PrintMethod = "printMemASOperandHM"; - let MIOperandInfo = (ops ptr_rc, i32imm); + let MIOperandInfo = (ops ve_ptr_rc, i32imm); let ParserMatchClass = VEMEMriAsmOperand; } def MEMziHM : Operand<iPTR> { @@ -642,7 +644,7 @@ multiclass RRIm<string opcStr, bits<8>opc, // Special RR multiclass for 128 bits shift left instruction. // e.g. SLD -let Constraints = "$hi = $sx", DisableEncoding = "$hi", hasSideEffects = 0 in +let Constraints = "$hi = $sx", hasSideEffects = 0 in multiclass RRILDm<string opcStr, bits<8>opc, RegisterClass RC> { def rrr : RR<opc, (outs RC:$sx), (ins RC:$hi, RC:$sz, I32:$sy), !strconcat(opcStr, " $sx, $sz, $sy")>; @@ -659,7 +661,7 @@ multiclass RRILDm<string opcStr, bits<8>opc, RegisterClass RC> { // Special RR multiclass for 128 bits shift right instruction. // e.g. SRD -let Constraints = "$low = $sx", DisableEncoding = "$low", hasSideEffects = 0 in +let Constraints = "$low = $sx", hasSideEffects = 0 in multiclass RRIRDm<string opcStr, bits<8>opc, RegisterClass RC> { def rrr : RR<opc, (outs RC:$sx), (ins RC:$sz, RC:$low, I32:$sy), !strconcat(opcStr, " $sx, $sz, $sy")>; @@ -689,7 +691,7 @@ multiclass RRI1m<string opcStr, bits<8>opc, RegisterClass RC, ValueType Ty, // Special RR multiclass for MRG instruction. // e.g. MRG -let Constraints = "$sx = $sd", DisableEncoding = "$sd", hasSideEffects = 0 in +let Constraints = "$sx = $sd", hasSideEffects = 0 in multiclass RRMRGm<string opcStr, bits<8>opc, RegisterClass RC> { def rr : RR<opc, (outs RC:$sx), (ins RC:$sy, RC:$sz, RC:$sd), !strconcat(opcStr, " $sx, $sy, $sz")>; @@ -722,7 +724,7 @@ multiclass RRSWPm<string opcStr, bits<8>opc, // Multiclass for CMOV instructions. // e.g. CMOVL, CMOVW, CMOVD, and etc. -let Constraints = "$sx = $sd", DisableEncoding = "$sd", hasSideEffects = 0, +let Constraints = "$sx = $sd", hasSideEffects = 0, cfw = ? in multiclass RRCMOVm<string opcStr, bits<8>opc, RegisterClass RC, ValueType Ty, SDPatternOperator OpNode = null_frag, @@ -805,7 +807,7 @@ multiclass PFCHm<string opcStr, bits<8>opc> { // Multiclass for CAS instructions. // e.g. TS1AML, TS1AMW, TS2AM, and etc. -let Constraints = "$sx = $sd", DisableEncoding = "$sd", +let Constraints = "$sx = $sd", mayStore=1, mayLoad = 1, hasSideEffects = 0 in multiclass RRCAStgm<string opcStr, bits<8>opc, RegisterClass RC, ValueType Ty, Operand immOp, Operand MEM, ComplexPattern ADDR, @@ -920,7 +922,7 @@ multiclass STORECRm<string opcStr, bits<8>opc, RegisterClass RC> { !strconcat(opcStr, " $sx, $sy, $sz")>; } -let hasSideEffects = 1, Constraints = "$sx = $sx_in", DisableEncoding = "$sx_in" in +let hasSideEffects = 1, Constraints = "$sx = $sx_in" in multiclass TSCRm<string opcStr, bits<8>opc, RegisterClass RC> { def rrr : RR<opc, (outs RC:$sx), (ins RC:$sy, RC:$sz, RC:$sx_in), !strconcat(opcStr, " $sx, $sy, $sz")>; diff --git a/llvm/lib/Target/VE/VEInstrVec.td b/llvm/lib/Target/VE/VEInstrVec.td index 327ad9ceacc5..e0989bf6ad23 100644 --- a/llvm/lib/Target/VE/VEInstrVec.td +++ b/llvm/lib/Target/VE/VEInstrVec.td @@ -35,7 +35,7 @@ def STVM512rii : Pseudo< // LVM/SVM instructions using VM512 let hasSideEffects = 0, isCodeGenOnly = 1 in { - let Constraints = "$vx = $vd", DisableEncoding = "$vd" in { + let Constraints = "$vx = $vd" in { def LVMyir_y : Pseudo<(outs VM512:$vx), (ins uimm3:$sy, I64:$sz, VM512:$vd), "# pseudo LVM $vx, $sy, $sz, $vd">; def LVMyim_y : Pseudo<(outs VM512:$vx), @@ -51,7 +51,7 @@ let hasSideEffects = 0, isCodeGenOnly = 1 in { } // VFMK/VFMKW/VFMKS instructions using VM512 -let hasSideEffects = 0, isCodeGenOnly = 1, DisableEncoding = "$vl" in { +let hasSideEffects = 0, isCodeGenOnly = 1 in { def VFMKyal : Pseudo<(outs VM512:$vmx), (ins I32:$vl), "# pseudo-vfmk.at $vmx">; def VFMKynal : Pseudo<(outs VM512:$vmx), (ins I32:$vl), @@ -126,21 +126,18 @@ let hasSideEffects = 0, isCodeGenOnly = 1 in { // Multiclass for VLD instructions let mayLoad = 1, hasSideEffects = 0, Uses = [VL] in -multiclass VLDbm<string opcStr, bits<8>opc, RegisterClass RC, dag dag_in, - string disEnc = ""> { - let DisableEncoding = disEnc in +multiclass VLDbm<string opcStr, bits<8>opc, RegisterClass RC, dag dag_in> { def "" : RVM<opc, (outs RC:$vx), dag_in, !strconcat(opcStr, " $vx, $sy, $sz")>; - let Constraints = "$vx = $base", DisableEncoding = disEnc#"$base", - isCodeGenOnly = 1 in + let Constraints = "$vx = $base", isCodeGenOnly = 1 in def _v : RVM<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)), !strconcat(opcStr, " $vx, $sy, $sz")>; } multiclass VLDlm<string opcStr, bits<8>opc, RegisterClass RC, dag dag_in> { defm "" : VLDbm<opcStr, opc, RC, dag_in>; let isCodeGenOnly = 1, VE_VLInUse = 1 in { - defm l : VLDbm<opcStr, opc, RC, !con(dag_in, (ins I32:$vl)), "$vl,">; - defm L : VLDbm<opcStr, opc, RC, !con(dag_in, (ins VLS:$vl)), "$vl,">; + defm l : VLDbm<opcStr, opc, RC, !con(dag_in, (ins I32:$vl))>; + defm L : VLDbm<opcStr, opc, RC, !con(dag_in, (ins VLS:$vl))>; } } let VE_VLIndex = 3 in @@ -182,7 +179,7 @@ let cx = 1 in defm VLDL2DZX : VLDm<"vldl2d.zx", 0xc3, V64>; let mayStore = 1, hasSideEffects = 0, Uses = [VL] in multiclass VSTbm<string opcStr, string argStr, bits<8>opc, dag dag_in> { def "" : RVM<opc, (outs), dag_in, !strconcat(opcStr, argStr)>; - let DisableEncoding = "$vl", isCodeGenOnly = 1, VE_VLInUse = 1 in { + let isCodeGenOnly = 1, VE_VLInUse = 1 in { def l : RVM<opc, (outs), !con(dag_in, (ins I32:$vl)), !strconcat(opcStr, argStr)>; def L : RVM<opc, (outs), !con(dag_in, (ins VLS:$vl)), @@ -232,12 +229,10 @@ defm VSTL2D : VSTm<"vstl2d", 0xd3, V64>; // Multiclass for VGT instructions let mayLoad = 1, hasSideEffects = 0, Uses = [VL] in multiclass VGTbm<string opcStr, string argStr, bits<8>opc, RegisterClass RC, - dag dag_in, string disEnc = ""> { - let DisableEncoding = disEnc in + dag dag_in> { def "" : RVM<opc, (outs RC:$vx), dag_in, !strconcat(opcStr, " $vx, ", argStr)>; - let Constraints = "$vx = $base", DisableEncoding = disEnc#"$base", - isCodeGenOnly = 1 in + let Constraints = "$vx = $base", isCodeGenOnly = 1 in def _v : RVM<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)), !strconcat(opcStr, " $vx, ", argStr)>; } @@ -245,10 +240,8 @@ multiclass VGTlm<string opcStr, string argStr, bits<8>opc, RegisterClass RC, dag dag_in> { defm "" : VGTbm<opcStr, argStr, opc, RC, dag_in>; let isCodeGenOnly = 1, VE_VLInUse = 1 in { - defm l : VGTbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl)), - "$vl,">; - defm L : VGTbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl)), - "$vl,">; + defm l : VGTbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl))>; + defm L : VGTbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl))>; } } multiclass VGTmm<string opcStr, string argStr, bits<8>opc, RegisterClass RC, @@ -297,7 +290,7 @@ def : MnemonicAlias<"vgtl.nc", "vgtl.zx.nc">; let mayStore = 1, hasSideEffects = 0, Uses = [VL] in multiclass VSCbm<string opcStr, string argStr, bits<8>opc, dag dag_in> { def "" : RVM<opc, (outs), dag_in, !strconcat(opcStr, argStr)>; - let DisableEncoding = "$vl", isCodeGenOnly = 1, VE_VLInUse = 1 in { + let isCodeGenOnly = 1, VE_VLInUse = 1 in { def l : RVM<opc, (outs), !con(dag_in, (ins I32:$vl)), !strconcat(opcStr, argStr)>; def L : RVM<opc, (outs), !con(dag_in, (ins VLS:$vl)), @@ -348,7 +341,7 @@ defm VSCL : VSCm<"vscl", 0xb3, V64>; let Uses = [VL] in multiclass PFCHVbm<string opcStr, string argStr, bits<8>opc, dag dag_in> { def "" : RVM<opc, (outs), dag_in, !strconcat(opcStr, argStr)>; - let DisableEncoding = "$vl", isCodeGenOnly = 1, VE_VLInUse = 1 in { + let isCodeGenOnly = 1, VE_VLInUse = 1 in { def l : RVM<opc, (outs), !con(dag_in, (ins I32:$vl)), !strconcat(opcStr, argStr)>; def L : RVM<opc, (outs), !con(dag_in, (ins VLS:$vl)), @@ -373,8 +366,7 @@ let sx = 0, vx = ?, hasSideEffects = 0 in multiclass LSVbm<string opcStr, string argStr, bits<8>opc, RegisterClass RC, dag dag_in> { def "" : RR<opc, (outs RC:$vx), dag_in, !strconcat(opcStr, " ${vx}", argStr)>; - let Constraints = "$vx = $base", DisableEncoding = "$base", - isCodeGenOnly = 1 in + let Constraints = "$vx = $base", isCodeGenOnly = 1 in def _v : RR<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)), !strconcat(opcStr, " ${vx}", argStr)>; } @@ -406,8 +398,7 @@ multiclass LVMbm<string opcStr, string argStr, bits<8>opc, RegisterClass RCM, dag dag_in> { def "" : RR<opc, (outs RCM:$vx), dag_in, !strconcat(opcStr, " $vx, ", argStr)>; - let Constraints = "$vx = $base", DisableEncoding = "$base", - isCodeGenOnly = 1 in { + let Constraints = "$vx = $base", isCodeGenOnly = 1 in { def _m : RR<opc, (outs RCM:$vx), !con(dag_in, (ins RCM:$base)), !strconcat(opcStr, " $vx, ", argStr)>; } @@ -440,11 +431,10 @@ defm SVM : SVMm<"svm", 0xa7, VM>; // Section 8.9.24 - VBRD (Vector Broadcast) let vx = ?, hasSideEffects = 0, Uses = [VL] in multiclass VBRDbm<string opcStr, string argStr, bits<8>opc, RegisterClass RC, - dag dag_in, string disEnc = ""> { - let DisableEncoding = disEnc in + dag dag_in> { def "" : RV<opc, (outs RC:$vx), dag_in, !strconcat(opcStr, " $vx, ", argStr)>; - let Constraints = "$vx = $base", DisableEncoding = disEnc#"$base", + let Constraints = "$vx = $base", isCodeGenOnly = 1 in def _v : RV<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)), !strconcat(opcStr, " $vx, ", argStr)>; @@ -453,10 +443,8 @@ multiclass VBRDlm<string opcStr, string argStr, bits<8>opc, RegisterClass RC, dag dag_in> { defm "" : VBRDbm<opcStr, argStr, opc, RC, dag_in>; let isCodeGenOnly = 1, VE_VLInUse = 1 in { - defm l : VBRDbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl)), - "$vl,">; - defm L : VBRDbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl)), - "$vl,">; + defm l : VBRDbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl))>; + defm L : VBRDbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl))>; } } multiclass VBRDmm<string opcStr, string argStr, bits<8>opc, RegisterClass RC, @@ -484,11 +472,10 @@ defm PVBRD : VBRDm<"pvbrd", 0x8c, V64, I64, VM512>; // Section 8.9.25 - VMV (Vector Move) let vx = ?, vz = ?, hasSideEffects = 0, Uses = [VL] in multiclass VMVbm<string opcStr, string argStr, bits<8>opc, RegisterClass RC, - dag dag_in, string disEnc = ""> { - let DisableEncoding = disEnc in + dag dag_in> { def "" : RV<opc, (outs RC:$vx), dag_in, !strconcat(opcStr, " $vx, ", argStr)>; - let Constraints = "$vx = $base", DisableEncoding = disEnc#"$base", + let Constraints = "$vx = $base", isCodeGenOnly = 1 in def _v : RV<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)), !strconcat(opcStr, " $vx, ", argStr)>; @@ -497,10 +484,8 @@ multiclass VMVlm<string opcStr, string argStr, bits<8>opc, RegisterClass RC, dag dag_in> { defm "" : VMVbm<opcStr, argStr, opc, RC, dag_in>; let isCodeGenOnly = 1, VE_VLInUse = 1 in { - defm l : VMVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl)), - "$vl,">; - defm L : VMVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl)), - "$vl,">; + defm l : VMVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl))>; + defm L : VMVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl))>; } } multiclass VMVmm<string opcStr, bits<8>opc, RegisterClass RC, @@ -525,12 +510,10 @@ defm VMV : VMVm<"vmv", 0x9c, V64, VM>; // Multiclass for generic vector calculation let vx = ?, hasSideEffects = 0, Uses = [VL] in multiclass RVbm<string opcStr, string argStr, bits<8>opc, RegisterClass RC, - dag dag_in, string disEnc = ""> { - let DisableEncoding = disEnc in + dag dag_in> { def "" : RV<opc, (outs RC:$vx), dag_in, !strconcat(opcStr, " $vx", argStr)>; - let Constraints = "$vx = $base", DisableEncoding = disEnc#"$base", - isCodeGenOnly = 1 in + let Constraints = "$vx = $base", isCodeGenOnly = 1 in def _v : RV<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)), !strconcat(opcStr, " $vx", argStr)>; } @@ -538,10 +521,8 @@ multiclass RVlm<string opcStr, string argStr, bits<8>opc, RegisterClass RC, dag dag_in> { defm "" : RVbm<opcStr, argStr, opc, RC, dag_in>; let isCodeGenOnly = 1, VE_VLInUse = 1 in { - defm l : RVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl)), - "$vl,">; - defm L : RVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl)), - "$vl,">; + defm l : RVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl))>; + defm L : RVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl))>; } } multiclass RVmm<string opcStr, string argStr, bits<8>opc, RegisterClass RC, @@ -688,11 +669,10 @@ multiclass RVFIXm<string opcStr, bits<8> opc, RegisterClass RC, // Multiclass for generic iterative vector calculation let vx = ?, hasSideEffects = 0, Uses = [VL] in multiclass RVIbm<string opcStr, string argStr, bits<8>opc, RegisterClass RC, - dag dag_in, string disEnc = ""> { - let DisableEncoding = disEnc in + dag dag_in> { def "" : RV<opc, (outs RC:$vx), dag_in, !strconcat(opcStr, " $vx", argStr)>; - let isCodeGenOnly = 1, Constraints = "$vx = $base", DisableEncoding = disEnc#"$base" in + let isCodeGenOnly = 1, Constraints = "$vx = $base" in def _v : RV<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)), !strconcat(opcStr, " $vx", argStr)>; } @@ -700,10 +680,8 @@ multiclass RVIlm<string opcStr, string argStr, bits<8>opc, RegisterClass RC, dag dag_in> { defm "" : RVIbm<opcStr, argStr, opc, RC, dag_in>; let isCodeGenOnly = 1, VE_VLInUse = 1 in { - defm l : RVIbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl)), - "$vl,">; - defm L : RVIbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl)), - "$vl,">; + defm l : RVIbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl))>; + defm L : RVIbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl))>; } } // Generic RV multiclass for iterative operation with 2 argument. @@ -743,7 +721,7 @@ let vx = ?, hasSideEffects = 0, Uses = [VL] in multiclass RVMKbm<string opcStr, string argStr, bits<8>opc, dag dag_out, dag dag_in> { def "" : RV<opc, dag_out, dag_in, !strconcat(opcStr, argStr)>; - let DisableEncoding = "$vl", isCodeGenOnly = 1, VE_VLInUse = 1 in { + let isCodeGenOnly = 1, VE_VLInUse = 1 in { def l : RV<opc, dag_out, !con(dag_in, (ins I32:$vl)), !strconcat(opcStr, argStr)>; def L : RV<opc, dag_out, !con(dag_in, (ins VLS:$vl)), @@ -796,7 +774,7 @@ multiclass RVMSbm<string opcStr, string argStr, bits<8>opc, dag dag_in> { bits<7> sx; let Inst{54-48} = sx; } - let DisableEncoding = "$vl", isCodeGenOnly = 1, VE_VLInUse = 1 in { + let isCodeGenOnly = 1, VE_VLInUse = 1 in { def l : RV<opc, (outs I64:$sx), !con(dag_in, (ins I32:$vl)), !strconcat(opcStr, " $sx,", argStr)> { bits<7> sx; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def index 378ef2c8f250..1eae3586d16b 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def @@ -27,6 +27,7 @@ HANDLE_NODETYPE(WrapperREL) HANDLE_NODETYPE(BR_IF) HANDLE_NODETYPE(BR_TABLE) HANDLE_NODETYPE(DOT) +HANDLE_NODETYPE(EXT_ADD_PAIRWISE_U) HANDLE_NODETYPE(SHUFFLE) HANDLE_NODETYPE(SWIZZLE) HANDLE_NODETYPE(VEC_SHL) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index c6c2d0cfccb6..fe100dab427e 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -2183,13 +2183,10 @@ SDValue performLowerPartialReduction(SDNode *N, SelectionDAG &DAG) { SDValue MulLow = DAG.getNode(ISD::MUL, DL, MVT::v8i16, LowLHS, LowRHS); SDValue MulHigh = DAG.getNode(ISD::MUL, DL, MVT::v8i16, HighLHS, HighRHS); - SDValue LowLow = DAG.getNode(LowOpc, DL, MVT::v4i32, MulLow); - SDValue LowHigh = DAG.getNode(LowOpc, DL, MVT::v4i32, MulHigh); - SDValue HighLow = DAG.getNode(HighOpc, DL, MVT::v4i32, MulLow); - SDValue HighHigh = DAG.getNode(HighOpc, DL, MVT::v4i32, MulHigh); - - SDValue AddLow = DAG.getNode(ISD::ADD, DL, MVT::v4i32, LowLow, HighLow); - SDValue AddHigh = DAG.getNode(ISD::ADD, DL, MVT::v4i32, LowHigh, HighHigh); + SDValue AddLow = + DAG.getNode(WebAssemblyISD::EXT_ADD_PAIRWISE_U, DL, MVT::v4i32, MulLow); + SDValue AddHigh = DAG.getNode(WebAssemblyISD::EXT_ADD_PAIRWISE_U, DL, + MVT::v4i32, MulHigh); SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::v4i32, AddLow, AddHigh); return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Add); } @@ -3588,34 +3585,53 @@ static SDValue performMulCombine(SDNode *N, if (auto Res = TryWideExtMulCombine(N, DCI.DAG)) return Res; - // We don't natively support v16i8 mul, but we do support v8i16 so split the - // inputs and extend them to v8i16. Only do this before legalization in case - // a narrow vector is widened and may be simplified later. - if (!DCI.isBeforeLegalize() || VT != MVT::v16i8) + // We don't natively support v16i8 or v8i8 mul, but we do support v8i16. So, + // extend them to v8i16. Only do this before legalization in case a narrow + // vector is widened and may be simplified later. + if (!DCI.isBeforeLegalize() || (VT != MVT::v8i8 && VT != MVT::v16i8)) return SDValue(); SDLoc DL(N); SelectionDAG &DAG = DCI.DAG; SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); - SDValue LowLHS = - DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, LHS); - SDValue HighLHS = - DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, LHS); - SDValue LowRHS = - DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, RHS); - SDValue HighRHS = - DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, RHS); - - SDValue MulLow = - DAG.getBitcast(VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, LowLHS, LowRHS)); - SDValue MulHigh = DAG.getBitcast( - VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, HighLHS, HighRHS)); - - // Take the low byte of each lane. - return DAG.getVectorShuffle( - VT, DL, MulLow, MulHigh, - {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}); + EVT MulVT = MVT::v8i16; + + if (VT == MVT::v8i8) { + SDValue PromotedLHS = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, LHS, + DAG.getUNDEF(MVT::v8i8)); + SDValue PromotedRHS = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, RHS, + DAG.getUNDEF(MVT::v8i8)); + SDValue LowLHS = + DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MulVT, PromotedLHS); + SDValue LowRHS = + DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MulVT, PromotedRHS); + SDValue MulLow = DAG.getBitcast( + MVT::v16i8, DAG.getNode(ISD::MUL, DL, MulVT, LowLHS, LowRHS)); + // Take the low byte of each lane. + SDValue Shuffle = DAG.getVectorShuffle( + MVT::v16i8, DL, MulLow, DAG.getUNDEF(MVT::v16i8), + {0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1}); + return extractSubVector(Shuffle, 0, DAG, DL, 64); + } else { + assert(VT == MVT::v16i8 && "Expected v16i8"); + SDValue LowLHS = DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MulVT, LHS); + SDValue LowRHS = DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MulVT, RHS); + SDValue HighLHS = + DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MulVT, LHS); + SDValue HighRHS = + DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MulVT, RHS); + + SDValue MulLow = + DAG.getBitcast(VT, DAG.getNode(ISD::MUL, DL, MulVT, LowLHS, LowRHS)); + SDValue MulHigh = + DAG.getBitcast(VT, DAG.getNode(ISD::MUL, DL, MulVT, HighLHS, HighRHS)); + + // Take the low byte of each lane. + return DAG.getVectorShuffle( + VT, DL, MulLow, MulHigh, + {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}); + } } SDValue diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp index a934853ff9f4..feac04a17068 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp @@ -34,7 +34,7 @@ using namespace llvm; #include "WebAssemblyGenInstrInfo.inc" WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI) - : WebAssemblyGenInstrInfo(WebAssembly::ADJCALLSTACKDOWN, + : WebAssemblyGenInstrInfo(STI, WebAssembly::ADJCALLSTACKDOWN, WebAssembly::ADJCALLSTACKUP, WebAssembly::CATCHRET), RI(STI.getTargetTriple()) {} diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index f06f8d5174e3..3c26b453c448 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -1453,15 +1453,22 @@ if !ne(t1, t2) then def : Pat<(t1.vt (bitconvert (t2.vt V128:$v))), (t1.vt V128:$v)>; // Extended pairwise addition +def extadd_pairwise_u : SDNode<"WebAssemblyISD::EXT_ADD_PAIRWISE_U", extend_t>; + defm "" : SIMDConvert<I16x8, I8x16, int_wasm_extadd_pairwise_signed, "extadd_pairwise_i8x16_s", 0x7c>; -defm "" : SIMDConvert<I16x8, I8x16, int_wasm_extadd_pairwise_unsigned, +defm "" : SIMDConvert<I16x8, I8x16, extadd_pairwise_u, "extadd_pairwise_i8x16_u", 0x7d>; defm "" : SIMDConvert<I32x4, I16x8, int_wasm_extadd_pairwise_signed, "extadd_pairwise_i16x8_s", 0x7e>; -defm "" : SIMDConvert<I32x4, I16x8, int_wasm_extadd_pairwise_unsigned, +defm "" : SIMDConvert<I32x4, I16x8, extadd_pairwise_u, "extadd_pairwise_i16x8_u", 0x7f>; +def : Pat<(v4i32 (int_wasm_extadd_pairwise_unsigned (v8i16 V128:$in))), + (extadd_pairwise_u_I32x4 V128:$in)>; +def : Pat<(v8i16 (int_wasm_extadd_pairwise_unsigned (v16i8 V128:$in))), + (extadd_pairwise_u_I16x8 V128:$in)>; + // f64x2 <-> f32x4 conversions def demote_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>; def demote_zero : SDNode<"WebAssemblyISD::DEMOTE_ZERO", demote_t>; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp index bc91c6424b63..08ca20b5eef6 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp @@ -247,7 +247,8 @@ static void query(const MachineInstr &MI, bool &Read, bool &Write, // Check for writes to __stack_pointer global. if ((MI.getOpcode() == WebAssembly::GLOBAL_SET_I32 || MI.getOpcode() == WebAssembly::GLOBAL_SET_I64) && - strcmp(MI.getOperand(0).getSymbolName(), "__stack_pointer") == 0) + MI.getOperand(0).isSymbol() && + !strcmp(MI.getOperand(0).getSymbolName(), "__stack_pointer")) StackPointer = true; // Analyze calls. diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp index 08fb7586d215..0eefd3e2b350 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp @@ -166,12 +166,6 @@ InstructionCost WebAssemblyTTIImpl::getMemoryOpCost( CostKind); } - int ISD = TLI->InstructionOpcodeToISD(Opcode); - if (ISD != ISD::LOAD) { - return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, - CostKind); - } - EVT VT = TLI->getValueType(DL, Ty, true); // Type legalization can't handle structs if (VT == MVT::Other) @@ -182,22 +176,121 @@ InstructionCost WebAssemblyTTIImpl::getMemoryOpCost( if (!LT.first.isValid()) return InstructionCost::getInvalid(); - // 128-bit loads are a single instruction. 32-bit and 64-bit vector loads can - // be lowered to load32_zero and load64_zero respectively. Assume SIMD loads - // are twice as expensive as scalar. + int ISD = TLI->InstructionOpcodeToISD(Opcode); unsigned width = VT.getSizeInBits(); - switch (width) { - default: - break; - case 32: - case 64: - case 128: - return 2; + if (ISD == ISD::LOAD) { + // 128-bit loads are a single instruction. 32-bit and 64-bit vector loads + // can be lowered to load32_zero and load64_zero respectively. Assume SIMD + // loads are twice as expensive as scalar. + switch (width) { + default: + break; + case 32: + case 64: + case 128: + return 2; + } + } else if (ISD == ISD::STORE) { + // For stores, we can use store lane operations. + switch (width) { + default: + break; + case 8: + case 16: + case 32: + case 64: + case 128: + return 2; + } } return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, CostKind); } +InstructionCost WebAssemblyTTIImpl::getInterleavedMemoryOpCost( + unsigned Opcode, Type *Ty, unsigned Factor, ArrayRef<unsigned> Indices, + Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, + bool UseMaskForCond, bool UseMaskForGaps) const { + assert(Factor >= 2 && "Invalid interleave factor"); + + auto *VecTy = cast<VectorType>(Ty); + if (!ST->hasSIMD128() || !isa<FixedVectorType>(VecTy)) { + return InstructionCost::getInvalid(); + } + + if (UseMaskForCond || UseMaskForGaps) + return BaseT::getInterleavedMemoryOpCost(Opcode, Ty, Factor, Indices, + Alignment, AddressSpace, CostKind, + UseMaskForCond, UseMaskForGaps); + + constexpr unsigned MaxInterleaveFactor = 4; + if (Factor <= MaxInterleaveFactor) { + unsigned MinElts = VecTy->getElementCount().getKnownMinValue(); + // Ensure the number of vector elements is greater than 1. + if (MinElts < 2 || MinElts % Factor != 0) + return InstructionCost::getInvalid(); + + unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); + // Ensure the element type is legal. + if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64) + return InstructionCost::getInvalid(); + + auto *SubVecTy = + VectorType::get(VecTy->getElementType(), + VecTy->getElementCount().divideCoefficientBy(Factor)); + InstructionCost MemCost = + getMemoryOpCost(Opcode, SubVecTy, Alignment, AddressSpace, CostKind); + + unsigned VecSize = DL.getTypeSizeInBits(SubVecTy); + unsigned MaxVecSize = 128; + unsigned NumAccesses = + std::max<unsigned>(1, (MinElts * ElSize + MaxVecSize - 1) / VecSize); + + // A stride of two is commonly supported via dedicated instructions, so it + // should be relatively cheap for all element sizes. A stride of four is + // more expensive as it will likely require more shuffles. Using two + // simd128 inputs is considered more expensive and we mainly account for + // shuffling two inputs (32 bytes), but we do model 4 x v4i32 to enable + // arithmetic kernels. + static const CostTblEntry ShuffleCostTbl[] = { + // One reg. + {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 + {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 + {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 + {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 + {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 + {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 + + // Two regs. + {2, MVT::v16i8, 2}, // interleave 2 x 16i8 into 32i8 + {2, MVT::v8i16, 2}, // interleave 2 x 8i16 into 16i16 + {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 + + // One reg. + {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 + {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 + {4, MVT::v2i16, 4}, // interleave 4 x 2i16 into 8i16 + + // Two regs. + {4, MVT::v8i8, 16}, // interleave 4 x 8i8 into 32i8 + {4, MVT::v4i16, 8}, // interleave 4 x 4i16 into 16i16 + {4, MVT::v2i32, 4}, // interleave 4 x 2i32 into 8i32 + + // Four regs. + {4, MVT::v4i32, 16}, // interleave 4 x 4i32 into 16i32 + }; + + EVT ETy = TLI->getValueType(DL, SubVecTy); + if (const auto *Entry = + CostTableLookup(ShuffleCostTbl, Factor, ETy.getSimpleVT())) + return Entry->Cost + (NumAccesses * MemCost); + } + + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace, CostKind, + UseMaskForCond, UseMaskForGaps); +} + InstructionCost WebAssemblyTTIImpl::getVectorInstrCost( unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const { diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h index c915eeb07d4f..2573066cd5d6 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h @@ -82,6 +82,10 @@ public: TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I = nullptr) const override; + InstructionCost getInterleavedMemoryOpCost( + unsigned Opcode, Type *Ty, unsigned Factor, ArrayRef<unsigned> Indices, + Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, + bool UseMaskForCond, bool UseMaskForGaps) const override; using BaseT::getVectorInstrCost; InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index d7671ed19589..ce5e92135f70 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -15,10 +15,12 @@ #include "MCTargetDesc/X86TargetStreamer.h" #include "TargetInfo/X86TargetInfo.h" #include "X86Operand.h" +#include "X86RegisterInfo.h" #include "llvm-c/Visibility.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" #include "llvm/MC/MCContext.h" @@ -29,6 +31,7 @@ #include "llvm/MC/MCParser/MCAsmParser.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" #include "llvm/MC/MCParser/MCTargetAsmParser.h" +#include "llvm/MC/MCRegister.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSection.h" #include "llvm/MC/MCStreamer.h" @@ -40,6 +43,7 @@ #include "llvm/Support/SourceMgr.h" #include "llvm/Support/raw_ostream.h" #include <algorithm> +#include <cstdint> #include <memory> using namespace llvm; @@ -1172,7 +1176,7 @@ private: X86::CondCode ParseConditionCode(StringRef CCode); - bool ParseIntelMemoryOperandSize(unsigned &Size); + bool ParseIntelMemoryOperandSize(unsigned &Size, StringRef *SizeStr); bool CreateMemForMSInlineAsm(MCRegister SegReg, const MCExpr *Disp, MCRegister BaseReg, MCRegister IndexReg, unsigned Scale, bool NonAbsMem, SMLoc Start, @@ -2574,7 +2578,8 @@ bool X86AsmParser::ParseMasmOperator(unsigned OpKind, int64_t &Val) { return false; } -bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) { +bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size, + StringRef *SizeStr) { Size = StringSwitch<unsigned>(getTok().getString()) .Cases("BYTE", "byte", 8) .Cases("WORD", "word", 16) @@ -2592,6 +2597,8 @@ bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) { .Cases("ZMMWORD", "zmmword", 512) .Default(0); if (Size) { + if (SizeStr) + *SizeStr = getTok().getString(); const AsmToken &Tok = Lex(); // Eat operand size (e.g., byte, word). if (!(Tok.getString() == "PTR" || Tok.getString() == "ptr")) return Error(Tok.getLoc(), "Expected 'PTR' or 'ptr' token!"); @@ -2600,6 +2607,19 @@ bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) { return false; } +uint16_t RegSizeInBits(const MCRegisterInfo &MRI, MCRegister RegNo) { + if (X86MCRegisterClasses[X86::GR8RegClassID].contains(RegNo)) + return 8; + if (X86MCRegisterClasses[X86::GR16RegClassID].contains(RegNo)) + return 16; + if (X86MCRegisterClasses[X86::GR32RegClassID].contains(RegNo)) + return 32; + if (X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo)) + return 64; + // Unknown register size + return 0; +} + bool X86AsmParser::parseIntelOperand(OperandVector &Operands, StringRef Name) { MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); @@ -2607,7 +2627,8 @@ bool X86AsmParser::parseIntelOperand(OperandVector &Operands, StringRef Name) { // Parse optional Size directive. unsigned Size; - if (ParseIntelMemoryOperandSize(Size)) + StringRef SizeStr; + if (ParseIntelMemoryOperandSize(Size, &SizeStr)) return true; bool PtrInOperand = bool(Size); @@ -2624,9 +2645,29 @@ bool X86AsmParser::parseIntelOperand(OperandVector &Operands, StringRef Name) { return Error(Start, "rip can only be used as a base register"); // A Register followed by ':' is considered a segment override if (Tok.isNot(AsmToken::Colon)) { - if (PtrInOperand) - return Error(Start, "expected memory operand after 'ptr', " - "found register operand instead"); + if (PtrInOperand) { + if (!Parser.isParsingMasm()) + return Error(Start, "expected memory operand after 'ptr', " + "found register operand instead"); + + // If we are parsing MASM, we are allowed to cast registers to their own + // sizes, but not to other types. + uint16_t RegSize = + RegSizeInBits(*getContext().getRegisterInfo(), RegNo); + if (RegSize == 0) + return Error( + Start, + "cannot cast register '" + + StringRef(getContext().getRegisterInfo()->getName(RegNo)) + + "'; its size is not easily defined."); + if (RegSize != Size) + return Error( + Start, + std::to_string(RegSize) + "-bit register '" + + StringRef(getContext().getRegisterInfo()->getName(RegNo)) + + "' cannot be used as a " + std::to_string(Size) + "-bit " + + SizeStr.upper()); + } Operands.push_back(X86Operand::CreateReg(RegNo, Start, End)); return false; } diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 56a4cc3d65c2..865fc0ce8101 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -485,7 +485,16 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS, if (!CanPadInst) return; - if (PendingBA && PendingBA->getNext() == OS.getCurrentFragment()) { + if (PendingBA) { + auto *NextFragment = PendingBA->getNext(); + assert(NextFragment && "NextFragment should not be null"); + if (NextFragment == OS.getCurrentFragment()) + return; + // We eagerly create an empty fragment when inserting a fragment + // with a variable-size tail. + if (NextFragment->getNext() == OS.getCurrentFragment()) + return; + // Macro fusion actually happens and there is no other fragment inserted // after the previous instruction. // diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp index 547745fdba9d..76731437931a 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp @@ -1668,6 +1668,13 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, DestName = getRegName(MI->getOperand(0).getReg()); break; + case X86::VMOVSHZrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DecodeScalarMoveMask(8, false, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::MOVPQI2QIrr: case X86::MOVZPQILo2PQIrr: case X86::VMOVPQI2QIrr: diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index a15930c1433f..cfe5b1094811 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -1047,9 +1047,6 @@ X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI, Prefix.setL(TSFlags & X86II::VEX_L); Prefix.setL2(TSFlags & X86II::EVEX_L2); - if ((TSFlags & X86II::EVEX_L2) && STI.hasFeature(X86::FeatureAVX512) && - !STI.hasFeature(X86::FeatureEVEX512)) - report_fatal_error("ZMM registers are not supported without EVEX512"); switch (TSFlags & X86II::OpPrefixMask) { case X86II::PD: Prefix.setPP(0x1); // 66 diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index cc7bcd678cb3..bb1e716c33ed 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -397,18 +397,6 @@ MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(const Triple &TT, if (CPU.empty()) CPU = "generic"; - size_t posNoEVEX512 = FS.rfind("-evex512"); - // Make sure we won't be cheated by "-avx512fp16". - size_t posNoAVX512F = - FS.ends_with("-avx512f") ? FS.size() - 8 : FS.rfind("-avx512f,"); - size_t posEVEX512 = FS.rfind("+evex512"); - size_t posAVX512F = FS.rfind("+avx512"); // Any AVX512XXX will enable AVX512F. - - if (posAVX512F != StringRef::npos && - (posNoAVX512F == StringRef::npos || posNoAVX512F < posAVX512F)) - if (posEVEX512 == StringRef::npos && posNoEVEX512 == StringRef::npos) - ArchFS += ",+evex512"; - return createX86MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, ArchFS); } diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 9cfe081b8710..7c9e821c02fd 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -113,6 +113,7 @@ def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true", def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true", "Support 16-bit floating point conversion instructions", [FeatureAVX]>; +// Deprecated feature. Keep it here to suppress warnings in old IRs. def FeatureEVEX512 : SubtargetFeature<"evex512", "HasEVEX512", "true", "Support ZMM and 64-bit mask instructions">; def FeatureAVX512 : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512", @@ -329,20 +330,22 @@ def FeatureMOVDIRI : SubtargetFeature<"movdiri", "HasMOVDIRI", "true", "Support movdiri instruction (direct store integer)">; def FeatureMOVDIR64B : SubtargetFeature<"movdir64b", "HasMOVDIR64B", "true", "Support movdir64b instruction (direct store 64 bytes)">; -def FeatureAVX10_1 : SubtargetFeature<"avx10.1-256", "HasAVX10_1", "true", - "Support AVX10.1 up to 256-bit instruction", +def FeatureAVX10_1 : SubtargetFeature<"avx10.1", "HasAVX10_1", "true", + "Support AVX10.1 instruction", [FeatureCDI, FeatureVBMI, FeatureIFMA, FeatureVNNI, FeatureBF16, FeatureVPOPCNTDQ, FeatureVBMI2, FeatureBITALG, FeatureFP16, FeatureVLX, FeatureDQI]>; +// Deprecated feature. Keep it here to suppress warnings in old IRs. def FeatureAVX10_1_512 : SubtargetFeature<"avx10.1-512", "HasAVX10_1_512", "true", - "Support AVX10.1 up to 512-bit instruction", - [FeatureAVX10_1, FeatureEVEX512]>; -def FeatureAVX10_2 : SubtargetFeature<"avx10.2-256", "HasAVX10_2", "true", - "Support AVX10.2 up to 256-bit instruction", + "Support AVX10.1 instruction", + [FeatureAVX10_1]>; +def FeatureAVX10_2 : SubtargetFeature<"avx10.2", "HasAVX10_2", "true", + "Support AVX10.2 instruction", [FeatureAVX10_1]>; +// Deprecated feature. Keep it here to suppress warnings in old IRs. def FeatureAVX10_2_512 : SubtargetFeature<"avx10.2-512", "HasAVX10_2_512", "true", - "Support AVX10.2 up to 512-bit instruction", - [FeatureAVX10_2, FeatureAVX10_1_512]>; + "Support AVX10.2 instruction", + [FeatureAVX10_2]>; def FeatureEGPR : SubtargetFeature<"egpr", "HasEGPR", "true", "Support extended general purpose register">; def FeaturePush2Pop2 : SubtargetFeature<"push2pop2", "HasPush2Pop2", "true", @@ -871,7 +874,6 @@ def ProcessorFeatures { ]; list<SubtargetFeature> X86_64V4Features = !listconcat(X86_64V3Features, [ - FeatureEVEX512, FeatureBWI, FeatureCDI, FeatureDQI, @@ -996,7 +998,6 @@ def ProcessorFeatures { FeatureXSAVES, FeatureCLFLUSHOPT, FeatureAVX512, - FeatureEVEX512, FeatureCDI, FeatureDQI, FeatureBWI, @@ -1039,7 +1040,6 @@ def ProcessorFeatures { // Cannonlake list<SubtargetFeature> CNLAdditionalFeatures = [FeatureAVX512, - FeatureEVEX512, FeatureCDI, FeatureDQI, FeatureBWI, @@ -1155,7 +1155,7 @@ def ProcessorFeatures { !listconcat(GNRFeatures, GNRDAdditionalFeatures); // Diamond Rapids - list<SubtargetFeature> DMRAdditionalFeatures = [FeatureAVX10_2_512, + list<SubtargetFeature> DMRAdditionalFeatures = [FeatureAVX10_2, FeatureSM4, FeatureCMPCCXADD, FeatureAVXIFMA, @@ -1368,7 +1368,6 @@ def ProcessorFeatures { FeatureF16C, FeatureFSGSBase, FeatureAVX512, - FeatureEVEX512, FeatureCDI, FeatureADX, FeatureRDSEED, @@ -1586,7 +1585,6 @@ def ProcessorFeatures { list<SubtargetFeature> ZN4Tuning = !listconcat(ZN3Tuning, ZN4AdditionalTuning); list<SubtargetFeature> ZN4AdditionalFeatures = [FeatureAVX512, - FeatureEVEX512, FeatureCDI, FeatureDQI, FeatureBWI, diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp index d406277e440b..ff22ee8c86fa 100644 --- a/llvm/lib/Target/X86/X86AsmPrinter.cpp +++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp @@ -476,7 +476,8 @@ static bool isIndirectBranchOrTailCall(const MachineInstr &MI) { return MI.getDesc().isIndirectBranch() /*Make below code in a good shape*/ || Opc == X86::TAILJMPr || Opc == X86::TAILJMPm || Opc == X86::TAILJMPr64 || Opc == X86::TAILJMPm64 || - Opc == X86::TCRETURNri || Opc == X86::TCRETURNmi || + Opc == X86::TCRETURNri || Opc == X86::TCRETURN_WIN64ri || + Opc == X86::TCRETURN_HIPE32ri || Opc == X86::TCRETURNmi || Opc == X86::TCRETURNri64 || Opc == X86::TCRETURNmi64 || Opc == X86::TCRETURNri64_ImpCall || Opc == X86::TAILJMPr64_REX || Opc == X86::TAILJMPm64_REX; diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index 0e6b4dffec3a..9457e718de69 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -269,6 +269,8 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, case X86::TCRETURNdi: case X86::TCRETURNdicc: case X86::TCRETURNri: + case X86::TCRETURN_WIN64ri: + case X86::TCRETURN_HIPE32ri: case X86::TCRETURNmi: case X86::TCRETURNdi64: case X86::TCRETURNdi64cc: @@ -346,8 +348,9 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op)); for (unsigned i = 0; i != X86::AddrNumOperands; ++i) MIB.add(MBBI->getOperand(i)); - } else if ((Opcode == X86::TCRETURNri64) || - (Opcode == X86::TCRETURNri64_ImpCall)) { + } else if (Opcode == X86::TCRETURNri64 || + Opcode == X86::TCRETURNri64_ImpCall || + Opcode == X86::TCRETURN_WIN64ri) { JumpTarget.setIsKill(); BuildMI(MBB, MBBI, DL, TII->get(IsX64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64)) diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp index d3c239250943..787b71d425cb 100644 --- a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp @@ -564,8 +564,17 @@ bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) { MachineBasicBlock::iterator I; if (LastShapeMI && dominates(MBB, MI, LastShapeMI)) I = ++LastShapeMI->getIterator(); - else - I = ++MI.getIterator(); + else { + // Call can overwrite registers like rax, ensure the tile config + // instruction is sinked closer to first instruction that uses tile. + auto UseIt = MI.getIterator(); + while (UseIt != MBB.end()) { + if (HasTileOperand(MRI, *UseIt)) + break; + ++UseIt; + } + I = UseIt; + } Config(*I); HasUnconfigTile = false; continue; diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index cba7843d53e3..a293b4c87cfe 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -2398,7 +2398,8 @@ X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const { } static bool isTailCallOpcode(unsigned Opc) { - return Opc == X86::TCRETURNri || Opc == X86::TCRETURNdi || + return Opc == X86::TCRETURNri || Opc == X86::TCRETURN_WIN64ri || + Opc == X86::TCRETURN_HIPE32ri || Opc == X86::TCRETURNdi || Opc == X86::TCRETURNmi || Opc == X86::TCRETURNri64 || Opc == X86::TCRETURNri64_ImpCall || Opc == X86::TCRETURNdi64 || Opc == X86::TCRETURNmi64; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 19131fbd4102..3631016b0f5c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -326,15 +326,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget.hasAVX10_2()) { setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v2i32, Custom); setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v2i32, Custom); + setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v8i64, Legal); + setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v8i64, Legal); for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64, MVT::v4i64}) { setOperationAction(ISD::FP_TO_UINT_SAT, VT, Legal); setOperationAction(ISD::FP_TO_SINT_SAT, VT, Legal); } - if (Subtarget.hasAVX10_2_512()) { - setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v8i64, Legal); - setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v8i64, Legal); - } if (Subtarget.is64Bit()) { setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Legal); setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Legal); @@ -2457,6 +2455,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) { + setOperationAction(ISD::FADD, MVT::v32bf16, Legal); + setOperationAction(ISD::FSUB, MVT::v32bf16, Legal); + setOperationAction(ISD::FMUL, MVT::v32bf16, Legal); + setOperationAction(ISD::FDIV, MVT::v32bf16, Legal); + setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal); + setOperationAction(ISD::FMA, MVT::v32bf16, Legal); + setOperationAction(ISD::SETCC, MVT::v32bf16, Custom); + setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom); + setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom); + setOperationAction(ISD::FMINIMUMNUM, MVT::v32bf16, Custom); + setOperationAction(ISD::FMAXIMUMNUM, MVT::v32bf16, Custom); for (auto VT : {MVT::v8bf16, MVT::v16bf16}) { setOperationAction(ISD::FADD, VT, Legal); setOperationAction(ISD::FSUB, VT, Legal); @@ -2470,19 +2479,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FMINIMUMNUM, VT, Custom); setOperationAction(ISD::FMAXIMUMNUM, VT, Custom); } - if (Subtarget.hasAVX10_2_512()) { - setOperationAction(ISD::FADD, MVT::v32bf16, Legal); - setOperationAction(ISD::FSUB, MVT::v32bf16, Legal); - setOperationAction(ISD::FMUL, MVT::v32bf16, Legal); - setOperationAction(ISD::FDIV, MVT::v32bf16, Legal); - setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal); - setOperationAction(ISD::FMA, MVT::v32bf16, Legal); - setOperationAction(ISD::SETCC, MVT::v32bf16, Custom); - setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom); - setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom); - setOperationAction(ISD::FMINIMUMNUM, MVT::v32bf16, Custom); - setOperationAction(ISD::FMAXIMUMNUM, MVT::v32bf16, Custom); - } for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) { setCondCodeAction(ISD::SETOEQ, VT, Custom); setCondCodeAction(ISD::SETUNE, VT, Custom); @@ -21252,7 +21248,7 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, // the truncation then we can use PACKSS by converting the srl to a sra. // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it. if (In.getOpcode() == ISD::SRL && In->hasOneUse()) - if (std::optional<uint64_t> ShAmt = DAG.getValidShiftAmount(In)) { + if (std::optional<unsigned> ShAmt = DAG.getValidShiftAmount(In)) { if (*ShAmt == MinSignBits) { PackOpcode = X86ISD::PACKSS; return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops()); @@ -26269,10 +26265,9 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - - if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask)) - if (MaskConst->getZExtValue() & 0x1) - return Op; + auto *MaskConst = dyn_cast<ConstantSDNode>(Mask); + if (MaskConst && (MaskConst->getZExtValue() & 0x1)) + return Op; MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); @@ -26288,6 +26283,17 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, if (PreservedSrc.isUndef()) PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); + + if (MaskConst) { + assert((MaskConst->getZExtValue() & 0x1) == 0 && "Expected false mask"); + // Discard op and blend passthrough with scalar op src/dst. + SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements()); + std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0); + ShuffleMask[0] = VT.getVectorNumElements(); + return DAG.getVectorShuffle(VT, dl, Op.getOperand(0), PreservedSrc, + ShuffleMask); + } + return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc); } @@ -31404,9 +31410,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, return R; // AVX512 implicitly uses modulo rotation amounts. - if ((Subtarget.hasVLX() || - (Subtarget.hasAVX512() && Subtarget.hasEVEX512())) && - 32 <= EltSizeInBits) { + if ((Subtarget.hasVLX() || Subtarget.hasAVX512()) && 32 <= EltSizeInBits) { // Attempt to rotate by immediate. if (IsCstSplat) { unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI; @@ -38676,13 +38680,11 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); if (Opc == X86ISD::VSHLI) { - Known.Zero <<= ShAmt; - Known.One <<= ShAmt; + Known <<= ShAmt; // Low bits are known zero. Known.Zero.setLowBits(ShAmt); } else if (Opc == X86ISD::VSRLI) { - Known.Zero.lshrInPlace(ShAmt); - Known.One.lshrInPlace(ShAmt); + Known >>= ShAmt; // High bits are known zero. Known.Zero.setHighBits(ShAmt); } else { @@ -44518,8 +44520,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( TLO, Depth + 1)) return true; - Known.Zero <<= ShAmt; - Known.One <<= ShAmt; + Known <<= ShAmt; // Low bits known zero. Known.Zero.setLowBits(ShAmt); @@ -44549,8 +44550,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( TLO, Depth + 1)) return true; - Known.Zero.lshrInPlace(ShAmt); - Known.One.lshrInPlace(ShAmt); + Known >>= ShAmt; // High bits known zero. Known.Zero.setHighBits(ShAmt); @@ -44598,8 +44598,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( TLO, Depth + 1)) return true; - Known.Zero.lshrInPlace(ShAmt); - Known.One.lshrInPlace(ShAmt); + Known >>= ShAmt; // If the input sign bit is known to be zero, or if none of the top bits // are demanded, turn this into an unsigned shift right. @@ -44957,6 +44956,44 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( Known.Zero.setLowBits(Known2.countMinTrailingZeros()); return false; } + case X86ISD::VPMADD52L: + case X86ISD::VPMADD52H: { + KnownBits KnownOp0, KnownOp1, KnownOp2; + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDValue Op2 = Op.getOperand(2); + // Only demand the lower 52-bits of operands 0 / 1 (and all 64-bits of + // operand 2). + APInt Low52Bits = APInt::getLowBitsSet(BitWidth, 52); + if (SimplifyDemandedBits(Op0, Low52Bits, OriginalDemandedElts, KnownOp0, + TLO, Depth + 1)) + return true; + + if (SimplifyDemandedBits(Op1, Low52Bits, OriginalDemandedElts, KnownOp1, + TLO, Depth + 1)) + return true; + + if (SimplifyDemandedBits(Op2, APInt::getAllOnes(64), OriginalDemandedElts, + KnownOp2, TLO, Depth + 1)) + return true; + + KnownBits KnownMul; + KnownOp0 = KnownOp0.trunc(52); + KnownOp1 = KnownOp1.trunc(52); + KnownMul = Opc == X86ISD::VPMADD52L ? KnownBits::mul(KnownOp0, KnownOp1) + : KnownBits::mulhu(KnownOp0, KnownOp1); + KnownMul = KnownMul.zext(64); + + // lo/hi(X * Y) + Z --> C + Z + if (KnownMul.isConstant()) { + SDLoc DL(Op); + SDValue C = TLO.DAG.getConstant(KnownMul.getConstant(), DL, VT); + return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ADD, DL, VT, C, Op2)); + } + + Known = KnownBits::add(KnownMul, KnownOp2); + return false; + } } return TargetLowering::SimplifyDemandedBitsForTargetNode( @@ -45132,6 +45169,14 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode( bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const { switch (Op.getOpcode()) { + // SSE bit logic. + case X86ISD::FAND: + case X86ISD::FOR: + case X86ISD::FXOR: + case X86ISD::FANDN: + case X86ISD::ANDNP: + case X86ISD::VPTERNLOG: + return false; // SSE vector insert/extracts use modulo indices. case X86ISD::PINSRB: case X86ISD::PINSRW: @@ -45167,6 +45212,11 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode( // SSE signbit extraction. case X86ISD::MOVMSK: return false; + // GFNI instructions. + case X86ISD::GF2P8AFFINEINVQB: + case X86ISD::GF2P8AFFINEQB: + case X86ISD::GF2P8MULB: + return false; case ISD::INTRINSIC_WO_CHAIN: switch (Op->getConstantOperandVal(0)) { case Intrinsic::x86_sse2_pmadd_wd: @@ -48349,7 +48399,7 @@ static SDValue checkSignTestSetCCCombine(SDValue Cmp, X86::CondCode &CC, // If Src came from a SHL (probably from an expanded SIGN_EXTEND_INREG), then // peek through and adjust the TEST bit. if (Src.getOpcode() == ISD::SHL) { - if (std::optional<uint64_t> ShiftAmt = DAG.getValidShiftAmount(Src)) { + if (std::optional<unsigned> ShiftAmt = DAG.getValidShiftAmount(Src)) { Src = Src.getOperand(0); BitMask.lshrInPlace(*ShiftAmt); } @@ -50886,10 +50936,12 @@ static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, // Given a target type \p VT, we generate // or (and x, y), (xor z, zext(build_vector (constants))) // given x, y and z are of type \p VT. We can do so, if operands are either -// truncates from VT types, the second operand is a vector of constants or can -// be recursively promoted. +// truncates from VT types, the second operand is a vector of constants, can +// be recursively promoted or is an existing extension we can extend further. static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT, - SelectionDAG &DAG, unsigned Depth) { + SelectionDAG &DAG, + const X86Subtarget &Subtarget, + unsigned Depth) { // Limit recursion to avoid excessive compile times. if (Depth >= SelectionDAG::MaxRecursionDepth) return SDValue(); @@ -50904,28 +50956,32 @@ static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT, if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT)) return SDValue(); - if (SDValue NN0 = PromoteMaskArithmetic(N0, DL, VT, DAG, Depth + 1)) + if (SDValue NN0 = + PromoteMaskArithmetic(N0, DL, VT, DAG, Subtarget, Depth + 1)) N0 = NN0; else { - // The left side has to be a trunc. - if (N0.getOpcode() != ISD::TRUNCATE) - return SDValue(); - - // The type of the truncated inputs. - if (N0.getOperand(0).getValueType() != VT) + // The left side has to be a 'trunc'. + bool LHSTrunc = N0.getOpcode() == ISD::TRUNCATE && + N0.getOperand(0).getValueType() == VT; + if (LHSTrunc) + N0 = N0.getOperand(0); + else return SDValue(); - - N0 = N0.getOperand(0); } - if (SDValue NN1 = PromoteMaskArithmetic(N1, DL, VT, DAG, Depth + 1)) + if (SDValue NN1 = + PromoteMaskArithmetic(N1, DL, VT, DAG, Subtarget, Depth + 1)) N1 = NN1; else { - // The right side has to be a 'trunc' or a (foldable) constant. + // The right side has to be a 'trunc', a (foldable) constant or an + // existing extension we can extend further. bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE && N1.getOperand(0).getValueType() == VT; if (RHSTrunc) N1 = N1.getOperand(0); + else if (ISD::isExtVecInRegOpcode(N1.getOpcode()) && VT.is256BitVector() && + Subtarget.hasInt256() && N1.hasOneUse()) + N1 = DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0)); else if (SDValue Cst = DAG.FoldConstantArithmetic(ISD::ZERO_EXTEND, DL, VT, {N1})) N1 = Cst; @@ -50955,7 +51011,7 @@ static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT NarrowVT = Narrow.getValueType(); // Generate the wide operation. - SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, 0); + SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, Subtarget, 0); if (!Op) return SDValue(); switch (N.getOpcode()) { @@ -51804,6 +51860,8 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, SDValue X, Y; EVT CondVT = VT.changeVectorElementType(MVT::i1); if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(CondVT) && + (VT.is512BitVector() || Subtarget.hasVLX()) && + (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && sd_match(N, m_And(m_Value(X), m_OneUse(m_SExt(m_AllOf( m_Value(Y), m_SpecificVT(CondVT), @@ -54135,10 +54193,10 @@ static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG, static SDValue combinei64TruncSrlConstant(SDValue N, EVT VT, SelectionDAG &DAG, const SDLoc &DL) { assert(N.getOpcode() == ISD::SRL && "Unknown shift opcode"); - std::optional<uint64_t> ValidSrlConst = DAG.getValidShiftAmount(N); + std::optional<unsigned> ValidSrlConst = DAG.getValidShiftAmount(N); if (!ValidSrlConst) return SDValue(); - uint64_t SrlConstVal = *ValidSrlConst; + unsigned SrlConstVal = *ValidSrlConst; SDValue Op = N.getOperand(0); unsigned Opcode = Op.getOpcode(); @@ -55368,6 +55426,8 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, SDValue Src = N0.getOperand(0); EVT SrcVT = Src.getValueType(); if (Src.getOpcode() == ISD::SETCC && SrcVT.getScalarType() == MVT::i1 && + (VT.is512BitVector() || Subtarget.hasVLX()) && + (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && TLI.isTypeLegal(SrcVT) && N0.hasOneUse() && Src.hasOneUse()) return DAG.getSelect(DL, VT, DAG.getNOT(DL, Src, SrcVT), N1, getZeroVector(VT, Subtarget, DAG, DL)); @@ -56247,7 +56307,13 @@ static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC, SDValue Masked = BroadcastOp; if (N != 0) { - APInt Mask = APInt::getLowBitsSet(BroadcastOpVT.getSizeInBits(), Len); + unsigned BroadcastOpBitWidth = BroadcastOpVT.getSizeInBits(); + unsigned NumDefinedElts = UndefElts.countTrailingZeros(); + + if (NumDefinedElts > BroadcastOpBitWidth) + return SDValue(); + + APInt Mask = APInt::getLowBitsSet(BroadcastOpBitWidth, NumDefinedElts); SDValue ShiftedValue = DAG.getNode(ISD::SRL, DL, BroadcastOpVT, BroadcastOp, DAG.getConstant(N, DL, BroadcastOpVT)); Masked = DAG.getNode(ISD::AND, DL, BroadcastOpVT, ShiftedValue, @@ -57904,6 +57970,51 @@ static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL, Cmov.getOperand(3)); } +// Attempt to turn ADD(MUL(x, y), acc)) -> VPMADD52L +// When upper 12 bits of x, y and MUL(x, y) are known to be 0 +static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, + EVT VT, const X86Subtarget &Subtarget) { + using namespace SDPatternMatch; + if (!VT.isVector() || VT.getScalarSizeInBits() != 64 || + (!Subtarget.hasAVXIFMA() && !Subtarget.hasIFMA())) + return SDValue(); + + // Need AVX-512VL vector length extensions if operating on XMM/YMM registers + if (!Subtarget.hasAVXIFMA() && !Subtarget.hasVLX() && + VT.getSizeInBits() < 512) + return SDValue(); + + const auto TotalSize = VT.getSizeInBits(); + if (TotalSize < 128 || !isPowerOf2_64(TotalSize)) + return SDValue(); + + SDValue X, Y, Acc; + if (!sd_match(N, m_Add(m_Mul(m_Value(X), m_Value(Y)), m_Value(Acc)))) + return SDValue(); + + KnownBits KnownX = DAG.computeKnownBits(X); + if (KnownX.countMinLeadingZeros() < 12) + return SDValue(); + KnownBits KnownY = DAG.computeKnownBits(Y); + if (KnownY.countMinLeadingZeros() < 12) + return SDValue(); + KnownBits KnownMul = KnownBits::mul(KnownX, KnownY); + if (KnownMul.countMinLeadingZeros() < 12) + return SDValue(); + + auto VPMADD52Builder = [](SelectionDAG &G, SDLoc DL, + ArrayRef<SDValue> SubOps) { + EVT SubVT = SubOps[0].getValueType(); + assert(SubVT.getScalarSizeInBits() == 64 && + "Unexpected element size, only supports 64bit size"); + return G.getNode(X86ISD::VPMADD52L, DL, SubVT, SubOps[1] /*X*/, + SubOps[2] /*Y*/, SubOps[0] /*Acc*/); + }; + + return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Acc, X, Y}, VPMADD52Builder, + /*CheckBWI*/ false); +} + static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -58007,6 +58118,9 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, Op0.getOperand(0), Op0.getOperand(2)); } + if (SDValue IFMA52 = matchVPMADD52(N, DAG, DL, VT, Subtarget)) + return IFMA52; + return combineAddOrSubToADCOrSBB(N, DL, DAG); } @@ -60068,6 +60182,19 @@ static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// Simplify VPMADD52L/VPMADD52H operations. +static SDValue combineVPMADD52LH(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + MVT VT = N->getSimpleValueType(0); + unsigned NumEltBits = VT.getScalarSizeInBits(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits), + DCI)) + return SDValue(N, 0); + + return SDValue(); +} + static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -60705,6 +60832,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget); case X86ISD::VPMADDUBSW: case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI); + case X86ISD::VPMADD52L: + case X86ISD::VPMADD52H: return combineVPMADD52LH(N, DAG, DCI); case X86ISD::KSHIFTL: case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI); case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget); @@ -60932,117 +61061,6 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { // X86 Inline Assembly Support //===----------------------------------------------------------------------===// -// Helper to match a string separated by whitespace. -static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) { - S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace. - - for (StringRef Piece : Pieces) { - if (!S.starts_with(Piece)) // Check if the piece matches. - return false; - - S = S.substr(Piece.size()); - StringRef::size_type Pos = S.find_first_not_of(" \t"); - if (Pos == 0) // We matched a prefix. - return false; - - S = S.substr(Pos); - } - - return S.empty(); -} - -static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) { - - if (AsmPieces.size() == 3 || AsmPieces.size() == 4) { - if (llvm::is_contained(AsmPieces, "~{cc}") && - llvm::is_contained(AsmPieces, "~{flags}") && - llvm::is_contained(AsmPieces, "~{fpsr}")) { - - if (AsmPieces.size() == 3) - return true; - else if (llvm::is_contained(AsmPieces, "~{dirflag}")) - return true; - } - } - return false; -} - -bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { - InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand()); - - StringRef AsmStr = IA->getAsmString(); - - IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); - if (!Ty || Ty->getBitWidth() % 16 != 0) - return false; - - // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" - SmallVector<StringRef, 4> AsmPieces; - SplitString(AsmStr, AsmPieces, ";\n"); - - switch (AsmPieces.size()) { - default: return false; - case 1: - // FIXME: this should verify that we are targeting a 486 or better. If not, - // we will turn this bswap into something that will be lowered to logical - // ops instead of emitting the bswap asm. For now, we don't support 486 or - // lower so don't worry about this. - // bswap $0 - if (matchAsm(AsmPieces[0], {"bswap", "$0"}) || - matchAsm(AsmPieces[0], {"bswapl", "$0"}) || - matchAsm(AsmPieces[0], {"bswapq", "$0"}) || - matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) || - matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) || - matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) { - // No need to check constraints, nothing other than the equivalent of - // "=r,0" would be valid here. - return IntrinsicLowering::LowerToByteSwap(CI); - } - - // rorw $$8, ${0:w} --> llvm.bswap.i16 - if (CI->getType()->isIntegerTy(16) && - IA->getConstraintString().starts_with("=r,0,") && - (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) || - matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) { - AsmPieces.clear(); - StringRef ConstraintsStr = IA->getConstraintString(); - SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); - array_pod_sort(AsmPieces.begin(), AsmPieces.end()); - if (clobbersFlagRegisters(AsmPieces)) - return IntrinsicLowering::LowerToByteSwap(CI); - } - break; - case 3: - if (CI->getType()->isIntegerTy(32) && - IA->getConstraintString().starts_with("=r,0,") && - matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) && - matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) && - matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) { - AsmPieces.clear(); - StringRef ConstraintsStr = IA->getConstraintString(); - SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); - array_pod_sort(AsmPieces.begin(), AsmPieces.end()); - if (clobbersFlagRegisters(AsmPieces)) - return IntrinsicLowering::LowerToByteSwap(CI); - } - - if (CI->getType()->isIntegerTy(64)) { - InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); - if (Constraints.size() >= 2 && - Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && - Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { - // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 - if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) && - matchAsm(AsmPieces[1], {"bswap", "%edx"}) && - matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"})) - return IntrinsicLowering::LowerToByteSwap(CI); - } - } - break; - } - return false; -} - static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) { X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint) .Case("{@cca}", X86::COND_A) diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 97d3b6e2420d..0c9ba591b03e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1364,8 +1364,6 @@ namespace llvm { SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const; - bool ExpandInlineAsm(CallInst *CI) const override; - ConstraintType getConstraintType(StringRef Constraint) const override; /// Examine constraint string and operand type and determine a weight value. @@ -1668,8 +1666,8 @@ namespace llvm { /// Lower interleaved store(s) into target specific /// instructions/intrinsics. bool lowerInterleavedStore(Instruction *Store, Value *Mask, - ShuffleVectorInst *SVI, - unsigned Factor) const override; + ShuffleVectorInst *SVI, unsigned Factor, + const APInt &GapMask) const override; SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const override; diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index 1c745a338a61..3bc46af4d130 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -302,7 +302,7 @@ EVT X86TargetLowering::getOptimalMemOpType( if (Op.size() >= 16 && (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) { // FIXME: Check if unaligned 64-byte accesses are slow. - if (Op.size() >= 64 && Subtarget.hasAVX512() && Subtarget.hasEVEX512() && + if (Op.size() >= 64 && Subtarget.hasAVX512() && (Subtarget.getPreferVectorWidth() >= 512)) { return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32; } @@ -416,7 +416,7 @@ bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context, return true; return false; case 512: - if (Subtarget.hasAVX512() && Subtarget.hasEVEX512()) + if (Subtarget.hasAVX512()) return true; return false; default: diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index 1beaaafb159e..69a5115201ef 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -550,7 +550,7 @@ let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in { } // HasAMXMOVRS, In64BitMode multiclass m_tcvtrowd2ps { - let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in { + let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode] in { let SchedRW = [WriteSystem] in { def rri : Ii8<0x7, MRMSrcReg, (outs VR512:$dst), (ins TILE:$src1, i32u8imm:$src2), @@ -561,12 +561,12 @@ multiclass m_tcvtrowd2ps { "tcvtrowd2ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, T8,XS, EVEX, VVVV, EVEX_V512; } - } // HasAMXAVX512, HasAVX10_2_512, In64BitMode + } // HasAMXAVX512, HasAVX10_2, In64BitMode } defm TCVTROWD2PS : m_tcvtrowd2ps; -let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in { +let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode] in { let SchedRW = [WriteSystem] in { let usesCustomInserter = 1 in { def PTCVTROWD2PSrri : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, i32u8imm:$src2), @@ -630,7 +630,7 @@ let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in { multiclass AMXAVX512_BASE<bits<8> Opcode1, bits<8> Opcode2, string Opstr, Prefix P1, Prefix P2> { - let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode], SchedRW = [WriteSystem] in { + let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode], SchedRW = [WriteSystem] in { let OpPrefix = P1 in def rre : I<Opcode1, MRMSrcReg4VOp3, (outs VR512:$dst), (ins TILE:$src1, GR32:$src2), @@ -658,7 +658,7 @@ defm TCVTROWPS2BF16H : AMXAVX512_BASE<0x6d, 0x07, "tcvtrowps2bf16h", XD, XD>; defm TCVTROWPS2BF16L : AMXAVX512_BASE<0x6d, 0x77, "tcvtrowps2bf16l", XS, XS>; multiclass m_tilemovrow { - let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in { + let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode] in { let SchedRW = [WriteSystem] in { def rri : Ii8<0x7, MRMSrcReg, (outs VR512:$dst), (ins TILE:$src1, u8imm:$src2), @@ -669,12 +669,12 @@ multiclass m_tilemovrow { "tilemovrow\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, T8,PD, EVEX, VVVV, EVEX_V512; } - } // HasAMXAVX512, HasAVX10_2_512, In64BitMode + } // HasAMXAVX512, HasAVX10_2, In64BitMode } defm TILEMOVROW : m_tilemovrow; -let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in { +let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode] in { let SchedRW = [WriteSystem] in { let usesCustomInserter = 1 in { def PTILEMOVROWrri : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, i32u8imm:$src2), diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td index 2d2bf1f6c725..764ff998bb56 100644 --- a/llvm/lib/Target/X86/X86InstrAVX10.td +++ b/llvm/lib/Target/X86/X86InstrAVX10.td @@ -15,36 +15,36 @@ // VNNI FP16 let ExeDomain = SSEPackedSingle in defm VDPPHPS : avx512_dpf16ps_sizes<0x52, "vdpphps", X86dpfp16ps, avx512vl_f16_info, - [HasAVX10_2], [HasAVX10_2_512]>, + [HasAVX10_2], [HasAVX10_2]>, T8, PS, EVEX_CD8<32, CD8VF>; // VNNI INT8 defm VPDPBSSD : VNNI_common<0x50, "vpdpbssd", X86vpdpbssd, SchedWriteVecIMul, 1, - [HasAVX10_2], [HasAVX10_2_512]>, XD; + [HasAVX10_2], [HasAVX10_2]>, XD; defm VPDPBSSDS : VNNI_common<0x51, "vpdpbssds", X86vpdpbssds, SchedWriteVecIMul, 1, - [HasAVX10_2], [HasAVX10_2_512]>, XD; + [HasAVX10_2], [HasAVX10_2]>, XD; defm VPDPBSUD : VNNI_common<0x50, "vpdpbsud", X86vpdpbsud, SchedWriteVecIMul, 0, - [HasAVX10_2], [HasAVX10_2_512]>, XS; + [HasAVX10_2], [HasAVX10_2]>, XS; defm VPDPBSUDS : VNNI_common<0x51, "vpdpbsuds", X86vpdpbsuds, SchedWriteVecIMul, 0, - [HasAVX10_2], [HasAVX10_2_512]>, XS; + [HasAVX10_2], [HasAVX10_2]>, XS; defm VPDPBUUD : VNNI_common<0x50, "vpdpbuud", X86vpdpbuud, SchedWriteVecIMul, 1, - [HasAVX10_2], [HasAVX10_2_512]>, PS; + [HasAVX10_2], [HasAVX10_2]>, PS; defm VPDPBUUDS : VNNI_common<0x51, "vpdpbuuds", X86vpdpbuuds, SchedWriteVecIMul, 1, - [HasAVX10_2], [HasAVX10_2_512]>, PS; + [HasAVX10_2], [HasAVX10_2]>, PS; // VNNI INT16 defm VPDPWSUD : VNNI_common<0xd2, "vpdpwsud", X86vpdpwsud, SchedWriteVecIMul, 0, - [HasAVX10_2], [HasAVX10_2_512]>, XS; + [HasAVX10_2], [HasAVX10_2]>, XS; defm VPDPWSUDS : VNNI_common<0xd3, "vpdpwsuds", X86vpdpwsuds, SchedWriteVecIMul, 0, - [HasAVX10_2], [HasAVX10_2_512]>, XS; + [HasAVX10_2], [HasAVX10_2]>, XS; defm VPDPWUSD : VNNI_common<0xd2, "vpdpwusd", X86vpdpwusd, SchedWriteVecIMul, 0, - [HasAVX10_2], [HasAVX10_2_512]>, PD; + [HasAVX10_2], [HasAVX10_2]>, PD; defm VPDPWUSDS : VNNI_common<0xd3, "vpdpwusds", X86vpdpwusds, SchedWriteVecIMul, 0, - [HasAVX10_2], [HasAVX10_2_512]>, PD; + [HasAVX10_2], [HasAVX10_2]>, PD; defm VPDPWUUD : VNNI_common<0xd2, "vpdpwuud", X86vpdpwuud, SchedWriteVecIMul, 1, - [HasAVX10_2], [HasAVX10_2_512]>, PS; + [HasAVX10_2], [HasAVX10_2]>, PS; defm VPDPWUUDS : VNNI_common<0xd3, "vpdpwuuds", X86vpdpwuuds, SchedWriteVecIMul, 1, - [HasAVX10_2], [HasAVX10_2_512]>, PS; + [HasAVX10_2], [HasAVX10_2]>, PS; // VMPSADBW defm VMPSADBW : avx512_common_3Op_rm_imm8<0x42, X86Vmpsadbw, "vmpsadbw", SchedWritePSADBW, @@ -94,9 +94,8 @@ multiclass avx10_minmax_packed_sae<string OpStr, AVX512VLVectorVTInfo VTI, SDNod } multiclass avx10_minmax_packed<string OpStr, AVX512VLVectorVTInfo VTI, SDNode OpNode> { - let Predicates = [HasAVX10_2_512] in - defm Z : avx10_minmax_packed_base<OpStr, VTI.info512, OpNode>, EVEX_V512; let Predicates = [HasAVX10_2] in { + defm Z : avx10_minmax_packed_base<OpStr, VTI.info512, OpNode>, EVEX_V512; defm Z256 : avx10_minmax_packed_base<OpStr, VTI.info256, OpNode>, EVEX_V256; defm Z128 : avx10_minmax_packed_base<OpStr, VTI.info128, OpNode>, EVEX_V128; } @@ -201,7 +200,7 @@ multiclass avx10_sat_cvt_rmb<bits<8> Opc, string OpStr, X86FoldableSchedWrite sc multiclass avx10_sat_cvt_rc<bits<8> Opc, string OpStr, X86SchedWriteWidths sched, AVX512VLVectorVTInfo DestInfo, AVX512VLVectorVTInfo SrcInfo, SDNode MaskNode> { - let Predicates = [HasAVX10_2_512], Uses = [MXCSR] in + let Predicates = [HasAVX10_2], Uses = [MXCSR] in defm Zrrb : AVX512_maskable<Opc, MRMSrcReg, DestInfo.info512, (outs DestInfo.info512.RC:$dst), (ins SrcInfo.info512.RC:$src, AVX512RC:$rc), @@ -216,7 +215,7 @@ multiclass avx10_sat_cvt_rc<bits<8> Opc, string OpStr, X86SchedWriteWidths sched multiclass avx10_sat_cvt_sae<bits<8> Opc, string OpStr, X86SchedWriteWidths sched, AVX512VLVectorVTInfo DestInfo, AVX512VLVectorVTInfo SrcInfo, SDNode Node> { - let Predicates = [HasAVX10_2_512], Uses = [MXCSR] in + let Predicates = [HasAVX10_2], Uses = [MXCSR] in defm Zrrb : AVX512_maskable<Opc, MRMSrcReg, DestInfo.info512, (outs DestInfo.info512.RC:$dst), (ins SrcInfo.info512.RC:$src), @@ -229,12 +228,11 @@ multiclass avx10_sat_cvt_sae<bits<8> Opc, string OpStr, X86SchedWriteWidths sche multiclass avx10_sat_cvt_base<bits<8> Opc, string OpStr, X86SchedWriteWidths sched, SDNode MaskNode, AVX512VLVectorVTInfo DestInfo, AVX512VLVectorVTInfo SrcInfo> { - let Predicates = [HasAVX10_2_512] in - defm Z : avx10_sat_cvt_rmb<Opc, OpStr, sched.ZMM, - DestInfo.info512, SrcInfo.info512, - MaskNode>, - EVEX, EVEX_V512; let Predicates = [HasAVX10_2] in { + defm Z : avx10_sat_cvt_rmb<Opc, OpStr, sched.ZMM, + DestInfo.info512, SrcInfo.info512, + MaskNode>, + EVEX, EVEX_V512; defm Z256 : avx10_sat_cvt_rmb<Opc, OpStr, sched.YMM, DestInfo.info256, SrcInfo.info256, @@ -334,13 +332,11 @@ defm VCVTTPS2IUBS : avx10_sat_cvt_base<0x6a, "vcvttps2iubs", SchedWriteVecIMul, multiclass avx10_cvttpd2dqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, SDNode MaskOpNode, SDNode OpNodeSAE, X86SchedWriteWidths sched> { - let Predicates = [HasAVX10_2_512] in { + let Predicates = [HasAVX10_2] in { defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode, MaskOpNode, sched.ZMM>, avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNodeSAE, sched.ZMM>, EVEX_V512; - } - let Predicates = [HasAVX10_2] in { defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, null_frag, null_frag, sched.XMM, "{1to2}", "{x}", f128mem, VK2WM>, EVEX_V128; @@ -410,13 +406,11 @@ multiclass avx10_cvttpd2dqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpN multiclass avx10_cvttpd2qqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, SDNode MaskOpNode, SDNode OpNodeRnd, X86SchedWriteWidths sched> { - let Predicates = [HasAVX10_2_512] in { + let Predicates = [HasAVX10_2] in { defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode, MaskOpNode, sched.ZMM>, avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f64_info, OpNodeRnd, sched.ZMM>, EVEX_V512; - } - let Predicates = [HasAVX10_2] in { defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode, MaskOpNode, sched.XMM>, EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode, @@ -432,13 +426,11 @@ multiclass avx10_cvttpd2qqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpN multiclass avx10_cvttps2qqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, SDNode MaskOpNode, SDNode OpNodeRnd, X86SchedWriteWidths sched> { - let Predicates = [HasAVX10_2_512] in { + let Predicates = [HasAVX10_2] in { defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode, MaskOpNode, sched.ZMM>, avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNodeRnd, sched.ZMM>, EVEX_V512; - } - let Predicates = [HasAVX10_2] in { defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode, MaskOpNode, sched.XMM, "{1to2}", "", f64mem, VK2WM, (v2i64 (OpNode (bc_v4f32 (v2f64 @@ -460,14 +452,11 @@ multiclass avx10_cvttps2qqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpN multiclass avx10_cvttps2dqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, SDNode MaskOpNode, SDNode OpNodeSAE, X86SchedWriteWidths sched> { - let Predicates = [HasAVX10_2_512] in { + let Predicates = [HasAVX10_2] in { defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode, MaskOpNode, sched.ZMM>, avx512_vcvt_fp_sae<opc, OpcodeStr, v16i32_info, v16f32_info, OpNodeSAE, sched.ZMM>, EVEX_V512; - } - - let Predicates = [HasAVX10_2] in { defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode, MaskOpNode, sched.XMM>, EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode, @@ -719,7 +708,7 @@ multiclass avx10_cvt2ps2ph<bits<8> opc, string OpcodeStr, AVX512VLVectorVTInfo _SrcVTInfo, AVX512VLVectorVTInfo _DstVTInfo, SDNode OpNode, SDNode OpNodeRnd> { - let Predicates = [HasAVX10_2_512], Uses = [MXCSR] in { + let Predicates = [HasAVX10_2] in { defm Z : avx512_binop_rm2<opc, OpcodeStr, sched.ZMM, OpNode, _SrcVTInfo.info512, _DstVTInfo.info512, _SrcVTInfo.info512>, @@ -727,8 +716,6 @@ multiclass avx10_cvt2ps2ph<bits<8> opc, string OpcodeStr, _SrcVTInfo.info512, _DstVTInfo.info512, OpNodeRnd>, EVEX_V512, EVEX_CD8<32, CD8VF>; - } - let Predicates = [HasAVX10_2] in { defm Z256 : avx512_binop_rm2<opc, OpcodeStr, sched.YMM, OpNode, _SrcVTInfo.info256, _DstVTInfo.info256, _SrcVTInfo.info256>, @@ -747,19 +734,19 @@ defm VCVT2PS2PHX : avx10_cvt2ps2ph<0x67, "vcvt2ps2phx", defm VCVT2PH2BF8 : avx512_binop_all<0x74, "vcvt2ph2bf8", SchedWriteCvtPD2PS, avx512vl_f16_info, avx512vl_i8_info, - X86vcvt2ph2bf8, [HasAVX10_2_512], [HasAVX10_2]>, + X86vcvt2ph2bf8, [HasAVX10_2], [HasAVX10_2]>, EVEX_CD8<16, CD8VF>, T8, XD; defm VCVT2PH2BF8S : avx512_binop_all<0x74, "vcvt2ph2bf8s", SchedWriteCvtPD2PS, avx512vl_f16_info, avx512vl_i8_info, - X86vcvt2ph2bf8s, [HasAVX10_2_512], [HasAVX10_2]>, + X86vcvt2ph2bf8s, [HasAVX10_2], [HasAVX10_2]>, EVEX_CD8<16, CD8VF>, T_MAP5, XD; defm VCVT2PH2HF8 : avx512_binop_all<0x18, "vcvt2ph2hf8", SchedWriteCvtPD2PS, avx512vl_f16_info, avx512vl_i8_info, - X86vcvt2ph2hf8, [HasAVX10_2_512], [HasAVX10_2]>, + X86vcvt2ph2hf8, [HasAVX10_2], [HasAVX10_2]>, EVEX_CD8<16, CD8VF>, T_MAP5, XD; defm VCVT2PH2HF8S : avx512_binop_all<0x1b, "vcvt2ph2hf8s", SchedWriteCvtPD2PS, avx512vl_f16_info, avx512vl_i8_info, - X86vcvt2ph2hf8s, [HasAVX10_2_512], [HasAVX10_2]>, + X86vcvt2ph2hf8s, [HasAVX10_2], [HasAVX10_2]>, EVEX_CD8<16, CD8VF>, T_MAP5, XD; //TODO: Merge into avx512_vcvt_fp, diffrence is one more source register here. @@ -836,11 +823,10 @@ multiclass avx10_convert_3op<bits<8> OpCode, string OpcodeStr, PatFrag bcast128 = vt_src.info128.BroadcastLdFrag, PatFrag loadVT128 = vt_src.info128.LdFrag, RegisterClass maskRC128 = vt_src.info128.KRCWM> { - let Predicates = [HasAVX10_2_512] in + let Predicates = [HasAVX10_2] in { defm Z : avx10_convert_3op_packed<OpCode, OpcodeStr, vt_dst.info256, vt_dst.info512, vt_src.info512, OpNode, OpNode, sched.ZMM>, EVEX_V512, EVEX_CD8<16, CD8VF>; - let Predicates = [HasAVX10_2] in { defm Z256 : avx10_convert_3op_packed<OpCode, OpcodeStr, vt_dst.info128, vt_dst.info256, vt_src.info256, OpNode, OpNode, sched.YMM>, EVEX_V256, EVEX_CD8<16, CD8VF>; @@ -920,25 +906,25 @@ defm VCVTBIASPH2HF8S : avx10_convert_3op<0x1b, "vcvtbiasph2hf8s", defm VCVTPH2BF8 : avx512_cvt_trunc_ne<0x74, "vcvtph2bf8", avx512vl_i8_info, avx512vl_f16_info, SchedWriteCvtPD2PS, X86vcvtph2bf8, X86vmcvtph2bf8, - [HasAVX10_2], [HasAVX10_2_512]>, + [HasAVX10_2], [HasAVX10_2]>, T8, XS, EVEX_CD8<16, CD8VF>; defm VCVTPH2BF8S : avx512_cvt_trunc_ne<0x74, "vcvtph2bf8s", avx512vl_i8_info, avx512vl_f16_info, SchedWriteCvtPD2PS, X86vcvtph2bf8s, X86vmcvtph2bf8s, - [HasAVX10_2], [HasAVX10_2_512]>, + [HasAVX10_2], [HasAVX10_2]>, T_MAP5, XS, EVEX_CD8<16, CD8VF>; defm VCVTPH2HF8 : avx512_cvt_trunc_ne<0x18, "vcvtph2hf8", avx512vl_i8_info, avx512vl_f16_info, SchedWriteCvtPD2PS, X86vcvtph2hf8, X86vmcvtph2hf8, - [HasAVX10_2], [HasAVX10_2_512]>, + [HasAVX10_2], [HasAVX10_2]>, T_MAP5, XS, EVEX_CD8<16, CD8VF>; defm VCVTPH2HF8S : avx512_cvt_trunc_ne<0x1b, "vcvtph2hf8s", avx512vl_i8_info, avx512vl_f16_info, SchedWriteCvtPD2PS, X86vcvtph2hf8s, X86vmcvtph2hf8s, - [HasAVX10_2], [HasAVX10_2_512]>, + [HasAVX10_2], [HasAVX10_2]>, T_MAP5, XS, EVEX_CD8<16, CD8VF>; multiclass avx10_convert_2op_nomb_packed<bits<8> opc, string OpcodeStr, @@ -962,10 +948,9 @@ multiclass avx10_convert_2op_nomb_packed<bits<8> opc, string OpcodeStr, multiclass avx10_convert_2op_nomb<string OpcodeStr, AVX512VLVectorVTInfo _dest, AVX512VLVectorVTInfo _src, bits<8> opc, SDNode OpNode> { - let Predicates = [HasAVX10_2_512] in + let Predicates = [HasAVX10_2] in { defm Z : avx10_convert_2op_nomb_packed<opc, OpcodeStr, _dest.info512, _src.info256, OpNode, f256mem, WriteCvtPH2PSZ>, EVEX_V512; - let Predicates = [HasAVX10_2] in { defm Z128 : avx10_convert_2op_nomb_packed<opc, OpcodeStr, _dest.info128, _src.info128, OpNode, f64mem, WriteCvtPH2PSZ>, EVEX_V128; defm Z256 : avx10_convert_2op_nomb_packed<opc, OpcodeStr, _dest.info256, _src.info128, @@ -985,13 +970,12 @@ defm VCVTHF82PH : avx10_convert_2op_nomb<"vcvthf82ph", avx512vl_f16_info, multiclass avx10_fp_binop_int_bf16<bits<8> opc, string OpcodeStr, X86SchedWriteSizes sched, bit IsCommutable = 0> { - let Predicates = [HasAVX10_2_512] in + let Predicates = [HasAVX10_2] in { defm Z : avx512_fp_packed<opc, OpcodeStr, !cast<Intrinsic>("int_x86_avx10_"#OpcodeStr#"bf16512"), !cast<Intrinsic>("int_x86_avx10_"#OpcodeStr#"bf16512"), v32bf16_info, sched.PH.ZMM, IsCommutable>, EVEX_V512, T_MAP5, PD, EVEX_CD8<16, CD8VF>; - let Predicates = [HasAVX10_2] in { defm Z128 : avx512_fp_packed<opc, OpcodeStr, !cast<Intrinsic>("int_x86_avx10_"#OpcodeStr#"bf16128"), !cast<Intrinsic>("int_x86_avx10_"#OpcodeStr#"bf16128"), @@ -1009,11 +993,10 @@ multiclass avx10_fp_binop_bf16<bits<8> opc, string OpcodeStr, SDPatternOperator X86SchedWriteSizes sched, bit IsCommutable = 0, SDPatternOperator MaskOpNode = OpNode> { - let Predicates = [HasAVX10_2_512] in + let Predicates = [HasAVX10_2] in { defm Z : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v32bf16_info, sched.PH.ZMM, IsCommutable>, EVEX_V512, T_MAP5, PD, EVEX_CD8<16, CD8VF>; - let Predicates = [HasAVX10_2] in { defm Z128 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v8bf16x_info, sched.PH.XMM, IsCommutable>, EVEX_V128, T_MAP5, PD, EVEX_CD8<16, CD8VF>; @@ -1086,9 +1069,8 @@ multiclass avx10_vcmp_common_bf16<X86FoldableSchedWrite sched, X86VectorVTInfo _ } multiclass avx10_vcmp_bf16<X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> { - let Predicates = [HasAVX10_2_512] in - defm Z : avx10_vcmp_common_bf16<sched.ZMM, _.info512>, EVEX_V512; let Predicates = [HasAVX10_2] in { + defm Z : avx10_vcmp_common_bf16<sched.ZMM, _.info512>, EVEX_V512; defm Z128 : avx10_vcmp_common_bf16<sched.XMM, _.info128>, EVEX_V128; defm Z256 : avx10_vcmp_common_bf16<sched.YMM, _.info256>, EVEX_V256; } @@ -1102,11 +1084,10 @@ defm VCMPBF16 : avx10_vcmp_bf16<SchedWriteFCmp, avx512vl_bf16_info>, // VSQRTBF16 multiclass avx10_sqrt_packed_bf16<bits<8> opc, string OpcodeStr, X86SchedWriteSizes sched> { - let Predicates = [HasAVX10_2_512] in - defm Z : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "bf16"), - sched.PH.ZMM, v32bf16_info>, - EVEX_V512, PD, T_MAP5, EVEX_CD8<16, CD8VF>; let Predicates = [HasAVX10_2] in { + defm Z : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "bf16"), + sched.PH.ZMM, v32bf16_info>, + EVEX_V512, PD, T_MAP5, EVEX_CD8<16, CD8VF>; defm Z128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "bf16"), sched.PH.XMM, v8bf16x_info>, EVEX_V128, PD, T_MAP5, EVEX_CD8<16, CD8VF>; @@ -1122,11 +1103,10 @@ defm VSQRTBF16 : avx10_sqrt_packed_bf16<0x51, "vsqrt", SchedWriteFSqrtSizes>; // VRSQRTBF16, VRCPBF16, VSRQTBF16, VGETEXPBF16 multiclass avx10_fp14_bf16<bits<8> opc, string OpcodeStr, SDNode OpNode, X86SchedWriteWidths sched> { - let Predicates = [HasAVX10_2_512] in - defm BF16Z : avx512_fp14_p<opc, !strconcat(OpcodeStr, "bf16"), - OpNode, sched.ZMM, v32bf16_info>, - EVEX_V512; let Predicates = [HasAVX10_2] in { + defm BF16Z : avx512_fp14_p<opc, !strconcat(OpcodeStr, "bf16"), + OpNode, sched.ZMM, v32bf16_info>, + EVEX_V512; defm BF16Z128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "bf16"), OpNode, sched.XMM, v8bf16x_info>, EVEX_V128; @@ -1146,10 +1126,9 @@ defm VGETEXP : avx10_fp14_bf16<0x42, "vgetexp", X86fgetexp, SchedWriteFRnd>, // VSCALEFBF16 multiclass avx10_fp_scalef_bf16<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched> { - let Predicates = [HasAVX10_2_512] in + let Predicates = [HasAVX10_2] in { defm Z : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v32bf16_info>, EVEX_V512, T_MAP6, PS, EVEX_CD8<16, CD8VF>; - let Predicates = [HasAVX10_2] in { defm Z128 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.XMM, v8bf16x_info>, EVEX_V128, EVEX_CD8<16, CD8VF>, T_MAP6, PS; defm Z256 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.YMM, v16bf16x_info>, @@ -1164,10 +1143,9 @@ defm VSCALEFBF16 : avx10_fp_scalef_bf16<0x2C, "vscalef", SchedWriteFAdd>; multiclass avx10_common_unary_fp_packed_imm_bf16<string OpcodeStr, AVX512VLVectorVTInfo _, bits<8> opc, SDPatternOperator OpNode, SDPatternOperator MaskOpNode, X86SchedWriteWidths sched> { - let Predicates = [HasAVX10_2_512] in + let Predicates = [HasAVX10_2] in { defm Z : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode, sched.ZMM, _.info512>, EVEX_V512; - let Predicates = [HasAVX10_2] in { defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode, sched.XMM, _.info128>, EVEX_V128; defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode, @@ -1190,11 +1168,10 @@ defm VGETMANTBF16 : avx10_common_unary_fp_packed_imm_bf16<"vgetmant", avx512vl_b // VFPCLASSBF16 multiclass avx10_fp_fpclass_bf16<string OpcodeStr, bits<8> opcVec, X86SchedWriteWidths sched> { - let Predicates = [HasAVX10_2_512] in + let Predicates = [HasAVX10_2] in { defm Z : avx512_vector_fpclass<opcVec, OpcodeStr, sched.ZMM, avx512vl_bf16_info.info512, "z", []<Register>>, EVEX_V512; - let Predicates = [HasAVX10_2] in { defm Z128 : avx512_vector_fpclass<opcVec, OpcodeStr, sched.XMM, avx512vl_bf16_info.info128, "x", []<Register>>, EVEX_V128; @@ -1211,11 +1188,10 @@ defm VFPCLASSBF16 : avx10_fp_fpclass_bf16<"vfpclass", 0x66, SchedWriteFCmp>, multiclass avx10_fma3p_213_bf16<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, SDNode MaskOpNode, X86SchedWriteWidths sched> { - let Predicates = [HasAVX10_2_512] in + let Predicates = [HasAVX10_2] in { defm Z : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, MaskOpNode, sched.ZMM, v32bf16_info>, EVEX_V512, T_MAP6, PS, EVEX_CD8<16, CD8VF>; - let Predicates = [HasAVX10_2] in { defm Z128 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, MaskOpNode, sched.XMM, v8bf16x_info>, EVEX_V128, T_MAP6, PS, EVEX_CD8<16, CD8VF>; @@ -1239,11 +1215,10 @@ defm VFNMSUB213BF16 : avx10_fma3p_213_bf16<0xAE, "vfnmsub213bf16", X86any_Fnmsub multiclass avx10_fma3p_231_bf16<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, SDNode MaskOpNode, X86SchedWriteWidths sched> { - let Predicates = [HasAVX10_2_512] in + let Predicates = [HasAVX10_2] in { defm Z : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, MaskOpNode, sched.ZMM, v32bf16_info>, EVEX_V512, T_MAP6, PS, EVEX_CD8<16, CD8VF>; - let Predicates = [HasAVX10_2] in { defm Z128 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, MaskOpNode, sched.XMM, v8bf16x_info>, EVEX_V128, T_MAP6, PS, EVEX_CD8<16, CD8VF>; @@ -1267,11 +1242,10 @@ defm VFNMSUB231BF16 : avx10_fma3p_231_bf16<0xBE, "vfnmsub231bf16", X86any_Fnmsub multiclass avx10_fma3p_132_bf16<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, SDNode MaskOpNode, X86SchedWriteWidths sched> { - let Predicates = [HasAVX10_2_512] in + let Predicates = [HasAVX10_2] in { defm Z : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, MaskOpNode, sched.ZMM, v32bf16_info>, EVEX_V512, T_MAP6, PS, EVEX_CD8<16, CD8VF>; - let Predicates = [HasAVX10_2] in { defm Z128 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, MaskOpNode, sched.XMM, v8bf16x_info>, EVEX_V128, T_MAP6, PS, EVEX_CD8<16, CD8VF>; @@ -1440,9 +1414,8 @@ multiclass vmovrs_p<bits<8> opc, string OpStr, X86VectorVTInfo _> { } multiclass vmovrs_p_vl<bits<8> opc, string OpStr, AVX512VLVectorVTInfo _Vec> { - let Predicates = [HasMOVRS, HasAVX10_2_512, In64BitMode] in - defm Z : vmovrs_p<opc, OpStr, _Vec.info512>, EVEX_V512; let Predicates = [HasMOVRS, HasAVX10_2, In64BitMode] in { + defm Z : vmovrs_p<opc, OpStr, _Vec.info512>, EVEX_V512; defm Z128 : vmovrs_p<opc, OpStr, _Vec.info128>, EVEX_V128; defm Z256 : vmovrs_p<opc, OpStr, _Vec.info256>, EVEX_V256; } @@ -1464,7 +1437,7 @@ multiclass avx10_sm4_base<string OpStr> { defm Z128 : SM4_Base<OpStr, VR128X, "128", loadv4i32, i128mem>, EVEX_V128; defm Z256 : SM4_Base<OpStr, VR256X, "256", loadv8i32, i256mem>, EVEX_V256; } - let Predicates = [HasSM4, HasAVX10_2_512] in + let Predicates = [HasSM4, HasAVX10_2] in defm Z : SM4_Base<OpStr, VR512, "512", loadv16i32, i512mem>, EVEX_V512; } diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 0ab94cca4142..3401f6f04800 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -834,7 +834,7 @@ defm : vextract_for_size_lowering<"VEXTRACTF64X4Z", v32bf16_info, v16bf16x_info, // A 128-bit extract from bits [255:128] of a 512-bit vector should use a // smaller extract to enable EVEX->VEX. -let Predicates = [NoVLX, HasEVEX512] in { +let Predicates = [NoVLX] in { def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 2))), (v2i64 (VEXTRACTI128rri (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm)), @@ -3088,7 +3088,7 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, addr:$src2, (X86cmpm_imm_commute timm:$cc)), Narrow.KRC)>; } -let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { +let Predicates = [HasAVX512, NoVLX] in { defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPD", v8i32x_info, v16i32_info>; defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUD", v8i32x_info, v16i32_info>; @@ -3119,7 +3119,7 @@ let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPD", v2f64x_info, v8f64_info>; } -let Predicates = [HasBWI, NoVLX, HasEVEX512] in { +let Predicates = [HasBWI, NoVLX] in { defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPB", v32i8x_info, v64i8_info>; defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUB", v32i8x_info, v64i8_info>; @@ -3513,7 +3513,7 @@ multiclass mask_move_lowering<string InstrStr, X86VectorVTInfo Narrow, // Patterns for handling v8i1 selects of 256-bit vectors when VLX isn't // available. Use a 512-bit operation and extract. -let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { +let Predicates = [HasAVX512, NoVLX] in { defm : mask_move_lowering<"VMOVAPSZ", v4f32x_info, v16f32_info>; defm : mask_move_lowering<"VMOVDQA32Z", v4i32x_info, v16i32_info>; defm : mask_move_lowering<"VMOVAPSZ", v8f32x_info, v16f32_info>; @@ -3525,7 +3525,7 @@ let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { defm : mask_move_lowering<"VMOVDQA64Z", v4i64x_info, v8i64_info>; } -let Predicates = [HasBWI, NoVLX, HasEVEX512] in { +let Predicates = [HasBWI, NoVLX] in { defm : mask_move_lowering<"VMOVDQU8Z", v16i8x_info, v64i8_info>; defm : mask_move_lowering<"VMOVDQU8Z", v32i8x_info, v64i8_info>; @@ -5021,8 +5021,8 @@ defm VPMINUD : avx512_binop_rm_vl_d<0x3B, "vpminud", umin, defm VPMINUQ : avx512_binop_rm_vl_q<0x3B, "vpminuq", umin, SchedWriteVecALU, HasAVX512, 1>, T8; -// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX, HasEVEX512. -let Predicates = [HasDQI, NoVLX, HasEVEX512] in { +// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX. +let Predicates = [HasDQI, NoVLX] in { def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))), (EXTRACT_SUBREG (VPMULLQZrr @@ -5078,7 +5078,7 @@ multiclass avx512_min_max_lowering<string Instr, SDNode OpNode> { sub_xmm)>; } -let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { +let Predicates = [HasAVX512, NoVLX] in { defm : avx512_min_max_lowering<"VPMAXUQZ", umax>; defm : avx512_min_max_lowering<"VPMINUQZ", umin>; defm : avx512_min_max_lowering<"VPMAXSQZ", smax>; @@ -6055,7 +6055,7 @@ defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl, SchedWriteVecShift>; // Use 512bit VPSRA/VPSRAI version to implement v2i64/v4i64 in case NoVLX. -let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { +let Predicates = [HasAVX512, NoVLX] in { def : Pat<(v4i64 (X86vsra (v4i64 VR256X:$src1), (v2i64 VR128X:$src2))), (EXTRACT_SUBREG (v8i64 (VPSRAQZrr @@ -6184,14 +6184,14 @@ defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", X86vsrlv, SchedWriteVarVecS defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr, SchedWriteVarVecShift>; defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl, SchedWriteVarVecShift>; -defm : avx512_var_shift_lowering<avx512vl_i64_info, "VPSRAVQ", X86vsrav, [HasAVX512, NoVLX, HasEVEX512]>; -defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", X86vshlv, [HasBWI, NoVLX, HasEVEX512]>; -defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRAVW", X86vsrav, [HasBWI, NoVLX, HasEVEX512]>; -defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRLVW", X86vsrlv, [HasBWI, NoVLX, HasEVEX512]>; +defm : avx512_var_shift_lowering<avx512vl_i64_info, "VPSRAVQ", X86vsrav, [HasAVX512, NoVLX]>; +defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", X86vshlv, [HasBWI, NoVLX]>; +defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRAVW", X86vsrav, [HasBWI, NoVLX]>; +defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRLVW", X86vsrlv, [HasBWI, NoVLX]>; // Use 512bit VPROL/VPROLI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX. -let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { +let Predicates = [HasAVX512, NoVLX] in { def : Pat<(v2i64 (rotl (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), (EXTRACT_SUBREG (v8i64 (VPROLVQZrr @@ -6242,7 +6242,7 @@ let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { } // Use 512bit VPROR/VPRORI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX. -let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { +let Predicates = [HasAVX512, NoVLX] in { def : Pat<(v2i64 (rotr (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), (EXTRACT_SUBREG (v8i64 (VPRORVQZrr @@ -9933,7 +9933,7 @@ defm VPMOVUSWB : avx512_trunc_wb<0x10, "vpmovuswb", X86vtruncus, truncstore_us_vi8, masked_truncstore_us_vi8, X86vtruncus, X86vmtruncus>; -let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { +let Predicates = [HasAVX512, NoVLX] in { def: Pat<(v8i16 (trunc (v8i32 VR256X:$src))), (v8i16 (EXTRACT_SUBREG (v16i16 (VPMOVDWZrr (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), @@ -9944,7 +9944,7 @@ def: Pat<(v4i32 (trunc (v4i64 VR256X:$src))), VR256X:$src, sub_ymm)))), sub_xmm))>; } -let Predicates = [HasBWI, NoVLX, HasEVEX512] in { +let Predicates = [HasBWI, NoVLX] in { def: Pat<(v16i8 (trunc (v16i16 VR256X:$src))), (v16i8 (EXTRACT_SUBREG (VPMOVWBZrr (v32i16 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src, sub_ymm))), sub_xmm))>; @@ -10487,7 +10487,7 @@ multiclass avx512_convert_vector_to_mask<bits<8> opc, string OpcodeStr, defm Z128 : convert_vector_to_mask_common<opc, VTInfo.info128, OpcodeStr>, EVEX_V128; } - let Predicates = [prd, NoVLX, HasEVEX512] in { + let Predicates = [prd, NoVLX] in { defm Z256_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info256, NAME>; defm Z128_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info128, NAME>; } @@ -11283,7 +11283,7 @@ defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", abs, SchedWriteVecALU>; // VPABS: Use 512bit version to implement 128/256 bit in case NoVLX. -let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { +let Predicates = [HasAVX512, NoVLX] in { def : Pat<(v4i64 (abs VR256X:$src)), (EXTRACT_SUBREG (VPABSQZrr @@ -11299,7 +11299,7 @@ let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { // Use 512bit version to implement 128/256 bit. multiclass avx512_unary_lowering<string InstrStr, SDNode OpNode, AVX512VLVectorVTInfo _, Predicate prd> { - let Predicates = [prd, NoVLX, HasEVEX512] in { + let Predicates = [prd, NoVLX] in { def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1))), (EXTRACT_SUBREG (!cast<Instruction>(InstrStr # "Zrr") @@ -11918,7 +11918,7 @@ let Predicates = [HasAVX512] in { (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>; } -let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { +let Predicates = [HasAVX512, NoVLX] in { def : Pat<(v16i8 (vnot VR128X:$src)), (EXTRACT_SUBREG (VPTERNLOGQZrri diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index 927b2c8b22f0..5a0df058b27f 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -1326,7 +1326,11 @@ def : Pat<(X86imp_call (i64 tglobaladdr:$dst)), // Match an X86tcret that uses less than 7 volatile registers. def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off), (TCRETURNri ptr_rc_tailcall:$dst, timm:$off)>, - Requires<[Not64BitMode, NotUseIndirectThunkCalls]>; + Requires<[Not64BitMode, IsNotHiPECCFunc, NotUseIndirectThunkCalls]>; + +def : Pat<(X86tcret GR32:$dst, timm:$off), + (TCRETURN_HIPE32ri GR32:$dst, timm:$off)>, + Requires<[Not64BitMode, IsHiPECCFunc, NotUseIndirectThunkCalls]>; // FIXME: This is disabled for 32-bit PIC mode because the global base // register which is part of the address mode may be assigned a @@ -1346,7 +1350,11 @@ def : Pat<(X86tcret (i32 texternalsym:$dst), timm:$off), def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off), (TCRETURNri64 ptr_rc_tailcall:$dst, timm:$off)>, - Requires<[In64BitMode, NotUseIndirectThunkCalls, ImportCallOptimizationDisabled]>; + Requires<[In64BitMode, IsNotWin64CCFunc, NotUseIndirectThunkCalls, ImportCallOptimizationDisabled]>; + +def : Pat<(X86tcret GR64_TCW64:$dst, timm:$off), + (TCRETURN_WIN64ri GR64_TCW64:$dst, timm:$off)>, + Requires<[IsWin64CCFunc, NotUseIndirectThunkCalls, ImportCallOptimizationDisabled]>; def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off), (TCRETURNri64_ImpCall ptr_rc_tailcall:$dst, timm:$off)>, diff --git a/llvm/lib/Target/X86/X86InstrControl.td b/llvm/lib/Target/X86/X86InstrControl.td index 22253bf0413a..139aedd473eb 100644 --- a/llvm/lib/Target/X86/X86InstrControl.td +++ b/llvm/lib/Target/X86/X86InstrControl.td @@ -282,6 +282,10 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, []>, Sched<[WriteJump]>; def TCRETURNri : PseudoI<(outs), (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>, Sched<[WriteJump]>; + + def TCRETURN_HIPE32ri : PseudoI<(outs), (ins GR32:$dst, i32imm:$offset), + []>, Sched<[WriteJump]>; + let mayLoad = 1 in def TCRETURNmi : PseudoI<(outs), (ins i32mem_TC:$dst, i32imm:$offset), []>, Sched<[WriteJumpLd]>; @@ -357,6 +361,9 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, def TCRETURNri64 : PseudoI<(outs), (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>, Sched<[WriteJump]>; + def TCRETURN_WIN64ri : PseudoI<(outs), (ins GR64_TCW64:$dst, i32imm:$offset), + []>, Sched<[WriteJump]>; + def TCRETURNri64_ImpCall : PseudoI<(outs), (ins GR64_A:$dst, i32imm:$offset), []>, Sched<[WriteJump]>; diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index abf365eedec3..a68edf4d2b7e 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -83,8 +83,9 @@ static cl::opt<unsigned> UndefRegClearance( // Pin the vtable to this file. void X86InstrInfo::anchor() {} -X86InstrInfo::X86InstrInfo(X86Subtarget &STI) - : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 +X86InstrInfo::X86InstrInfo(const X86Subtarget &STI) + : X86GenInstrInfo(STI, + (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32), (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32), @@ -4399,13 +4400,8 @@ static unsigned getLoadStoreOpcodeForFP16(bool Load, const X86Subtarget &STI) { if (STI.hasFP16()) return Load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr; if (Load) - return STI.hasAVX512() ? X86::VMOVSSZrm - : STI.hasAVX() ? X86::VMOVSSrm - : X86::MOVSSrm; - else - return STI.hasAVX512() ? X86::VMOVSSZmr - : STI.hasAVX() ? X86::VMOVSSmr - : X86::MOVSSmr; + return X86::MOVSHPrm; + return X86::MOVSHPmr; } static unsigned getLoadStoreRegOpcode(Register Reg, @@ -4903,6 +4899,16 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, CmpMask = ~0; CmpValue = 0; return true; + case X86::TEST64ri32: + case X86::TEST32ri: + case X86::TEST16ri: + case X86::TEST8ri: + SrcReg = MI.getOperand(0).getReg(); + SrcReg2 = 0; + // Force identical compare. + CmpMask = 0; + CmpValue = 0; + return true; } return false; } @@ -4942,6 +4948,10 @@ bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI, case X86::CMP32ri: case X86::CMP16ri: case X86::CMP8ri: + case X86::TEST64ri32: + case X86::TEST32ri: + case X86::TEST16ri: + case X86::TEST8ri: CASE_ND(SUB64ri32) CASE_ND(SUB32ri) CASE_ND(SUB16ri) @@ -6131,6 +6141,25 @@ static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) { return true; } +static bool expandMOVSHP(MachineInstrBuilder &MIB, MachineInstr &MI, + const TargetInstrInfo &TII, bool HasAVX) { + unsigned NewOpc; + if (MI.getOpcode() == X86::MOVSHPrm) { + NewOpc = HasAVX ? X86::VMOVSSrm : X86::MOVSSrm; + Register Reg = MI.getOperand(0).getReg(); + if (Reg > X86::XMM15) + NewOpc = X86::VMOVSSZrm; + } else { + NewOpc = HasAVX ? X86::VMOVSSmr : X86::MOVSSmr; + Register Reg = MI.getOperand(5).getReg(); + if (Reg > X86::XMM15) + NewOpc = X86::VMOVSSZmr; + } + + MIB->setDesc(TII.get(NewOpc)); + return true; +} + bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { bool HasAVX = Subtarget.hasAVX(); MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI); @@ -6203,6 +6232,9 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { } return Expand2AddrUndef(MIB, get(X86::VPXORDZrr)); } + case X86::MOVSHPmr: + case X86::MOVSHPrm: + return expandMOVSHP(MIB, MI, *this, Subtarget.hasAVX()); case X86::V_SETALLONES: return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr)); diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index 9dc5f4b0e086..f087b7f20ff6 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -222,7 +222,7 @@ inline static bool isMemInstrWithGOTPCREL(const MachineInstr &MI) { } class X86InstrInfo final : public X86GenInstrInfo { - X86Subtarget &Subtarget; + const X86Subtarget &Subtarget; const X86RegisterInfo RI; LLVM_DECLARE_VIRTUAL_ANCHOR_FUNCTION(); @@ -238,7 +238,7 @@ class X86InstrInfo final : public X86GenInstrInfo { bool MakeChange) const; public: - explicit X86InstrInfo(X86Subtarget &STI); + explicit X86InstrInfo(const X86Subtarget &STI); /// Given a machine instruction descriptor, returns the register /// class constraint for OpNum, or NULL. Returned register class diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td index df1541e9085b..8339c2081842 100644 --- a/llvm/lib/Target/X86/X86InstrPredicates.td +++ b/llvm/lib/Target/X86/X86InstrPredicates.td @@ -69,11 +69,8 @@ def NoAVX : Predicate<"!Subtarget->hasAVX()">; def HasAVX : Predicate<"Subtarget->hasAVX()">; def HasAVX2 : Predicate<"Subtarget->hasAVX2()">; def HasAVX1Only : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">; -def HasEVEX512 : Predicate<"Subtarget->hasEVEX512()">; def HasAVX10_1 : Predicate<"Subtarget->hasAVX10_1()">; -def HasAVX10_1_512 : Predicate<"Subtarget->hasAVX10_1_512()">; def HasAVX10_2 : Predicate<"Subtarget->hasAVX10_2()">; -def HasAVX10_2_512 : Predicate<"Subtarget->hasAVX10_2_512()">; def NoAVX10_2 : Predicate<"!Subtarget->hasAVX10_2()">; def HasAVX512 : Predicate<"Subtarget->hasAVX512()">; def UseAVX : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">; @@ -233,6 +230,13 @@ let RecomputePerFunction = 1 in { "!Subtarget->hasSSE41()">; def ImportCallOptimizationEnabled : Predicate<"MF->getFunction().getParent()->getModuleFlag(\"import-call-optimization\")">; def ImportCallOptimizationDisabled : Predicate<"!MF->getFunction().getParent()->getModuleFlag(\"import-call-optimization\")">; + + def IsWin64CCFunc : Predicate<"Subtarget->isCallingConvWin64(MF->getFunction().getCallingConv())">; + def IsNotWin64CCFunc : Predicate<"!Subtarget->isCallingConvWin64(MF->getFunction().getCallingConv())">; + def IsHiPECCFunc : Predicate<"MF->getFunction().getCallingConv() == CallingConv::HiPE">; + + def IsNotHiPECCFunc : Predicate< + "MF->getFunction().getCallingConv() != CallingConv::HiPE">; } def CallImmAddr : Predicate<"Subtarget->isLegalToCallImmediateAddr()">; diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 1acc0cd8da20..b7926497c92b 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -267,6 +267,18 @@ multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop, } } +// pseudo instruction for fp16 spilling. +let isPseudo = 1, Predicates = [HasSSE2] in { + let mayStore = 1 in + def MOVSHPmr : I<0, Pseudo, (outs), (ins f32mem:$dst, FR16X:$src), "", + [], SSEPackedSingle>, + Sched<[WriteFStore]>; + let mayLoad = 1 in + def MOVSHPrm : I<0, Pseudo, (outs FR16X:$dst), (ins f32mem:$src), "", + [], SSEPackedSingle>, + Sched<[WriteFLoad]>; +} + defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss", SSEPackedSingle, UseSSE1>, TB, XS; defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd", diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp index 632db7e4326e..4188487d7591 100644 --- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -825,7 +825,8 @@ bool X86TargetLowering::lowerInterleavedLoad( bool X86TargetLowering::lowerInterleavedStore(Instruction *Store, Value *LaneMask, ShuffleVectorInst *SVI, - unsigned Factor) const { + unsigned Factor, + const APInt &GapMask) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); @@ -836,7 +837,8 @@ bool X86TargetLowering::lowerInterleavedStore(Instruction *Store, auto *SI = dyn_cast<StoreInst>(Store); if (!SI) return false; - assert(!LaneMask && "Unexpected mask on store"); + assert(!LaneMask && GapMask.popcount() == Factor && + "Unexpected mask on store"); // Holds the indices of SVI that correspond to the starting index of each // interleaved shuffle. diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp index 595ad3290eed..9ec04e740a08 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -204,15 +204,7 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, // we can still use 64-bit register as long as we know the high bits // are zeros. // Reflect that in the returned register class. - if (Is64Bit) { - // When the target also allows 64-bit frame pointer and we do have a - // frame, this is fine to use it for the address accesses as well. - const X86FrameLowering *TFI = getFrameLowering(MF); - return TFI->hasFP(MF) && TFI->Uses64BitFramePtr - ? &X86::LOW32_ADDR_ACCESS_RBPRegClass - : &X86::LOW32_ADDR_ACCESSRegClass; - } - return &X86::GR32RegClass; + return Is64Bit ? &X86::LOW32_ADDR_ACCESSRegClass : &X86::GR32RegClass; case 1: // Normal GPRs except the stack pointer (for encoding reasons). if (Subtarget.isTarget64BitLP64()) return &X86::GR64_NOSPRegClass; @@ -228,25 +220,11 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, // NOSP does not contain RIP, so no special case here. return &X86::GR32_NOREX_NOSPRegClass; case 4: // Available for tailcall (not callee-saved GPRs). - return getGPRsForTailCall(MF); + return Is64Bit ? &X86::GR64_TCRegClass : &X86::GR32_TCRegClass; } } const TargetRegisterClass * -X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const { - const Function &F = MF.getFunction(); - if (IsWin64 || IsUEFI64 || (F.getCallingConv() == CallingConv::Win64)) - return &X86::GR64_TCW64RegClass; - else if (Is64Bit) - return &X86::GR64_TCRegClass; - - bool hasHipeCC = (F.getCallingConv() == CallingConv::HiPE); - if (hasHipeCC) - return &X86::GR32RegClass; - return &X86::GR32_TCRegClass; -} - -const TargetRegisterClass * X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { if (RC == &X86::CCRRegClass) { if (Is64Bit) @@ -1007,11 +985,10 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, unsigned X86RegisterInfo::findDeadCallerSavedReg( MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI) const { const MachineFunction *MF = MBB.getParent(); + const MachineRegisterInfo &MRI = MF->getRegInfo(); if (MF->callsEHReturn()) return 0; - const TargetRegisterClass &AvailableRegs = *getGPRsForTailCall(*MF); - if (MBBI == MBB.end()) return 0; @@ -1026,6 +1003,8 @@ unsigned X86RegisterInfo::findDeadCallerSavedReg( case X86::RETI64: case X86::TCRETURNdi: case X86::TCRETURNri: + case X86::TCRETURN_WIN64ri: + case X86::TCRETURN_HIPE32ri: case X86::TCRETURNmi: case X86::TCRETURNdi64: case X86::TCRETURNri64: @@ -1033,20 +1012,16 @@ unsigned X86RegisterInfo::findDeadCallerSavedReg( case X86::TCRETURNmi64: case X86::EH_RETURN: case X86::EH_RETURN64: { - SmallSet<uint16_t, 8> Uses; - for (MachineOperand &MO : MBBI->operands()) { - if (!MO.isReg() || MO.isDef()) - continue; - Register Reg = MO.getReg(); - if (!Reg) - continue; - for (MCRegAliasIterator AI(Reg, this, true); AI.isValid(); ++AI) - Uses.insert(*AI); + LiveRegUnits LRU(*this); + LRU.addLiveOuts(MBB); + LRU.stepBackward(*MBBI); + + const TargetRegisterClass &RC = + Is64Bit ? X86::GR64_NOSPRegClass : X86::GR32_NOSPRegClass; + for (MCRegister Reg : RC) { + if (LRU.available(Reg) && !MRI.isReserved(Reg)) + return Reg; } - - for (auto CS : AvailableRegs) - if (!Uses.count(CS) && CS != X86::RIP && CS != X86::RSP && CS != X86::ESP) - return CS; } } diff --git a/llvm/lib/Target/X86/X86RegisterInfo.h b/llvm/lib/Target/X86/X86RegisterInfo.h index 2f4c55cfad6d..d022e5ab8794 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.h +++ b/llvm/lib/Target/X86/X86RegisterInfo.h @@ -87,11 +87,6 @@ public: const TargetRegisterClass * getCrossCopyRegClass(const TargetRegisterClass *RC) const override; - /// getGPRsForTailCall - Returns a register class with registers that can be - /// used in forming tail calls. - const TargetRegisterClass * - getGPRsForTailCall(const MachineFunction &MF) const; - unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td index e9ca25d808a5..99b7910131dc 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/llvm/lib/Target/X86/X86RegisterInfo.td @@ -716,10 +716,7 @@ def GR64_NOREX2_NOSP : RegisterClass<"X86", [i64], 64, // which we do not have right now. def LOW32_ADDR_ACCESS : RegisterClass<"X86", [i32], 32, (add GR32, RIP)>; -// When RBP is used as a base pointer in a 32-bit addresses environment, -// this is also safe to use the full register to access addresses. -// Since RBP will never be spilled, stick to a 32 alignment to save -// on memory consumption. +// FIXME: This is unused, but deleting it results in codegen changes def LOW32_ADDR_ACCESS_RBP : RegisterClass<"X86", [i32], 32, (add LOW32_ADDR_ACCESS, RBP)>; diff --git a/llvm/lib/Target/X86/X86ScheduleZnver3.td b/llvm/lib/Target/X86/X86ScheduleZnver3.td index 9e271c1ee370..044b77f7aacf 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver3.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver3.td @@ -992,14 +992,14 @@ def Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn3FPFMisc0]> { def : InstRW<[Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rri, VEXTRACTI128rri)>; def Zn3WriteVEXTRACTI128mr : SchedWriteRes<[Zn3FPFMisc0, Zn3FPSt, Zn3Store]> { - let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); + let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); let ReleaseAtCycles = [1, 1, 1]; let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1); } def : InstRW<[Zn3WriteVEXTRACTI128mr], (instrs VEXTRACTI128mri, VEXTRACTF128mri)>; def Zn3WriteVINSERTF128rmr : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPFMisc0]> { - let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); + let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); let ReleaseAtCycles = [1, 1, 1]; let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0); } @@ -1221,7 +1221,7 @@ def Zn3WriteSHA1MSG1rr : SchedWriteRes<[Zn3FPU0123]> { def : InstRW<[Zn3WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>; def Zn3WriteSHA1MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { - let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG1rr.Latency); + let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteSHA1MSG1rr.Latency); let ReleaseAtCycles = [1, 1, 2]; let NumMicroOps = !add(Zn3WriteSHA1MSG1rr.NumMicroOps, 0); } @@ -1235,7 +1235,7 @@ def Zn3WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn3FPU0123]> { def : InstRW<[Zn3WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>; def Zn3Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { - let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG2rr_SHA1NEXTErr.Latency); + let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteSHA1MSG2rr_SHA1NEXTErr.Latency); let ReleaseAtCycles = [1, 1, 2]; let NumMicroOps = !add(Zn3WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0); } @@ -1249,7 +1249,7 @@ def Zn3WriteSHA256MSG1rr : SchedWriteRes<[Zn3FPU0123]> { def : InstRW<[Zn3WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>; def Zn3Writerm_SHA256MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { - let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG1rr.Latency); + let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteSHA256MSG1rr.Latency); let ReleaseAtCycles = [1, 1, 3]; let NumMicroOps = !add(Zn3WriteSHA256MSG1rr.NumMicroOps, 0); } @@ -1263,7 +1263,7 @@ def Zn3WriteSHA256MSG2rr : SchedWriteRes<[Zn3FPU0123]> { def : InstRW<[Zn3WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>; def Zn3WriteSHA256MSG2rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { - let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG2rr.Latency); + let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteSHA256MSG2rr.Latency); let ReleaseAtCycles = [1, 1, 8]; let NumMicroOps = !add(Zn3WriteSHA256MSG2rr.NumMicroOps, 1); } @@ -1338,14 +1338,14 @@ def Zn3WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn3FPVShuf]> { def : InstRW<[Zn3WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rri, VPERM2F128rri)>; def Zn3WriteVPERM2F128rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> { - let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERM2I128rr_VPERM2F128rr.Latency); + let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteVPERM2I128rr_VPERM2F128rr.Latency); let ReleaseAtCycles = [1, 1, 1]; let NumMicroOps = !add(Zn3WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0); } def : InstRW<[Zn3WriteVPERM2F128rm], (instrs VPERM2F128rmi)>; def Zn3WriteVPERMPSYrm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> { - let Latency = !add(Znver3Model.LoadLatency, 7); + let Latency = !add(Znver3Model.VecLoadLatency, 7); let ReleaseAtCycles = [1, 1, 2]; let NumMicroOps = 3; } @@ -1359,14 +1359,14 @@ def Zn3WriteVPERMYri : SchedWriteRes<[Zn3FPVShuf]> { def : InstRW<[Zn3WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>; def Zn3WriteVPERMPDYmi : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> { - let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMYri.Latency); + let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteVPERMYri.Latency); let ReleaseAtCycles = [1, 1, 2]; let NumMicroOps = !add(Zn3WriteVPERMYri.NumMicroOps, 1); } def : InstRW<[Zn3WriteVPERMPDYmi], (instrs VPERMPDYmi)>; def Zn3WriteVPERMDYm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> { - let Latency = !add(Znver3Model.LoadLatency, 5); + let Latency = !add(Znver3Model.VecLoadLatency, 5); let ReleaseAtCycles = [1, 1, 2]; let NumMicroOps = 2; } diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td index 74d916d41f83..a93c7e3a82f1 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver4.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td @@ -1005,14 +1005,14 @@ def Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn4FPFMisc0]> { def : InstRW<[Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rri, VEXTRACTI128rri)>; def Zn4WriteVEXTRACTI128mr : SchedWriteRes<[Zn4FPFMisc0, Zn4FPSt, Zn4Store]> { - let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); + let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); let ReleaseAtCycles = [1, 1, 1]; let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1); } def : InstRW<[Zn4WriteVEXTRACTI128mr], (instrs VEXTRACTI128mri, VEXTRACTF128mri)>; def Zn4WriteVINSERTF128rmr : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPFMisc0]> { - let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); + let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); let ReleaseAtCycles = [1, 1, 1]; let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0); } @@ -1262,7 +1262,7 @@ def Zn4WriteSHA1MSG1rr : SchedWriteRes<[Zn4FPU0123]> { def : InstRW<[Zn4WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>; def Zn4WriteSHA1MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { - let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG1rr.Latency); + let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteSHA1MSG1rr.Latency); let ReleaseAtCycles = [1, 1, 2]; let NumMicroOps = !add(Zn4WriteSHA1MSG1rr.NumMicroOps, 0); } @@ -1276,7 +1276,7 @@ def Zn4WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn4FPU0123]> { def : InstRW<[Zn4WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>; def Zn4Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { - let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG2rr_SHA1NEXTErr.Latency); + let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteSHA1MSG2rr_SHA1NEXTErr.Latency); let ReleaseAtCycles = [1, 1, 2]; let NumMicroOps = !add(Zn4WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0); } @@ -1290,7 +1290,7 @@ def Zn4WriteSHA256MSG1rr : SchedWriteRes<[Zn4FPU0123]> { def : InstRW<[Zn4WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>; def Zn4Writerm_SHA256MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { - let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG1rr.Latency); + let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteSHA256MSG1rr.Latency); let ReleaseAtCycles = [1, 1, 3]; let NumMicroOps = !add(Zn4WriteSHA256MSG1rr.NumMicroOps, 0); } @@ -1304,7 +1304,7 @@ def Zn4WriteSHA256MSG2rr : SchedWriteRes<[Zn4FPU0123]> { def : InstRW<[Zn4WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>; def Zn4WriteSHA256MSG2rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { - let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG2rr.Latency); + let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteSHA256MSG2rr.Latency); let ReleaseAtCycles = [1, 1, 8]; let NumMicroOps = !add(Zn4WriteSHA256MSG2rr.NumMicroOps, 1); } @@ -1379,7 +1379,7 @@ def Zn4WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn4FPVShuf]> { def : InstRW<[Zn4WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rri, VPERM2F128rri)>; def Zn4WriteVPERM2F128rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { - let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERM2I128rr_VPERM2F128rr.Latency); + let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERM2I128rr_VPERM2F128rr.Latency); let ReleaseAtCycles = [1, 1, 1]; let NumMicroOps = !add(Zn4WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0); } @@ -1393,7 +1393,7 @@ def Zn4WriteVPERMPSYrr : SchedWriteRes<[Zn4FPVShuf]> { def : InstRW<[Zn4WriteVPERMPSYrr], (instrs VPERMPSYrr)>; def Zn4WriteVPERMPSYrm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { - let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMPSYrr.Latency); + let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMPSYrr.Latency); let ReleaseAtCycles = [1, 1, 2]; let NumMicroOps = !add(Zn4WriteVPERMPSYrr.NumMicroOps, 1); } @@ -1407,7 +1407,7 @@ def Zn4WriteVPERMYri : SchedWriteRes<[Zn4FPVShuf]> { def : InstRW<[Zn4WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>; def Zn4WriteVPERMPDYmi : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { - let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMYri.Latency); + let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMYri.Latency); let ReleaseAtCycles = [1, 1, 2]; let NumMicroOps = !add(Zn4WriteVPERMYri.NumMicroOps, 1); } @@ -1421,7 +1421,7 @@ def Zn4WriteVPERMDYrr : SchedWriteRes<[Zn4FPVShuf]> { def : InstRW<[Zn4WriteVPERMDYrr], (instrs VPERMDYrr)>; def Zn4WriteVPERMYm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { - let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMDYrr.Latency); + let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMDYrr.Latency); let ReleaseAtCycles = [1, 1, 2]; let NumMicroOps = !add(Zn4WriteVPERMDYrr.NumMicroOps, 0); } @@ -1534,9 +1534,9 @@ def Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr : SchedWriteRes<[Zn4FPFMisc01]> { let NumMicroOps = 1; } def : InstRW<[Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr], (instregex - "VFIXUPIMM(S|P)(S|D)(Z|Z128|Z256?)rrik", "VFIXUPIMM(S|P)(S|D)(Z?|Z128?|Z256?)rrikz", + "VFIXUPIMM(S|P)(S|D)(Z|Z128|Z256?)rrik", "VFIXUPIMM(S|P)(S|D)(Z?|Z128?|Z256?)rrikz", "VFIXUPIMM(S|P)(S|D)(Z128|Z256?)rri", "VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)", - "VRANGE(S|P)(S|D)(Z|Z128|Z256?)rri(b?)k","VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)kz" + "VRANGE(S|P)(S|D)(Z|Z128|Z256?)rri(b?)k","VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)kz" )>; // SCALE & REDUCE instructions @@ -1567,7 +1567,7 @@ def Zn4WriteBUSDr_VPMADDr: SchedWriteRes<[Zn4FPFMisc01]> { let NumMicroOps = 1; } def : InstRW<[Zn4WriteBUSDr_VPMADDr], (instregex - "VPDP(BU|WS)(S|P)(S|D|DS)(Z|Z128|Z256)(r|rk|rkz)", + "VPDP(BU|WS)(S|P)(S|D|DS)(Z|Z128|Z256)(r|rk|rkz)", "VPMADD52(H|L)UQ(Z|Z128|Z256)(r|rk|rkz)" )>; @@ -1586,7 +1586,7 @@ def : InstRW<[Zn4WriteSHIFTrr], (instregex "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z?|Z128?|Z256?)(rr|rrk|rrkz)", "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z256?)(ri|rik|rikz)", "(V?)P(ROL|ROR)(D|Q)(Z?|Z128?)(ri|rik|rikz)", - "VPSHUFBITQMBZ128rr", "VFMSUB231SSZrkz_Int" + "VPSHUFBITQMBZ128rr", "VFMSUB231SSZrkz_Int" )>; def Zn4WriteSHIFTri: SchedWriteRes<[Zn4FPFMisc01]> { @@ -1598,24 +1598,40 @@ def : InstRW<[Zn4WriteSHIFTri], (instregex "VP(SLL|SRL|SRA)(D|Q|W)(Z|Z128|Z256?)(ri|rik|rikz)" )>; -// ALIGN Instructions -def Zn4WriteALIGN: SchedWriteRes<[Zn4FPFMisc12]> { +// ALIGNR Instructions +def Zn4WriteALIGNR: SchedWriteRes<[Zn4FPFMisc12]> { + let Latency = 2; + let ReleaseAtCycles = [1]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteALIGNR], (instregex + "(V?)PALIGNR(Y?|Z128?|Z256?)(rri|rrik|rrikz)" + )>; +def Zn4WriteALIGNRZ: SchedWriteRes<[Zn4FPFMisc12]> { let Latency = 2; let ReleaseAtCycles = [2]; let NumMicroOps = 1; } -def : InstRW<[Zn4WriteALIGN], (instregex - "(V?)PALIGNR(Z?|Z128?|Z256?)(rri|rrik|rrikz)" +def : InstRW<[Zn4WriteALIGNRZ], (instregex + "(V?)PALIGNRZ(rri|rrik|rrikz)" )>; -//PACK Instructions +// PACK Instructions def Zn4WritePACK: SchedWriteRes<[Zn4FPFMisc12]> { let Latency = 2; - let ReleaseAtCycles = [2]; + let ReleaseAtCycles = [1]; let NumMicroOps = 1; } def : InstRW<[Zn4WritePACK], (instregex - "(V?)PACK(SS|US)(DW|WB)(Z?|Z128?|Z256?)(rr|rrk|rrkz)" + "(V?)PACK(SS|US)(DW|WB)(Y?|Z128?|Z256?)(rr|rrk|rrkz)" + )>; +def Zn4WritePACKZ: SchedWriteRes<[Zn4FPFMisc12]> { + let Latency = 2; + let ReleaseAtCycles = [2]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WritePACKZ], (instregex + "(V?)PACK(SS|US)(DW|WB)Z(rr|rrk|rrkz)" )>; // MAX and MIN Instructions diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp index 8ad8d423d10c..fd5f34b60efb 100644 --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -261,26 +261,8 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, if (!FS.empty()) FullFS = (Twine(FullFS) + "," + FS).str(); - // Attach EVEX512 feature when we have AVX512 features with a default CPU. - // "pentium4" is default CPU for 32-bit targets. - // "x86-64" is default CPU for 64-bit targets. - if (CPU == "generic" || CPU == "pentium4" || CPU == "x86-64") { - size_t posNoEVEX512 = FS.rfind("-evex512"); - // Make sure we won't be cheated by "-avx512fp16". - size_t posNoAVX512F = - FS.ends_with("-avx512f") ? FS.size() - 8 : FS.rfind("-avx512f,"); - size_t posEVEX512 = FS.rfind("+evex512"); - // Any AVX512XXX will enable AVX512F. - size_t posAVX512F = FS.rfind("+avx512"); - - if (posAVX512F != StringRef::npos && - (posNoAVX512F == StringRef::npos || posNoAVX512F < posAVX512F)) - if (posEVEX512 == StringRef::npos && posNoEVEX512 == StringRef::npos) - FullFS += ",+evex512"; - } - // Disable 64-bit only features in non-64-bit mode. - SmallVector<StringRef, 9> FeaturesIn64BitOnly = { + StringRef FeaturesIn64BitOnly[] = { "egpr", "push2pop2", "ppx", "ndd", "ccmp", "nf", "cf", "zu", "uintr"}; if (FullFS.find("-64bit-mode") != std::string::npos) for (StringRef F : FeaturesIn64BitOnly) diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index be49214e041e..fa3f3b59741d 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -226,8 +226,7 @@ public: // TODO: Currently we're always allowing widening on CPUs without VLX, // because for many cases we don't have a better option. bool canExtendTo512DQ() const { - return hasAVX512() && hasEVEX512() && - (!hasVLX() || getPreferVectorWidth() >= 512); + return hasAVX512() && (!hasVLX() || getPreferVectorWidth() >= 512); } bool canExtendTo512BW() const { return hasBWI() && canExtendTo512DQ(); @@ -247,8 +246,7 @@ public: // If there are no 512-bit vectors and we prefer not to use 512-bit registers, // disable them in the legalizer. bool useAVX512Regs() const { - return hasAVX512() && hasEVEX512() && - (canExtendTo512DQ() || RequiredVectorWidth > 256); + return hasAVX512() && (canExtendTo512DQ() || RequiredVectorWidth > 256); } bool useLight256BitInstructions() const { diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 62f95277d016..3d8d0a236a3c 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -213,7 +213,7 @@ X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { case TargetTransformInfo::RGK_Scalar: return TypeSize::getFixed(ST->is64Bit() ? 64 : 32); case TargetTransformInfo::RGK_FixedWidthVector: - if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512) + if (ST->hasAVX512() && PreferVectorWidth >= 512) return TypeSize::getFixed(512); if (ST->hasAVX() && PreferVectorWidth >= 256) return TypeSize::getFixed(256); @@ -1206,6 +1206,8 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost( { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } }, + { X86ISD::PMULUDQ, MVT::v4i64, { 3, 5, 5, 6 } }, // pmuludq + split + { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps @@ -6591,7 +6593,7 @@ X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { // Only enable vector loads for equality comparison. Right now the vector // version is not as fast for three way compare (see #33329). const unsigned PreferredWidth = ST->getPreferVectorWidth(); - if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512()) + if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64); if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32); if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16); diff --git a/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp b/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp index ea8b88f41bb8..9bf0abb018c9 100644 --- a/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp +++ b/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp @@ -105,6 +105,7 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) { // Prolog information. SmallVector<int64_t> PushedRegs; bool HasStackAlloc = false; + bool HasSetFrame = false; unsigned ApproximatePrologCodeCount = 0; // Requested changes. @@ -130,15 +131,20 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) { break; case X86::SEH_StackAlloc: - case X86::SEH_SetFrame: if (State != FunctionState::InProlog) - llvm_unreachable("SEH_StackAlloc or SEH_SetFrame outside of prolog"); + llvm_unreachable("SEH_StackAlloc outside of prolog"); // Assume a large alloc... - ApproximatePrologCodeCount += - (MI.getOpcode() == X86::SEH_StackAlloc) ? 3 : 1; + ApproximatePrologCodeCount += 3; HasStackAlloc = true; break; + case X86::SEH_SetFrame: + if (State != FunctionState::InProlog) + llvm_unreachable("SEH_SetFrame outside of prolog"); + ApproximatePrologCodeCount++; + HasSetFrame = true; + break; + case X86::SEH_SaveReg: case X86::SEH_SaveXMM: if (State != FunctionState::InProlog) @@ -190,8 +196,30 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) { State = FunctionState::FinishedEpilog; break; - case X86::LEA64r: case X86::MOV64rr: + if (State == FunctionState::InEpilog) { + // If the prolog contains a stack allocation, then the first + // instruction in the epilog must be to adjust the stack pointer. + if (!HasSetFrame) + return rejectCurrentFunctionInternalError( + MF, Mode, + "The epilog is setting frame back, but prolog did not set it"); + if (PoppedRegCount > 0) + return rejectCurrentFunctionInternalError( + MF, Mode, + "The epilog is setting the frame back after popping " + "registers"); + if (HasStackDealloc) + return rejectCurrentFunctionInternalError( + MF, Mode, + "Cannot set the frame back after the stack " + "allocation has been deallocated"); + } else if (State == FunctionState::FinishedEpilog) + return rejectCurrentFunctionInternalError( + MF, Mode, "Unexpected mov instruction after the epilog"); + break; + + case X86::LEA64r: case X86::ADD64ri32: if (State == FunctionState::InEpilog) { // If the prolog contains a stack allocation, then the first @@ -211,8 +239,7 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) { HasStackDealloc = true; } else if (State == FunctionState::FinishedEpilog) return rejectCurrentFunctionInternalError( - MF, Mode, - "Unexpected lea, mov or add instruction after the epilog"); + MF, Mode, "Unexpected lea or add instruction after the epilog"); break; case X86::POP64r: @@ -278,11 +305,8 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) { } } - if (UnwindV2StartLocations.empty()) { - assert(State == FunctionState::InProlog && - "If there are no epilogs, then there should be no prolog"); + if (UnwindV2StartLocations.empty()) return false; - } MachineBasicBlock &FirstMBB = MF.front(); // Assume +1 for the "header" UOP_Epilog that contains the epilog size, and diff --git a/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp index 6921f44b700c..096ad08d8a3c 100644 --- a/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp +++ b/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp @@ -71,113 +71,11 @@ static bool readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t Address, static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo) { const MCRegisterInfo *RegInfo = D->getContext().getRegisterInfo(); - return *(RegInfo->getRegClass(RC).begin() + RegNo); + return RegInfo->getRegClass(RC).getRegister(RegNo); } static DecodeStatus DecodeGRRegsRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeRRegsRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeBitpOperand(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeNegImmOperand(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus Decode2RInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus Decode2RImmInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeR2RInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus Decode2RSrcDstInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeRUSInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeRUSBitpInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus -DecodeRUSSrcDstBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeL2RInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeLR2RInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus Decode3RInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus Decode3RImmInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus Decode2RUSInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus Decode2RUSBitpInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeL3RInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeL3RSrcDstInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeL2RUSInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeL2RUSBitpInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeL6RInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeL5RInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus DecodeL4RSrcDstInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus -DecodeL4RSrcDstSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address, - const MCDisassembler *Decoder); - -#include "XCoreGenDisassemblerTables.inc" - -static DecodeStatus DecodeGRRegsRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, const MCDisassembler *Decoder) { if (RegNo > 11) return MCDisassembler::Fail; @@ -249,6 +147,116 @@ Decode3OpInstruction(unsigned Insn, unsigned &Op1, unsigned &Op2, return MCDisassembler::Success; } +static DecodeStatus Decode3RInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, + const MCDisassembler *Decoder) { + unsigned Op1, Op2, Op3; + DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3); + if (S == MCDisassembler::Success) { + DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder); + DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder); + DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder); + } + return S; +} + +static DecodeStatus Decode3RImmInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, + const MCDisassembler *Decoder) { + unsigned Op1, Op2, Op3; + DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3); + if (S == MCDisassembler::Success) { + Inst.addOperand(MCOperand::createImm(Op1)); + DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder); + DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder); + } + return S; +} + +static DecodeStatus Decode2RUSInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, + const MCDisassembler *Decoder) { + unsigned Op1, Op2, Op3; + DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3); + if (S == MCDisassembler::Success) { + DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder); + DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder); + Inst.addOperand(MCOperand::createImm(Op3)); + } + return S; +} + +static DecodeStatus Decode2RUSBitpInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, + const MCDisassembler *Decoder) { + unsigned Op1, Op2, Op3; + DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3); + if (S == MCDisassembler::Success) { + DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder); + DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder); + DecodeBitpOperand(Inst, Op3, Address, Decoder); + } + return S; +} + +static DecodeStatus DecodeL3RInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, + const MCDisassembler *Decoder) { + unsigned Op1, Op2, Op3; + DecodeStatus S = + Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3); + if (S == MCDisassembler::Success) { + DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder); + DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder); + DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder); + } + return S; +} + +static DecodeStatus DecodeL3RSrcDstInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, + const MCDisassembler *Decoder) { + unsigned Op1, Op2, Op3; + DecodeStatus S = + Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3); + if (S == MCDisassembler::Success) { + DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder); + DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder); + DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder); + DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder); + } + return S; +} + +static DecodeStatus DecodeL2RUSInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, + const MCDisassembler *Decoder) { + unsigned Op1, Op2, Op3; + DecodeStatus S = + Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3); + if (S == MCDisassembler::Success) { + DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder); + DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder); + Inst.addOperand(MCOperand::createImm(Op3)); + } + return S; +} + +static DecodeStatus DecodeL2RUSBitpInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, + const MCDisassembler *Decoder) { + unsigned Op1, Op2, Op3; + DecodeStatus S = + Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3); + if (S == MCDisassembler::Success) { + DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder); + DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder); + DecodeBitpOperand(Inst, Op3, Address, Decoder); + } + return S; +} + + static DecodeStatus Decode2OpInstructionFail(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { @@ -511,115 +519,6 @@ static DecodeStatus DecodeLR2RInstruction(MCInst &Inst, unsigned Insn, return S; } -static DecodeStatus Decode3RInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - unsigned Op1, Op2, Op3; - DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3); - if (S == MCDisassembler::Success) { - DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder); - DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder); - DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder); - } - return S; -} - -static DecodeStatus Decode3RImmInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - unsigned Op1, Op2, Op3; - DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3); - if (S == MCDisassembler::Success) { - Inst.addOperand(MCOperand::createImm(Op1)); - DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder); - DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder); - } - return S; -} - -static DecodeStatus Decode2RUSInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - unsigned Op1, Op2, Op3; - DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3); - if (S == MCDisassembler::Success) { - DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder); - DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder); - Inst.addOperand(MCOperand::createImm(Op3)); - } - return S; -} - -static DecodeStatus Decode2RUSBitpInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - unsigned Op1, Op2, Op3; - DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3); - if (S == MCDisassembler::Success) { - DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder); - DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder); - DecodeBitpOperand(Inst, Op3, Address, Decoder); - } - return S; -} - -static DecodeStatus DecodeL3RInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - unsigned Op1, Op2, Op3; - DecodeStatus S = - Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3); - if (S == MCDisassembler::Success) { - DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder); - DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder); - DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder); - } - return S; -} - -static DecodeStatus DecodeL3RSrcDstInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - unsigned Op1, Op2, Op3; - DecodeStatus S = - Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3); - if (S == MCDisassembler::Success) { - DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder); - DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder); - DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder); - DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder); - } - return S; -} - -static DecodeStatus DecodeL2RUSInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - unsigned Op1, Op2, Op3; - DecodeStatus S = - Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3); - if (S == MCDisassembler::Success) { - DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder); - DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder); - Inst.addOperand(MCOperand::createImm(Op3)); - } - return S; -} - -static DecodeStatus DecodeL2RUSBitpInstruction(MCInst &Inst, unsigned Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - unsigned Op1, Op2, Op3; - DecodeStatus S = - Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3); - if (S == MCDisassembler::Success) { - DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder); - DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder); - DecodeBitpOperand(Inst, Op3, Address, Decoder); - } - return S; -} - static DecodeStatus DecodeL6RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { @@ -713,6 +612,8 @@ DecodeL4RSrcDstSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address, return S; } +#include "XCoreGenDisassemblerTables.inc" + MCDisassembler::DecodeStatus XCoreDisassembler::getInstruction(MCInst &instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address, diff --git a/llvm/lib/Target/XCore/XCoreInstrInfo.cpp b/llvm/lib/Target/XCore/XCoreInstrInfo.cpp index 0a86588b6bdb..1a9133aad4dd 100644 --- a/llvm/lib/Target/XCore/XCoreInstrInfo.cpp +++ b/llvm/lib/Target/XCore/XCoreInstrInfo.cpp @@ -12,6 +12,7 @@ #include "XCoreInstrInfo.h" #include "XCore.h" +#include "XCoreSubtarget.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -41,10 +42,9 @@ namespace XCore { // Pin the vtable to this file. void XCoreInstrInfo::anchor() {} -XCoreInstrInfo::XCoreInstrInfo() - : XCoreGenInstrInfo(XCore::ADJCALLSTACKDOWN, XCore::ADJCALLSTACKUP), - RI() { -} +XCoreInstrInfo::XCoreInstrInfo(const XCoreSubtarget &ST) + : XCoreGenInstrInfo(ST, XCore::ADJCALLSTACKDOWN, XCore::ADJCALLSTACKUP), + RI() {} static bool isZeroImm(const MachineOperand &op) { return op.isImm() && op.getImm() == 0; diff --git a/llvm/lib/Target/XCore/XCoreInstrInfo.h b/llvm/lib/Target/XCore/XCoreInstrInfo.h index 5026671616fa..354339265378 100644 --- a/llvm/lib/Target/XCore/XCoreInstrInfo.h +++ b/llvm/lib/Target/XCore/XCoreInstrInfo.h @@ -20,12 +20,13 @@ #include "XCoreGenInstrInfo.inc" namespace llvm { +class XCoreSubtarget; class XCoreInstrInfo : public XCoreGenInstrInfo { const XCoreRegisterInfo RI; virtual void anchor(); public: - XCoreInstrInfo(); + explicit XCoreInstrInfo(const XCoreSubtarget &ST); /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As /// such, whenever a client has an instance of instruction info, it should diff --git a/llvm/lib/Target/XCore/XCoreSubtarget.cpp b/llvm/lib/Target/XCore/XCoreSubtarget.cpp index d4b777ef447f..2f6517ec9e7a 100644 --- a/llvm/lib/Target/XCore/XCoreSubtarget.cpp +++ b/llvm/lib/Target/XCore/XCoreSubtarget.cpp @@ -26,5 +26,5 @@ void XCoreSubtarget::anchor() { } XCoreSubtarget::XCoreSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM) - : XCoreGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), FrameLowering(*this), - TLInfo(TM, *this) {} + : XCoreGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), InstrInfo(*this), + FrameLowering(*this), TLInfo(TM, *this) {} diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp index f1367037bdf4..c211777e6989 100644 --- a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp +++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp @@ -232,12 +232,6 @@ XtensaTargetLowering::XtensaTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal); - - setCondCodeAction(ISD::SETOGT, MVT::f32, Expand); - setCondCodeAction(ISD::SETOGE, MVT::f32, Expand); - setCondCodeAction(ISD::SETONE, MVT::f32, Expand); - setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); - setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); } else { setOperationAction(ISD::BITCAST, MVT::i32, Expand); setOperationAction(ISD::BITCAST, MVT::f32, Expand); @@ -887,6 +881,16 @@ static std::pair<unsigned, unsigned> getFPBranchKind(ISD::CondCode Cond) { return std::make_pair(Xtensa::BF, Xtensa::OLT_S); case ISD::SETGT: return std::make_pair(Xtensa::BF, Xtensa::OLE_S); + case ISD::SETOGT: + return std::make_pair(Xtensa::BF, Xtensa::ULE_S); + case ISD::SETOGE: + return std::make_pair(Xtensa::BF, Xtensa::ULT_S); + case ISD::SETONE: + return std::make_pair(Xtensa::BF, Xtensa::UEQ_S); + case ISD::SETUGT: + return std::make_pair(Xtensa::BF, Xtensa::OLE_S); + case ISD::SETUGE: + return std::make_pair(Xtensa::BF, Xtensa::OLT_S); default: llvm_unreachable("Invalid condition!"); } diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp index 55c0729a0c9e..b0f924f2cd58 100644 --- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp +++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp @@ -48,7 +48,7 @@ addFrameReference(const MachineInstrBuilder &MIB, int FI) { } XtensaInstrInfo::XtensaInstrInfo(const XtensaSubtarget &STI) - : XtensaGenInstrInfo(Xtensa::ADJCALLSTACKDOWN, Xtensa::ADJCALLSTACKUP), + : XtensaGenInstrInfo(STI, Xtensa::ADJCALLSTACKDOWN, Xtensa::ADJCALLSTACKUP), RI(STI), STI(STI) {} Register XtensaInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index 24827537eb19..63848160636a 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -1396,7 +1396,6 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf, setFeature(X86::FEATURE_BMI2); if (HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save) { setFeature(X86::FEATURE_AVX512F); - setFeature(X86::FEATURE_EVEX512); } if (HasLeaf7 && ((EBX >> 17) & 1) && HasAVX512Save) setFeature(X86::FEATURE_AVX512DQ); @@ -2063,8 +2062,6 @@ StringMap<bool> sys::getHostCPUFeatures() { Features["rtm"] = HasLeaf7 && ((EBX >> 11) & 1); // AVX512 is only supported if the OS supports the context save for it. Features["avx512f"] = HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save; - if (Features["avx512f"]) - Features["evex512"] = true; Features["avx512dq"] = HasLeaf7 && ((EBX >> 17) & 1) && HasAVX512Save; Features["rdseed"] = HasLeaf7 && ((EBX >> 18) & 1); Features["adx"] = HasLeaf7 && ((EBX >> 19) & 1); @@ -2176,11 +2173,8 @@ StringMap<bool> sys::getHostCPUFeatures() { MaxLevel >= 0x24 && !getX86CpuIDAndInfo(0x24, &EAX, &EBX, &ECX, &EDX); int AVX10Ver = HasLeaf24 && (EBX & 0xff); - int Has512Len = HasLeaf24 && ((EBX >> 18) & 1); - Features["avx10.1-256"] = HasAVX10 && AVX10Ver >= 1; - Features["avx10.1-512"] = HasAVX10 && AVX10Ver >= 1 && Has512Len; - Features["avx10.2-256"] = HasAVX10 && AVX10Ver >= 2; - Features["avx10.2-512"] = HasAVX10 && AVX10Ver >= 2 && Has512Len; + Features["avx10.1"] = HasAVX10 && AVX10Ver >= 1; + Features["avx10.2"] = HasAVX10 && AVX10Ver >= 2; return Features; } diff --git a/llvm/lib/TargetParser/RISCVTargetParser.cpp b/llvm/lib/TargetParser/RISCVTargetParser.cpp index 9957ec0c28d8..b53a1b95431a 100644 --- a/llvm/lib/TargetParser/RISCVTargetParser.cpp +++ b/llvm/lib/TargetParser/RISCVTargetParser.cpp @@ -153,12 +153,13 @@ namespace RISCVVType { // // Bits | Name | Description // -----+------------+------------------------------------------------ +// 8 | altfmt | Alternative format for bf16 // 7 | vma | Vector mask agnostic // 6 | vta | Vector tail agnostic // 5:3 | vsew[2:0] | Standard element width (SEW) setting // 2:0 | vlmul[2:0] | Vector register group multiplier (LMUL) setting unsigned encodeVTYPE(VLMUL VLMul, unsigned SEW, bool TailAgnostic, - bool MaskAgnostic) { + bool MaskAgnostic, bool AltFmt) { assert(isValidSEW(SEW) && "Invalid SEW"); unsigned VLMulBits = static_cast<unsigned>(VLMul); unsigned VSEWBits = encodeSEW(SEW); @@ -167,6 +168,8 @@ unsigned encodeVTYPE(VLMUL VLMul, unsigned SEW, bool TailAgnostic, VTypeI |= 0x40; if (MaskAgnostic) VTypeI |= 0x80; + if (AltFmt) + VTypeI |= 0x100; return VTypeI; } @@ -200,6 +203,10 @@ void printVType(unsigned VType, raw_ostream &OS) { unsigned Sew = getSEW(VType); OS << "e" << Sew; + bool AltFmt = RISCVVType::isAltFmt(VType); + if (AltFmt) + OS << "alt"; + unsigned LMul; bool Fractional; std::tie(LMul, Fractional) = decodeVLMUL(getVLMUL(VType)); diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index 50b97d325754..2194ef4df14d 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -364,8 +364,326 @@ StringRef AMDGPU::getCanonicalArchName(const Triple &T, StringRef Arch) { return T.isAMDGCN() ? getArchNameAMDGCN(ProcKind) : getArchNameR600(ProcKind); } -void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, - StringMap<bool> &Features) { +static std::pair<FeatureError, StringRef> +insertWaveSizeFeature(StringRef GPU, const Triple &T, + const StringMap<bool> &DefaultFeatures, + StringMap<bool> &Features) { + const bool IsNullGPU = GPU.empty(); + const bool TargetHasWave32 = DefaultFeatures.count("wavefrontsize32"); + const bool TargetHasWave64 = DefaultFeatures.count("wavefrontsize64"); + const bool HaveWave32 = Features.count("wavefrontsize32"); + const bool HaveWave64 = Features.count("wavefrontsize64"); + if (HaveWave32 && HaveWave64) + return {AMDGPU::INVALID_FEATURE_COMBINATION, + "'wavefrontsize32' and 'wavefrontsize64' are mutually exclusive"}; + + if (HaveWave32 && !IsNullGPU && TargetHasWave64) + return {AMDGPU::UNSUPPORTED_TARGET_FEATURE, "wavefrontsize32"}; + + if (HaveWave64 && !IsNullGPU && TargetHasWave32) + return {AMDGPU::UNSUPPORTED_TARGET_FEATURE, "wavefrontsize64"}; + + // Don't assume any wavesize with an unknown subtarget. + // Default to wave32 if target supports both. + if (!IsNullGPU && !HaveWave32 && !HaveWave64 && !TargetHasWave32 && + !TargetHasWave64) + Features.insert(std::make_pair("wavefrontsize32", true)); + + for (const auto &Entry : DefaultFeatures) { + if (!Features.count(Entry.getKey())) + Features[Entry.getKey()] = Entry.getValue(); + } + + return {NO_ERROR, StringRef()}; +} + +/// Fills Features map with default values for given target GPU. +/// \p Features contains overriding target features and this function returns +/// default target features with entries overridden by \p Features. +static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T, + StringMap<bool> &Features) { + AMDGPU::GPUKind Kind = parseArchAMDGCN(GPU); + switch (Kind) { + case GK_GFX1250: + Features["ci-insts"] = true; + Features["dot7-insts"] = true; + Features["dot8-insts"] = true; + Features["dl-insts"] = true; + Features["16-bit-insts"] = true; + Features["dpp"] = true; + Features["gfx8-insts"] = true; + Features["gfx9-insts"] = true; + Features["gfx10-insts"] = true; + Features["gfx10-3-insts"] = true; + Features["gfx11-insts"] = true; + Features["gfx12-insts"] = true; + Features["gfx1250-insts"] = true; + Features["bitop3-insts"] = true; + Features["prng-inst"] = true; + Features["tanh-insts"] = true; + Features["tensor-cvt-lut-insts"] = true; + Features["transpose-load-f4f6-insts"] = true; + Features["bf16-trans-insts"] = true; + Features["bf16-cvt-insts"] = true; + Features["fp8-conversion-insts"] = true; + Features["fp8e5m3-insts"] = true; + Features["permlane16-swap"] = true; + Features["ashr-pk-insts"] = true; + Features["atomic-buffer-pk-add-bf16-inst"] = true; + Features["vmem-pref-insts"] = true; + Features["atomic-fadd-rtn-insts"] = true; + Features["atomic-buffer-global-pk-add-f16-insts"] = true; + Features["atomic-flat-pk-add-16-insts"] = true; + Features["atomic-global-pk-add-bf16-inst"] = true; + Features["atomic-ds-pk-add-16-insts"] = true; + Features["setprio-inc-wg-inst"] = true; + Features["atomic-fmin-fmax-global-f32"] = true; + Features["atomic-fmin-fmax-global-f64"] = true; + Features["wavefrontsize32"] = true; + break; + case GK_GFX1201: + case GK_GFX1200: + case GK_GFX12_GENERIC: + Features["ci-insts"] = true; + Features["dot7-insts"] = true; + Features["dot8-insts"] = true; + Features["dot9-insts"] = true; + Features["dot10-insts"] = true; + Features["dot11-insts"] = true; + Features["dot12-insts"] = true; + Features["dl-insts"] = true; + Features["atomic-ds-pk-add-16-insts"] = true; + Features["atomic-flat-pk-add-16-insts"] = true; + Features["atomic-buffer-global-pk-add-f16-insts"] = true; + Features["atomic-buffer-pk-add-bf16-inst"] = true; + Features["atomic-global-pk-add-bf16-inst"] = true; + Features["16-bit-insts"] = true; + Features["dpp"] = true; + Features["gfx8-insts"] = true; + Features["gfx9-insts"] = true; + Features["gfx10-insts"] = true; + Features["gfx10-3-insts"] = true; + Features["gfx11-insts"] = true; + Features["gfx12-insts"] = true; + Features["atomic-fadd-rtn-insts"] = true; + Features["image-insts"] = true; + Features["fp8-conversion-insts"] = true; + Features["atomic-fmin-fmax-global-f32"] = true; + break; + case GK_GFX1153: + case GK_GFX1152: + case GK_GFX1151: + case GK_GFX1150: + case GK_GFX1103: + case GK_GFX1102: + case GK_GFX1101: + case GK_GFX1100: + case GK_GFX11_GENERIC: + Features["ci-insts"] = true; + Features["dot5-insts"] = true; + Features["dot7-insts"] = true; + Features["dot8-insts"] = true; + Features["dot9-insts"] = true; + Features["dot10-insts"] = true; + Features["dot12-insts"] = true; + Features["dl-insts"] = true; + Features["16-bit-insts"] = true; + Features["dpp"] = true; + Features["gfx8-insts"] = true; + Features["gfx9-insts"] = true; + Features["gfx10-insts"] = true; + Features["gfx10-3-insts"] = true; + Features["gfx11-insts"] = true; + Features["atomic-fadd-rtn-insts"] = true; + Features["image-insts"] = true; + Features["gws"] = true; + Features["atomic-fmin-fmax-global-f32"] = true; + break; + case GK_GFX1036: + case GK_GFX1035: + case GK_GFX1034: + case GK_GFX1033: + case GK_GFX1032: + case GK_GFX1031: + case GK_GFX1030: + case GK_GFX10_3_GENERIC: + Features["ci-insts"] = true; + Features["dot1-insts"] = true; + Features["dot2-insts"] = true; + Features["dot5-insts"] = true; + Features["dot6-insts"] = true; + Features["dot7-insts"] = true; + Features["dot10-insts"] = true; + Features["dl-insts"] = true; + Features["16-bit-insts"] = true; + Features["dpp"] = true; + Features["gfx8-insts"] = true; + Features["gfx9-insts"] = true; + Features["gfx10-insts"] = true; + Features["gfx10-3-insts"] = true; + Features["image-insts"] = true; + Features["s-memrealtime"] = true; + Features["s-memtime-inst"] = true; + Features["gws"] = true; + Features["vmem-to-lds-load-insts"] = true; + Features["atomic-fmin-fmax-global-f32"] = true; + Features["atomic-fmin-fmax-global-f64"] = true; + break; + case GK_GFX1012: + case GK_GFX1011: + Features["dot1-insts"] = true; + Features["dot2-insts"] = true; + Features["dot5-insts"] = true; + Features["dot6-insts"] = true; + Features["dot7-insts"] = true; + Features["dot10-insts"] = true; + [[fallthrough]]; + case GK_GFX1013: + case GK_GFX1010: + case GK_GFX10_1_GENERIC: + Features["dl-insts"] = true; + Features["ci-insts"] = true; + Features["16-bit-insts"] = true; + Features["dpp"] = true; + Features["gfx8-insts"] = true; + Features["gfx9-insts"] = true; + Features["gfx10-insts"] = true; + Features["image-insts"] = true; + Features["s-memrealtime"] = true; + Features["s-memtime-inst"] = true; + Features["gws"] = true; + Features["vmem-to-lds-load-insts"] = true; + Features["atomic-fmin-fmax-global-f32"] = true; + Features["atomic-fmin-fmax-global-f64"] = true; + break; + case GK_GFX950: + Features["bitop3-insts"] = true; + Features["fp6bf6-cvt-scale-insts"] = true; + Features["fp4-cvt-scale-insts"] = true; + Features["bf8-cvt-scale-insts"] = true; + Features["fp8-cvt-scale-insts"] = true; + Features["f16bf16-to-fp6bf6-cvt-scale-insts"] = true; + Features["f32-to-f16bf16-cvt-sr-insts"] = true; + Features["prng-inst"] = true; + Features["permlane16-swap"] = true; + Features["permlane32-swap"] = true; + Features["ashr-pk-insts"] = true; + Features["dot12-insts"] = true; + Features["dot13-insts"] = true; + Features["atomic-buffer-pk-add-bf16-inst"] = true; + Features["gfx950-insts"] = true; + [[fallthrough]]; + case GK_GFX942: + Features["fp8-insts"] = true; + Features["fp8-conversion-insts"] = true; + if (Kind != GK_GFX950) + Features["xf32-insts"] = true; + [[fallthrough]]; + case GK_GFX9_4_GENERIC: + Features["gfx940-insts"] = true; + Features["atomic-ds-pk-add-16-insts"] = true; + Features["atomic-flat-pk-add-16-insts"] = true; + Features["atomic-global-pk-add-bf16-inst"] = true; + Features["gfx90a-insts"] = true; + Features["atomic-buffer-global-pk-add-f16-insts"] = true; + Features["atomic-fadd-rtn-insts"] = true; + Features["dot3-insts"] = true; + Features["dot4-insts"] = true; + Features["dot5-insts"] = true; + Features["dot6-insts"] = true; + Features["mai-insts"] = true; + Features["dl-insts"] = true; + Features["dot1-insts"] = true; + Features["dot2-insts"] = true; + Features["dot7-insts"] = true; + Features["dot10-insts"] = true; + Features["gfx9-insts"] = true; + Features["gfx8-insts"] = true; + Features["16-bit-insts"] = true; + Features["dpp"] = true; + Features["s-memrealtime"] = true; + Features["ci-insts"] = true; + Features["s-memtime-inst"] = true; + Features["gws"] = true; + Features["vmem-to-lds-load-insts"] = true; + Features["atomic-fmin-fmax-global-f64"] = true; + Features["wavefrontsize64"] = true; + break; + case GK_GFX90A: + Features["gfx90a-insts"] = true; + Features["atomic-buffer-global-pk-add-f16-insts"] = true; + Features["atomic-fadd-rtn-insts"] = true; + Features["atomic-fmin-fmax-global-f64"] = true; + [[fallthrough]]; + case GK_GFX908: + Features["dot3-insts"] = true; + Features["dot4-insts"] = true; + Features["dot5-insts"] = true; + Features["dot6-insts"] = true; + Features["mai-insts"] = true; + [[fallthrough]]; + case GK_GFX906: + Features["dl-insts"] = true; + Features["dot1-insts"] = true; + Features["dot2-insts"] = true; + Features["dot7-insts"] = true; + Features["dot10-insts"] = true; + [[fallthrough]]; + case GK_GFX90C: + case GK_GFX909: + case GK_GFX904: + case GK_GFX902: + case GK_GFX900: + case GK_GFX9_GENERIC: + Features["gfx9-insts"] = true; + Features["vmem-to-lds-load-insts"] = true; + [[fallthrough]]; + case GK_GFX810: + case GK_GFX805: + case GK_GFX803: + case GK_GFX802: + case GK_GFX801: + Features["gfx8-insts"] = true; + Features["16-bit-insts"] = true; + Features["dpp"] = true; + Features["s-memrealtime"] = true; + Features["ci-insts"] = true; + Features["image-insts"] = true; + Features["s-memtime-inst"] = true; + Features["gws"] = true; + Features["wavefrontsize64"] = true; + break; + case GK_GFX705: + case GK_GFX704: + case GK_GFX703: + case GK_GFX702: + case GK_GFX701: + case GK_GFX700: + Features["ci-insts"] = true; + [[fallthrough]]; + case GK_GFX602: + case GK_GFX601: + case GK_GFX600: + Features["image-insts"] = true; + Features["s-memtime-inst"] = true; + Features["gws"] = true; + Features["atomic-fmin-fmax-global-f32"] = true; + Features["atomic-fmin-fmax-global-f64"] = true; + Features["wavefrontsize64"] = true; + break; + case GK_NONE: + break; + default: + llvm_unreachable("Unhandled GPU!"); + } +} + +/// Fills Features map with default values for given target GPU. +/// \p Features contains overriding target features and this function returns +/// default target features with entries overridden by \p Features. +std::pair<FeatureError, StringRef> +AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, + StringMap<bool> &Features) { // XXX - What does the member GPU mean if device name string passed here? if (T.isSPIRV() && T.getOS() == Triple::OSType::AMDHSA) { // AMDGCN SPIRV must support the union of all AMDGCN features. This list @@ -434,276 +752,9 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["wavefrontsize32"] = true; Features["wavefrontsize64"] = true; } else if (T.isAMDGCN()) { - AMDGPU::GPUKind Kind = parseArchAMDGCN(GPU); - switch (Kind) { - case GK_GFX1250: - Features["ci-insts"] = true; - Features["dot7-insts"] = true; - Features["dot8-insts"] = true; - Features["dl-insts"] = true; - Features["16-bit-insts"] = true; - Features["dpp"] = true; - Features["gfx8-insts"] = true; - Features["gfx9-insts"] = true; - Features["gfx10-insts"] = true; - Features["gfx10-3-insts"] = true; - Features["gfx11-insts"] = true; - Features["gfx12-insts"] = true; - Features["gfx1250-insts"] = true; - Features["bitop3-insts"] = true; - Features["prng-inst"] = true; - Features["tanh-insts"] = true; - Features["tensor-cvt-lut-insts"] = true; - Features["transpose-load-f4f6-insts"] = true; - Features["bf16-trans-insts"] = true; - Features["bf16-cvt-insts"] = true; - Features["fp8-conversion-insts"] = true; - Features["fp8e5m3-insts"] = true; - Features["permlane16-swap"] = true; - Features["ashr-pk-insts"] = true; - Features["atomic-buffer-pk-add-bf16-inst"] = true; - Features["vmem-pref-insts"] = true; - Features["atomic-fadd-rtn-insts"] = true; - Features["atomic-buffer-global-pk-add-f16-insts"] = true; - Features["atomic-flat-pk-add-16-insts"] = true; - Features["atomic-global-pk-add-bf16-inst"] = true; - Features["atomic-ds-pk-add-16-insts"] = true; - Features["setprio-inc-wg-inst"] = true; - Features["atomic-fmin-fmax-global-f32"] = true; - Features["atomic-fmin-fmax-global-f64"] = true; - break; - case GK_GFX1201: - case GK_GFX1200: - case GK_GFX12_GENERIC: - Features["ci-insts"] = true; - Features["dot7-insts"] = true; - Features["dot8-insts"] = true; - Features["dot9-insts"] = true; - Features["dot10-insts"] = true; - Features["dot11-insts"] = true; - Features["dot12-insts"] = true; - Features["dl-insts"] = true; - Features["atomic-ds-pk-add-16-insts"] = true; - Features["atomic-flat-pk-add-16-insts"] = true; - Features["atomic-buffer-global-pk-add-f16-insts"] = true; - Features["atomic-buffer-pk-add-bf16-inst"] = true; - Features["atomic-global-pk-add-bf16-inst"] = true; - Features["16-bit-insts"] = true; - Features["dpp"] = true; - Features["gfx8-insts"] = true; - Features["gfx9-insts"] = true; - Features["gfx10-insts"] = true; - Features["gfx10-3-insts"] = true; - Features["gfx11-insts"] = true; - Features["gfx12-insts"] = true; - Features["atomic-fadd-rtn-insts"] = true; - Features["image-insts"] = true; - Features["fp8-conversion-insts"] = true; - Features["atomic-fmin-fmax-global-f32"] = true; - break; - case GK_GFX1153: - case GK_GFX1152: - case GK_GFX1151: - case GK_GFX1150: - case GK_GFX1103: - case GK_GFX1102: - case GK_GFX1101: - case GK_GFX1100: - case GK_GFX11_GENERIC: - Features["ci-insts"] = true; - Features["dot5-insts"] = true; - Features["dot7-insts"] = true; - Features["dot8-insts"] = true; - Features["dot9-insts"] = true; - Features["dot10-insts"] = true; - Features["dot12-insts"] = true; - Features["dl-insts"] = true; - Features["16-bit-insts"] = true; - Features["dpp"] = true; - Features["gfx8-insts"] = true; - Features["gfx9-insts"] = true; - Features["gfx10-insts"] = true; - Features["gfx10-3-insts"] = true; - Features["gfx11-insts"] = true; - Features["atomic-fadd-rtn-insts"] = true; - Features["image-insts"] = true; - Features["gws"] = true; - Features["atomic-fmin-fmax-global-f32"] = true; - break; - case GK_GFX1036: - case GK_GFX1035: - case GK_GFX1034: - case GK_GFX1033: - case GK_GFX1032: - case GK_GFX1031: - case GK_GFX1030: - case GK_GFX10_3_GENERIC: - Features["ci-insts"] = true; - Features["dot1-insts"] = true; - Features["dot2-insts"] = true; - Features["dot5-insts"] = true; - Features["dot6-insts"] = true; - Features["dot7-insts"] = true; - Features["dot10-insts"] = true; - Features["dl-insts"] = true; - Features["16-bit-insts"] = true; - Features["dpp"] = true; - Features["gfx8-insts"] = true; - Features["gfx9-insts"] = true; - Features["gfx10-insts"] = true; - Features["gfx10-3-insts"] = true; - Features["image-insts"] = true; - Features["s-memrealtime"] = true; - Features["s-memtime-inst"] = true; - Features["gws"] = true; - Features["vmem-to-lds-load-insts"] = true; - Features["atomic-fmin-fmax-global-f32"] = true; - Features["atomic-fmin-fmax-global-f64"] = true; - break; - case GK_GFX1012: - case GK_GFX1011: - Features["dot1-insts"] = true; - Features["dot2-insts"] = true; - Features["dot5-insts"] = true; - Features["dot6-insts"] = true; - Features["dot7-insts"] = true; - Features["dot10-insts"] = true; - [[fallthrough]]; - case GK_GFX1013: - case GK_GFX1010: - case GK_GFX10_1_GENERIC: - Features["dl-insts"] = true; - Features["ci-insts"] = true; - Features["16-bit-insts"] = true; - Features["dpp"] = true; - Features["gfx8-insts"] = true; - Features["gfx9-insts"] = true; - Features["gfx10-insts"] = true; - Features["image-insts"] = true; - Features["s-memrealtime"] = true; - Features["s-memtime-inst"] = true; - Features["gws"] = true; - Features["vmem-to-lds-load-insts"] = true; - Features["atomic-fmin-fmax-global-f32"] = true; - Features["atomic-fmin-fmax-global-f64"] = true; - break; - case GK_GFX950: - Features["bitop3-insts"] = true; - Features["fp6bf6-cvt-scale-insts"] = true; - Features["fp4-cvt-scale-insts"] = true; - Features["bf8-cvt-scale-insts"] = true; - Features["fp8-cvt-scale-insts"] = true; - Features["f16bf16-to-fp6bf6-cvt-scale-insts"] = true; - Features["f32-to-f16bf16-cvt-sr-insts"] = true; - Features["prng-inst"] = true; - Features["permlane16-swap"] = true; - Features["permlane32-swap"] = true; - Features["ashr-pk-insts"] = true; - Features["dot12-insts"] = true; - Features["dot13-insts"] = true; - Features["atomic-buffer-pk-add-bf16-inst"] = true; - Features["gfx950-insts"] = true; - [[fallthrough]]; - case GK_GFX942: - Features["fp8-insts"] = true; - Features["fp8-conversion-insts"] = true; - if (Kind != GK_GFX950) - Features["xf32-insts"] = true; - [[fallthrough]]; - case GK_GFX9_4_GENERIC: - Features["gfx940-insts"] = true; - Features["atomic-ds-pk-add-16-insts"] = true; - Features["atomic-flat-pk-add-16-insts"] = true; - Features["atomic-global-pk-add-bf16-inst"] = true; - Features["gfx90a-insts"] = true; - Features["atomic-buffer-global-pk-add-f16-insts"] = true; - Features["atomic-fadd-rtn-insts"] = true; - Features["dot3-insts"] = true; - Features["dot4-insts"] = true; - Features["dot5-insts"] = true; - Features["dot6-insts"] = true; - Features["mai-insts"] = true; - Features["dl-insts"] = true; - Features["dot1-insts"] = true; - Features["dot2-insts"] = true; - Features["dot7-insts"] = true; - Features["dot10-insts"] = true; - Features["gfx9-insts"] = true; - Features["gfx8-insts"] = true; - Features["16-bit-insts"] = true; - Features["dpp"] = true; - Features["s-memrealtime"] = true; - Features["ci-insts"] = true; - Features["s-memtime-inst"] = true; - Features["gws"] = true; - Features["vmem-to-lds-load-insts"] = true; - Features["atomic-fmin-fmax-global-f64"] = true; - break; - case GK_GFX90A: - Features["gfx90a-insts"] = true; - Features["atomic-buffer-global-pk-add-f16-insts"] = true; - Features["atomic-fadd-rtn-insts"] = true; - Features["atomic-fmin-fmax-global-f64"] = true; - [[fallthrough]]; - case GK_GFX908: - Features["dot3-insts"] = true; - Features["dot4-insts"] = true; - Features["dot5-insts"] = true; - Features["dot6-insts"] = true; - Features["mai-insts"] = true; - [[fallthrough]]; - case GK_GFX906: - Features["dl-insts"] = true; - Features["dot1-insts"] = true; - Features["dot2-insts"] = true; - Features["dot7-insts"] = true; - Features["dot10-insts"] = true; - [[fallthrough]]; - case GK_GFX90C: - case GK_GFX909: - case GK_GFX904: - case GK_GFX902: - case GK_GFX900: - case GK_GFX9_GENERIC: - Features["gfx9-insts"] = true; - Features["vmem-to-lds-load-insts"] = true; - [[fallthrough]]; - case GK_GFX810: - case GK_GFX805: - case GK_GFX803: - case GK_GFX802: - case GK_GFX801: - Features["gfx8-insts"] = true; - Features["16-bit-insts"] = true; - Features["dpp"] = true; - Features["s-memrealtime"] = true; - Features["ci-insts"] = true; - Features["image-insts"] = true; - Features["s-memtime-inst"] = true; - Features["gws"] = true; - break; - case GK_GFX705: - case GK_GFX704: - case GK_GFX703: - case GK_GFX702: - case GK_GFX701: - case GK_GFX700: - Features["ci-insts"] = true; - [[fallthrough]]; - case GK_GFX602: - case GK_GFX601: - case GK_GFX600: - Features["image-insts"] = true; - Features["s-memtime-inst"] = true; - Features["gws"] = true; - Features["atomic-fmin-fmax-global-f32"] = true; - Features["atomic-fmin-fmax-global-f64"] = true; - break; - case GK_NONE: - break; - default: - llvm_unreachable("Unhandled GPU!"); - } + StringMap<bool> DefaultFeatures; + fillAMDGCNFeatureMap(GPU, T, DefaultFeatures); + return insertWaveSizeFeature(GPU, T, DefaultFeatures, Features); } else { if (GPU.empty()) GPU = "r600"; @@ -732,70 +783,5 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, llvm_unreachable("Unhandled GPU!"); } } -} - -static bool isWave32Capable(StringRef GPU, const Triple &T) { - bool IsWave32Capable = false; - // XXX - What does the member GPU mean if device name string passed here? - if (T.isAMDGCN()) { - switch (parseArchAMDGCN(GPU)) { - case GK_GFX1250: - case GK_GFX1201: - case GK_GFX1200: - case GK_GFX1153: - case GK_GFX1152: - case GK_GFX1151: - case GK_GFX1150: - case GK_GFX1103: - case GK_GFX1102: - case GK_GFX1101: - case GK_GFX1100: - case GK_GFX1036: - case GK_GFX1035: - case GK_GFX1034: - case GK_GFX1033: - case GK_GFX1032: - case GK_GFX1031: - case GK_GFX1030: - case GK_GFX1012: - case GK_GFX1011: - case GK_GFX1013: - case GK_GFX1010: - case GK_GFX12_GENERIC: - case GK_GFX11_GENERIC: - case GK_GFX10_3_GENERIC: - case GK_GFX10_1_GENERIC: - IsWave32Capable = true; - break; - default: - break; - } - } - return IsWave32Capable; -} - -std::pair<FeatureError, StringRef> -AMDGPU::insertWaveSizeFeature(StringRef GPU, const Triple &T, - StringMap<bool> &Features) { - bool IsWave32Capable = isWave32Capable(GPU, T); - const bool IsNullGPU = GPU.empty(); - const bool HaveWave32 = Features.count("wavefrontsize32"); - const bool HaveWave64 = Features.count("wavefrontsize64"); - if (HaveWave32 && HaveWave64) { - return {AMDGPU::INVALID_FEATURE_COMBINATION, - "'wavefrontsize32' and 'wavefrontsize64' are mutually exclusive"}; - } - if (HaveWave32 && !IsNullGPU && !IsWave32Capable) { - return {AMDGPU::UNSUPPORTED_TARGET_FEATURE, "wavefrontsize32"}; - } - // Don't assume any wavesize with an unknown subtarget. - if (!IsNullGPU) { - // Default to wave32 if available, or wave64 if not - if (!HaveWave32 && !HaveWave64) { - StringRef DefaultWaveSizeFeature = - IsWave32Capable ? "wavefrontsize32" : "wavefrontsize64"; - Features.insert(std::make_pair(DefaultWaveSizeFeature, true)); - } - } return {NO_ERROR, StringRef()}; } diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp index 6acb0bc49ecf..ac3626db46ea 100644 --- a/llvm/lib/TargetParser/Triple.cpp +++ b/llvm/lib/TargetParser/Triple.cpp @@ -158,6 +158,8 @@ StringRef Triple::getArchName(ArchType Kind, SubArchType SubArch) { return "dxilv1.7"; case Triple::DXILSubArch_v1_8: return "dxilv1.8"; + case Triple::DXILSubArch_v1_9: + return "dxilv1.9"; default: break; } @@ -329,6 +331,8 @@ StringRef Triple::getOSTypeName(OSType Kind) { case LiteOS: return "liteos"; case XROS: return "xros"; case Vulkan: return "vulkan"; + case CheriotRTOS: + return "cheriotrtos"; } llvm_unreachable("Invalid OSType"); @@ -387,6 +391,8 @@ StringRef Triple::getEnvironmentTypeName(EnvironmentType Kind) { case Callable: return "callable"; case Mesh: return "mesh"; case Amplification: return "amplification"; + case RootSignature: + return "rootsignature"; case OpenCL: return "opencl"; case OpenHOS: return "ohos"; @@ -648,6 +654,8 @@ static Triple::ArchType parseArch(StringRef ArchName) { .Cases("dxil", "dxilv1.0", "dxilv1.1", "dxilv1.2", "dxilv1.3", "dxilv1.4", "dxilv1.5", "dxilv1.6", "dxilv1.7", "dxilv1.8", Triple::dxil) + // Note: Cases has max limit of 10. + .Case("dxilv1.9", Triple::dxil) .Case("xtensa", Triple::xtensa) .Default(Triple::UnknownArch); @@ -687,49 +695,50 @@ static Triple::VendorType parseVendor(StringRef VendorName) { static Triple::OSType parseOS(StringRef OSName) { return StringSwitch<Triple::OSType>(OSName) - .StartsWith("darwin", Triple::Darwin) - .StartsWith("dragonfly", Triple::DragonFly) - .StartsWith("freebsd", Triple::FreeBSD) - .StartsWith("fuchsia", Triple::Fuchsia) - .StartsWith("ios", Triple::IOS) - .StartsWith("kfreebsd", Triple::KFreeBSD) - .StartsWith("linux", Triple::Linux) - .StartsWith("lv2", Triple::Lv2) - .StartsWith("macos", Triple::MacOSX) - .StartsWith("managarm", Triple::Managarm) - .StartsWith("netbsd", Triple::NetBSD) - .StartsWith("openbsd", Triple::OpenBSD) - .StartsWith("solaris", Triple::Solaris) - .StartsWith("uefi", Triple::UEFI) - .StartsWith("win32", Triple::Win32) - .StartsWith("windows", Triple::Win32) - .StartsWith("zos", Triple::ZOS) - .StartsWith("haiku", Triple::Haiku) - .StartsWith("rtems", Triple::RTEMS) - .StartsWith("aix", Triple::AIX) - .StartsWith("cuda", Triple::CUDA) - .StartsWith("nvcl", Triple::NVCL) - .StartsWith("amdhsa", Triple::AMDHSA) - .StartsWith("ps4", Triple::PS4) - .StartsWith("ps5", Triple::PS5) - .StartsWith("elfiamcu", Triple::ELFIAMCU) - .StartsWith("tvos", Triple::TvOS) - .StartsWith("watchos", Triple::WatchOS) - .StartsWith("bridgeos", Triple::BridgeOS) - .StartsWith("driverkit", Triple::DriverKit) - .StartsWith("xros", Triple::XROS) - .StartsWith("visionos", Triple::XROS) - .StartsWith("mesa3d", Triple::Mesa3D) - .StartsWith("amdpal", Triple::AMDPAL) - .StartsWith("hermit", Triple::HermitCore) - .StartsWith("hurd", Triple::Hurd) - .StartsWith("wasi", Triple::WASI) - .StartsWith("emscripten", Triple::Emscripten) - .StartsWith("shadermodel", Triple::ShaderModel) - .StartsWith("liteos", Triple::LiteOS) - .StartsWith("serenity", Triple::Serenity) - .StartsWith("vulkan", Triple::Vulkan) - .Default(Triple::UnknownOS); + .StartsWith("darwin", Triple::Darwin) + .StartsWith("dragonfly", Triple::DragonFly) + .StartsWith("freebsd", Triple::FreeBSD) + .StartsWith("fuchsia", Triple::Fuchsia) + .StartsWith("ios", Triple::IOS) + .StartsWith("kfreebsd", Triple::KFreeBSD) + .StartsWith("linux", Triple::Linux) + .StartsWith("lv2", Triple::Lv2) + .StartsWith("macos", Triple::MacOSX) + .StartsWith("managarm", Triple::Managarm) + .StartsWith("netbsd", Triple::NetBSD) + .StartsWith("openbsd", Triple::OpenBSD) + .StartsWith("solaris", Triple::Solaris) + .StartsWith("uefi", Triple::UEFI) + .StartsWith("win32", Triple::Win32) + .StartsWith("windows", Triple::Win32) + .StartsWith("zos", Triple::ZOS) + .StartsWith("haiku", Triple::Haiku) + .StartsWith("rtems", Triple::RTEMS) + .StartsWith("aix", Triple::AIX) + .StartsWith("cuda", Triple::CUDA) + .StartsWith("nvcl", Triple::NVCL) + .StartsWith("amdhsa", Triple::AMDHSA) + .StartsWith("ps4", Triple::PS4) + .StartsWith("ps5", Triple::PS5) + .StartsWith("elfiamcu", Triple::ELFIAMCU) + .StartsWith("tvos", Triple::TvOS) + .StartsWith("watchos", Triple::WatchOS) + .StartsWith("bridgeos", Triple::BridgeOS) + .StartsWith("driverkit", Triple::DriverKit) + .StartsWith("xros", Triple::XROS) + .StartsWith("visionos", Triple::XROS) + .StartsWith("mesa3d", Triple::Mesa3D) + .StartsWith("amdpal", Triple::AMDPAL) + .StartsWith("hermit", Triple::HermitCore) + .StartsWith("hurd", Triple::Hurd) + .StartsWith("wasi", Triple::WASI) + .StartsWith("emscripten", Triple::Emscripten) + .StartsWith("shadermodel", Triple::ShaderModel) + .StartsWith("liteos", Triple::LiteOS) + .StartsWith("serenity", Triple::Serenity) + .StartsWith("vulkan", Triple::Vulkan) + .StartsWith("cheriotrtos", Triple::CheriotRTOS) + .Default(Triple::UnknownOS); } static Triple::EnvironmentType parseEnvironment(StringRef EnvironmentName) { @@ -780,6 +789,7 @@ static Triple::EnvironmentType parseEnvironment(StringRef EnvironmentName) { .StartsWith("callable", Triple::Callable) .StartsWith("mesh", Triple::Mesh) .StartsWith("amplification", Triple::Amplification) + .StartsWith("rootsignature", Triple::RootSignature) .StartsWith("opencl", Triple::OpenCL) .StartsWith("ohos", Triple::OpenHOS) .StartsWith("pauthtest", Triple::PAuthTest) @@ -839,6 +849,7 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) { .EndsWith("v1.6", Triple::DXILSubArch_v1_6) .EndsWith("v1.7", Triple::DXILSubArch_v1_7) .EndsWith("v1.8", Triple::DXILSubArch_v1_8) + .EndsWith("v1.9", Triple::DXILSubArch_v1_9) .Default(Triple::NoSubArch); StringRef ARMSubArch = ARM::getCanonicalArchName(SubArchName); @@ -1108,7 +1119,7 @@ static StringRef getDXILArchNameFromShaderModel(StringRef ShaderModelStr) { VersionTuple Ver = parseVersionFromName(ShaderModelStr.drop_front(strlen("shadermodel"))); // Default DXIL minor version when Shader Model version is anything other - // than 6.[0...8] or 6.x (which translates to latest current SM version) + // than 6.[0...9] or 6.x (which translates to latest current SM version) const unsigned SMMajor = 6; if (!Ver.empty()) { if (Ver.getMajor() == SMMajor) { @@ -1132,6 +1143,8 @@ static StringRef getDXILArchNameFromShaderModel(StringRef ShaderModelStr) { return Triple::getArchName(Triple::dxil, Triple::DXILSubArch_v1_7); case 8: return Triple::getArchName(Triple::dxil, Triple::DXILSubArch_v1_8); + case 9: + return Triple::getArchName(Triple::dxil, Triple::DXILSubArch_v1_9); default: report_fatal_error("Unsupported Shader Model version", false); } diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp index b72096553ad9..edca7c18062a 100644 --- a/llvm/lib/TargetParser/X86TargetParser.cpp +++ b/llvm/lib/TargetParser/X86TargetParser.cpp @@ -72,7 +72,7 @@ constexpr FeatureBitset FeaturesX86_64_V2 = FeaturesX86_64 | FeatureSAHF | constexpr FeatureBitset FeaturesX86_64_V3 = FeaturesX86_64_V2 | FeatureAVX2 | FeatureBMI | FeatureBMI2 | FeatureF16C | FeatureFMA | FeatureLZCNT | FeatureMOVBE | FeatureXSAVE; -constexpr FeatureBitset FeaturesX86_64_V4 = FeaturesX86_64_V3 | FeatureEVEX512 | +constexpr FeatureBitset FeaturesX86_64_V4 = FeaturesX86_64_V3 | FeatureAVX512BW | FeatureAVX512CD | FeatureAVX512DQ | FeatureAVX512VL; @@ -95,9 +95,8 @@ constexpr FeatureBitset FeaturesBroadwell = // Intel Knights Landing and Knights Mill // Knights Landing has feature parity with Broadwell. -constexpr FeatureBitset FeaturesKNL = FeaturesBroadwell | FeatureAES | - FeatureAVX512F | FeatureEVEX512 | - FeatureAVX512CD; +constexpr FeatureBitset FeaturesKNL = + FeaturesBroadwell | FeatureAES | FeatureAVX512F | FeatureAVX512CD; constexpr FeatureBitset FeaturesKNM = FeaturesKNL | FeatureAVX512VPOPCNTDQ; // Intel Skylake processors. @@ -107,9 +106,9 @@ constexpr FeatureBitset FeaturesSkylakeClient = // SkylakeServer inherits all SkylakeClient features except SGX. // FIXME: That doesn't match gcc. constexpr FeatureBitset FeaturesSkylakeServer = - (FeaturesSkylakeClient & ~FeatureSGX) | FeatureAVX512F | FeatureEVEX512 | - FeatureAVX512CD | FeatureAVX512DQ | FeatureAVX512BW | FeatureAVX512VL | - FeatureCLWB | FeaturePKU; + (FeaturesSkylakeClient & ~FeatureSGX) | FeatureAVX512F | FeatureAVX512CD | + FeatureAVX512DQ | FeatureAVX512BW | FeatureAVX512VL | FeatureCLWB | + FeaturePKU; constexpr FeatureBitset FeaturesCascadeLake = FeaturesSkylakeServer | FeatureAVX512VNNI; constexpr FeatureBitset FeaturesCooperLake = @@ -117,9 +116,9 @@ constexpr FeatureBitset FeaturesCooperLake = // Intel 10nm processors. constexpr FeatureBitset FeaturesCannonlake = - FeaturesSkylakeClient | FeatureAVX512F | FeatureEVEX512 | FeatureAVX512CD | - FeatureAVX512DQ | FeatureAVX512BW | FeatureAVX512VL | FeatureAVX512IFMA | - FeatureAVX512VBMI | FeaturePKU | FeatureSHA; + FeaturesSkylakeClient | FeatureAVX512F | FeatureAVX512CD | FeatureAVX512DQ | + FeatureAVX512BW | FeatureAVX512VL | FeatureAVX512IFMA | FeatureAVX512VBMI | + FeaturePKU | FeatureSHA; constexpr FeatureBitset FeaturesICLClient = FeaturesCannonlake | FeatureAVX512BITALG | FeatureAVX512VBMI2 | FeatureAVX512VNNI | FeatureAVX512VPOPCNTDQ | FeatureGFNI | FeatureRDPID | @@ -139,7 +138,7 @@ constexpr FeatureBitset FeaturesSapphireRapids = constexpr FeatureBitset FeaturesGraniteRapids = FeaturesSapphireRapids | FeatureAMX_FP16 | FeaturePREFETCHI; constexpr FeatureBitset FeaturesDiamondRapids = - FeaturesGraniteRapids | FeatureAMX_COMPLEX | FeatureAVX10_2_512 | + FeaturesGraniteRapids | FeatureAMX_COMPLEX | FeatureAVX10_2 | FeatureCMPCCXADD | FeatureAVXIFMA | FeatureAVXNECONVERT | FeatureAVXVNNIINT8 | FeatureAVXVNNIINT16 | FeatureSHA512 | FeatureSM3 | FeatureSM4 | FeatureEGPR | FeatureZU | FeatureCCMP | FeaturePush2Pop2 | @@ -244,11 +243,10 @@ static constexpr FeatureBitset FeaturesZNVER3 = FeaturesZNVER2 | FeatureINVPCID | FeaturePKU | FeatureVAES | FeatureVPCLMULQDQ; static constexpr FeatureBitset FeaturesZNVER4 = - FeaturesZNVER3 | FeatureAVX512F | FeatureEVEX512 | FeatureAVX512CD | - FeatureAVX512DQ | FeatureAVX512BW | FeatureAVX512VL | FeatureAVX512IFMA | - FeatureAVX512VBMI | FeatureAVX512VBMI2 | FeatureAVX512VNNI | - FeatureAVX512BITALG | FeatureAVX512VPOPCNTDQ | FeatureAVX512BF16 | - FeatureGFNI | FeatureSHSTK; + FeaturesZNVER3 | FeatureAVX512F | FeatureAVX512CD | FeatureAVX512DQ | + FeatureAVX512BW | FeatureAVX512VL | FeatureAVX512IFMA | FeatureAVX512VBMI | + FeatureAVX512VBMI2 | FeatureAVX512VNNI | FeatureAVX512BITALG | + FeatureAVX512VPOPCNTDQ | FeatureAVX512BF16 | FeatureGFNI | FeatureSHSTK; static constexpr FeatureBitset FeaturesZNVER5 = FeaturesZNVER4 | FeatureAVXVNNI | FeatureMOVDIRI | FeatureMOVDIR64B | @@ -394,7 +392,7 @@ constexpr ProcInfo Processors[] = { // Clearwaterforest microarchitecture based processors. { {"clearwaterforest"}, CK_Lunarlake, FEATURE_AVX2, FeaturesClearwaterforest, 'p', false }, // Diamond Rapids microarchitecture based processors. - { {"diamondrapids"}, CK_Diamondrapids, FEATURE_AVX10_2_512, FeaturesDiamondRapids, 'z', false }, + { {"diamondrapids"}, CK_Diamondrapids, FEATURE_AVX10_2, FeaturesDiamondRapids, 'z', false }, // Knights Landing processor. { {"knl"}, CK_KNL, FEATURE_AVX512F, FeaturesKNL, 'Z', false }, { {"mic_avx512"}, CK_KNL, FEATURE_AVX512F, FeaturesKNL, 'Z', true }, @@ -616,7 +614,7 @@ constexpr FeatureBitset ImpliedFeaturesAMX_FP8 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_TRANSPOSE = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_MOVRS = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_AVX512 = - FeatureAMX_TILE | FeatureAVX10_2_512; + FeatureAMX_TILE | FeatureAVX10_2; constexpr FeatureBitset ImpliedFeaturesAMX_TF32 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesHRESET = {}; @@ -642,11 +640,9 @@ constexpr FeatureBitset ImpliedFeaturesAVX10_1 = FeatureAVX512VNNI | FeatureAVX512BF16 | FeatureAVX512VPOPCNTDQ | FeatureAVX512VBMI2 | FeatureAVX512BITALG | FeatureAVX512FP16 | FeatureAVX512DQ | FeatureAVX512VL; -constexpr FeatureBitset ImpliedFeaturesAVX10_1_512 = - FeatureAVX10_1 | FeatureEVEX512; constexpr FeatureBitset ImpliedFeaturesAVX10_2 = FeatureAVX10_1; -constexpr FeatureBitset ImpliedFeaturesAVX10_2_512 = - FeatureAVX10_2 | FeatureAVX10_1_512; +constexpr FeatureBitset ImpliedFeaturesAVX10_1_512 = FeatureAVX10_1; +constexpr FeatureBitset ImpliedFeaturesAVX10_2_512 = FeatureAVX10_2; // APX Features constexpr FeatureBitset ImpliedFeaturesEGPR = {}; diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index 40a7f8043034..40de36d81ddd 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -83,8 +83,8 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) { // == (ShVal0 << ShAmt) | (ShVal1 >> (Width -ShAmt)) if (match(V, m_OneUse(m_c_Or( m_Shl(m_Value(ShVal0), m_Value(ShAmt)), - m_LShr(m_Value(ShVal1), - m_Sub(m_SpecificInt(Width), m_Deferred(ShAmt))))))) { + m_LShr(m_Value(ShVal1), m_Sub(m_SpecificInt(Width), + m_Deferred(ShAmt))))))) { return Intrinsic::fshl; } @@ -617,7 +617,7 @@ struct LoadOps { LoadInst *RootInsert = nullptr; bool FoundRoot = false; uint64_t LoadSize = 0; - const APInt *Shift = nullptr; + uint64_t Shift = 0; Type *ZextType; AAMDNodes AATags; }; @@ -627,17 +627,15 @@ struct LoadOps { // (ZExt(L1) << shift1) | ZExt(L2) -> ZExt(L3) static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL, AliasAnalysis &AA) { - const APInt *ShAmt2 = nullptr; + uint64_t ShAmt2; Value *X; Instruction *L1, *L2; // Go to the last node with loads. - if (match(V, m_OneUse(m_c_Or( - m_Value(X), - m_OneUse(m_Shl(m_OneUse(m_ZExt(m_OneUse(m_Instruction(L2)))), - m_APInt(ShAmt2)))))) || - match(V, m_OneUse(m_Or(m_Value(X), - m_OneUse(m_ZExt(m_OneUse(m_Instruction(L2)))))))) { + if (match(V, + m_OneUse(m_c_Or(m_Value(X), m_OneUse(m_ShlOrSelf( + m_OneUse(m_ZExt(m_Instruction(L2))), + ShAmt2)))))) { if (!foldLoadsRecursive(X, LOps, DL, AA) && LOps.FoundRoot) // Avoid Partial chain merge. return false; @@ -646,11 +644,10 @@ static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL, // Check if the pattern has loads LoadInst *LI1 = LOps.Root; - const APInt *ShAmt1 = LOps.Shift; + uint64_t ShAmt1 = LOps.Shift; if (LOps.FoundRoot == false && - (match(X, m_OneUse(m_ZExt(m_Instruction(L1)))) || - match(X, m_OneUse(m_Shl(m_OneUse(m_ZExt(m_OneUse(m_Instruction(L1)))), - m_APInt(ShAmt1)))))) { + match(X, m_OneUse( + m_ShlOrSelf(m_OneUse(m_ZExt(m_Instruction(L1))), ShAmt1)))) { LI1 = dyn_cast<LoadInst>(L1); } LoadInst *LI2 = dyn_cast<LoadInst>(L2); @@ -726,13 +723,6 @@ static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL, if (IsBigEndian) std::swap(ShAmt1, ShAmt2); - // Find Shifts values. - uint64_t Shift1 = 0, Shift2 = 0; - if (ShAmt1) - Shift1 = ShAmt1->getZExtValue(); - if (ShAmt2) - Shift2 = ShAmt2->getZExtValue(); - // First load is always LI1. This is where we put the new load. // Use the merged load size available from LI1 for forward loads. if (LOps.FoundRoot) { @@ -747,7 +737,7 @@ static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL, uint64_t ShiftDiff = IsBigEndian ? LoadSize2 : LoadSize1; uint64_t PrevSize = DL.getTypeStoreSize(IntegerType::get(LI1->getContext(), LoadSize1)); - if ((Shift2 - Shift1) != ShiftDiff || (Offset2 - Offset1) != PrevSize) + if ((ShAmt2 - ShAmt1) != ShiftDiff || (Offset2 - Offset1) != PrevSize) return false; // Update LOps @@ -824,7 +814,7 @@ static bool foldConsecutiveLoads(Instruction &I, const DataLayout &DL, // Check if shift needed. We need to shift with the amount of load1 // shift if not zero. if (LOps.Shift) - NewOp = Builder.CreateShl(NewOp, ConstantInt::get(I.getContext(), *LOps.Shift)); + NewOp = Builder.CreateShl(NewOp, LOps.Shift); I.replaceAllUsesWith(NewOp); return true; @@ -860,11 +850,9 @@ static std::optional<PartStore> matchPartStore(Instruction &I, return std::nullopt; uint64_t ValWidth = StoredTy->getPrimitiveSizeInBits(); - uint64_t ValOffset = 0; + uint64_t ValOffset; Value *Val; - if (!match(StoredVal, m_CombineOr(m_Trunc(m_LShr(m_Value(Val), - m_ConstantInt(ValOffset))), - m_Trunc(m_Value(Val))))) + if (!match(StoredVal, m_Trunc(m_LShrOrSelf(m_Value(Val), ValOffset)))) return std::nullopt; Value *Ptr = Store->getPointerOperand(); diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp index b775c4346019..08f03aa45255 100644 --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -700,9 +700,6 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape, DIBuilder DBuilder(*F.getParent(), /*AllowUnresolved*/ false); - assert(Shape.getPromiseAlloca() && - "Coroutine with switch ABI should own Promise alloca"); - DIFile *DFile = DIS->getFile(); unsigned LineNum = DIS->getLine(); diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index 180ac9c61e7d..02c38d02cff6 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -1568,14 +1568,22 @@ private: if (DebugLoc SuspendLoc = S->getDebugLoc()) { std::string LabelName = ("__coro_resume_" + Twine(SuspendIndex)).str(); - DILocation &DILoc = *SuspendLoc; + // Take the "inlined at" location recursively, if present. This is + // mandatory as the DILabel insertion checks that the scopes of label + // and the attached location match. This is not the case when the + // suspend location has been inlined due to pointing to the original + // scope. + DILocation *DILoc = SuspendLoc; + while (DILocation *InlinedAt = DILoc->getInlinedAt()) + DILoc = InlinedAt; + DILabel *ResumeLabel = - DBuilder.createLabel(DIS, LabelName, DILoc.getFile(), + DBuilder.createLabel(DIS, LabelName, DILoc->getFile(), SuspendLoc.getLine(), SuspendLoc.getCol(), /*IsArtificial=*/true, /*CoroSuspendIdx=*/SuspendIndex, /*AlwaysPreserve=*/false); - DBuilder.insertLabel(ResumeLabel, &DILoc, ResumeBB->begin()); + DBuilder.insertLabel(ResumeLabel, DILoc, ResumeBB->begin()); } } diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp index 7bcb20de46ff..83aa7de5400f 100644 --- a/llvm/lib/Transforms/IPO/FunctionImport.cpp +++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp @@ -40,6 +40,7 @@ #include "llvm/Support/JSON.h" #include "llvm/Support/Path.h" #include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TimeProfiler.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO/Internalize.h" #include "llvm/Transforms/Utils/Cloning.h" @@ -1550,6 +1551,7 @@ void llvm::computeDeadSymbolsWithConstProp( const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols, function_ref<PrevailingType(GlobalValue::GUID)> isPrevailing, bool ImportEnabled) { + llvm::TimeTraceScope timeScope("Drop dead symbols and propagate attributes"); computeDeadSymbolsAndUpdateIndirectCalls(Index, GUIDPreservedSymbols, isPrevailing); if (ImportEnabled) @@ -1664,6 +1666,7 @@ bool llvm::convertToDeclaration(GlobalValue &GV) { void llvm::thinLTOFinalizeInModule(Module &TheModule, const GVSummaryMapTy &DefinedGlobals, bool PropagateAttrs) { + llvm::TimeTraceScope timeScope("ThinLTO finalize in module"); DenseSet<Comdat *> NonPrevailingComdats; auto FinalizeInModule = [&](GlobalValue &GV, bool Propagate = false) { // See if the global summary analysis computed a new resolved linkage. @@ -1791,6 +1794,7 @@ void llvm::thinLTOFinalizeInModule(Module &TheModule, /// Run internalization on \p TheModule based on symmary analysis. void llvm::thinLTOInternalizeModule(Module &TheModule, const GVSummaryMapTy &DefinedGlobals) { + llvm::TimeTraceScope timeScope("ThinLTO internalize module"); // Declare a callback for the internalize pass that will ask for every // candidate GlobalValue if it can be internalized or not. auto MustPreserveGV = [&](const GlobalValue &GV) -> bool { @@ -1885,6 +1889,7 @@ Expected<bool> FunctionImporter::importFunctions( // Do the actual import of functions now, one Module at a time for (const auto &ModName : ImportList.getSourceModules()) { + llvm::TimeTraceScope timeScope("Import", ModName); // Get the module for the import Expected<std::unique_ptr<Module>> SrcModuleOrErr = ModuleLoader(ModName); if (!SrcModuleOrErr) @@ -1900,102 +1905,114 @@ Expected<bool> FunctionImporter::importFunctions( // Find the globals to import SetVector<GlobalValue *> GlobalsToImport; - for (Function &F : *SrcModule) { - if (!F.hasName()) - continue; - auto GUID = F.getGUID(); - auto MaybeImportType = ImportList.getImportType(ModName, GUID); - bool ImportDefinition = MaybeImportType == GlobalValueSummary::Definition; - - LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not") - << " importing function" - << (ImportDefinition - ? " definition " - : (MaybeImportType ? " declaration " : " ")) - << GUID << " " << F.getName() << " from " - << SrcModule->getSourceFileName() << "\n"); - if (ImportDefinition) { - if (Error Err = F.materialize()) - return std::move(Err); - // MemProf should match function's definition and summary, - // 'thinlto_src_module' is needed. - if (EnableImportMetadata || EnableMemProfContextDisambiguation) { - // Add 'thinlto_src_module' and 'thinlto_src_file' metadata for - // statistics and debugging. - F.setMetadata( - "thinlto_src_module", - MDNode::get(DestModule.getContext(), - {MDString::get(DestModule.getContext(), - SrcModule->getModuleIdentifier())})); - F.setMetadata( - "thinlto_src_file", - MDNode::get(DestModule.getContext(), - {MDString::get(DestModule.getContext(), - SrcModule->getSourceFileName())})); + { + llvm::TimeTraceScope functionsScope("Functions"); + for (Function &F : *SrcModule) { + if (!F.hasName()) + continue; + auto GUID = F.getGUID(); + auto MaybeImportType = ImportList.getImportType(ModName, GUID); + bool ImportDefinition = + MaybeImportType == GlobalValueSummary::Definition; + + LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not") + << " importing function" + << (ImportDefinition + ? " definition " + : (MaybeImportType ? " declaration " : " ")) + << GUID << " " << F.getName() << " from " + << SrcModule->getSourceFileName() << "\n"); + if (ImportDefinition) { + if (Error Err = F.materialize()) + return std::move(Err); + // MemProf should match function's definition and summary, + // 'thinlto_src_module' is needed. + if (EnableImportMetadata || EnableMemProfContextDisambiguation) { + // Add 'thinlto_src_module' and 'thinlto_src_file' metadata for + // statistics and debugging. + F.setMetadata( + "thinlto_src_module", + MDNode::get(DestModule.getContext(), + {MDString::get(DestModule.getContext(), + SrcModule->getModuleIdentifier())})); + F.setMetadata( + "thinlto_src_file", + MDNode::get(DestModule.getContext(), + {MDString::get(DestModule.getContext(), + SrcModule->getSourceFileName())})); + } + GlobalsToImport.insert(&F); } - GlobalsToImport.insert(&F); } } - for (GlobalVariable &GV : SrcModule->globals()) { - if (!GV.hasName()) - continue; - auto GUID = GV.getGUID(); - auto MaybeImportType = ImportList.getImportType(ModName, GUID); - bool ImportDefinition = MaybeImportType == GlobalValueSummary::Definition; - - LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not") - << " importing global" - << (ImportDefinition - ? " definition " - : (MaybeImportType ? " declaration " : " ")) - << GUID << " " << GV.getName() << " from " - << SrcModule->getSourceFileName() << "\n"); - if (ImportDefinition) { - if (Error Err = GV.materialize()) - return std::move(Err); - ImportedGVCount += GlobalsToImport.insert(&GV); + { + llvm::TimeTraceScope globalsScope("Globals"); + for (GlobalVariable &GV : SrcModule->globals()) { + if (!GV.hasName()) + continue; + auto GUID = GV.getGUID(); + auto MaybeImportType = ImportList.getImportType(ModName, GUID); + bool ImportDefinition = + MaybeImportType == GlobalValueSummary::Definition; + + LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not") + << " importing global" + << (ImportDefinition + ? " definition " + : (MaybeImportType ? " declaration " : " ")) + << GUID << " " << GV.getName() << " from " + << SrcModule->getSourceFileName() << "\n"); + if (ImportDefinition) { + if (Error Err = GV.materialize()) + return std::move(Err); + ImportedGVCount += GlobalsToImport.insert(&GV); + } } } - for (GlobalAlias &GA : SrcModule->aliases()) { - if (!GA.hasName() || isa<GlobalIFunc>(GA.getAliaseeObject())) - continue; - auto GUID = GA.getGUID(); - auto MaybeImportType = ImportList.getImportType(ModName, GUID); - bool ImportDefinition = MaybeImportType == GlobalValueSummary::Definition; - - LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not") - << " importing alias" - << (ImportDefinition - ? " definition " - : (MaybeImportType ? " declaration " : " ")) - << GUID << " " << GA.getName() << " from " - << SrcModule->getSourceFileName() << "\n"); - if (ImportDefinition) { - if (Error Err = GA.materialize()) - return std::move(Err); - // Import alias as a copy of its aliasee. - GlobalObject *GO = GA.getAliaseeObject(); - if (Error Err = GO->materialize()) - return std::move(Err); - auto *Fn = replaceAliasWithAliasee(SrcModule.get(), &GA); - LLVM_DEBUG(dbgs() << "Is importing aliasee fn " << GO->getGUID() << " " - << GO->getName() << " from " + { + llvm::TimeTraceScope aliasesScope("Aliases"); + for (GlobalAlias &GA : SrcModule->aliases()) { + if (!GA.hasName() || isa<GlobalIFunc>(GA.getAliaseeObject())) + continue; + auto GUID = GA.getGUID(); + auto MaybeImportType = ImportList.getImportType(ModName, GUID); + bool ImportDefinition = + MaybeImportType == GlobalValueSummary::Definition; + + LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not") + << " importing alias" + << (ImportDefinition + ? " definition " + : (MaybeImportType ? " declaration " : " ")) + << GUID << " " << GA.getName() << " from " << SrcModule->getSourceFileName() << "\n"); - if (EnableImportMetadata || EnableMemProfContextDisambiguation) { - // Add 'thinlto_src_module' and 'thinlto_src_file' metadata for - // statistics and debugging. - Fn->setMetadata( - "thinlto_src_module", - MDNode::get(DestModule.getContext(), - {MDString::get(DestModule.getContext(), - SrcModule->getModuleIdentifier())})); - Fn->setMetadata( - "thinlto_src_file", - MDNode::get(DestModule.getContext(), - {MDString::get(DestModule.getContext(), - SrcModule->getSourceFileName())})); + if (ImportDefinition) { + if (Error Err = GA.materialize()) + return std::move(Err); + // Import alias as a copy of its aliasee. + GlobalObject *GO = GA.getAliaseeObject(); + if (Error Err = GO->materialize()) + return std::move(Err); + auto *Fn = replaceAliasWithAliasee(SrcModule.get(), &GA); + LLVM_DEBUG(dbgs() << "Is importing aliasee fn " << GO->getGUID() + << " " << GO->getName() << " from " + << SrcModule->getSourceFileName() << "\n"); + if (EnableImportMetadata || EnableMemProfContextDisambiguation) { + // Add 'thinlto_src_module' and 'thinlto_src_file' metadata for + // statistics and debugging. + Fn->setMetadata( + "thinlto_src_module", + MDNode::get(DestModule.getContext(), + {MDString::get(DestModule.getContext(), + SrcModule->getModuleIdentifier())})); + Fn->setMetadata( + "thinlto_src_file", + MDNode::get(DestModule.getContext(), + {MDString::get(DestModule.getContext(), + SrcModule->getSourceFileName())})); + } + GlobalsToImport.insert(Fn); } - GlobalsToImport.insert(Fn); } } diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp index 9196a0147c43..30459caee160 100644 --- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp +++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp @@ -89,6 +89,8 @@ static cl::opt<bool> SpecializeLiteralConstant( "Enable specialization of functions that take a literal constant as an " "argument")); +extern cl::opt<bool> ProfcheckDisableMetadataFixes; + bool InstCostVisitor::canEliminateSuccessor(BasicBlock *BB, BasicBlock *Succ) const { unsigned I = 0; @@ -784,9 +786,31 @@ bool FunctionSpecializer::run() { // Update the known call sites to call the clone. for (CallBase *Call : S.CallSites) { + Function *Clone = S.Clone; LLVM_DEBUG(dbgs() << "FnSpecialization: Redirecting " << *Call - << " to call " << S.Clone->getName() << "\n"); + << " to call " << Clone->getName() << "\n"); Call->setCalledFunction(S.Clone); + auto &BFI = GetBFI(*Call->getFunction()); + std::optional<uint64_t> Count = + BFI.getBlockProfileCount(Call->getParent()); + if (Count && !ProfcheckDisableMetadataFixes) { + std::optional<llvm::Function::ProfileCount> MaybeCloneCount = + Clone->getEntryCount(); + assert(MaybeCloneCount && "Clone entry count was not set!"); + uint64_t CallCount = *Count + MaybeCloneCount->getCount(); + Clone->setEntryCount(CallCount); + if (std::optional<llvm::Function::ProfileCount> MaybeOriginalCount = + S.F->getEntryCount()) { + uint64_t OriginalCount = MaybeOriginalCount->getCount(); + if (OriginalCount >= CallCount) { + S.F->setEntryCount(OriginalCount - CallCount); + } else { + // This should generally not happen as that would mean there are + // more computed calls to the function than what was recorded. + LLVM_DEBUG(S.F->setEntryCount(0)); + } + } + } } Clones.push_back(S.Clone); @@ -838,14 +862,24 @@ bool FunctionSpecializer::run() { } void FunctionSpecializer::removeDeadFunctions() { - for (Function *F : FullySpecialized) { + for (Function *F : DeadFunctions) { LLVM_DEBUG(dbgs() << "FnSpecialization: Removing dead function " << F->getName() << "\n"); if (FAM) FAM->clear(*F, F->getName()); + + // Remove all the callsites that were proven unreachable once, and replace + // them with poison. + for (User *U : make_early_inc_range(F->users())) { + assert((isa<CallInst>(U) || isa<InvokeInst>(U)) && + "User of dead function must be call or invoke"); + Instruction *CS = cast<Instruction>(U); + CS->replaceAllUsesWith(PoisonValue::get(CS->getType())); + CS->eraseFromParent(); + } F->eraseFromParent(); } - FullySpecialized.clear(); + DeadFunctions.clear(); } /// Clone the function \p F and remove the ssa_copy intrinsics added by @@ -1033,6 +1067,9 @@ Function *FunctionSpecializer::createSpecialization(Function *F, // clone must. Clone->setLinkage(GlobalValue::InternalLinkage); + if (F->getEntryCount() && !ProfcheckDisableMetadataFixes) + Clone->setEntryCount(0); + // Initialize the lattice state of the arguments of the function clone, // marking the argument on which we specialized the function constant // with the given value. @@ -1206,8 +1243,11 @@ void FunctionSpecializer::updateCallSites(Function *F, const Spec *Begin, // If the function has been completely specialized, the original function // is no longer needed. Mark it unreachable. - if (NCallsLeft == 0 && Solver.isArgumentTrackedFunction(F)) { + // NOTE: If the address of a function is taken, we cannot treat it as dead + // function. + if (NCallsLeft == 0 && Solver.isArgumentTrackedFunction(F) && + !F->hasAddressTaken()) { Solver.markFunctionUnreachable(F); - FullySpecialized.insert(F); + DeadFunctions.insert(F); } } diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index d7edd1288309..f88d51f443bc 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -2551,7 +2551,8 @@ static bool OptimizeNonTrivialIFuncs( })) continue; - assert(!Callees.empty() && "Expecting successful collection of versions"); + if (Callees.empty()) + continue; LLVM_DEBUG(dbgs() << "Statically resolving calls to function " << Resolver->getName() << "\n"); diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp index c57981ae4ca0..fdf0c3ac8007 100644 --- a/llvm/lib/Transforms/IPO/IROutliner.cpp +++ b/llvm/lib/Transforms/IPO/IROutliner.cpp @@ -686,9 +686,6 @@ Function *IROutliner::createFunction(Module &M, OutlinableGroup &Group, /* Outlined code is optimized code by definition. */ DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized); - // Don't add any new variables to the subprogram. - DB.finalizeSubprogram(OutlinedSP); - // Attach subprogram to the function. F->setSubprogram(OutlinedSP); // We're done with the DIBuilder. diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index 57844a10aa9c..821a9d82ddb0 100644 --- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -504,10 +504,7 @@ class LowerTypeTestsModule { void importTypeTest(CallInst *CI); void importFunction(Function *F, bool isJumpTableCanonical); - BitSetInfo - buildBitSet(Metadata *TypeId, - const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout); - ByteArrayInfo *createByteArray(BitSetInfo &BSI); + ByteArrayInfo *createByteArray(const BitSetInfo &BSI); void allocateByteArrays(); Value *createBitSetTest(IRBuilder<> &B, const TypeIdLowering &TIL, Value *BitOffset); @@ -578,9 +575,9 @@ public: /// Build a bit set for TypeId using the object layouts in /// GlobalLayout. -BitSetInfo LowerTypeTestsModule::buildBitSet( - Metadata *TypeId, - const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) { +static BitSetInfo +buildBitSet(Metadata *TypeId, + const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) { BitSetBuilder BSB; // Compute the byte offset of each address associated with this type @@ -615,7 +612,7 @@ static Value *createMaskedBitTest(IRBuilder<> &B, Value *Bits, return B.CreateICmpNE(MaskedBits, ConstantInt::get(BitsType, 0)); } -ByteArrayInfo *LowerTypeTestsModule::createByteArray(BitSetInfo &BSI) { +ByteArrayInfo *LowerTypeTestsModule::createByteArray(const BitSetInfo &BSI) { // Create globals to stand in for byte arrays and masks. These never actually // get initialized, we RAUW and erase them later in allocateByteArrays() once // we know the offset and mask to use. diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index b8c99f1f3389..7f9693169af0 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -3965,6 +3965,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones( void ModuleCallsiteContextGraph::updateAllocationCall( CallInfo &Call, AllocationType AllocType) { std::string AllocTypeString = getAllocTypeAttributeString(AllocType); + removeAnyExistingAmbiguousAttribute(cast<CallBase>(Call.call())); auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(), "memprof", AllocTypeString); cast<CallBase>(Call.call())->addFnAttr(A); @@ -5501,6 +5502,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) { // clone J-1 (J==0 is the original clone and does not have a VMaps // entry). CBClone = cast<CallBase>((*VMaps[J - 1])[CB]); + removeAnyExistingAmbiguousAttribute(CBClone); CBClone->addFnAttr(A); ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone) << ore::NV("AllocationCall", CBClone) << " in clone " diff --git a/llvm/lib/Transforms/IPO/SCCP.cpp b/llvm/lib/Transforms/IPO/SCCP.cpp index d50de34dfa48..2ecadd529170 100644 --- a/llvm/lib/Transforms/IPO/SCCP.cpp +++ b/llvm/lib/Transforms/IPO/SCCP.cpp @@ -169,6 +169,13 @@ static bool runIPSCCP( for (Function &F : M) { if (F.isDeclaration()) continue; + // Skip the dead functions marked by FunctionSpecializer, avoiding removing + // blocks in dead functions. Set MadeChanges if there is any dead function + // that will be removed later. + if (IsFuncSpecEnabled && Specializer.isDeadFunction(&F)) { + MadeChanges = true; + continue; + } SmallVector<BasicBlock *, 512> BlocksToErase; @@ -326,12 +333,15 @@ static bool runIPSCCP( LLVM_DEBUG(dbgs() << "Found that GV '" << GV->getName() << "' is constant!\n"); for (User *U : make_early_inc_range(GV->users())) { - // We can remove LoadInst here, because we already replaced its users - // with a constant. + // We can remove LoadInst here. The LoadInsts in dead functions marked by + // FuncSpec are not simplified to constants, thus poison them. assert((isa<StoreInst>(U) || isa<LoadInst>(U)) && "Only Store|Load Instruction can be user of GlobalVariable at " "reaching here."); - cast<Instruction>(U)->eraseFromParent(); + Instruction *I = cast<Instruction>(U); + if (isa<LoadInst>(I)) + I->replaceAllUsesWith(PoisonValue::get(I->getType())); + I->eraseFromParent(); } // Try to create a debug constant expression for the global variable diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp index 838f97c8f49a..2340fe556538 100644 --- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp +++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp @@ -269,6 +269,12 @@ static bool enableUnifiedLTO(Module &M) { } #endif +bool mustEmitToMergedModule(const GlobalValue *GV) { + // The __cfi_check definition is filled in by the CrossDSOCFI pass which + // runs only in the merged module. + return GV->getName() == "__cfi_check"; +} + // If it's possible to split M into regular and thin LTO parts, do so and write // a multi-module bitcode file with the two parts to OS. Otherwise, write only a // regular LTO bitcode file to OS. @@ -350,19 +356,13 @@ void splitAndWriteThinLTOBitcode( }); } - auto MustEmitToMergedModule = [](const GlobalValue *GV) { - // The __cfi_check definition is filled in by the CrossDSOCFI pass which - // runs only in the merged module. - return GV->getName() == "__cfi_check"; - }; - ValueToValueMapTy VMap; std::unique_ptr<Module> MergedM( CloneModule(M, VMap, [&](const GlobalValue *GV) -> bool { if (const auto *C = GV->getComdat()) if (MergedMComdats.count(C)) return true; - if (MustEmitToMergedModule(GV)) + if (mustEmitToMergedModule(GV)) return true; if (auto *F = dyn_cast<Function>(GV)) return EligibleVirtualFns.count(F); @@ -380,7 +380,7 @@ void splitAndWriteThinLTOBitcode( cloneUsedGlobalVariables(M, *MergedM, /*CompilerUsed*/ true); for (Function &F : *MergedM) - if (!F.isDeclaration() && !MustEmitToMergedModule(&F)) { + if (!F.isDeclaration() && !mustEmitToMergedModule(&F)) { // Reset the linkage of all functions eligible for virtual constant // propagation. The canonical definitions live in the thin LTO module so // that they can be imported. @@ -406,7 +406,7 @@ void splitAndWriteThinLTOBitcode( if (const auto *C = GV->getComdat()) if (MergedMComdats.count(C)) return false; - if (MustEmitToMergedModule(GV)) + if (mustEmitToMergedModule(GV)) return false; return true; }); @@ -529,11 +529,13 @@ bool enableSplitLTOUnit(Module &M) { return EnableSplitLTOUnit; } -// Returns whether this module needs to be split because it uses type metadata. -bool hasTypeMetadata(Module &M) { +// Returns whether this module needs to be split (if splitting is enabled). +bool requiresSplit(Module &M) { for (auto &GO : M.global_objects()) { if (GO.hasMetadata(LLVMContext::MD_type)) return true; + if (mustEmitToMergedModule(&GO)) + return true; } return false; } @@ -543,9 +545,9 @@ bool writeThinLTOBitcode(raw_ostream &OS, raw_ostream *ThinLinkOS, Module &M, const ModuleSummaryIndex *Index, const bool ShouldPreserveUseListOrder) { std::unique_ptr<ModuleSummaryIndex> NewIndex = nullptr; - // See if this module has any type metadata. If so, we try to split it + // See if this module needs to be split. If so, we try to split it // or at least promote type ids to enable WPD. - if (hasTypeMetadata(M)) { + if (requiresSplit(M)) { if (enableSplitLTOUnit(M)) { splitAndWriteThinLTOBitcode(OS, ThinLinkOS, AARGetter, M, ShouldPreserveUseListOrder); diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index aec484f8a18f..bfb25c806e53 100644 --- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -60,6 +60,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/TypeMetadataUtils.h" #include "llvm/Bitcode/BitcodeReader.h" @@ -68,6 +69,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalAlias.h" @@ -82,12 +84,15 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/ModuleSummaryIndexYAML.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Errc.h" #include "llvm/Support/Error.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/GlobPattern.h" +#include "llvm/Support/TimeProfiler.h" #include "llvm/TargetParser/Triple.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/FunctionAttrs.h" @@ -95,6 +100,7 @@ #include "llvm/Transforms/Utils/CallPromotionUtils.h" #include "llvm/Transforms/Utils/Evaluator.h" #include <algorithm> +#include <cmath> #include <cstddef> #include <map> #include <set> @@ -167,6 +173,8 @@ static cl::list<std::string> cl::desc("Prevent function(s) from being devirtualized"), cl::Hidden, cl::CommaSeparated); +extern cl::opt<bool> ProfcheckDisableMetadataFixes; + /// With Clang, a pure virtual class's deleting destructor is emitted as a /// `llvm.trap` intrinsic followed by an unreachable IR instruction. In the /// context of whole program devirtualization, the deleting destructor of a pure @@ -451,21 +459,21 @@ struct VirtualCallSite { void emitRemark(const StringRef OptName, const StringRef TargetName, - function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) { + function_ref<OptimizationRemarkEmitter &(Function &)> OREGetter) { Function *F = CB.getCaller(); DebugLoc DLoc = CB.getDebugLoc(); BasicBlock *Block = CB.getParent(); using namespace ore; - OREGetter(F).emit(OptimizationRemark(DEBUG_TYPE, OptName, DLoc, Block) - << NV("Optimization", OptName) - << ": devirtualized a call to " - << NV("FunctionName", TargetName)); + OREGetter(*F).emit(OptimizationRemark(DEBUG_TYPE, OptName, DLoc, Block) + << NV("Optimization", OptName) + << ": devirtualized a call to " + << NV("FunctionName", TargetName)); } void replaceAndErase( const StringRef OptName, const StringRef TargetName, bool RemarksEnabled, - function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter, + function_ref<OptimizationRemarkEmitter &(Function &)> OREGetter, Value *New) { if (RemarksEnabled) emitRemark(OptName, TargetName, OREGetter); @@ -570,25 +578,24 @@ void VTableSlotInfo::addCallSite(Value *VTable, CallBase &CB, struct DevirtModule { Module &M; - function_ref<AAResults &(Function &)> AARGetter; - function_ref<DominatorTree &(Function &)> LookupDomTree; + ModuleAnalysisManager &MAM; + FunctionAnalysisManager &FAM; - ModuleSummaryIndex *ExportSummary; - const ModuleSummaryIndex *ImportSummary; + ModuleSummaryIndex *const ExportSummary; + const ModuleSummaryIndex *const ImportSummary; - IntegerType *Int8Ty; - PointerType *Int8PtrTy; - IntegerType *Int32Ty; - IntegerType *Int64Ty; - IntegerType *IntPtrTy; + IntegerType *const Int8Ty; + PointerType *const Int8PtrTy; + IntegerType *const Int32Ty; + IntegerType *const Int64Ty; + IntegerType *const IntPtrTy; /// Sizeless array type, used for imported vtables. This provides a signal /// to analyzers that these imports may alias, as they do for example /// when multiple unique return values occur in the same vtable. - ArrayType *Int8Arr0Ty; - - bool RemarksEnabled; - function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter; + ArrayType *const Int8Arr0Ty; + const bool RemarksEnabled; + std::function<OptimizationRemarkEmitter &(Function &)> OREGetter; MapVector<VTableSlot, VTableSlotInfo> CallSlots; // Calls that have already been optimized. We may add a call to multiple @@ -611,12 +618,11 @@ struct DevirtModule { std::map<CallInst *, unsigned> NumUnsafeUsesForTypeTest; PatternList FunctionsToSkip; - DevirtModule(Module &M, function_ref<AAResults &(Function &)> AARGetter, - function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter, - function_ref<DominatorTree &(Function &)> LookupDomTree, + DevirtModule(Module &M, ModuleAnalysisManager &MAM, ModuleSummaryIndex *ExportSummary, const ModuleSummaryIndex *ImportSummary) - : M(M), AARGetter(AARGetter), LookupDomTree(LookupDomTree), + : M(M), MAM(MAM), + FAM(MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager()), ExportSummary(ExportSummary), ImportSummary(ImportSummary), Int8Ty(Type::getInt8Ty(M.getContext())), Int8PtrTy(PointerType::getUnqual(M.getContext())), @@ -624,7 +630,10 @@ struct DevirtModule { Int64Ty(Type::getInt64Ty(M.getContext())), IntPtrTy(M.getDataLayout().getIntPtrType(M.getContext(), 0)), Int8Arr0Ty(ArrayType::get(Type::getInt8Ty(M.getContext()), 0)), - RemarksEnabled(areRemarksEnabled()), OREGetter(OREGetter) { + RemarksEnabled(areRemarksEnabled()), + OREGetter([&](Function &F) -> OptimizationRemarkEmitter & { + return FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); + }) { assert(!(ExportSummary && ImportSummary)); FunctionsToSkip.init(SkipFunctionNames); } @@ -653,7 +662,7 @@ struct DevirtModule { VTableSlotInfo &SlotInfo, WholeProgramDevirtResolution *Res); - void applyICallBranchFunnel(VTableSlotInfo &SlotInfo, Constant *JT, + void applyICallBranchFunnel(VTableSlotInfo &SlotInfo, Function &JT, bool &IsExported); void tryICallBranchFunnel(MutableArrayRef<VirtualCallTarget> TargetsForSlot, VTableSlotInfo &SlotInfo, @@ -738,10 +747,7 @@ struct DevirtModule { // Lower the module using the action and summary passed as command line // arguments. For testing purposes only. - static bool - runForTesting(Module &M, function_ref<AAResults &(Function &)> AARGetter, - function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter, - function_ref<DominatorTree &(Function &)> LookupDomTree); + static bool runForTesting(Module &M, ModuleAnalysisManager &MAM); }; struct DevirtIndex { @@ -782,25 +788,13 @@ struct DevirtIndex { } // end anonymous namespace PreservedAnalyses WholeProgramDevirtPass::run(Module &M, - ModuleAnalysisManager &AM) { - auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); - auto AARGetter = [&](Function &F) -> AAResults & { - return FAM.getResult<AAManager>(F); - }; - auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & { - return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F); - }; - auto LookupDomTree = [&FAM](Function &F) -> DominatorTree & { - return FAM.getResult<DominatorTreeAnalysis>(F); - }; + ModuleAnalysisManager &MAM) { if (UseCommandLine) { - if (!DevirtModule::runForTesting(M, AARGetter, OREGetter, LookupDomTree)) + if (!DevirtModule::runForTesting(M, MAM)) return PreservedAnalyses::all(); return PreservedAnalyses::none(); } - if (!DevirtModule(M, AARGetter, OREGetter, LookupDomTree, ExportSummary, - ImportSummary) - .run()) + if (!DevirtModule(M, MAM, ExportSummary, ImportSummary).run()) return PreservedAnalyses::all(); return PreservedAnalyses::none(); } @@ -832,8 +826,8 @@ typeIDVisibleToRegularObj(StringRef TypeID, // function for the base type and thus only contains a reference to the // type info (_ZTI). To catch this case we query using the type info // symbol corresponding to the TypeID. - std::string typeInfo = ("_ZTI" + TypeID).str(); - return IsVisibleToRegularObj(typeInfo); + std::string TypeInfo = ("_ZTI" + TypeID).str(); + return IsVisibleToRegularObj(TypeInfo); } static bool @@ -842,7 +836,7 @@ skipUpdateDueToValidation(GlobalVariable &GV, SmallVector<MDNode *, 2> Types; GV.getMetadata(LLVMContext::MD_type, Types); - for (auto Type : Types) + for (auto *Type : Types) if (auto *TypeID = dyn_cast<MDString>(Type->getOperand(1).get())) return typeIDVisibleToRegularObj(TypeID->getString(), IsVisibleToRegularObj); @@ -881,6 +875,7 @@ void llvm::updateVCallVisibilityInModule( void llvm::updatePublicTypeTestCalls(Module &M, bool WholeProgramVisibilityEnabledInLTO) { + llvm::TimeTraceScope timeScope("Update public type test calls"); Function *PublicTypeTestFunc = Intrinsic::getDeclarationIfExists(&M, Intrinsic::public_type_test); if (!PublicTypeTestFunc) @@ -912,9 +907,9 @@ void llvm::getVisibleToRegularObjVtableGUIDs( ModuleSummaryIndex &Index, DenseSet<GlobalValue::GUID> &VisibleToRegularObjSymbols, function_ref<bool(StringRef)> IsVisibleToRegularObj) { - for (const auto &typeID : Index.typeIdCompatibleVtableMap()) { - if (typeIDVisibleToRegularObj(typeID.first, IsVisibleToRegularObj)) - for (const TypeIdOffsetVtableInfo &P : typeID.second) + for (const auto &TypeID : Index.typeIdCompatibleVtableMap()) { + if (typeIDVisibleToRegularObj(TypeID.first, IsVisibleToRegularObj)) + for (const TypeIdOffsetVtableInfo &P : TypeID.second) VisibleToRegularObjSymbols.insert(P.VTableVI.getGUID()); } } @@ -957,7 +952,7 @@ void llvm::runWholeProgramDevirtOnIndex( void llvm::updateIndexWPDForExports( ModuleSummaryIndex &Summary, - function_ref<bool(StringRef, ValueInfo)> isExported, + function_ref<bool(StringRef, ValueInfo)> IsExported, std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap) { for (auto &T : LocalWPDTargetsMap) { auto &VI = T.first; @@ -965,7 +960,7 @@ void llvm::updateIndexWPDForExports( assert(VI.getSummaryList().size() == 1 && "Devirt of local target has more than one copy"); auto &S = VI.getSummaryList()[0]; - if (!isExported(S->modulePath(), VI)) + if (!IsExported(S->modulePath(), VI)) continue; // It's been exported by a cross module import. @@ -995,10 +990,7 @@ static Error checkCombinedSummaryForTesting(ModuleSummaryIndex *Summary) { return ErrorSuccess(); } -bool DevirtModule::runForTesting( - Module &M, function_ref<AAResults &(Function &)> AARGetter, - function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter, - function_ref<DominatorTree &(Function &)> LookupDomTree) { +bool DevirtModule::runForTesting(Module &M, ModuleAnalysisManager &MAM) { std::unique_ptr<ModuleSummaryIndex> Summary = std::make_unique<ModuleSummaryIndex>(/*HaveGVs=*/false); @@ -1023,7 +1015,7 @@ bool DevirtModule::runForTesting( } bool Changed = - DevirtModule(M, AARGetter, OREGetter, LookupDomTree, + DevirtModule(M, MAM, ClSummaryAction == PassSummaryAction::Export ? Summary.get() : nullptr, ClSummaryAction == PassSummaryAction::Import ? Summary.get() @@ -1071,7 +1063,7 @@ void DevirtModule::buildTypeIdentifierMap( } for (MDNode *Type : Types) { - auto TypeID = Type->getOperand(1).get(); + auto *TypeID = Type->getOperand(1).get(); uint64_t Offset = cast<ConstantInt>( @@ -1120,7 +1112,7 @@ bool DevirtModule::tryFindVirtualCallTargets( // Save the symbol used in the vtable to use as the devirtualization // target. - auto GV = dyn_cast<GlobalValue>(C); + auto *GV = dyn_cast<GlobalValue>(C); assert(GV); TargetsForSlot.push_back({GV, &TM}); } @@ -1284,7 +1276,7 @@ void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo, Apply(P.second); } -static bool AddCalls(VTableSlotInfo &SlotInfo, const ValueInfo &Callee) { +static bool addCalls(VTableSlotInfo &SlotInfo, const ValueInfo &Callee) { // We can't add calls if we haven't seen a definition if (Callee.getSummaryList().empty()) return false; @@ -1359,7 +1351,7 @@ bool DevirtModule::trySingleImplDevirt( if (ValueInfo TheFnVI = ExportSummary->getValueInfo(TheFn->getGUID())) // Any needed promotion of 'TheFn' has already been done during // LTO unit split, so we can ignore return value of AddCalls. - AddCalls(SlotInfo, TheFnVI); + addCalls(SlotInfo, TheFnVI); Res->TheKind = WholeProgramDevirtResolution::SingleImpl; Res->SingleImplName = std::string(TheFn->getName()); @@ -1400,7 +1392,7 @@ bool DevirtIndex::trySingleImplDevirt(MutableArrayRef<ValueInfo> TargetsForSlot, DevirtTargets.insert(TheFn); auto &S = TheFn.getSummaryList()[0]; - bool IsExported = AddCalls(SlotInfo, TheFn); + bool IsExported = addCalls(SlotInfo, TheFn); if (IsExported) ExportedGUIDs.insert(TheFn.getGUID()); @@ -1497,13 +1489,19 @@ void DevirtModule::tryICallBranchFunnel( ReturnInst::Create(M.getContext(), nullptr, BB); bool IsExported = false; - applyICallBranchFunnel(SlotInfo, JT, IsExported); + applyICallBranchFunnel(SlotInfo, *JT, IsExported); if (IsExported) Res->TheKind = WholeProgramDevirtResolution::BranchFunnel; + + if (!JT->getEntryCount().has_value()) { + // FIXME: we could pass through thinlto the necessary information. + setExplicitlyUnknownFunctionEntryCount(*JT); + } } void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo, - Constant *JT, bool &IsExported) { + Function &JT, bool &IsExported) { + DenseMap<Function *, double> FunctionEntryCounts; auto Apply = [&](CallSiteInfo &CSInfo) { if (CSInfo.isExported()) IsExported = true; @@ -1531,8 +1529,7 @@ void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo, NumBranchFunnel++; if (RemarksEnabled) - VCallSite.emitRemark("branch-funnel", - JT->stripPointerCasts()->getName(), OREGetter); + VCallSite.emitRemark("branch-funnel", JT.getName(), OREGetter); // Pass the address of the vtable in the nest register, which is r10 on // x86_64. @@ -1548,11 +1545,28 @@ void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo, llvm::append_range(Args, CB.args()); CallBase *NewCS = nullptr; + if (!JT.isDeclaration() && !ProfcheckDisableMetadataFixes) { + // Accumulate the call frequencies of the original call site, and use + // that as total entry count for the funnel function. + auto &F = *CB.getCaller(); + auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F); + auto EC = BFI.getBlockFreq(&F.getEntryBlock()); + auto CC = F.getEntryCount(/*AllowSynthetic=*/true); + double CallCount = 0.0; + if (EC.getFrequency() != 0 && CC && CC->getCount() != 0) { + double CallFreq = + static_cast<double>( + BFI.getBlockFreq(CB.getParent()).getFrequency()) / + EC.getFrequency(); + CallCount = CallFreq * CC->getCount(); + } + FunctionEntryCounts[&JT] += CallCount; + } if (isa<CallInst>(CB)) - NewCS = IRB.CreateCall(NewFT, JT, Args); + NewCS = IRB.CreateCall(NewFT, &JT, Args); else NewCS = - IRB.CreateInvoke(NewFT, JT, cast<InvokeInst>(CB).getNormalDest(), + IRB.CreateInvoke(NewFT, &JT, cast<InvokeInst>(CB).getNormalDest(), cast<InvokeInst>(CB).getUnwindDest(), Args); NewCS->setCallingConv(CB.getCallingConv()); @@ -1586,6 +1600,11 @@ void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo, Apply(SlotInfo.CSInfo); for (auto &P : SlotInfo.ConstCSInfo) Apply(P.second); + for (auto &[F, C] : FunctionEntryCounts) { + assert(!F->getEntryCount(/*AllowSynthetic=*/true) && + "Unexpected entry count for funnel that was freshly synthesized"); + F->setEntryCount(static_cast<uint64_t>(std::round(C))); + } } bool DevirtModule::tryEvaluateFunctionsWithArgs( @@ -1597,7 +1616,7 @@ bool DevirtModule::tryEvaluateFunctionsWithArgs( // TODO: Skip for now if the vtable symbol was an alias to a function, // need to evaluate whether it would be correct to analyze the aliasee // function for this optimization. - auto Fn = dyn_cast<Function>(Target.Fn); + auto *Fn = dyn_cast<Function>(Target.Fn); if (!Fn) return false; @@ -1836,11 +1855,11 @@ bool DevirtModule::tryVirtualConstProp( // TODO: Skip for now if the vtable symbol was an alias to a function, // need to evaluate whether it would be correct to analyze the aliasee // function for this optimization. - auto Fn = dyn_cast<Function>(TargetsForSlot[0].Fn); + auto *Fn = dyn_cast<Function>(TargetsForSlot[0].Fn); if (!Fn) return false; // This only works if the function returns an integer. - auto RetType = dyn_cast<IntegerType>(Fn->getReturnType()); + auto *RetType = dyn_cast<IntegerType>(Fn->getReturnType()); if (!RetType) return false; unsigned BitWidth = RetType->getBitWidth(); @@ -1871,12 +1890,12 @@ bool DevirtModule::tryVirtualConstProp( // TODO: Skip for now if the vtable symbol was an alias to a function, // need to evaluate whether it would be correct to analyze the aliasee // function for this optimization. - auto Fn = dyn_cast<Function>(Target.Fn); + auto *Fn = dyn_cast<Function>(Target.Fn); if (!Fn) return false; if (Fn->isDeclaration() || - !computeFunctionBodyMemoryAccess(*Fn, AARGetter(*Fn)) + !computeFunctionBodyMemoryAccess(*Fn, FAM.getResult<AAManager>(*Fn)) .doesNotAccessMemory() || Fn->arg_empty() || !Fn->arg_begin()->use_empty() || Fn->getReturnType() != RetType) @@ -1992,11 +2011,11 @@ void DevirtModule::rebuildGlobal(VTableBits &B) { // Build an anonymous global containing the before bytes, followed by the // original initializer, followed by the after bytes. - auto NewInit = ConstantStruct::getAnon( + auto *NewInit = ConstantStruct::getAnon( {ConstantDataArray::get(M.getContext(), B.Before.Bytes), B.GV->getInitializer(), ConstantDataArray::get(M.getContext(), B.After.Bytes)}); - auto NewGV = + auto *NewGV = new GlobalVariable(M, NewInit->getType(), B.GV->isConstant(), GlobalVariable::PrivateLinkage, NewInit, "", B.GV); NewGV->setSection(B.GV->getSection()); @@ -2009,7 +2028,7 @@ void DevirtModule::rebuildGlobal(VTableBits &B) { // Build an alias named after the original global, pointing at the second // element (the original initializer). - auto Alias = GlobalAlias::create( + auto *Alias = GlobalAlias::create( B.GV->getInitializer()->getType(), 0, B.GV->getLinkage(), "", ConstantExpr::getInBoundsGetElementPtr( NewInit->getType(), NewGV, @@ -2050,7 +2069,7 @@ void DevirtModule::scanTypeTestUsers( // Search for virtual calls based on %p and add them to DevirtCalls. SmallVector<DevirtCallSite, 1> DevirtCalls; SmallVector<CallInst *, 1> Assumes; - auto &DT = LookupDomTree(*CI->getFunction()); + auto &DT = FAM.getResult<DominatorTreeAnalysis>(*CI->getFunction()); findDevirtualizableCallsForTypeTest(DevirtCalls, Assumes, CI, DT); Metadata *TypeId = @@ -2127,7 +2146,7 @@ void DevirtModule::scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc) { SmallVector<Instruction *, 1> LoadedPtrs; SmallVector<Instruction *, 1> Preds; bool HasNonCallUses = false; - auto &DT = LookupDomTree(*CI->getFunction()); + auto &DT = FAM.getResult<DominatorTreeAnalysis>(*CI->getFunction()); findDevirtualizableCallsForTypeCheckedLoad(DevirtCalls, LoadedPtrs, Preds, HasNonCallUses, CI, DT); @@ -2259,18 +2278,18 @@ void DevirtModule::importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo) { if (Res.TheKind == WholeProgramDevirtResolution::BranchFunnel) { // The type of the function is irrelevant, because it's bitcast at calls // anyhow. - Constant *JT = cast<Constant>( + auto *JT = cast<Function>( M.getOrInsertFunction(getGlobalName(Slot, {}, "branch_funnel"), Type::getVoidTy(M.getContext())) .getCallee()); bool IsExported = false; - applyICallBranchFunnel(SlotInfo, JT, IsExported); + applyICallBranchFunnel(SlotInfo, *JT, IsExported); assert(!IsExported); } } void DevirtModule::removeRedundantTypeTests() { - auto True = ConstantInt::getTrue(M.getContext()); + auto *True = ConstantInt::getTrue(M.getContext()); for (auto &&U : NumUnsafeUsesForTypeTest) { if (U.second == 0) { U.first->replaceAllUsesWith(True); @@ -2490,18 +2509,17 @@ bool DevirtModule::run() { // Generate remarks for each devirtualized function. for (const auto &DT : DevirtTargets) { GlobalValue *GV = DT.second; - auto F = dyn_cast<Function>(GV); + auto *F = dyn_cast<Function>(GV); if (!F) { - auto A = dyn_cast<GlobalAlias>(GV); + auto *A = dyn_cast<GlobalAlias>(GV); assert(A && isa<Function>(A->getAliasee())); F = dyn_cast<Function>(A->getAliasee()); assert(F); } using namespace ore; - OREGetter(F).emit(OptimizationRemark(DEBUG_TYPE, "Devirtualized", F) - << "devirtualized " - << NV("FunctionName", DT.first)); + OREGetter(*F).emit(OptimizationRemark(DEBUG_TYPE, "Devirtualized", F) + << "devirtualized " << NV("FunctionName", DT.first)); } } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index d934638c15e7..f9155cc66031 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -2115,6 +2115,7 @@ CommonPointerBase CommonPointerBase::compute(Value *LHS, Value *RHS) { } // Find common base and collect RHS GEPs. + bool First = true; while (true) { if (Ptrs.contains(RHS)) { Base.Ptr = RHS; @@ -2123,7 +2124,12 @@ CommonPointerBase CommonPointerBase::compute(Value *LHS, Value *RHS) { if (auto *GEP = dyn_cast<GEPOperator>(RHS)) { Base.RHSGEPs.push_back(GEP); - Base.RHSNW &= GEP->getNoWrapFlags(); + if (First) { + First = false; + Base.RHSNW = GEP->getNoWrapFlags(); + } else { + Base.RHSNW = Base.RHSNW.intersectForOffsetAdd(GEP->getNoWrapFlags()); + } RHS = GEP->getPointerOperand(); } else { // No common base. @@ -2132,13 +2138,19 @@ CommonPointerBase CommonPointerBase::compute(Value *LHS, Value *RHS) { } // Collect LHS GEPs. + First = true; while (true) { if (LHS == Base.Ptr) break; auto *GEP = cast<GEPOperator>(LHS); Base.LHSGEPs.push_back(GEP); - Base.LHSNW &= GEP->getNoWrapFlags(); + if (First) { + First = false; + Base.LHSNW = GEP->getNoWrapFlags(); + } else { + Base.LHSNW = Base.LHSNW.intersectForOffsetAdd(GEP->getNoWrapFlags()); + } LHS = GEP->getPointerOperand(); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index a13d3ceb6132..2d7524e8018b 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -1799,16 +1799,21 @@ static Instruction *foldLogicCastConstant(BinaryOperator &Logic, CastInst *Cast, // type may provide more information to later folds, and the smaller logic // instruction may be cheaper (particularly in the case of vectors). Value *X; + auto &DL = IC.getDataLayout(); if (match(Cast, m_OneUse(m_ZExt(m_Value(X))))) { - if (Constant *TruncC = IC.getLosslessUnsignedTrunc(C, SrcTy)) { + PreservedCastFlags Flags; + if (Constant *TruncC = getLosslessUnsignedTrunc(C, SrcTy, DL, &Flags)) { // LogicOpc (zext X), C --> zext (LogicOpc X, C) Value *NewOp = IC.Builder.CreateBinOp(LogicOpc, X, TruncC); - return new ZExtInst(NewOp, DestTy); + auto *ZExt = new ZExtInst(NewOp, DestTy); + ZExt->setNonNeg(Flags.NNeg); + ZExt->andIRFlags(Cast); + return ZExt; } } if (match(Cast, m_OneUse(m_SExtLike(m_Value(X))))) { - if (Constant *TruncC = IC.getLosslessSignedTrunc(C, SrcTy)) { + if (Constant *TruncC = getLosslessSignedTrunc(C, SrcTy, DL)) { // LogicOpc (sext X), C --> sext (LogicOpc X, C) Value *NewOp = IC.Builder.CreateBinOp(LogicOpc, X, TruncC); return new SExtInst(NewOp, DestTy); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 42b65dde6725..33b66aeaffe6 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1956,7 +1956,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { Constant *C; if (match(I0, m_ZExt(m_Value(X))) && match(I1, m_Constant(C)) && I0->hasOneUse()) { - if (Constant *NarrowC = getLosslessUnsignedTrunc(C, X->getType())) { + if (Constant *NarrowC = getLosslessUnsignedTrunc(C, X->getType(), DL)) { Value *NarrowMaxMin = Builder.CreateBinaryIntrinsic(IID, X, NarrowC); return CastInst::Create(Instruction::ZExt, NarrowMaxMin, II->getType()); } @@ -2006,7 +2006,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { Constant *C; if (match(I0, m_SExt(m_Value(X))) && match(I1, m_Constant(C)) && I0->hasOneUse()) { - if (Constant *NarrowC = getLosslessSignedTrunc(C, X->getType())) { + if (Constant *NarrowC = getLosslessSignedTrunc(C, X->getType(), DL)) { Value *NarrowMaxMin = Builder.CreateBinaryIntrinsic(IID, X, NarrowC); return CastInst::Create(Instruction::SExt, NarrowMaxMin, II->getType()); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index fdef49e310f8..ccf918f0b6db 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -11,11 +11,13 @@ //===----------------------------------------------------------------------===// #include "InstCombineInternal.h" +#include "llvm/ADT/APInt.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Value.h" #include "llvm/Support/KnownBits.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" #include <optional> @@ -969,6 +971,25 @@ Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) { Changed = true; } + const APInt *C1; + Value *V1; + // OP = { lshr, ashr } + // trunc ( OP i8 C1, V1) to i1 -> icmp eq V1, log_2(C1) iff C1 is power of 2 + if (DestWidth == 1 && match(Src, m_Shr(m_Power2(C1), m_Value(V1)))) { + Value *Right = ConstantInt::get(V1->getType(), C1->countr_zero()); + Value *Icmp = Builder.CreateICmpEQ(V1, Right); + return replaceInstUsesWith(Trunc, Icmp); + } + + // OP = { lshr, ashr } + // trunc ( OP i8 C1, V1) to i1 -> icmp ult V1, log_2(C1 + 1) iff (C1 + 1) is + // power of 2 + if (DestWidth == 1 && match(Src, m_Shr(m_LowBitMask(C1), m_Value(V1)))) { + Value *Right = ConstantInt::get(V1->getType(), C1->countr_one()); + Value *Icmp = Builder.CreateICmpULT(V1, Right); + return replaceInstUsesWith(Trunc, Icmp); + } + return Changed ? &Trunc : nullptr; } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 3a8e04303815..99ea04816681 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -19,6 +19,7 @@ #include "llvm/Analysis/CmpInstAnalysis.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/Loads.h" #include "llvm/Analysis/Utils/Local.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/ConstantRange.h" @@ -110,75 +111,41 @@ static bool isSignTest(ICmpInst::Predicate &Pred, const APInt &C) { /// If AndCst is non-null, then the loaded value is masked with that constant /// before doing the comparison. This handles cases like "A[i]&4 == 0". Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal( - LoadInst *LI, GetElementPtrInst *GEP, GlobalVariable *GV, CmpInst &ICI, - ConstantInt *AndCst) { - if (LI->isVolatile() || LI->getType() != GEP->getResultElementType() || - !GV->getValueType()->isArrayTy() || !GV->isConstant() || + LoadInst *LI, GetElementPtrInst *GEP, CmpInst &ICI, ConstantInt *AndCst) { + auto *GV = dyn_cast<GlobalVariable>(getUnderlyingObject(GEP)); + if (LI->isVolatile() || !GV || !GV->isConstant() || !GV->hasDefinitiveInitializer()) return nullptr; - Type *GEPSrcEltTy = GEP->getSourceElementType(); - if (GEPSrcEltTy->isArrayTy()) - GEPSrcEltTy = GEPSrcEltTy->getArrayElementType(); - if (GV->getValueType()->getArrayElementType() != GEPSrcEltTy) + Type *EltTy = LI->getType(); + TypeSize EltSize = DL.getTypeStoreSize(EltTy); + if (EltSize.isScalable()) return nullptr; - Constant *Init = GV->getInitializer(); - if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init)) + LinearExpression Expr = decomposeLinearExpression(DL, GEP); + if (!Expr.Index || Expr.BasePtr != GV || Expr.Offset.getBitWidth() > 64) return nullptr; - uint64_t ArrayElementCount = Init->getType()->getArrayNumElements(); - // Don't blow up on huge arrays. - if (ArrayElementCount > MaxArraySizeForCombine) - return nullptr; + Constant *Init = GV->getInitializer(); + TypeSize GlobalSize = DL.getTypeAllocSize(Init->getType()); - // There are many forms of this optimization we can handle, for now, just do - // the simple index into a single-dimensional array or elements of equal size. - // - // Require: GEP [n x i8] GV, 0, Idx {{, constant indices}} - // Or: GEP i8 GV, Idx + Value *Idx = Expr.Index; + const APInt &Stride = Expr.Scale; + const APInt &ConstOffset = Expr.Offset; - unsigned GEPIdxOp = 1; - if (GEP->getSourceElementType()->isArrayTy()) { - GEPIdxOp = 2; - if (!match(GEP->getOperand(1), m_ZeroInt())) - return nullptr; - } - if (GEP->getNumOperands() < GEPIdxOp + 1 || - isa<Constant>(GEP->getOperand(GEPIdxOp))) + // Allow an additional context offset, but only within the stride. + if (!ConstOffset.ult(Stride)) return nullptr; - // Check that indices after the variable are constants and in-range for the - // type they index. Collect the indices. This is typically for arrays of - // structs. - SmallVector<unsigned, 4> LaterIndices; - - Type *EltTy = Init->getType()->getArrayElementType(); - for (unsigned i = GEPIdxOp + 1, e = GEP->getNumOperands(); i != e; ++i) { - ConstantInt *Idx = dyn_cast<ConstantInt>(GEP->getOperand(i)); - if (!Idx) - return nullptr; // Variable index. - - uint64_t IdxVal = Idx->getZExtValue(); - if ((unsigned)IdxVal != IdxVal) - return nullptr; // Too large array index. - - if (StructType *STy = dyn_cast<StructType>(EltTy)) - EltTy = STy->getElementType(IdxVal); - else if (ArrayType *ATy = dyn_cast<ArrayType>(EltTy)) { - if (IdxVal >= ATy->getNumElements()) - return nullptr; - EltTy = ATy->getElementType(); - } else { - return nullptr; // Unknown type. - } - - LaterIndices.push_back(IdxVal); - } + // Don't handle overlapping loads for now. + if (!Stride.uge(EltSize.getFixedValue())) + return nullptr; - Value *Idx = GEP->getOperand(GEPIdxOp); - // If the index type is non-canonical, wait for it to be canonicalized. - if (Idx->getType() != DL.getIndexType(GEP->getType())) + // Don't blow up on huge arrays. + uint64_t ArrayElementCount = + divideCeil((GlobalSize.getFixedValue() - ConstOffset.getZExtValue()), + Stride.getZExtValue()); + if (ArrayElementCount > MaxArraySizeForCombine) return nullptr; enum { Overdefined = -3, Undefined = -2 }; @@ -211,18 +178,12 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal( // Scan the array and see if one of our patterns matches. Constant *CompareRHS = cast<Constant>(ICI.getOperand(1)); - for (unsigned i = 0, e = ArrayElementCount; i != e; ++i) { - Constant *Elt = Init->getAggregateElement(i); + APInt Offset = ConstOffset; + for (unsigned i = 0, e = ArrayElementCount; i != e; ++i, Offset += Stride) { + Constant *Elt = ConstantFoldLoadFromConst(Init, EltTy, Offset, DL); if (!Elt) return nullptr; - // If this is indexing an array of structures, get the structure element. - if (!LaterIndices.empty()) { - Elt = ConstantFoldExtractValueInstruction(Elt, LaterIndices); - if (!Elt) - return nullptr; - } - // If the element is masked, handle it. if (AndCst) { Elt = ConstantFoldBinaryOpOperands(Instruction::And, Elt, AndCst, DL); @@ -309,19 +270,17 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal( // Now that we've scanned the entire array, emit our new comparison(s). We // order the state machines in complexity of the generated code. - // If inbounds keyword is not present, Idx * ElementSize can overflow. - // Let's assume that ElementSize is 2 and the wanted value is at offset 0. + // If inbounds keyword is not present, Idx * Stride can overflow. + // Let's assume that Stride is 2 and the wanted value is at offset 0. // Then, there are two possible values for Idx to match offset 0: // 0x00..00, 0x80..00. // Emitting 'icmp eq Idx, 0' isn't correct in this case because the // comparison is false if Idx was 0x80..00. // We need to erase the highest countTrailingZeros(ElementSize) bits of Idx. - unsigned ElementSize = - DL.getTypeAllocSize(Init->getType()->getArrayElementType()); auto MaskIdx = [&](Value *Idx) { - if (!GEP->isInBounds() && llvm::countr_zero(ElementSize) != 0) { + if (!Expr.Flags.isInBounds() && Stride.countr_zero() != 0) { Value *Mask = Constant::getAllOnesValue(Idx->getType()); - Mask = Builder.CreateLShr(Mask, llvm::countr_zero(ElementSize)); + Mask = Builder.CreateLShr(Mask, Stride.countr_zero()); Idx = Builder.CreateAnd(Idx, Mask); } return Idx; @@ -1997,10 +1956,8 @@ Instruction *InstCombinerImpl::foldICmpAndConstant(ICmpInst &Cmp, if (auto *C2 = dyn_cast<ConstantInt>(Y)) if (auto *LI = dyn_cast<LoadInst>(X)) if (auto *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0))) - if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) - if (Instruction *Res = - foldCmpLoadFromIndexedGlobal(LI, GEP, GV, Cmp, C2)) - return Res; + if (Instruction *Res = foldCmpLoadFromIndexedGlobal(LI, GEP, Cmp, C2)) + return Res; if (!Cmp.isEquality()) return nullptr; @@ -4353,10 +4310,9 @@ Instruction *InstCombinerImpl::foldICmpInstWithConstantNotInt(ICmpInst &I) { // Try to optimize things like "A[i] > 4" to index computations. if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LHSI->getOperand(0))) - if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) - if (Instruction *Res = - foldCmpLoadFromIndexedGlobal(cast<LoadInst>(LHSI), GEP, GV, I)) - return Res; + if (Instruction *Res = + foldCmpLoadFromIndexedGlobal(cast<LoadInst>(LHSI), GEP, I)) + return Res; break; } @@ -6375,7 +6331,7 @@ Instruction *InstCombinerImpl::foldICmpWithZextOrSext(ICmpInst &ICmp) { // If a lossless truncate is possible... Type *SrcTy = CastOp0->getSrcTy(); - Constant *Res = getLosslessTrunc(C, SrcTy, CastOp0->getOpcode()); + Constant *Res = getLosslessInvCast(C, SrcTy, CastOp0->getOpcode(), DL); if (Res) { if (ICmp.isEquality()) return new ICmpInst(ICmp.getPredicate(), X, Res); @@ -8837,10 +8793,9 @@ Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) { break; case Instruction::Load: if (auto *GEP = dyn_cast<GetElementPtrInst>(LHSI->getOperand(0))) - if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) - if (Instruction *Res = foldCmpLoadFromIndexedGlobal( - cast<LoadInst>(LHSI), GEP, GV, I)) - return Res; + if (Instruction *Res = + foldCmpLoadFromIndexedGlobal(cast<LoadInst>(LHSI), GEP, I)) + return Res; break; case Instruction::FPTrunc: if (Instruction *NV = foldFCmpFpTrunc(I, *LHSI, *RHSC)) @@ -8944,14 +8899,14 @@ Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) { } { - Value *CanonLHS = nullptr, *CanonRHS = nullptr; + Value *CanonLHS = nullptr; match(Op0, m_Intrinsic<Intrinsic::canonicalize>(m_Value(CanonLHS))); - match(Op1, m_Intrinsic<Intrinsic::canonicalize>(m_Value(CanonRHS))); - // (canonicalize(x) == x) => (x == x) if (CanonLHS == Op1) return new FCmpInst(Pred, Op1, Op1, "", &I); + Value *CanonRHS = nullptr; + match(Op1, m_Intrinsic<Intrinsic::canonicalize>(m_Value(CanonRHS))); // (x == canonicalize(x)) => (x == x) if (CanonRHS == Op0) return new FCmpInst(Pred, Op0, Op0, "", &I); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 2340028ce93d..7a979c16da50 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -222,23 +222,6 @@ public: bool fmulByZeroIsZero(Value *MulVal, FastMathFlags FMF, const Instruction *CtxI) const; - Constant *getLosslessTrunc(Constant *C, Type *TruncTy, unsigned ExtOp) { - Constant *TruncC = ConstantExpr::getTrunc(C, TruncTy); - Constant *ExtTruncC = - ConstantFoldCastOperand(ExtOp, TruncC, C->getType(), DL); - if (ExtTruncC && ExtTruncC == C) - return TruncC; - return nullptr; - } - - Constant *getLosslessUnsignedTrunc(Constant *C, Type *TruncTy) { - return getLosslessTrunc(C, TruncTy, Instruction::ZExt); - } - - Constant *getLosslessSignedTrunc(Constant *C, Type *TruncTy) { - return getLosslessTrunc(C, TruncTy, Instruction::SExt); - } - std::optional<std::pair<Intrinsic::ID, SmallVector<Value *, 3>>> convertOrOfShiftsToFunnelShift(Instruction &Or); @@ -710,7 +693,7 @@ public: bool foldAllocaCmp(AllocaInst *Alloca); Instruction *foldCmpLoadFromIndexedGlobal(LoadInst *LI, GetElementPtrInst *GEP, - GlobalVariable *GV, CmpInst &ICI, + CmpInst &ICI, ConstantInt *AndCst = nullptr); Instruction *foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI, Constant *RHSC); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index d7310b1c741c..a9aacc707cc2 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -1642,10 +1642,11 @@ static Instruction *narrowUDivURem(BinaryOperator &I, } Constant *C; + auto &DL = IC.getDataLayout(); if (isa<Instruction>(N) && match(N, m_OneUse(m_ZExt(m_Value(X)))) && match(D, m_Constant(C))) { // If the constant is the same in the smaller type, use the narrow version. - Constant *TruncC = IC.getLosslessUnsignedTrunc(C, X->getType()); + Constant *TruncC = getLosslessUnsignedTrunc(C, X->getType(), DL); if (!TruncC) return nullptr; @@ -1656,7 +1657,7 @@ static Instruction *narrowUDivURem(BinaryOperator &I, if (isa<Instruction>(D) && match(D, m_OneUse(m_ZExt(m_Value(X)))) && match(N, m_Constant(C))) { // If the constant is the same in the smaller type, use the narrow version. - Constant *TruncC = IC.getLosslessUnsignedTrunc(C, X->getType()); + Constant *TruncC = getLosslessUnsignedTrunc(C, X->getType(), DL); if (!TruncC) return nullptr; diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp index 6477141ab095..ed9a0be6981f 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -841,7 +841,7 @@ Instruction *InstCombinerImpl::foldPHIArgZextsIntoPHI(PHINode &Phi) { NumZexts++; } else if (auto *C = dyn_cast<Constant>(V)) { // Make sure that constants can fit in the new type. - Constant *Trunc = getLosslessUnsignedTrunc(C, NarrowType); + Constant *Trunc = getLosslessUnsignedTrunc(C, NarrowType, DL); if (!Trunc) return nullptr; NewIncoming.push_back(Trunc); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index eb4332fbc095..9467463d39c0 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -1993,6 +1993,63 @@ Value *InstCombinerImpl::foldSelectWithConstOpToBinOp(ICmpInst *Cmp, return BinOp; } +/// Folds: +/// %a_sub = call @llvm.usub.sat(x, IntConst1) +/// %b_sub = call @llvm.usub.sat(y, IntConst2) +/// %or = or %a_sub, %b_sub +/// %cmp = icmp eq %or, 0 +/// %sel = select %cmp, 0, MostSignificantBit +/// into: +/// %a_sub' = usub.sat(x, IntConst1 - MostSignificantBit) +/// %b_sub' = usub.sat(y, IntConst2 - MostSignificantBit) +/// %or = or %a_sub', %b_sub' +/// %and = and %or, MostSignificantBit +/// Likewise, for vector arguments as well. +static Instruction *foldICmpUSubSatWithAndForMostSignificantBitCmp( + SelectInst &SI, ICmpInst *ICI, InstCombiner::BuilderTy &Builder) { + if (!SI.hasOneUse() || !ICI->hasOneUse()) + return nullptr; + CmpPredicate Pred; + Value *A, *B; + const APInt *Constant1, *Constant2; + if (!match(SI.getCondition(), + m_ICmp(Pred, + m_OneUse(m_Or(m_OneUse(m_Intrinsic<Intrinsic::usub_sat>( + m_Value(A), m_APInt(Constant1))), + m_OneUse(m_Intrinsic<Intrinsic::usub_sat>( + m_Value(B), m_APInt(Constant2))))), + m_Zero()))) + return nullptr; + + Value *TrueVal = SI.getTrueValue(); + Value *FalseVal = SI.getFalseValue(); + if (!(Pred == ICmpInst::ICMP_EQ && + (match(TrueVal, m_Zero()) && match(FalseVal, m_SignMask()))) || + (Pred == ICmpInst::ICMP_NE && + (match(TrueVal, m_SignMask()) && match(FalseVal, m_Zero())))) + return nullptr; + + auto *Ty = A->getType(); + unsigned BW = Constant1->getBitWidth(); + APInt MostSignificantBit = APInt::getSignMask(BW); + + // Anything over MSB is negative + if (Constant1->isNonNegative() || Constant2->isNonNegative()) + return nullptr; + + APInt AdjAP1 = *Constant1 - MostSignificantBit + 1; + APInt AdjAP2 = *Constant2 - MostSignificantBit + 1; + + auto *Adj1 = ConstantInt::get(Ty, AdjAP1); + auto *Adj2 = ConstantInt::get(Ty, AdjAP2); + + Value *NewA = Builder.CreateBinaryIntrinsic(Intrinsic::usub_sat, A, Adj1); + Value *NewB = Builder.CreateBinaryIntrinsic(Intrinsic::usub_sat, B, Adj2); + Value *Or = Builder.CreateOr(NewA, NewB); + Constant *MSBConst = ConstantInt::get(Ty, MostSignificantBit); + return BinaryOperator::CreateAnd(Or, MSBConst); +} + /// Visit a SelectInst that has an ICmpInst as its first operand. Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI) { @@ -2009,6 +2066,9 @@ Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI, if (Instruction *NewSel = tryToReuseConstantFromSelectInComparison(SI, *ICI, *this)) return NewSel; + if (Instruction *Folded = + foldICmpUSubSatWithAndForMostSignificantBitCmp(SI, ICI, Builder)) + return Folded; // NOTE: if we wanted to, this is where to detect integer MIN/MAX bool Changed = false; @@ -2315,7 +2375,7 @@ Instruction *InstCombinerImpl::foldSelectExtConst(SelectInst &Sel) { // If the constant is the same after truncation to the smaller type and // extension to the original type, we can narrow the select. Type *SelType = Sel.getType(); - Constant *TruncC = getLosslessTrunc(C, SmallType, ExtOpcode); + Constant *TruncC = getLosslessInvCast(C, SmallType, ExtOpcode, DL); if (TruncC && ExtInst->hasOneUse()) { Value *TruncCVal = cast<Value>(TruncC); if (ExtInst == Sel.getFalseValue()) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index f17fecd430a6..aa030294ff1e 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -795,8 +795,7 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Instruction *I, I->dropPoisonGeneratingFlags(); return I; } - Known.Zero.lshrInPlace(ShiftAmt); - Known.One.lshrInPlace(ShiftAmt); + Known >>= ShiftAmt; if (ShiftAmt) Known.Zero.setHighBits(ShiftAmt); // high bits known zero. } else { @@ -1066,10 +1065,9 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Instruction *I, } } - Known.Zero = LHSKnown.Zero.shl(ShiftAmt) | - RHSKnown.Zero.lshr(BitWidth - ShiftAmt); - Known.One = LHSKnown.One.shl(ShiftAmt) | - RHSKnown.One.lshr(BitWidth - ShiftAmt); + LHSKnown <<= ShiftAmt; + RHSKnown >>= BitWidth - ShiftAmt; + Known = LHSKnown.unionWith(RHSKnown); KnownBitsComputed = true; break; } diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 5ee3bb1abe86..c2f045a2ab02 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2027,9 +2027,7 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN, } if (OneUse) { - replaceAllDbgUsesWith(const_cast<PHINode &>(*PN), - const_cast<PHINode &>(*NewPN), - const_cast<PHINode &>(*PN), DT); + replaceAllDbgUsesWith(*PN, *NewPN, *PN, DT); } return replaceInstUsesWith(I, NewPN); } @@ -2570,7 +2568,7 @@ Instruction *InstCombinerImpl::narrowMathIfNoOverflow(BinaryOperator &BO) { Constant *WideC; if (!Op0->hasOneUse() || !match(Op1, m_Constant(WideC))) return nullptr; - Constant *NarrowC = getLosslessTrunc(WideC, X->getType(), CastOpc); + Constant *NarrowC = getLosslessInvCast(WideC, X->getType(), CastOpc, DL); if (!NarrowC) return nullptr; Y = NarrowC; @@ -2676,6 +2674,62 @@ static Instruction *canonicalizeGEPOfConstGEPI8(GetElementPtrInst &GEP, return nullptr; } +/// Combine constant offsets separated by variable offsets. +/// ptradd (ptradd (ptradd p, C1), x), C2 -> ptradd (ptradd p, x), C1+C2 +static Instruction *combineConstantOffsets(GetElementPtrInst &GEP, + InstCombinerImpl &IC) { + if (!GEP.hasAllConstantIndices()) + return nullptr; + + GEPNoWrapFlags NW = GEPNoWrapFlags::all(); + SmallVector<GetElementPtrInst *> Skipped; + auto *InnerGEP = dyn_cast<GetElementPtrInst>(GEP.getPointerOperand()); + while (true) { + if (!InnerGEP) + return nullptr; + + NW = NW.intersectForReassociate(InnerGEP->getNoWrapFlags()); + if (InnerGEP->hasAllConstantIndices()) + break; + + if (!InnerGEP->hasOneUse()) + return nullptr; + + Skipped.push_back(InnerGEP); + InnerGEP = dyn_cast<GetElementPtrInst>(InnerGEP->getPointerOperand()); + } + + // The two constant offset GEPs are directly adjacent: Let normal offset + // merging handle it. + if (Skipped.empty()) + return nullptr; + + // FIXME: This one-use check is not strictly necessary. Consider relaxing it + // if profitable. + if (!InnerGEP->hasOneUse()) + return nullptr; + + // Don't bother with vector splats. + Type *Ty = GEP.getType(); + if (InnerGEP->getType() != Ty) + return nullptr; + + const DataLayout &DL = IC.getDataLayout(); + APInt Offset(DL.getIndexTypeSizeInBits(Ty), 0); + if (!GEP.accumulateConstantOffset(DL, Offset) || + !InnerGEP->accumulateConstantOffset(DL, Offset)) + return nullptr; + + IC.replaceOperand(*Skipped.back(), 0, InnerGEP->getPointerOperand()); + for (GetElementPtrInst *SkippedGEP : Skipped) + SkippedGEP->setNoWrapFlags(NW); + + return IC.replaceInstUsesWith( + GEP, + IC.Builder.CreatePtrAdd(Skipped.front(), IC.Builder.getInt(Offset), "", + NW.intersectForOffsetAdd(GEP.getNoWrapFlags()))); +} + Instruction *InstCombinerImpl::visitGEPOfGEP(GetElementPtrInst &GEP, GEPOperator *Src) { // Combine Indices - If the source pointer to this getelementptr instruction @@ -2687,125 +2741,56 @@ Instruction *InstCombinerImpl::visitGEPOfGEP(GetElementPtrInst &GEP, if (auto *I = canonicalizeGEPOfConstGEPI8(GEP, Src, *this)) return I; - // For constant GEPs, use a more general offset-based folding approach. - Type *PtrTy = Src->getType()->getScalarType(); - if (GEP.hasAllConstantIndices() && - (Src->hasOneUse() || Src->hasAllConstantIndices())) { - // Split Src into a variable part and a constant suffix. - gep_type_iterator GTI = gep_type_begin(*Src); - Type *BaseType = GTI.getIndexedType(); - bool IsFirstType = true; - unsigned NumVarIndices = 0; - for (auto Pair : enumerate(Src->indices())) { - if (!isa<ConstantInt>(Pair.value())) { - BaseType = GTI.getIndexedType(); - IsFirstType = false; - NumVarIndices = Pair.index() + 1; - } - ++GTI; - } - - // Determine the offset for the constant suffix of Src. - APInt Offset(DL.getIndexTypeSizeInBits(PtrTy), 0); - if (NumVarIndices != Src->getNumIndices()) { - // FIXME: getIndexedOffsetInType() does not handled scalable vectors. - if (BaseType->isScalableTy()) - return nullptr; - - SmallVector<Value *> ConstantIndices; - if (!IsFirstType) - ConstantIndices.push_back( - Constant::getNullValue(Type::getInt32Ty(GEP.getContext()))); - append_range(ConstantIndices, drop_begin(Src->indices(), NumVarIndices)); - Offset += DL.getIndexedOffsetInType(BaseType, ConstantIndices); - } - - // Add the offset for GEP (which is fully constant). - if (!GEP.accumulateConstantOffset(DL, Offset)) - return nullptr; - - // Convert the total offset back into indices. - SmallVector<APInt> ConstIndices = - DL.getGEPIndicesForOffset(BaseType, Offset); - if (!Offset.isZero() || (!IsFirstType && !ConstIndices[0].isZero())) - return nullptr; - - GEPNoWrapFlags NW = getMergedGEPNoWrapFlags(*Src, *cast<GEPOperator>(&GEP)); - SmallVector<Value *> Indices( - drop_end(Src->indices(), Src->getNumIndices() - NumVarIndices)); - for (const APInt &Idx : drop_begin(ConstIndices, !IsFirstType)) { - Indices.push_back(ConstantInt::get(GEP.getContext(), Idx)); - // Even if the total offset is inbounds, we may end up representing it - // by first performing a larger negative offset, and then a smaller - // positive one. The large negative offset might go out of bounds. Only - // preserve inbounds if all signs are the same. - if (Idx.isNonNegative() != ConstIndices[0].isNonNegative()) - NW = NW.withoutNoUnsignedSignedWrap(); - if (!Idx.isNonNegative()) - NW = NW.withoutNoUnsignedWrap(); - } - - return replaceInstUsesWith( - GEP, Builder.CreateGEP(Src->getSourceElementType(), Src->getOperand(0), - Indices, "", NW)); - } + if (auto *I = combineConstantOffsets(GEP, *this)) + return I; if (Src->getResultElementType() != GEP.getSourceElementType()) return nullptr; - SmallVector<Value*, 8> Indices; - // Find out whether the last index in the source GEP is a sequential idx. bool EndsWithSequential = false; for (gep_type_iterator I = gep_type_begin(*Src), E = gep_type_end(*Src); I != E; ++I) EndsWithSequential = I.isSequential(); + if (!EndsWithSequential) + return nullptr; - // Can we combine the two pointer arithmetics offsets? - if (EndsWithSequential) { - // Replace: gep (gep %P, long B), long A, ... - // With: T = long A+B; gep %P, T, ... - Value *SO1 = Src->getOperand(Src->getNumOperands()-1); - Value *GO1 = GEP.getOperand(1); - - // If they aren't the same type, then the input hasn't been processed - // by the loop above yet (which canonicalizes sequential index types to - // intptr_t). Just avoid transforming this until the input has been - // normalized. - if (SO1->getType() != GO1->getType()) - return nullptr; + // Replace: gep (gep %P, long B), long A, ... + // With: T = long A+B; gep %P, T, ... + Value *SO1 = Src->getOperand(Src->getNumOperands() - 1); + Value *GO1 = GEP.getOperand(1); - Value *Sum = - simplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP)); - // Only do the combine when we are sure the cost after the - // merge is never more than that before the merge. - if (Sum == nullptr) - return nullptr; + // If they aren't the same type, then the input hasn't been processed + // by the loop above yet (which canonicalizes sequential index types to + // intptr_t). Just avoid transforming this until the input has been + // normalized. + if (SO1->getType() != GO1->getType()) + return nullptr; - Indices.append(Src->op_begin()+1, Src->op_end()-1); - Indices.push_back(Sum); - Indices.append(GEP.op_begin()+2, GEP.op_end()); - } else if (isa<Constant>(*GEP.idx_begin()) && - cast<Constant>(*GEP.idx_begin())->isNullValue() && - Src->getNumOperands() != 1) { - // Otherwise we can do the fold if the first index of the GEP is a zero - Indices.append(Src->op_begin()+1, Src->op_end()); - Indices.append(GEP.idx_begin()+1, GEP.idx_end()); - } - - // Don't create GEPs with more than one variable index. - unsigned NumVarIndices = - count_if(Indices, [](Value *Idx) { return !isa<Constant>(Idx); }); - if (NumVarIndices > 1) + Value *Sum = + simplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP)); + // Only do the combine when we are sure the cost after the + // merge is never more than that before the merge. + if (Sum == nullptr) return nullptr; - if (!Indices.empty()) - return replaceInstUsesWith( - GEP, Builder.CreateGEP( - Src->getSourceElementType(), Src->getOperand(0), Indices, "", - getMergedGEPNoWrapFlags(*Src, *cast<GEPOperator>(&GEP)))); + SmallVector<Value *, 8> Indices; + Indices.append(Src->op_begin() + 1, Src->op_end() - 1); + Indices.push_back(Sum); + Indices.append(GEP.op_begin() + 2, GEP.op_end()); - return nullptr; + // Don't create GEPs with more than one non-zero index. + unsigned NumNonZeroIndices = count_if(Indices, [](Value *Idx) { + auto *C = dyn_cast<Constant>(Idx); + return !C || !C->isNullValue(); + }); + if (NumNonZeroIndices > 1) + return nullptr; + + return replaceInstUsesWith( + GEP, Builder.CreateGEP( + Src->getSourceElementType(), Src->getOperand(0), Indices, "", + getMergedGEPNoWrapFlags(*Src, *cast<GEPOperator>(&GEP)))); } Value *InstCombiner::getFreelyInvertedImpl(Value *V, bool WillInvertAllUses, @@ -3238,6 +3223,19 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { drop_end(Indices), "", GEP.getNoWrapFlags())); } + // Strip leading zero indices. + auto *FirstIdx = dyn_cast<Constant>(Indices.front()); + if (FirstIdx && FirstIdx->isNullValue() && + !FirstIdx->getType()->isVectorTy()) { + gep_type_iterator GTI = gep_type_begin(GEP); + ++GTI; + if (!GTI.isStruct()) + return replaceInstUsesWith(GEP, Builder.CreateGEP(GTI.getIndexedType(), + GEP.getPointerOperand(), + drop_begin(Indices), "", + GEP.getNoWrapFlags())); + } + // Scalarize vector operands; prefer splat-of-gep.as canonical form. // Note that this looses information about undef lanes; we run it after // demanded bits to partially mitigate that loss. @@ -3264,17 +3262,18 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { return replaceInstUsesWith(GEP, Res); } - bool SeenVarIndex = false; + bool SeenNonZeroIndex = false; for (auto [IdxNum, Idx] : enumerate(Indices)) { - if (isa<Constant>(Idx)) + auto *C = dyn_cast<Constant>(Idx); + if (C && C->isNullValue()) continue; - if (!SeenVarIndex) { - SeenVarIndex = true; + if (!SeenNonZeroIndex) { + SeenNonZeroIndex = true; continue; } - // GEP has multiple variable indices: Split it. + // GEP has multiple non-zero indices: Split it. ArrayRef<Value *> FrontIndices = ArrayRef(Indices).take_front(IdxNum); Value *FrontGEP = Builder.CreateGEP(GEPEltType, PtrOp, FrontIndices, @@ -4961,63 +4960,68 @@ Instruction *InstCombinerImpl::visitLandingPadInst(LandingPadInst &LI) { Value * InstCombinerImpl::pushFreezeToPreventPoisonFromPropagating(FreezeInst &OrigFI) { // Try to push freeze through instructions that propagate but don't produce - // poison as far as possible. If an operand of freeze follows three - // conditions 1) one-use, 2) does not produce poison, and 3) has all but one - // guaranteed-non-poison operands then push the freeze through to the one - // operand that is not guaranteed non-poison. The actual transform is as - // follows. - // Op1 = ... ; Op1 can be posion - // Op0 = Inst(Op1, NonPoisonOps...) ; Op0 has only one use and only have - // ; single guaranteed-non-poison operands + // poison as far as possible. If an operand of freeze does not produce poison + // then push the freeze through to the operands that are not guaranteed + // non-poison. The actual transform is as follows. + // Op1 = ... ; Op1 can be poison + // Op0 = Inst(Op1, NonPoisonOps...) // ... = Freeze(Op0) // => // Op1 = ... // Op1.fr = Freeze(Op1) // ... = Inst(Op1.fr, NonPoisonOps...) - auto *OrigOp = OrigFI.getOperand(0); - auto *OrigOpInst = dyn_cast<Instruction>(OrigOp); - // While we could change the other users of OrigOp to use freeze(OrigOp), that - // potentially reduces their optimization potential, so let's only do this iff - // the OrigOp is only used by the freeze. - if (!OrigOpInst || !OrigOpInst->hasOneUse() || isa<PHINode>(OrigOp)) - return nullptr; + auto CanPushFreeze = [](Value *V) { + if (!isa<Instruction>(V) || isa<PHINode>(V)) + return false; - // We can't push the freeze through an instruction which can itself create - // poison. If the only source of new poison is flags, we can simply - // strip them (since we know the only use is the freeze and nothing can - // benefit from them.) - if (canCreateUndefOrPoison(cast<Operator>(OrigOp), - /*ConsiderFlagsAndMetadata*/ false)) - return nullptr; + // We can't push the freeze through an instruction which can itself create + // poison. If the only source of new poison is flags, we can simply + // strip them (since we know the only use is the freeze and nothing can + // benefit from them.) + return !canCreateUndefOrPoison(cast<Operator>(V), + /*ConsiderFlagsAndMetadata*/ false); + }; - // If operand is guaranteed not to be poison, there is no need to add freeze - // to the operand. So we first find the operand that is not guaranteed to be - // poison. - Value *MaybePoisonOperand = nullptr; - for (Value *V : OrigOpInst->operands()) { - if (isa<MetadataAsValue>(V) || isGuaranteedNotToBeUndefOrPoison(V) || - // Treat identical operands as a single operand. - (MaybePoisonOperand && MaybePoisonOperand == V)) + // Pushing freezes up long instruction chains can be expensive. Instead, + // we directly push the freeze all the way to the leaves. However, we leave + // deduplication of freezes on the same value for freezeOtherUses(). + Use *OrigUse = &OrigFI.getOperandUse(0); + SmallPtrSet<Instruction *, 8> Visited; + SmallVector<Use *, 8> Worklist; + Worklist.push_back(OrigUse); + while (!Worklist.empty()) { + auto *U = Worklist.pop_back_val(); + Value *V = U->get(); + if (!CanPushFreeze(V)) { + // If we can't push through the original instruction, abort the transform. + if (U == OrigUse) + return nullptr; + + auto *UserI = cast<Instruction>(U->getUser()); + Builder.SetInsertPoint(UserI); + Value *Frozen = Builder.CreateFreeze(V, V->getName() + ".fr"); + U->set(Frozen); continue; - if (!MaybePoisonOperand) - MaybePoisonOperand = V; - else - return nullptr; - } + } - OrigOpInst->dropPoisonGeneratingAnnotations(); + auto *I = cast<Instruction>(V); + if (!Visited.insert(I).second) + continue; - // If all operands are guaranteed to be non-poison, we can drop freeze. - if (!MaybePoisonOperand) - return OrigOp; + // reverse() to emit freezes in a more natural order. + for (Use &Op : reverse(I->operands())) { + Value *OpV = Op.get(); + if (isa<MetadataAsValue>(OpV) || isGuaranteedNotToBeUndefOrPoison(OpV)) + continue; + Worklist.push_back(&Op); + } - Builder.SetInsertPoint(OrigOpInst); - Value *FrozenMaybePoisonOperand = Builder.CreateFreeze( - MaybePoisonOperand, MaybePoisonOperand->getName() + ".fr"); + I->dropPoisonGeneratingAnnotations(); + this->Worklist.add(I); + } - OrigOpInst->replaceUsesOfWith(MaybePoisonOperand, FrozenMaybePoisonOperand); - return OrigOp; + return OrigUse->get(); } Instruction *InstCombinerImpl::foldFreezeIntoRecurrence(FreezeInst &FI, diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 50258af5e26c..42c3d4a4f4c4 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -1219,7 +1219,9 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { std::optional<TypeSize> Size = AI->getAllocationSize(AI->getDataLayout()); // Check that size is known and can be stored in IntptrTy. - if (!Size || !ConstantInt::isValueValidForType(IntptrTy, *Size)) + // TODO: Add support for scalable vectors if possible. + if (!Size || Size->isScalable() || + !ConstantInt::isValueValidForType(IntptrTy, *Size)) return; bool DoPoison = (ID == Intrinsic::lifetime_end); diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index 66cdbfcf998c..832592e7663b 100644 --- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -212,6 +212,15 @@ static cl::opt<float> "OR because of the hot percentile cutoff, if " "both are supplied.")); +static cl::opt<bool> ClStaticLinking( + "hwasan-static-linking", + cl::desc("Don't use .note.hwasan.globals section to instrument globals " + "from loadable libraries. " + "Note: in static binaries, the global variables section can be " + "accessed directly via linker-provided " + "__start_hwasan_globals and __stop_hwasan_globals symbols"), + cl::Hidden, cl::init(false)); + STATISTIC(NumTotalFuncs, "Number of total funcs"); STATISTIC(NumInstrumentedFuncs, "Number of instrumented funcs"); STATISTIC(NumNoProfileSummaryFuncs, "Number of funcs without PS"); @@ -335,6 +344,7 @@ private: FunctionAnalysisManager &FAM) const; void initializeModule(); void createHwasanCtorComdat(); + void createHwasanNote(); void initializeCallbacks(Module &M); @@ -533,20 +543,7 @@ void HWAddressSanitizerPass::printPipeline( OS << '>'; } -void HWAddressSanitizer::createHwasanCtorComdat() { - std::tie(HwasanCtorFunction, std::ignore) = - getOrCreateSanitizerCtorAndInitFunctions( - M, kHwasanModuleCtorName, kHwasanInitName, - /*InitArgTypes=*/{}, - /*InitArgs=*/{}, - // This callback is invoked when the functions are created the first - // time. Hook them into the global ctors list in that case: - [&](Function *Ctor, FunctionCallee) { - Comdat *CtorComdat = M.getOrInsertComdat(kHwasanModuleCtorName); - Ctor->setComdat(CtorComdat); - appendToGlobalCtors(M, Ctor, 0, Ctor); - }); - +void HWAddressSanitizer::createHwasanNote() { // Create a note that contains pointers to the list of global // descriptors. Adding a note to the output file will cause the linker to // create a PT_NOTE program header pointing to the note that we can use to @@ -630,6 +627,29 @@ void HWAddressSanitizer::createHwasanCtorComdat() { appendToCompilerUsed(M, Dummy); } +void HWAddressSanitizer::createHwasanCtorComdat() { + std::tie(HwasanCtorFunction, std::ignore) = + getOrCreateSanitizerCtorAndInitFunctions( + M, kHwasanModuleCtorName, kHwasanInitName, + /*InitArgTypes=*/{}, + /*InitArgs=*/{}, + // This callback is invoked when the functions are created the first + // time. Hook them into the global ctors list in that case: + [&](Function *Ctor, FunctionCallee) { + Comdat *CtorComdat = M.getOrInsertComdat(kHwasanModuleCtorName); + Ctor->setComdat(CtorComdat); + appendToGlobalCtors(M, Ctor, 0, Ctor); + }); + + // Do not create .note.hwasan.globals for static binaries, as it is only + // needed for instrumenting globals from dynamic libraries. In static + // binaries, the global variables section can be accessed directly via the + // __start_hwasan_globals and __stop_hwasan_globals symbols inserted by the + // linker. + if (!ClStaticLinking) + createHwasanNote(); +} + /// Module-level initialization. /// /// inserts a call to __hwasan_init to the module's constructor list. diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp index a9a0731f16d9..ecb2f2dbc552 100644 --- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp @@ -22,6 +22,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" +#include "llvm/ProfileData/DataAccessProf.h" #include "llvm/ProfileData/InstrProf.h" #include "llvm/ProfileData/InstrProfReader.h" #include "llvm/ProfileData/MemProfCommon.h" @@ -75,6 +76,10 @@ static cl::opt<unsigned> MinMatchedColdBytePercent( "memprof-matching-cold-threshold", cl::init(100), cl::Hidden, cl::desc("Min percent of cold bytes matched to hint allocation cold")); +static cl::opt<bool> AnnotateStaticDataSectionPrefix( + "memprof-annotate-static-data-prefix", cl::init(false), cl::Hidden, + cl::desc("If true, annotate the static data section prefix")); + // Matching statistics STATISTIC(NumOfMemProfMissing, "Number of functions without memory profile."); STATISTIC(NumOfMemProfMismatch, @@ -90,6 +95,14 @@ STATISTIC(NumOfMemProfMatchedAllocs, "Number of matched memory profile allocs."); STATISTIC(NumOfMemProfMatchedCallSites, "Number of matched memory profile callsites."); +STATISTIC(NumOfMemProfHotGlobalVars, + "Number of global vars annotated with 'hot' section prefix."); +STATISTIC(NumOfMemProfColdGlobalVars, + "Number of global vars annotated with 'unlikely' section prefix."); +STATISTIC(NumOfMemProfUnknownGlobalVars, + "Number of global vars with unknown hotness (no section prefix)."); +STATISTIC(NumOfMemProfExplicitSectionGlobalVars, + "Number of global vars with user-specified section (not annotated)."); static void addCallsiteMetadata(Instruction &I, ArrayRef<uint64_t> InlinedCallStack, @@ -674,11 +687,12 @@ MemProfUsePass::MemProfUsePass(std::string MemoryProfileFile, } PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) { - // Return immediately if the module doesn't contain any function. - if (M.empty()) + // Return immediately if the module doesn't contain any function or global + // variables. + if (M.empty() && M.globals().empty()) return PreservedAnalyses::all(); - LLVM_DEBUG(dbgs() << "Read in memory profile:"); + LLVM_DEBUG(dbgs() << "Read in memory profile:\n"); auto &Ctx = M.getContext(); auto ReaderOrErr = IndexedInstrProfReader::create(MemoryProfileFileName, *FS); if (Error E = ReaderOrErr.takeError()) { @@ -703,6 +717,14 @@ PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) { return PreservedAnalyses::all(); } + const bool Changed = + annotateGlobalVariables(M, MemProfReader->getDataAccessProfileData()); + + // If the module doesn't contain any function, return after we process all + // global variables. + if (M.empty()) + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); + auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(*M.begin()); @@ -752,3 +774,95 @@ PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) { return PreservedAnalyses::none(); } + +// Returns true iff the global variable has custom section either by +// __attribute__((section("name"))) +// (https://clang.llvm.org/docs/AttributeReference.html#section-declspec-allocate) +// or #pragma clang section directives +// (https://clang.llvm.org/docs/LanguageExtensions.html#specifying-section-names-for-global-objects-pragma-clang-section). +static bool hasExplicitSectionName(const GlobalVariable &GVar) { + if (GVar.hasSection()) + return true; + + auto Attrs = GVar.getAttributes(); + if (Attrs.hasAttribute("bss-section") || Attrs.hasAttribute("data-section") || + Attrs.hasAttribute("relro-section") || + Attrs.hasAttribute("rodata-section")) + return true; + return false; +} + +bool MemProfUsePass::annotateGlobalVariables( + Module &M, const memprof::DataAccessProfData *DataAccessProf) { + if (!AnnotateStaticDataSectionPrefix || M.globals().empty()) + return false; + + if (!DataAccessProf) { + M.getContext().diagnose(DiagnosticInfoPGOProfile( + MemoryProfileFileName.data(), + StringRef("Data access profiles not found in memprof. Ignore " + "-memprof-annotate-static-data-prefix."), + DS_Warning)); + return false; + } + + bool Changed = false; + // Iterate all global variables in the module and annotate them based on + // data access profiles. Note it's up to the linker to decide how to map input + // sections to output sections, and one conservative practice is to map + // unlikely-prefixed ones to unlikely output section, and map the rest + // (hot-prefixed or prefix-less) to the canonical output section. + for (GlobalVariable &GVar : M.globals()) { + assert(!GVar.getSectionPrefix().has_value() && + "GVar shouldn't have section prefix yet"); + if (GVar.isDeclarationForLinker()) + continue; + + if (hasExplicitSectionName(GVar)) { + ++NumOfMemProfExplicitSectionGlobalVars; + LLVM_DEBUG(dbgs() << "Global variable " << GVar.getName() + << " has explicit section name. Skip annotating.\n"); + continue; + } + + StringRef Name = GVar.getName(); + // Skip string literals as their mangled names don't stay stable across + // binary releases. + // TODO: Track string content hash in the profiles and compute it inside the + // compiler to categeorize the hotness string literals. + if (Name.starts_with(".str")) { + + LLVM_DEBUG(dbgs() << "Skip annotating string literal " << Name << "\n"); + continue; + } + + // DataAccessProfRecord's get* methods will canonicalize the name under the + // hood before looking it up, so optimizer doesn't need to do it. + std::optional<DataAccessProfRecord> Record = + DataAccessProf->getProfileRecord(Name); + // Annotate a global variable as hot if it has non-zero sampled count, and + // annotate it as cold if it's seen in the profiled binary + // file but doesn't have any access sample. + // For logging, optimization remark emitter requires a llvm::Function, but + // it's not well defined how to associate a global variable with a function. + // So we just print out the static data section prefix in LLVM_DEBUG. + if (Record && Record->AccessCount > 0) { + ++NumOfMemProfHotGlobalVars; + GVar.setSectionPrefix("hot"); + Changed = true; + LLVM_DEBUG(dbgs() << "Global variable " << Name + << " is annotated as hot\n"); + } else if (DataAccessProf->isKnownColdSymbol(Name)) { + ++NumOfMemProfColdGlobalVars; + GVar.setSectionPrefix("unlikely"); + Changed = true; + LLVM_DEBUG(dbgs() << "Global variable " << Name + << " is annotated as unlikely\n"); + } else { + ++NumOfMemProfUnknownGlobalVars; + LLVM_DEBUG(dbgs() << "Global variable " << Name << " is not annotated\n"); + } + } + + return Changed; +} diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 27292d1a66c3..9899a2aae2b1 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -3263,7 +3263,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { return true; } - /// Heuristically instrument unknown intrinsics. + /// Returns whether it was able to heuristically instrument unknown + /// intrinsics. /// /// The main purpose of this code is to do something reasonable with all /// random intrinsics we might encounter, most importantly - SIMD intrinsics. @@ -3273,7 +3274,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// /// We special-case intrinsics where this approach fails. See llvm.bswap /// handling as an example of that. - bool handleUnknownIntrinsicUnlogged(IntrinsicInst &I) { + bool maybeHandleUnknownIntrinsicUnlogged(IntrinsicInst &I) { unsigned NumArgOperands = I.arg_size(); if (NumArgOperands == 0) return false; @@ -3300,8 +3301,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { return false; } - bool handleUnknownIntrinsic(IntrinsicInst &I) { - if (handleUnknownIntrinsicUnlogged(I)) { + bool maybeHandleUnknownIntrinsic(IntrinsicInst &I) { + if (maybeHandleUnknownIntrinsicUnlogged(I)) { if (ClDumpHeuristicInstructions) dumpInst(I); @@ -3860,7 +3861,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // // Three operands: // <4 x i32> @llvm.x86.avx512.vpdpbusd.128 - // (<4 x i32> %s, <4 x i32> %a, <4 x i32> %b) + // (<4 x i32> %s, <16 x i8> %a, <16 x i8> %b) // (this is equivalent to multiply-add on %a and %b, followed by // adding/"accumulating" %s. "Accumulation" stores the result in one // of the source registers, but this accumulate vs. add distinction @@ -3902,15 +3903,17 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { ReturnType->getPrimitiveSizeInBits()); if (I.arg_size() == 3) { - assert(ParamType == ReturnType); - assert(ParamType == I.getArgOperand(0)->getType()); + [[maybe_unused]] auto *AccumulatorType = + cast<FixedVectorType>(I.getOperand(0)->getType()); + assert(AccumulatorType == ReturnType); } FixedVectorType *ImplicitReturnType = ReturnType; // Step 1: instrument multiplication of corresponding vector elements if (EltSizeInBits) { - ImplicitReturnType = cast<FixedVectorType>(getMMXVectorTy( - EltSizeInBits * 2, ParamType->getPrimitiveSizeInBits())); + ImplicitReturnType = cast<FixedVectorType>( + getMMXVectorTy(EltSizeInBits * ReductionFactor, + ParamType->getPrimitiveSizeInBits())); ParamType = cast<FixedVectorType>( getMMXVectorTy(EltSizeInBits, ParamType->getPrimitiveSizeInBits())); @@ -3958,7 +3961,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // Step 2: instrument horizontal add // We don't need bit-precise horizontalReduce because we only want to check - // if each pair of elements is fully zero. + // if each pair/quad of elements is fully zero. // Cast to <4 x i32>. Value *Horizontal = IRB.CreateBitCast(And, ImplicitReturnType); @@ -3968,7 +3971,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Constant::getNullValue(Horizontal->getType())), ImplicitReturnType); - // Cast it back to the required fake return type (<1 x i64>). + // Cast it back to the required fake return type (if MMX: <1 x i64>; for + // AVX, it is already correct). if (EltSizeInBits) OutShadow = CreateShadowCast(IRB, OutShadow, getShadowTy(&I)); @@ -5262,7 +5266,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { handleShadowOr(I); } - void visitIntrinsicInst(IntrinsicInst &I) { + bool maybeHandleCrossPlatformIntrinsic(IntrinsicInst &I) { switch (I.getIntrinsicID()) { case Intrinsic::uadd_with_overflow: case Intrinsic::sadd_with_overflow: @@ -5342,6 +5346,32 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { handleVectorReduceWithStarterIntrinsic(I); break; + case Intrinsic::scmp: + case Intrinsic::ucmp: { + handleShadowOr(I); + break; + } + + case Intrinsic::fshl: + case Intrinsic::fshr: + handleFunnelShift(I); + break; + + case Intrinsic::is_constant: + // The result of llvm.is.constant() is always defined. + setShadow(&I, getCleanShadow(&I)); + setOrigin(&I, getCleanOrigin()); + break; + + default: + return false; + } + + return true; + } + + bool maybeHandleX86SIMDIntrinsic(IntrinsicInst &I) { + switch (I.getIntrinsicID()) { case Intrinsic::x86_sse_stmxcsr: handleStmxcsr(I); break; @@ -5392,6 +5422,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { break; } + // Convert Packed Single Precision Floating-Point Values + // to Packed Signed Doubleword Integer Values + // + // <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512 + // (<16 x float>, <16 x i32>, i16, i32) + case Intrinsic::x86_avx512_mask_cvtps2dq_512: + handleAVX512VectorConvertFPToInt(I, /*LastMask=*/false); + break; + // Convert Packed Double Precision Floating-Point Values // to Packed Single Precision Floating-Point Values case Intrinsic::x86_sse2_cvtpd2ps: @@ -5492,23 +5531,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case Intrinsic::x86_mmx_psrli_q: case Intrinsic::x86_mmx_psrai_w: case Intrinsic::x86_mmx_psrai_d: - case Intrinsic::aarch64_neon_rshrn: - case Intrinsic::aarch64_neon_sqrshl: - case Intrinsic::aarch64_neon_sqrshrn: - case Intrinsic::aarch64_neon_sqrshrun: - case Intrinsic::aarch64_neon_sqshl: - case Intrinsic::aarch64_neon_sqshlu: - case Intrinsic::aarch64_neon_sqshrn: - case Intrinsic::aarch64_neon_sqshrun: - case Intrinsic::aarch64_neon_srshl: - case Intrinsic::aarch64_neon_sshl: - case Intrinsic::aarch64_neon_uqrshl: - case Intrinsic::aarch64_neon_uqrshrn: - case Intrinsic::aarch64_neon_uqshl: - case Intrinsic::aarch64_neon_uqshrn: - case Intrinsic::aarch64_neon_urshl: - case Intrinsic::aarch64_neon_ushl: - // Not handled here: aarch64_neon_vsli (vector shift left and insert) handleVectorShiftIntrinsic(I, /* Variable */ false); break; case Intrinsic::x86_avx2_psllv_d: @@ -5621,19 +5643,19 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // // Multiply and Add Packed Signed and Unsigned Bytes // < 4 x i32> @llvm.x86.avx512.vpdpbusd.128 - // (< 4 x i32>, < 4 x i32>, < 4 x i32>) + // (< 4 x i32>, <16 x i8>, <16 x i8>) // < 8 x i32> @llvm.x86.avx512.vpdpbusd.256 - // (< 8 x i32>, < 8 x i32>, < 8 x i32>) + // (< 8 x i32>, <32 x i8>, <32 x i8>) // <16 x i32> @llvm.x86.avx512.vpdpbusd.512 - // (<16 x i32>, <16 x i32>, <16 x i32>) + // (<16 x i32>, <64 x i8>, <64 x i8>) // // Multiply and Add Unsigned and Signed Bytes With Saturation // < 4 x i32> @llvm.x86.avx512.vpdpbusds.128 - // (< 4 x i32>, < 4 x i32>, < 4 x i32>) + // (< 4 x i32>, <16 x i8>, <16 x i8>) // < 8 x i32> @llvm.x86.avx512.vpdpbusds.256 - // (< 8 x i32>, < 8 x i32>, < 8 x i32>) + // (< 8 x i32>, <32 x i8>, <32 x i8>) // <16 x i32> @llvm.x86.avx512.vpdpbusds.512 - // (<16 x i32>, <16 x i32>, <16 x i32>) + // (<16 x i32>, <64 x i8>, <64 x i8>) // // < 4 x i32> @llvm.x86.avx2.vpdpbssd.128 // (< 4 x i32>, < 4 x i32>, < 4 x i32>) @@ -5652,30 +5674,30 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // // These intrinsics are auto-upgraded into non-masked forms: // <4 x i32> @llvm.x86.avx512.mask.vpdpbusd.128 - // (<4 x i32>, <4 x i32>, <4 x i32>, i8) + // (<4 x i32>, <16 x i8>, <16 x i8>, i8) // <4 x i32> @llvm.x86.avx512.maskz.vpdpbusd.128 - // (<4 x i32>, <4 x i32>, <4 x i32>, i8) + // (<4 x i32>, <16 x i8>, <16 x i8>, i8) // <8 x i32> @llvm.x86.avx512.mask.vpdpbusd.256 - // (<8 x i32>, <8 x i32>, <8 x i32>, i8) + // (<8 x i32>, <32 x i8>, <32 x i8>, i8) // <8 x i32> @llvm.x86.avx512.maskz.vpdpbusd.256 - // (<8 x i32>, <8 x i32>, <8 x i32>, i8) + // (<8 x i32>, <32 x i8>, <32 x i8>, i8) // <16 x i32> @llvm.x86.avx512.mask.vpdpbusd.512 - // (<16 x i32>, <16 x i32>, <16 x i32>, i16) + // (<16 x i32>, <64 x i8>, <64 x i8>, i16) // <16 x i32> @llvm.x86.avx512.maskz.vpdpbusd.512 - // (<16 x i32>, <16 x i32>, <16 x i32>, i16) + // (<16 x i32>, <64 x i8>, <64 x i8>, i16) // // <4 x i32> @llvm.x86.avx512.mask.vpdpbusds.128 - // (<4 x i32>, <4 x i32>, <4 x i32>, i8) + // (<4 x i32>, <16 x i8>, <16 x i8>, i8) // <4 x i32> @llvm.x86.avx512.maskz.vpdpbusds.128 - // (<4 x i32>, <4 x i32>, <4 x i32>, i8) + // (<4 x i32>, <16 x i8>, <16 x i8>, i8) // <8 x i32> @llvm.x86.avx512.mask.vpdpbusds.256 - // (<8 x i32>, <8 x i32>, <8 x i32>, i8) + // (<8 x i32>, <32 x i8>, <32 x i8>, i8) // <8 x i32> @llvm.x86.avx512.maskz.vpdpbusds.256 - // (<8 x i32>, <8 x i32>, <8 x i32>, i8) + // (<8 x i32>, <32 x i8>, <32 x i8>, i8) // <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512 - // (<16 x i32>, <16 x i32>, <16 x i32>, i16) + // (<16 x i32>, <64 x i8>, <64 x i8>, i16) // <16 x i32> @llvm.x86.avx512.maskz.vpdpbusds.512 - // (<16 x i32>, <16 x i32>, <16 x i32>, i16) + // (<16 x i32>, <64 x i8>, <64 x i8>, i16) case Intrinsic::x86_avx512_vpdpbusd_128: case Intrinsic::x86_avx512_vpdpbusd_256: case Intrinsic::x86_avx512_vpdpbusd_512: @@ -5930,7 +5952,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case Intrinsic::x86_avx512_max_pd_512: { // These AVX512 variants contain the rounding mode as a trailing flag. // Earlier variants do not have a trailing flag and are already handled - // by maybeHandleSimpleNomemIntrinsic(I, 0) via handleUnknownIntrinsic. + // by maybeHandleSimpleNomemIntrinsic(I, 0) via + // maybeHandleUnknownIntrinsic. [[maybe_unused]] bool Success = maybeHandleSimpleNomemIntrinsic(I, /*trailingFlags=*/1); assert(Success); @@ -5988,15 +6011,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /*trailingVerbatimArgs=*/1); break; - // Convert Packed Single Precision Floating-Point Values - // to Packed Signed Doubleword Integer Values - // - // <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512 - // (<16 x float>, <16 x i32>, i16, i32) - case Intrinsic::x86_avx512_mask_cvtps2dq_512: - handleAVX512VectorConvertFPToInt(I, /*LastMask=*/false); - break; - // AVX512 PMOV: Packed MOV, with truncation // Precisely handled by applying the same intrinsic to the shadow case Intrinsic::x86_avx512_mask_pmov_dw_512: @@ -6074,15 +6088,33 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { handleAVXGF2P8Affine(I); break; - case Intrinsic::fshl: - case Intrinsic::fshr: - handleFunnelShift(I); - break; + default: + return false; + } - case Intrinsic::is_constant: - // The result of llvm.is.constant() is always defined. - setShadow(&I, getCleanShadow(&I)); - setOrigin(&I, getCleanOrigin()); + return true; + } + + bool maybeHandleArmSIMDIntrinsic(IntrinsicInst &I) { + switch (I.getIntrinsicID()) { + case Intrinsic::aarch64_neon_rshrn: + case Intrinsic::aarch64_neon_sqrshl: + case Intrinsic::aarch64_neon_sqrshrn: + case Intrinsic::aarch64_neon_sqrshrun: + case Intrinsic::aarch64_neon_sqshl: + case Intrinsic::aarch64_neon_sqshlu: + case Intrinsic::aarch64_neon_sqshrn: + case Intrinsic::aarch64_neon_sqshrun: + case Intrinsic::aarch64_neon_srshl: + case Intrinsic::aarch64_neon_sshl: + case Intrinsic::aarch64_neon_uqrshl: + case Intrinsic::aarch64_neon_uqrshrn: + case Intrinsic::aarch64_neon_uqshl: + case Intrinsic::aarch64_neon_uqshrn: + case Intrinsic::aarch64_neon_urshl: + case Intrinsic::aarch64_neon_ushl: + // Not handled here: aarch64_neon_vsli (vector shift left and insert) + handleVectorShiftIntrinsic(I, /* Variable */ false); break; // TODO: handling max/min similarly to AND/OR may be more precise @@ -6233,17 +6265,27 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { break; } - case Intrinsic::scmp: - case Intrinsic::ucmp: { - handleShadowOr(I); - break; - } - default: - if (!handleUnknownIntrinsic(I)) - visitInstruction(I); - break; + return false; } + + return true; + } + + void visitIntrinsicInst(IntrinsicInst &I) { + if (maybeHandleCrossPlatformIntrinsic(I)) + return; + + if (maybeHandleX86SIMDIntrinsic(I)) + return; + + if (maybeHandleArmSIMDIntrinsic(I)) + return; + + if (maybeHandleUnknownIntrinsic(I)) + return; + + visitInstruction(I); } void visitLibAtomicLoad(CallBase &CB) { diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp index 1ddb8ae9518f..4acc3f2d8469 100644 --- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp +++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp @@ -19,9 +19,11 @@ #include "llvm/Analysis/ConstraintSystem.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" @@ -170,10 +172,12 @@ struct State { DominatorTree &DT; LoopInfo &LI; ScalarEvolution &SE; + TargetLibraryInfo &TLI; SmallVector<FactOrCheck, 64> WorkList; - State(DominatorTree &DT, LoopInfo &LI, ScalarEvolution &SE) - : DT(DT), LI(LI), SE(SE) {} + State(DominatorTree &DT, LoopInfo &LI, ScalarEvolution &SE, + TargetLibraryInfo &TLI) + : DT(DT), LI(LI), SE(SE), TLI(TLI) {} /// Process block \p BB and add known facts to work-list. void addInfoFor(BasicBlock &BB); @@ -1109,10 +1113,54 @@ void State::addInfoForInductions(BasicBlock &BB) { } } +static bool getConstraintFromMemoryAccess(GetElementPtrInst &GEP, + uint64_t AccessSize, + CmpPredicate &Pred, Value *&A, + Value *&B, const DataLayout &DL, + const TargetLibraryInfo &TLI) { + auto Offset = collectOffsets(cast<GEPOperator>(GEP), DL); + if (!Offset.NW.hasNoUnsignedWrap()) + return false; + + if (Offset.VariableOffsets.size() != 1) + return false; + + uint64_t BitWidth = Offset.ConstantOffset.getBitWidth(); + auto &[Index, Scale] = Offset.VariableOffsets.front(); + // Bail out on non-canonical GEPs. + if (Index->getType()->getScalarSizeInBits() != BitWidth) + return false; + + ObjectSizeOpts Opts; + // Workaround for gep inbounds, ptr null, idx. + Opts.NullIsUnknownSize = true; + // Be conservative since we are not clear on whether an out of bounds access + // to the padding is UB or not. + Opts.RoundToAlign = true; + std::optional<TypeSize> Size = + getBaseObjectSize(Offset.BasePtr, DL, &TLI, Opts); + if (!Size || Size->isScalable()) + return false; + + // Index * Scale + ConstOffset + AccessSize <= AllocSize + // With nuw flag, we know that the index addition doesn't have unsigned wrap. + // If (AllocSize - (ConstOffset + AccessSize)) wraps around, there is no valid + // value for Index. + APInt MaxIndex = (APInt(BitWidth, Size->getFixedValue() - AccessSize, + /*isSigned=*/false, /*implicitTrunc=*/true) - + Offset.ConstantOffset) + .udiv(Scale); + Pred = ICmpInst::ICMP_ULE; + A = Index; + B = ConstantInt::get(Index->getType(), MaxIndex); + return true; +} + void State::addInfoFor(BasicBlock &BB) { addInfoForInductions(BB); + auto &DL = BB.getDataLayout(); - // True as long as long as the current instruction is guaranteed to execute. + // True as long as the current instruction is guaranteed to execute. bool GuaranteedToExecute = true; // Queue conditions and assumes. for (Instruction &I : BB) { @@ -1127,6 +1175,38 @@ void State::addInfoFor(BasicBlock &BB) { continue; } + auto AddFactFromMemoryAccess = [&](Value *Ptr, Type *AccessType) { + auto *GEP = dyn_cast<GetElementPtrInst>(Ptr); + if (!GEP) + return; + TypeSize AccessSize = DL.getTypeStoreSize(AccessType); + if (!AccessSize.isFixed()) + return; + if (GuaranteedToExecute) { + CmpPredicate Pred; + Value *A, *B; + if (getConstraintFromMemoryAccess(*GEP, AccessSize.getFixedValue(), + Pred, A, B, DL, TLI)) { + // The memory access is guaranteed to execute when BB is entered, + // hence the constraint holds on entry to BB. + WorkList.emplace_back(FactOrCheck::getConditionFact( + DT.getNode(I.getParent()), Pred, A, B)); + } + } else { + WorkList.emplace_back( + FactOrCheck::getInstFact(DT.getNode(I.getParent()), &I)); + } + }; + + if (auto *LI = dyn_cast<LoadInst>(&I)) { + if (!LI->isVolatile()) + AddFactFromMemoryAccess(LI->getPointerOperand(), LI->getAccessType()); + } + if (auto *SI = dyn_cast<StoreInst>(&I)) { + if (!SI->isVolatile()) + AddFactFromMemoryAccess(SI->getPointerOperand(), SI->getAccessType()); + } + auto *II = dyn_cast<IntrinsicInst>(&I); Intrinsic::ID ID = II ? II->getIntrinsicID() : Intrinsic::not_intrinsic; switch (ID) { @@ -1420,7 +1500,7 @@ static std::optional<bool> checkCondition(CmpInst::Predicate Pred, Value *A, LLVM_DEBUG(dbgs() << "Checking " << *CheckInst << "\n"); auto R = Info.getConstraintForSolving(Pred, A, B); - if (R.empty() || !R.isValid(Info)){ + if (R.empty() || !R.isValid(Info)) { LLVM_DEBUG(dbgs() << " failed to decompose condition\n"); return std::nullopt; } @@ -1785,12 +1865,13 @@ tryToSimplifyOverflowMath(IntrinsicInst *II, ConstraintInfo &Info, static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI, ScalarEvolution &SE, - OptimizationRemarkEmitter &ORE) { + OptimizationRemarkEmitter &ORE, + TargetLibraryInfo &TLI) { bool Changed = false; DT.updateDFSNumbers(); SmallVector<Value *> FunctionArgs(llvm::make_pointer_range(F.args())); ConstraintInfo Info(F.getDataLayout(), FunctionArgs); - State S(DT, LI, SE); + State S(DT, LI, SE, TLI); std::unique_ptr<Module> ReproducerModule( DumpReproducers ? new Module(F.getName(), F.getContext()) : nullptr); @@ -1960,6 +2041,26 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI, } continue; } + + auto &DL = F.getDataLayout(); + auto AddFactsAboutIndices = [&](Value *Ptr, Type *AccessType) { + CmpPredicate Pred; + Value *A, *B; + if (getConstraintFromMemoryAccess( + *cast<GetElementPtrInst>(Ptr), + DL.getTypeStoreSize(AccessType).getFixedValue(), Pred, A, B, DL, + TLI)) + AddFact(Pred, A, B); + }; + + if (auto *LI = dyn_cast<LoadInst>(CB.Inst)) { + AddFactsAboutIndices(LI->getPointerOperand(), LI->getAccessType()); + continue; + } + if (auto *SI = dyn_cast<StoreInst>(CB.Inst)) { + AddFactsAboutIndices(SI->getPointerOperand(), SI->getAccessType()); + continue; + } } Value *A = nullptr, *B = nullptr; @@ -2018,7 +2119,8 @@ PreservedAnalyses ConstraintEliminationPass::run(Function &F, auto &LI = AM.getResult<LoopAnalysis>(F); auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); - if (!eliminateConstraints(F, DT, LI, SE, ORE)) + auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); + if (!eliminateConstraints(F, DT, LI, SE, ORE, TLI)) return PreservedAnalyses::all(); PreservedAnalyses PA; diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp index 434b55868c99..944b253e0f5e 100644 --- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp @@ -521,7 +521,7 @@ private: Instruction *SIUse = dyn_cast<Instruction>(SI->user_back()); // The use of the select inst should be either a phi or another select. - if (!SIUse && !(isa<PHINode>(SIUse) || isa<SelectInst>(SIUse))) + if (!SIUse || !(isa<PHINode>(SIUse) || isa<SelectInst>(SIUse))) return false; BasicBlock *SIBB = SI->getParent(); @@ -581,15 +581,17 @@ struct AllSwitchPaths { VisitedBlocks VB; // Get paths from the determinator BBs to SwitchPhiDefBB std::vector<ThreadingPath> PathsToPhiDef = - getPathsFromStateDefMap(StateDef, SwitchPhi, VB); - if (SwitchPhiDefBB == SwitchBlock) { + getPathsFromStateDefMap(StateDef, SwitchPhi, VB, MaxNumPaths); + if (SwitchPhiDefBB == SwitchBlock || PathsToPhiDef.empty()) { TPaths = std::move(PathsToPhiDef); return; } + assert(MaxNumPaths >= PathsToPhiDef.size() && !PathsToPhiDef.empty()); + auto PathsLimit = MaxNumPaths / PathsToPhiDef.size(); // Find and append paths from SwitchPhiDefBB to SwitchBlock. PathsType PathsToSwitchBB = - paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1); + paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1, PathsLimit); if (PathsToSwitchBB.empty()) return; @@ -610,13 +612,16 @@ private: typedef DenseMap<const BasicBlock *, const PHINode *> StateDefMap; std::vector<ThreadingPath> getPathsFromStateDefMap(StateDefMap &StateDef, PHINode *Phi, - VisitedBlocks &VB) { + VisitedBlocks &VB, + unsigned PathsLimit) { std::vector<ThreadingPath> Res; auto *PhiBB = Phi->getParent(); VB.insert(PhiBB); VisitedBlocks UniqueBlocks; for (auto *IncomingBB : Phi->blocks()) { + if (Res.size() >= PathsLimit) + break; if (!UniqueBlocks.insert(IncomingBB).second) continue; if (!SwitchOuterLoop->contains(IncomingBB)) @@ -652,8 +657,9 @@ private: // Direct predecessor, just add to the path. if (IncomingPhiDefBB == IncomingBB) { - std::vector<ThreadingPath> PredPaths = - getPathsFromStateDefMap(StateDef, IncomingPhi, VB); + assert(PathsLimit > Res.size()); + std::vector<ThreadingPath> PredPaths = getPathsFromStateDefMap( + StateDef, IncomingPhi, VB, PathsLimit - Res.size()); for (ThreadingPath &Path : PredPaths) { Path.push_back(PhiBB); Res.push_back(std::move(Path)); @@ -666,13 +672,17 @@ private: continue; PathsType IntermediatePaths; - IntermediatePaths = - paths(IncomingPhiDefBB, IncomingBB, VB, /* PathDepth = */ 1); + assert(PathsLimit > Res.size()); + auto InterPathLimit = PathsLimit - Res.size(); + IntermediatePaths = paths(IncomingPhiDefBB, IncomingBB, VB, + /* PathDepth = */ 1, InterPathLimit); if (IntermediatePaths.empty()) continue; + assert(InterPathLimit >= IntermediatePaths.size()); + auto PredPathLimit = InterPathLimit / IntermediatePaths.size(); std::vector<ThreadingPath> PredPaths = - getPathsFromStateDefMap(StateDef, IncomingPhi, VB); + getPathsFromStateDefMap(StateDef, IncomingPhi, VB, PredPathLimit); for (const ThreadingPath &Path : PredPaths) { for (const PathType &IPath : IntermediatePaths) { ThreadingPath NewPath(Path); @@ -687,7 +697,7 @@ private: } PathsType paths(BasicBlock *BB, BasicBlock *ToBB, VisitedBlocks &Visited, - unsigned PathDepth) { + unsigned PathDepth, unsigned PathsLimit) { PathsType Res; // Stop exploring paths after visiting MaxPathLength blocks @@ -714,6 +724,8 @@ private: // is used to prevent a duplicate path from being generated SmallPtrSet<BasicBlock *, 4> Successors; for (BasicBlock *Succ : successors(BB)) { + if (Res.size() >= PathsLimit) + break; if (!Successors.insert(Succ).second) continue; @@ -735,14 +747,12 @@ private: // coverage and compile time. if (LI->getLoopFor(Succ) != CurrLoop) continue; - - PathsType SuccPaths = paths(Succ, ToBB, Visited, PathDepth + 1); + assert(PathsLimit > Res.size()); + PathsType SuccPaths = + paths(Succ, ToBB, Visited, PathDepth + 1, PathsLimit - Res.size()); for (PathType &Path : SuccPaths) { Path.push_front(BB); Res.push_back(Path); - if (Res.size() >= MaxNumPaths) { - return Res; - } } } // This block could now be visited again from a different predecessor. Note diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 4baa3b3eb824..26e17cc849bf 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -2982,7 +2982,8 @@ bool GVNPass::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, bool GVNPass::performScalarPRE(Instruction *CurInst) { if (isa<AllocaInst>(CurInst) || CurInst->isTerminator() || isa<PHINode>(CurInst) || CurInst->getType()->isVoidTy() || - CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects()) + CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() || + CurInst->getType()->isTokenLikeTy()) return false; // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp index e9bf59c6850a..b60b15b6c3a2 100644 --- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp +++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp @@ -15,6 +15,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/KnownBits.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" @@ -35,8 +36,38 @@ static bool tryToImproveAlign( return true; } } - // TODO: Also handle memory intrinsics. - return false; + + IntrinsicInst *II = dyn_cast<IntrinsicInst>(I); + if (!II) + return false; + + // TODO: Handle more memory intrinsics. + switch (II->getIntrinsicID()) { + case Intrinsic::masked_load: + case Intrinsic::masked_store: { + int AlignOpIdx = II->getIntrinsicID() == Intrinsic::masked_load ? 1 : 2; + Value *PtrOp = II->getIntrinsicID() == Intrinsic::masked_load + ? II->getArgOperand(0) + : II->getArgOperand(1); + Type *Type = II->getIntrinsicID() == Intrinsic::masked_load + ? II->getType() + : II->getArgOperand(0)->getType(); + + Align OldAlign = + cast<ConstantInt>(II->getArgOperand(AlignOpIdx))->getAlignValue(); + Align PrefAlign = DL.getPrefTypeAlign(Type); + Align NewAlign = Fn(PtrOp, OldAlign, PrefAlign); + if (NewAlign <= OldAlign) + return false; + + Value *V = + ConstantInt::get(Type::getInt32Ty(II->getContext()), NewAlign.value()); + II->setOperand(AlignOpIdx, V); + return true; + } + default: + return false; + } } bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) { diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index c2a737d8f9a4..c7d71eb5633e 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -1437,9 +1437,18 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) { // AvailablePreds vector as we go so that all of the PHI entries for this // predecessor use the same bitcast. Value *&PredV = I->second; - if (PredV->getType() != LoadI->getType()) + if (PredV->getType() != LoadI->getType()) { PredV = CastInst::CreateBitOrPointerCast( PredV, LoadI->getType(), "", P->getTerminator()->getIterator()); + // The new cast is producing the value used to replace the load + // instruction, so uses the load's debug location. If P does not always + // branch to the load BB however then the debug location must be dropped, + // as it is hoisted past a conditional branch. + DebugLoc DL = P->getTerminator()->getNumSuccessors() == 1 + ? LoadI->getDebugLoc() + : DebugLoc::getDropped(); + cast<CastInst>(PredV)->setDebugLoc(DL); + } PN->addIncoming(PredV, I->first); } diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 03b92d3338a9..0874b29ab7d2 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -39,6 +39,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CmpInstAnalysis.h" +#include "llvm/Analysis/HashRecognize.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/MemoryLocation.h" @@ -143,6 +144,14 @@ static cl::opt<bool, true> cl::location(DisableLIRP::Wcslen), cl::init(false), cl::ReallyHidden); +bool DisableLIRP::HashRecognize; +static cl::opt<bool, true> + DisableLIRPHashRecognize("disable-" DEBUG_TYPE "-hashrecognize", + cl::desc("Proceed with loop idiom recognize pass, " + "but do not optimize CRC loops."), + cl::location(DisableLIRP::HashRecognize), + cl::init(false), cl::ReallyHidden); + static cl::opt<bool> UseLIRCodeSizeHeurs( "use-lir-code-size-heurs", cl::desc("Use loop idiom recognition code size heuristics when compiling " @@ -242,6 +251,7 @@ private: const SCEV *BECount); bool avoidLIRForMultiBlockLoop(bool IsMemset = false, bool IsLoopMemset = false); + bool optimizeCRCLoop(const PolynomialInfo &Info); /// @} /// \name Noncountable Loop Idiom Handling @@ -287,6 +297,8 @@ PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM, // but ORE cannot be preserved (see comment before the pass definition). OptimizationRemarkEmitter ORE(L.getHeader()->getParent()); + std::optional<PolynomialInfo> HR; + LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, AR.MSSA, DL, ORE); if (!LIR.runOnLoop(&L)) @@ -335,7 +347,8 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) { HasMemsetPattern = TLI->has(LibFunc_memset_pattern16); HasMemcpy = TLI->has(LibFunc_memcpy); - if (HasMemset || HasMemsetPattern || ForceMemsetPatternIntrinsic || HasMemcpy) + if (HasMemset || HasMemsetPattern || ForceMemsetPatternIntrinsic || + HasMemcpy || !DisableLIRP::HashRecognize) if (SE->hasLoopInvariantBackedgeTakenCount(L)) return runOnCountableLoop(); @@ -378,6 +391,13 @@ bool LoopIdiomRecognize::runOnCountableLoop() { MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks); } + + // Optimize a CRC loop if HashRecognize found one, provided we're not + // optimizing for size. + if (!DisableLIRP::HashRecognize && !ApplyCodeSizeHeuristics) + if (auto Res = HashRecognize(*CurLoop, *SE).getResult()) + optimizeCRCLoop(*Res); + return MadeChange; } @@ -1514,6 +1534,160 @@ bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset, return false; } +bool LoopIdiomRecognize::optimizeCRCLoop(const PolynomialInfo &Info) { + // FIXME: Hexagon has a special HexagonLoopIdiom that optimizes CRC using + // carry-less multiplication instructions, which is more efficient than our + // Sarwate table-lookup optimization. Hence, until we're able to emit + // target-specific instructions for Hexagon, subsuming HexagonLoopIdiom, + // disable the optimization for Hexagon. + Module &M = *CurLoop->getHeader()->getModule(); + Triple TT(M.getTargetTriple()); + if (TT.getArch() == Triple::hexagon) + return false; + + // First, create a new GlobalVariable corresponding to the + // Sarwate-lookup-table. + Type *CRCTy = Info.LHS->getType(); + unsigned CRCBW = CRCTy->getIntegerBitWidth(); + std::array<Constant *, 256> CRCConstants; + transform(HashRecognize::genSarwateTable(Info.RHS, Info.ByteOrderSwapped), + CRCConstants.begin(), + [CRCTy](const APInt &E) { return ConstantInt::get(CRCTy, E); }); + Constant *ConstArray = + ConstantArray::get(ArrayType::get(CRCTy, 256), CRCConstants); + GlobalVariable *GV = + new GlobalVariable(M, ConstArray->getType(), true, + GlobalValue::PrivateLinkage, ConstArray, ".crctable"); + + PHINode *IV = CurLoop->getCanonicalInductionVariable(); + SmallVector<PHINode *, 2> Cleanup; + + // Next, mark all PHIs for removal except IV. + { + for (PHINode &PN : CurLoop->getHeader()->phis()) { + if (&PN == IV) + continue; + PN.replaceAllUsesWith(PoisonValue::get(PN.getType())); + Cleanup.push_back(&PN); + } + } + + // Next, fix up the trip count. + { + unsigned NewBTC = (Info.TripCount / 8) - 1; + BasicBlock *LoopBlk = CurLoop->getLoopLatch(); + BranchInst *BrInst = cast<BranchInst>(LoopBlk->getTerminator()); + CmpPredicate ExitPred = BrInst->getSuccessor(0) == LoopBlk + ? ICmpInst::Predicate::ICMP_NE + : ICmpInst::Predicate::ICMP_EQ; + Instruction *ExitCond = CurLoop->getLatchCmpInst(); + Value *ExitLimit = ConstantInt::get(IV->getType(), NewBTC); + IRBuilder<> Builder(ExitCond); + Value *NewExitCond = + Builder.CreateICmp(ExitPred, IV, ExitLimit, "exit.cond"); + ExitCond->replaceAllUsesWith(NewExitCond); + deleteDeadInstruction(ExitCond); + } + + // Finally, fill the loop with the Sarwate-table-lookup logic, and replace all + // uses of ComputedValue. + // + // Little-endian: + // crc = (crc >> 8) ^ tbl[(iv'th byte of data) ^ (bottom byte of crc)] + // Big-Endian: + // crc = (crc << 8) ^ tbl[(iv'th byte of data) ^ (top byte of crc)] + { + auto LoByte = [](IRBuilderBase &Builder, Value *Op, const Twine &Name) { + Type *OpTy = Op->getType(); + unsigned OpBW = OpTy->getIntegerBitWidth(); + return OpBW > 8 + ? Builder.CreateAnd(Op, ConstantInt::get(OpTy, 0XFF), Name) + : Op; + }; + auto HiIdx = [LoByte, CRCBW](IRBuilderBase &Builder, Value *Op, + const Twine &Name) { + Type *OpTy = Op->getType(); + + // When the bitwidth of the CRC mismatches the Op's bitwidth, we need to + // use the CRC's bitwidth as the reference for shifting right. + return LoByte(Builder, + CRCBW > 8 ? Builder.CreateLShr( + Op, ConstantInt::get(OpTy, CRCBW - 8), Name) + : Op, + Name + ".lo.byte"); + }; + + IRBuilder<> Builder(CurLoop->getHeader(), + CurLoop->getHeader()->getFirstNonPHIIt()); + + // Create the CRC PHI, and initialize its incoming value to the initial + // value of CRC. + PHINode *CRCPhi = Builder.CreatePHI(CRCTy, 2, "crc"); + CRCPhi->addIncoming(Info.LHS, CurLoop->getLoopPreheader()); + + // CRC is now an evolving variable, initialized to the PHI. + Value *CRC = CRCPhi; + + // TableIndexer = ((top|bottom) byte of CRC). It is XOR'ed with (iv'th byte + // of LHSAux), if LHSAux is non-nullptr. + Value *Indexer = CRC; + if (Value *Data = Info.LHSAux) { + Type *DataTy = Data->getType(); + + // To index into the (iv'th byte of LHSAux), we multiply iv by 8, and we + // shift right by that amount, and take the lo-byte (in the little-endian + // case), or shift left by that amount, and take the hi-idx (in the + // big-endian case). + Value *IVBits = Builder.CreateZExtOrTrunc( + Builder.CreateShl(IV, 3, "iv.bits"), DataTy, "iv.indexer"); + Value *DataIndexer = + Info.ByteOrderSwapped + ? Builder.CreateShl(Data, IVBits, "data.indexer") + : Builder.CreateLShr(Data, IVBits, "data.indexer"); + Indexer = Builder.CreateXor( + DataIndexer, + Builder.CreateZExtOrTrunc(Indexer, DataTy, "crc.indexer.cast"), + "crc.data.indexer"); + } + + Indexer = Info.ByteOrderSwapped ? HiIdx(Builder, Indexer, "indexer.hi") + : LoByte(Builder, Indexer, "indexer.lo"); + + // Always index into a GEP using the index type. + Indexer = Builder.CreateZExt( + Indexer, SE->getDataLayout().getIndexType(GV->getType()), + "indexer.ext"); + + // CRCTableLd = CRCTable[(iv'th byte of data) ^ (top|bottom) byte of CRC]. + Value *CRCTableGEP = + Builder.CreateInBoundsGEP(CRCTy, GV, Indexer, "tbl.ptradd"); + Value *CRCTableLd = Builder.CreateLoad(CRCTy, CRCTableGEP, "tbl.ld"); + + // CRCNext = (CRC (<<|>>) 8) ^ CRCTableLd, or simply CRCTableLd in case of + // CRC-8. + Value *CRCNext = CRCTableLd; + if (CRCBW > 8) { + Value *CRCShift = Info.ByteOrderSwapped + ? Builder.CreateShl(CRC, 8, "crc.be.shift") + : Builder.CreateLShr(CRC, 8, "crc.le.shift"); + CRCNext = Builder.CreateXor(CRCShift, CRCTableLd, "crc.next"); + } + + // Connect the back-edge for the loop, and RAUW the ComputedValue. + CRCPhi->addIncoming(CRCNext, CurLoop->getLoopLatch()); + Info.ComputedValue->replaceUsesOutsideBlock(CRCNext, + CurLoop->getLoopLatch()); + } + + // Cleanup. + { + for (PHINode *PN : Cleanup) + RecursivelyDeleteDeadPHINode(PN); + SE->forgetLoop(CurLoop); + } + return true; +} + bool LoopIdiomRecognize::runOnNoncountableLoop() { LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" << CurLoop->getHeader()->getParent()->getName() diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index f7d2258e1c28..2bda9d83236e 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -220,6 +220,7 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences( UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze; UP.SCEVExpansionBudget = SCEVCheapExpansionBudget; UP.RuntimeUnrollMultiExit = false; + UP.AddAdditionalAccumulators = false; // Override with any target specific settings TTI.getUnrollingPreferences(L, SE, UP, &ORE); @@ -1354,6 +1355,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, ULO.Heart = getLoopConvergenceHeart(L); ULO.SCEVExpansionBudget = UP.SCEVExpansionBudget; ULO.RuntimeUnrollMultiExit = UP.RuntimeUnrollMultiExit; + ULO.AddAdditionalAccumulators = UP.AddAdditionalAccumulators; LoopUnrollResult UnrollResult = UnrollLoop( L, ULO, LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA); if (UnrollResult == LoopUnrollResult::Unmodified) diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 8b9d06d7e443..8a5569743ab4 100644 --- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -247,8 +247,8 @@ private: /// index I' according to UserChain produced by function "find". /// /// The building conceptually takes two steps: - /// 1) iteratively distribute s/zext towards the leaves of the expression tree - /// that computes I + /// 1) iteratively distribute sext/zext/trunc towards the leaves of the + /// expression tree that computes I /// 2) reassociate the expression tree to the form I' + C. /// /// For example, to extract the 5 from sext(a + (b + 5)), we first distribute @@ -260,29 +260,30 @@ private: Value *rebuildWithoutConstOffset(); /// After the first step of rebuilding the GEP index without the constant - /// offset, distribute s/zext to the operands of all operators in UserChain. - /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) => + /// offset, distribute sext/zext/trunc to the operands of all operators in + /// UserChain. e.g., zext(sext(a + (b + 5)) (assuming no overflow) => /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))). /// /// The function also updates UserChain to point to new subexpressions after - /// distributing s/zext. e.g., the old UserChain of the above example is - /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)), + /// distributing sext/zext/trunc. e.g., the old UserChain of the above example + /// is + /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)), /// and the new UserChain is - /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) -> - /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5)) + /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) -> + /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5)) /// /// \p ChainIndex The index to UserChain. ChainIndex is initially /// UserChain.size() - 1, and is decremented during /// the recursion. - Value *distributeExtsAndCloneChain(unsigned ChainIndex); + Value *distributeCastsAndCloneChain(unsigned ChainIndex); /// Reassociates the GEP index to the form I' + C and returns I'. Value *removeConstOffset(unsigned ChainIndex); - /// A helper function to apply ExtInsts, a list of s/zext, to value V. - /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function + /// A helper function to apply CastInsts, a list of sext/zext/trunc, to value + /// V. e.g., if CastInsts = [sext i32 to i64, zext i16 to i32], this function /// returns "sext i32 (zext i16 V to i32) to i64". - Value *applyExts(Value *V); + Value *applyCasts(Value *V); /// A helper function that returns whether we can trace into the operands /// of binary operator BO for a constant offset. @@ -307,8 +308,8 @@ private: SmallVector<User *, 8> UserChain; /// A data structure used in rebuildWithoutConstOffset. Contains all - /// sext/zext instructions along UserChain. - SmallVector<CastInst *, 16> ExtInsts; + /// sext/zext/trunc instructions along UserChain. + SmallVector<CastInst *, 16> CastInsts; /// Insertion position of cloned instructions. BasicBlock::iterator IP; @@ -491,7 +492,7 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended, } Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1); - // Do not trace into "or" unless it is equivalent to "add". + // Do not trace into "or" unless it is equivalent to "add nuw nsw". // This is the case if the or's disjoint flag is set. if (BO->getOpcode() == Instruction::Or && !cast<PossiblyDisjointInst>(BO)->isDisjoint()) @@ -503,8 +504,8 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended, if (ZeroExtended && !SignExtended && BO->getOpcode() == Instruction::Sub) return false; - // In addition, tracing into BO requires that its surrounding s/zext (if - // any) is distributable to both operands. + // In addition, tracing into BO requires that its surrounding sext/zext/trunc + // (if any) is distributable to both operands. // // Suppose BO = A op B. // SignExtended | ZeroExtended | Distributable? @@ -628,11 +629,11 @@ APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended, return ConstantOffset; } -Value *ConstantOffsetExtractor::applyExts(Value *V) { +Value *ConstantOffsetExtractor::applyCasts(Value *V) { Value *Current = V; - // ExtInsts is built in the use-def order. Therefore, we apply them to V + // CastInsts is built in the use-def order. Therefore, we apply them to V // in the reversed order. - for (CastInst *I : llvm::reverse(ExtInsts)) { + for (CastInst *I : llvm::reverse(CastInsts)) { if (Constant *C = dyn_cast<Constant>(Current)) { // Try to constant fold the cast. Current = ConstantFoldCastOperand(I->getOpcode(), C, I->getType(), DL); @@ -640,24 +641,24 @@ Value *ConstantOffsetExtractor::applyExts(Value *V) { continue; } - Instruction *Ext = I->clone(); - Ext->setOperand(0, Current); + Instruction *Cast = I->clone(); + Cast->setOperand(0, Current); // In ConstantOffsetExtractor::find we do not analyze nuw/nsw for trunc, so // we assume that it is ok to redistribute trunc over add/sub/or. But for // example (add (trunc nuw A), (trunc nuw B)) is more poisonous than (trunc // nuw (add A, B))). To make such redistributions legal we drop all the // poison generating flags from cloned trunc instructions here. - if (isa<TruncInst>(Ext)) - Ext->dropPoisonGeneratingFlags(); - Ext->insertBefore(*IP->getParent(), IP); - Current = Ext; + if (isa<TruncInst>(Cast)) + Cast->dropPoisonGeneratingFlags(); + Cast->insertBefore(*IP->getParent(), IP); + Current = Cast; } return Current; } Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() { - distributeExtsAndCloneChain(UserChain.size() - 1); - // Remove all nullptrs (used to be s/zext) from UserChain. + distributeCastsAndCloneChain(UserChain.size() - 1); + // Remove all nullptrs (used to be sext/zext/trunc) from UserChain. unsigned NewSize = 0; for (User *I : UserChain) { if (I != nullptr) { @@ -670,29 +671,29 @@ Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() { } Value * -ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) { +ConstantOffsetExtractor::distributeCastsAndCloneChain(unsigned ChainIndex) { User *U = UserChain[ChainIndex]; if (ChainIndex == 0) { assert(isa<ConstantInt>(U)); - // If U is a ConstantInt, applyExts will return a ConstantInt as well. - return UserChain[ChainIndex] = cast<ConstantInt>(applyExts(U)); + // If U is a ConstantInt, applyCasts will return a ConstantInt as well. + return UserChain[ChainIndex] = cast<ConstantInt>(applyCasts(U)); } if (CastInst *Cast = dyn_cast<CastInst>(U)) { assert( (isa<SExtInst>(Cast) || isa<ZExtInst>(Cast) || isa<TruncInst>(Cast)) && "Only following instructions can be traced: sext, zext & trunc"); - ExtInsts.push_back(Cast); + CastInsts.push_back(Cast); UserChain[ChainIndex] = nullptr; - return distributeExtsAndCloneChain(ChainIndex - 1); + return distributeCastsAndCloneChain(ChainIndex - 1); } // Function find only trace into BinaryOperator and CastInst. BinaryOperator *BO = cast<BinaryOperator>(U); // OpNo = which operand of BO is UserChain[ChainIndex - 1] unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1); - Value *TheOther = applyExts(BO->getOperand(1 - OpNo)); - Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1); + Value *TheOther = applyCasts(BO->getOperand(1 - OpNo)); + Value *NextInChain = distributeCastsAndCloneChain(ChainIndex - 1); BinaryOperator *NewBO = nullptr; if (OpNo == 0) { @@ -713,7 +714,7 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]); assert((BO->use_empty() || BO->hasOneUse()) && - "distributeExtsAndCloneChain clones each BinaryOperator in " + "distributeCastsAndCloneChain clones each BinaryOperator in " "UserChain, so no one should be used more than " "once"); @@ -847,7 +848,8 @@ static bool allowsPreservingNUW(const User *U) { // "add nuw trunc(a), trunc(b)" is more poisonous than "trunc(add nuw a, b)" if (const TruncInst *TI = dyn_cast<TruncInst>(U)) return TI->hasNoUnsignedWrap(); - return isa<CastInst>(U) || isa<ConstantInt>(U); + assert((isa<CastInst>(U) || isa<ConstantInt>(U)) && "Unexpected User."); + return true; } Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP, diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 9b40fc03da6b..e4ba70d1bce1 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -98,6 +98,9 @@ static cl::opt<bool> EnableUnswitchCostMultiplier( static cl::opt<int> UnswitchSiblingsToplevelDiv( "unswitch-siblings-toplevel-div", cl::init(2), cl::Hidden, cl::desc("Toplevel siblings divisor for cost multiplier.")); +static cl::opt<int> UnswitchParentBlocksDiv( + "unswitch-parent-blocks-div", cl::init(8), cl::Hidden, + cl::desc("Outer loop size divisor for cost multiplier.")); static cl::opt<int> UnswitchNumInitialUnscaledCandidates( "unswitch-num-initial-unscaled-candidates", cl::init(8), cl::Hidden, cl::desc("Number of unswitch candidates that are ignored when calculating " @@ -2809,9 +2812,9 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, } /// Cost multiplier is a way to limit potentially exponential behavior -/// of loop-unswitch. Cost is multipied in proportion of 2^number of unswitch -/// candidates available. Also accounting for the number of "sibling" loops with -/// the idea to account for previous unswitches that already happened on this +/// of loop-unswitch. Cost is multiplied in proportion of 2^number of unswitch +/// candidates available. Also consider the number of "sibling" loops with +/// the idea of accounting for previous unswitches that already happened on this /// cluster of loops. There was an attempt to keep this formula simple, /// just enough to limit the worst case behavior. Even if it is not that simple /// now it is still not an attempt to provide a detailed heuristic size @@ -2842,7 +2845,19 @@ static int CalculateUnswitchCostMultiplier( return 1; } + // Each invariant non-trivial condition, after being unswitched, is supposed + // to have its own specialized sibling loop (the invariant condition has been + // hoisted out of the child loop into a newly-cloned loop). When unswitching + // conditions in nested loops, the basic block size of the outer loop should + // not be altered. If such a size significantly increases across unswitching + // invocations, something may be wrong; so adjust the final cost taking this + // into account. auto *ParentL = L.getParentLoop(); + int ParentLoopSizeMultiplier = 1; + if (ParentL) + ParentLoopSizeMultiplier = + std::max<int>(ParentL->getNumBlocks() / UnswitchParentBlocksDiv, 1); + int SiblingsCount = (ParentL ? ParentL->getSubLoopsVector().size() : std::distance(LI.begin(), LI.end())); // Count amount of clones that all the candidates might cause during @@ -2887,14 +2902,16 @@ static int CalculateUnswitchCostMultiplier( // at an upper bound. int CostMultiplier; if (ClonesPower > Log2_32(UnswitchThreshold) || - SiblingsMultiplier > UnswitchThreshold) + SiblingsMultiplier > UnswitchThreshold || + ParentLoopSizeMultiplier > UnswitchThreshold) CostMultiplier = UnswitchThreshold; else CostMultiplier = std::min(SiblingsMultiplier * (1 << ClonesPower), (int)UnswitchThreshold); LLVM_DEBUG(dbgs() << " Computed multiplier " << CostMultiplier - << " (siblings " << SiblingsMultiplier << " * clones " + << " (siblings " << SiblingsMultiplier << " * parent size " + << ParentLoopSizeMultiplier << " * clones " << (1 << ClonesPower) << ")" << " for unswitch candidate: " << TI << "\n"); return CostMultiplier; diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index bb7dbc2980f5..e05625344ee2 100644 --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -997,7 +997,8 @@ void StructurizeCFG::simplifyHoistedPhis() { continue; OtherPhi->setIncomingValue(PoisonValBBIdx, V); - Phi->setIncomingValue(i, OtherV); + if (DT->dominates(OtherV, Phi)) + Phi->setIncomingValue(i, OtherV); } } } diff --git a/llvm/lib/Transforms/Utils/CodeLayout.cpp b/llvm/lib/Transforms/Utils/CodeLayout.cpp index c76b3afef50c..27b13eeaf4d7 100644 --- a/llvm/lib/Transforms/Utils/CodeLayout.cpp +++ b/llvm/lib/Transforms/Utils/CodeLayout.cpp @@ -1285,7 +1285,7 @@ private: // Cache misses on the merged chain double MergedCounts = ChainPred->ExecutionCount + ChainSucc->ExecutionCount; double MergedSize = ChainPred->Size + ChainSucc->Size; - double MergedDensity = static_cast<double>(MergedCounts) / MergedSize; + double MergedDensity = MergedCounts / MergedSize; double NewScore = MergedCounts * missProbability(MergedDensity); return CurScore - NewScore; diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp index 7063cde5263b..5a09b7385f2b 100644 --- a/llvm/lib/Transforms/Utils/Debugify.cpp +++ b/llvm/lib/Transforms/Utils/Debugify.cpp @@ -254,7 +254,6 @@ bool llvm::applyDebugifyMetadata( } if (ApplyToMF) ApplyToMF(DIB, F); - DIB.finalizeSubprogram(SP); } DIB.finalize(); diff --git a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp index 3bbe875bbe9e..1a9e16be6989 100644 --- a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp +++ b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp @@ -13,6 +13,8 @@ #include "llvm/Transforms/Utils/FunctionImportUtils.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/TimeProfiler.h" + using namespace llvm; /// Uses the "source_filename" instead of a Module hash ID for the suffix of @@ -370,6 +372,7 @@ void FunctionImportGlobalProcessing::run() { processGlobalsForThinLTO(); } void llvm::renameModuleForThinLTO(Module &M, const ModuleSummaryIndex &Index, bool ClearDSOLocalOnDeclarations, SetVector<GlobalValue *> *GlobalsToImport) { + llvm::TimeTraceScope timeScope("Rename module for ThinLTO"); FunctionImportGlobalProcessing ThinLTOProcessing(M, Index, GlobalsToImport, ClearDSOLocalOnDeclarations); ThinLTOProcessing.run(); diff --git a/llvm/lib/Transforms/Utils/IRNormalizer.cpp b/llvm/lib/Transforms/Utils/IRNormalizer.cpp index ad91318ae474..fefa49f68c8d 100644 --- a/llvm/lib/Transforms/Utils/IRNormalizer.cpp +++ b/llvm/lib/Transforms/Utils/IRNormalizer.cpp @@ -427,7 +427,7 @@ void IRNormalizer::reorderInstructions(Function &F) const { // Process the remaining instructions. // // TODO: Do more a intelligent sorting of these instructions. For example, - // seperate between dead instructinos and instructions used in another + // separate between dead instructinos and instructions used in another // block. Use properties of the CFG the order instructions that are used // in another block. if (Visited.contains(&I)) diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index ac344904f90f..2cfd70a1746c 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3397,8 +3397,8 @@ DIExpression *llvm::getExpressionForConstant(DIBuilder &DIB, const Constant &C, if (FP && Ty.isFloatingPointTy() && Ty.getScalarSizeInBits() <= 64) { const APFloat &APF = FP->getValueAPF(); APInt const &API = APF.bitcastToAPInt(); - if (auto Temp = API.getZExtValue()) - return DIB.createConstantValueExpression(static_cast<uint64_t>(Temp)); + if (uint64_t Temp = API.getZExtValue()) + return DIB.createConstantValueExpression(Temp); return DIB.createConstantValueExpression(*API.getRawData()); } @@ -3838,8 +3838,8 @@ void llvm::maybeMarkSanitizerLibraryCallNoBuiltin( bool llvm::canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) { const auto *Op = I->getOperand(OpIdx); - // We can't have a PHI with a metadata type. - if (Op->getType()->isMetadataTy()) + // We can't have a PHI with a metadata or token type. + if (Op->getType()->isMetadataTy() || Op->getType()->isTokenLikeTy()) return false; // swifterror pointers can only be used by a load, store, or as a swifterror diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp index ba0ac01cadd8..735bad1cb134 100644 --- a/llvm/lib/Transforms/Utils/LoopPeel.cpp +++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp @@ -225,9 +225,9 @@ protected: // Auxiliary function to calculate the number of iterations for a comparison // instruction or a binary operator. - PeelCounter mergeTwoCounter(const Instruction &CmpOrBinaryOp, - const PeelCounterValue &LHS, - const PeelCounterValue &RHS) const; + PeelCounter mergeTwoCounters(const Instruction &CmpOrBinaryOp, + const PeelCounterValue &LHS, + const PeelCounterValue &RHS) const; // Returns true if the \p Phi is an induction in the target loop. This is a // lightweight check and possible to detect an IV in some cases. @@ -269,15 +269,13 @@ bool PhiAnalyzer::isInductionPHI(const PHINode *Phi) const { break; // Avoid infinite loop. - if (Visited.contains(Cur)) + if (!Visited.insert(Cur).second) return false; auto *I = dyn_cast<Instruction>(Cur); if (!I || !L.contains(I)) return false; - Visited.insert(Cur); - if (auto *Cast = dyn_cast<CastInst>(I)) { Cur = Cast->getOperand(0); } else if (auto *BinOp = dyn_cast<BinaryOperator>(I)) { @@ -300,14 +298,14 @@ bool PhiAnalyzer::isInductionPHI(const PHINode *Phi) const { /// When either \p LHS or \p RHS is an IV, the result of \p CmpOrBinaryOp is /// considered an IV only if it is an addition or a subtraction. Otherwise the -/// result can be a value that is neither an loop-invariant nor an IV. +/// result can be a value that is neither a loop-invariant nor an IV. /// /// If both \p LHS and \p RHS are loop-invariants, then the result of /// \CmpOrBinaryOp is also a loop-invariant. PhiAnalyzer::PeelCounter -PhiAnalyzer::mergeTwoCounter(const Instruction &CmpOrBinaryOp, - const PeelCounterValue &LHS, - const PeelCounterValue &RHS) const { +PhiAnalyzer::mergeTwoCounters(const Instruction &CmpOrBinaryOp, + const PeelCounterValue &LHS, + const PeelCounterValue &RHS) const { auto &[LVal, LTy] = LHS; auto &[RVal, RTy] = RHS; unsigned NewVal = std::max(LVal, RVal); @@ -380,7 +378,7 @@ PhiAnalyzer::PeelCounter PhiAnalyzer::calculate(const Value &V) { if (RHS == Unknown) return Unknown; return (IterationsToInvarianceOrInduction[I] = - mergeTwoCounter(*I, *LHS, *RHS)); + mergeTwoCounters(*I, *LHS, *RHS)); } if (I->isCast()) // Cast instructions get the value of the operand. diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp index 86b268de43cf..b18aceaa67d7 100644 --- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -41,6 +41,7 @@ #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" @@ -108,6 +109,9 @@ UnrollVerifyLoopInfo("unroll-verify-loopinfo", cl::Hidden, #endif ); +static cl::opt<bool> UnrollAddParallelReductions( + "unroll-add-parallel-reductions", cl::init(false), cl::Hidden, + cl::desc("Allow unrolling to add parallel reduction phis.")); /// Check if unrolling created a situation where we need to insert phi nodes to /// preserve LCSSA form. @@ -660,6 +664,41 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, OrigPHINode.push_back(cast<PHINode>(I)); } + // Collect phi nodes for reductions for which we can introduce multiple + // parallel reduction phis and compute the final reduction result after the + // loop. This requires a single exit block after unrolling. This is ensured by + // restricting to single-block loops where the unrolled iterations are known + // to not exit. + DenseMap<PHINode *, RecurrenceDescriptor> Reductions; + bool CanAddAdditionalAccumulators = + (UnrollAddParallelReductions.getNumOccurrences() > 0 + ? UnrollAddParallelReductions + : ULO.AddAdditionalAccumulators) && + !CompletelyUnroll && L->getNumBlocks() == 1 && + (ULO.Runtime || + (ExitInfos.contains(Header) && ((ExitInfos[Header].TripCount != 0 && + ExitInfos[Header].BreakoutTrip == 0)))); + + // Limit parallelizing reductions to unroll counts of 4 or less for now. + // TODO: The number of parallel reductions should depend on the number of + // execution units. We also don't have to add a parallel reduction phi per + // unrolled iteration, but could for example add a parallel phi for every 2 + // unrolled iterations. + if (CanAddAdditionalAccumulators && ULO.Count <= 4) { + for (PHINode &Phi : Header->phis()) { + auto RdxDesc = canParallelizeReductionWhenUnrolling(Phi, L, SE); + if (!RdxDesc) + continue; + + // Only handle duplicate phis for a single reduction for now. + // TODO: Handle any number of reductions + if (!Reductions.empty()) + continue; + + Reductions[&Phi] = *RdxDesc; + } + } + std::vector<BasicBlock *> Headers; std::vector<BasicBlock *> Latches; Headers.push_back(Header); @@ -710,6 +749,7 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, // latch. This is a reasonable default placement if we don't have block // frequencies, and if we do, well the layout will be adjusted later. auto BlockInsertPt = std::next(LatchBlock->getIterator()); + SmallVector<Instruction *> PartialReductions; for (unsigned It = 1; It != ULO.Count; ++It) { SmallVector<BasicBlock *, 8> NewBlocks; SmallDenseMap<const Loop *, Loop *, 4> NewLoops; @@ -733,6 +773,31 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, for (PHINode *OrigPHI : OrigPHINode) { PHINode *NewPHI = cast<PHINode>(VMap[OrigPHI]); Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock); + + // Use cloned phis as parallel phis for partial reductions, which will + // get combined to the final reduction result after the loop. + if (Reductions.contains(OrigPHI)) { + // Collect partial reduction results. + if (PartialReductions.empty()) + PartialReductions.push_back(cast<Instruction>(InVal)); + PartialReductions.push_back(cast<Instruction>(VMap[InVal])); + + // Update the start value for the cloned phis to use the identity + // value for the reduction. + const RecurrenceDescriptor &RdxDesc = Reductions[OrigPHI]; + NewPHI->setIncomingValueForBlock( + L->getLoopPreheader(), + getRecurrenceIdentity(RdxDesc.getRecurrenceKind(), + OrigPHI->getType(), + RdxDesc.getFastMathFlags())); + + // Update NewPHI to use the cloned value for the iteration and move + // to header. + NewPHI->replaceUsesOfWith(InVal, VMap[InVal]); + NewPHI->moveBefore(OrigPHI->getIterator()); + continue; + } + if (Instruction *InValI = dyn_cast<Instruction>(InVal)) if (It > 1 && L->contains(InValI)) InVal = LastValueMap[InValI]; @@ -832,6 +897,9 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader)); PN->eraseFromParent(); } else if (ULO.Count > 1) { + if (Reductions.contains(PN)) + continue; + Value *InVal = PN->removeIncomingValue(LatchBlock, false); // If this value was defined in the loop, take the value defined by the // last iteration of the loop. @@ -1010,6 +1078,38 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, } } + // If there are partial reductions, create code in the exit block to compute + // the final result and update users of the final result. + if (!PartialReductions.empty()) { + BasicBlock *ExitBlock = L->getExitBlock(); + assert(ExitBlock && + "Can only introduce parallel reduction phis with single exit block"); + assert(Reductions.size() == 1 && + "currently only a single reduction is supported"); + Value *FinalRdxValue = PartialReductions.back(); + Value *RdxResult = nullptr; + for (PHINode &Phi : ExitBlock->phis()) { + if (Phi.getIncomingValueForBlock(L->getLoopLatch()) != FinalRdxValue) + continue; + if (!RdxResult) { + RdxResult = PartialReductions.front(); + IRBuilder Builder(ExitBlock, ExitBlock->getFirstNonPHIIt()); + RecurKind RK = Reductions.begin()->second.getRecurrenceKind(); + for (Instruction *RdxPart : drop_begin(PartialReductions)) { + RdxResult = Builder.CreateBinOp( + (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK), + RdxPart, RdxResult, "bin.rdx"); + } + NeedToFixLCSSA = true; + for (Instruction *RdxPart : PartialReductions) + RdxPart->dropPoisonGeneratingFlags(); + } + + Phi.replaceAllUsesWith(RdxResult); + continue; + } + } + if (DTUToUse) { // Apply updates to the DomTree. DT = &DTU.getDomTree(); @@ -1111,3 +1211,41 @@ MDNode *llvm::GetUnrollMetadata(MDNode *LoopID, StringRef Name) { } return nullptr; } + +std::optional<RecurrenceDescriptor> +llvm::canParallelizeReductionWhenUnrolling(PHINode &Phi, Loop *L, + ScalarEvolution *SE) { + RecurrenceDescriptor RdxDesc; + if (!RecurrenceDescriptor::isReductionPHI(&Phi, L, RdxDesc, + /*DemandedBits=*/nullptr, + /*AC=*/nullptr, /*DT=*/nullptr, SE)) + return std::nullopt; + RecurKind RK = RdxDesc.getRecurrenceKind(); + // Skip unsupported reductions. + // TODO: Handle additional reductions, including FP and min-max + // reductions. + if (!RecurrenceDescriptor::isIntegerRecurrenceKind(RK) || + RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) || + RecurrenceDescriptor::isFindIVRecurrenceKind(RK) || + RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) + return std::nullopt; + + if (RdxDesc.IntermediateStore) + return std::nullopt; + + // Don't unroll reductions with constant ops; those can be folded to a + // single induction update. + if (any_of(cast<Instruction>(Phi.getIncomingValueForBlock(L->getLoopLatch())) + ->operands(), + IsaPred<Constant>)) + return std::nullopt; + + BasicBlock *Latch = L->getLoopLatch(); + if (!Latch || + !is_contained( + cast<Instruction>(Phi.getIncomingValueForBlock(Latch))->operands(), + &Phi)) + return std::nullopt; + + return RdxDesc; +} diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 843364eb34f8..b172ef6ba080 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -2032,6 +2032,7 @@ Value *llvm::addRuntimeChecks( MemoryRuntimeCheck = IsConflict; } + Exp.eraseDeadInstructions(MemoryRuntimeCheck); return MemoryRuntimeCheck; } @@ -2077,6 +2078,7 @@ Value *llvm::addDiffRuntimeChecks( MemoryRuntimeCheck = IsConflict; } + Expander.eraseDeadInstructions(MemoryRuntimeCheck); return MemoryRuntimeCheck; } diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp index 1711163fb9f5..ec2e6c1ab796 100644 --- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp +++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp @@ -81,6 +81,8 @@ void LoopVersioning::versionLoop( } else RuntimeCheck = MemRuntimeCheck ? MemRuntimeCheck : SCEVRuntimeCheck; + Exp.eraseDeadInstructions(SCEVRuntimeCheck); + assert(RuntimeCheck && "called even though we don't need " "any runtime checks"); diff --git a/llvm/lib/Transforms/Utils/ProfileVerify.cpp b/llvm/lib/Transforms/Utils/ProfileVerify.cpp index 41647f7717a4..faacd422c009 100644 --- a/llvm/lib/Transforms/Utils/ProfileVerify.cpp +++ b/llvm/lib/Transforms/Utils/ProfileVerify.cpp @@ -155,12 +155,15 @@ PreservedAnalyses ProfileVerifierPass::run(Function &F, FunctionAnalysisManager &FAM) { const auto EntryCount = F.getEntryCount(/*AllowSynthetic=*/true); if (!EntryCount) { - F.getContext().emitError("Profile verification failed: function entry " - "count missing (set to 0 if cold)"); + auto *MD = F.getMetadata(LLVMContext::MD_prof); + if (!MD || !isExplicitlyUnknownProfileMetadata(*MD)) { + F.getContext().emitError("Profile verification failed: function entry " + "count missing (set to 0 if cold)"); + return PreservedAnalyses::all(); + } + } else if (EntryCount->getCount() == 0) { return PreservedAnalyses::all(); } - if (EntryCount->getCount() == 0) - return PreservedAnalyses::all(); for (const auto &BB : F) { if (AnnotateSelect) { for (const auto &I : BB) diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index 10c162bc6463..d93a4d87f30f 100644 --- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -849,9 +849,12 @@ void PromoteMem2Reg::run() { for (unsigned i = 0, e = Allocas.size(); i != e; ++i) IncomingVals.init(i, UndefValue::get(Allocas[i]->getAllocatedType())); - // When handling debug info, treat all incoming values as if they have unknown - // locations until proven otherwise. + // When handling debug info, treat all incoming values as if they have + // compiler-generated (empty) locations, representing the uninitialized + // alloca, until proven otherwise. IncomingLocs.resize(Allocas.size()); + for (unsigned i = 0, e = Allocas.size(); i != e; ++i) + IncomingLocs.init(i, DebugLoc::getCompilerGenerated()); // The renamer uses the Visited set to avoid infinite loops. Visited.resize(F.getMaxBlockNumber(), false); diff --git a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp index d53a3144bf57..a814867652cd 100644 --- a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp +++ b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp @@ -21,29 +21,20 @@ using namespace llvm; -static bool shouldConvertToRelLookupTable(Module &M, GlobalVariable &GV) { +struct LookupTableInfo { + Value *Index; + SmallVector<Constant *> Ptrs; +}; + +static bool shouldConvertToRelLookupTable(LookupTableInfo &Info, Module &M, + GlobalVariable &GV) { // If lookup table has more than one user, // do not generate a relative lookup table. // This is to simplify the analysis that needs to be done for this pass. // TODO: Add support for lookup tables with multiple uses. // For ex, this can happen when a function that uses a lookup table gets // inlined into multiple call sites. - if (!GV.hasInitializer() || - !GV.isConstant() || - !GV.hasOneUse()) - return false; - - GetElementPtrInst *GEP = - dyn_cast<GetElementPtrInst>(GV.use_begin()->getUser()); - if (!GEP || !GEP->hasOneUse() || - GV.getValueType() != GEP->getSourceElementType()) - return false; - - LoadInst *Load = dyn_cast<LoadInst>(GEP->use_begin()->getUser()); - if (!Load || !Load->hasOneUse() || - Load->getType() != GEP->getResultElementType()) - return false; - + // // If the original lookup table does not have local linkage and is // not dso_local, do not generate a relative lookup table. // This optimization creates a relative lookup table that consists of @@ -51,21 +42,40 @@ static bool shouldConvertToRelLookupTable(Module &M, GlobalVariable &GV) { // To be able to generate these offsets, relative lookup table and // its elements should have internal linkage and be dso_local, which means // that they should resolve to symbols within the same linkage unit. - if (!GV.hasLocalLinkage() || - !GV.isDSOLocal() || - !GV.isImplicitDSOLocal()) + if (!GV.hasInitializer() || !GV.isConstant() || !GV.hasOneUse() || + !GV.hasLocalLinkage() || !GV.isDSOLocal() || !GV.isImplicitDSOLocal()) return false; - ConstantArray *Array = dyn_cast<ConstantArray>(GV.getInitializer()); - if (!Array) + auto *GEP = dyn_cast<GetElementPtrInst>(GV.use_begin()->getUser()); + if (!GEP || !GEP->hasOneUse()) + return false; + + auto *Load = dyn_cast<LoadInst>(GEP->use_begin()->getUser()); + if (!Load || !Load->hasOneUse()) return false; // If values are not 64-bit pointers, do not generate a relative lookup table. const DataLayout &DL = M.getDataLayout(); - Type *ElemType = Array->getType()->getElementType(); + Type *ElemType = Load->getType(); if (!ElemType->isPointerTy() || DL.getPointerTypeSizeInBits(ElemType) != 64) return false; + // Make sure this is a gep of the form GV + scale*var. + unsigned IndexWidth = + DL.getIndexTypeSizeInBits(Load->getPointerOperand()->getType()); + SmallMapVector<Value *, APInt, 4> VarOffsets; + APInt ConstOffset(IndexWidth, 0); + if (!GEP->collectOffset(DL, IndexWidth, VarOffsets, ConstOffset) || + !ConstOffset.isZero() || VarOffsets.size() != 1) + return false; + + // This can't be a pointer lookup table if the stride is smaller than a + // pointer. + Info.Index = VarOffsets.front().first; + const APInt &Stride = VarOffsets.front().second; + if (Stride.ult(DL.getTypeStoreSize(ElemType))) + return false; + SmallVector<GlobalVariable *, 4> GVOps; Triple TT = M.getTargetTriple(); // FIXME: This should be removed in the future. @@ -80,14 +90,20 @@ static bool shouldConvertToRelLookupTable(Module &M, GlobalVariable &GV) { // https://github.com/rust-lang/rust/issues/141306. || (TT.isX86() && TT.isOSDarwin()); - for (const Use &Op : Array->operands()) { - Constant *ConstOp = cast<Constant>(&Op); + APInt Offset(IndexWidth, 0); + uint64_t GVSize = DL.getTypeAllocSize(GV.getValueType()); + for (; Offset.ult(GVSize); Offset += Stride) { + Constant *C = + ConstantFoldLoadFromConst(GV.getInitializer(), ElemType, Offset, DL); + if (!C) + return false; + GlobalValue *GVOp; - APInt Offset; + APInt GVOffset; // If an operand is not a constant offset from a lookup table, // do not generate a relative lookup table. - if (!IsConstantOffsetFromGlobal(ConstOp, GVOp, Offset, DL)) + if (!IsConstantOffsetFromGlobal(C, GVOp, GVOffset, DL)) return false; // If operand is mutable, do not generate a relative lookup table. @@ -102,6 +118,8 @@ static bool shouldConvertToRelLookupTable(Module &M, GlobalVariable &GV) { if (ShouldDropUnnamedAddr) GVOps.push_back(GlovalVarOp); + + Info.Ptrs.push_back(C); } if (ShouldDropUnnamedAddr) @@ -111,14 +129,12 @@ static bool shouldConvertToRelLookupTable(Module &M, GlobalVariable &GV) { return true; } -static GlobalVariable *createRelLookupTable(Function &Func, +static GlobalVariable *createRelLookupTable(LookupTableInfo &Info, + Function &Func, GlobalVariable &LookupTable) { Module &M = *Func.getParent(); - ConstantArray *LookupTableArr = - cast<ConstantArray>(LookupTable.getInitializer()); - unsigned NumElts = LookupTableArr->getType()->getNumElements(); ArrayType *IntArrayTy = - ArrayType::get(Type::getInt32Ty(M.getContext()), NumElts); + ArrayType::get(Type::getInt32Ty(M.getContext()), Info.Ptrs.size()); GlobalVariable *RelLookupTable = new GlobalVariable( M, IntArrayTy, LookupTable.isConstant(), LookupTable.getLinkage(), @@ -127,10 +143,9 @@ static GlobalVariable *createRelLookupTable(Function &Func, LookupTable.isExternallyInitialized()); uint64_t Idx = 0; - SmallVector<Constant *, 64> RelLookupTableContents(NumElts); + SmallVector<Constant *, 64> RelLookupTableContents(Info.Ptrs.size()); - for (Use &Operand : LookupTableArr->operands()) { - Constant *Element = cast<Constant>(Operand); + for (Constant *Element : Info.Ptrs) { Type *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext()); Constant *Base = llvm::ConstantExpr::getPtrToInt(RelLookupTable, IntPtrTy); Constant *Target = llvm::ConstantExpr::getPtrToInt(Element, IntPtrTy); @@ -148,7 +163,8 @@ static GlobalVariable *createRelLookupTable(Function &Func, return RelLookupTable; } -static void convertToRelLookupTable(GlobalVariable &LookupTable) { +static void convertToRelLookupTable(LookupTableInfo &Info, + GlobalVariable &LookupTable) { GetElementPtrInst *GEP = cast<GetElementPtrInst>(LookupTable.use_begin()->getUser()); LoadInst *Load = cast<LoadInst>(GEP->use_begin()->getUser()); @@ -159,21 +175,21 @@ static void convertToRelLookupTable(GlobalVariable &LookupTable) { Function &Func = *BB->getParent(); // Generate an array that consists of relative offsets. - GlobalVariable *RelLookupTable = createRelLookupTable(Func, LookupTable); + GlobalVariable *RelLookupTable = + createRelLookupTable(Info, Func, LookupTable); // Place new instruction sequence before GEP. Builder.SetInsertPoint(GEP); - Value *Index = GEP->getOperand(2); - IntegerType *IntTy = cast<IntegerType>(Index->getType()); - Value *Offset = - Builder.CreateShl(Index, ConstantInt::get(IntTy, 2), "reltable.shift"); + IntegerType *IntTy = cast<IntegerType>(Info.Index->getType()); + Value *Offset = Builder.CreateShl(Info.Index, ConstantInt::get(IntTy, 2), + "reltable.shift"); // Insert the call to load.relative intrinsic before LOAD. // GEP might not be immediately followed by a LOAD, like it can be hoisted // outside the loop or another instruction might be inserted them in between. Builder.SetInsertPoint(Load); Function *LoadRelIntrinsic = llvm::Intrinsic::getOrInsertDeclaration( - &M, Intrinsic::load_relative, {Index->getType()}); + &M, Intrinsic::load_relative, {Info.Index->getType()}); // Create a call to load.relative intrinsic that computes the target address // by adding base address (lookup table address) and relative offset. @@ -205,10 +221,11 @@ static bool convertToRelativeLookupTables( bool Changed = false; for (GlobalVariable &GV : llvm::make_early_inc_range(M.globals())) { - if (!shouldConvertToRelLookupTable(M, GV)) + LookupTableInfo Info; + if (!shouldConvertToRelLookupTable(Info, M, GV)) continue; - convertToRelLookupTable(GV); + convertToRelLookupTable(Info, GV); // Remove the original lookup table. GV.eraseFromParent(); diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index 060ca92e559a..28befd0aa1ce 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -26,6 +26,7 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #if LLVM_ENABLE_ABI_BREAKING_CHECKS @@ -175,6 +176,26 @@ SCEVExpander::findInsertPointAfter(Instruction *I, return IP; } +void SCEVExpander::eraseDeadInstructions(Value *Root) { + SmallVector<Value *> WorkList; + SmallPtrSet<Value *, 8> DeletedValues; + append_range(WorkList, getAllInsertedInstructions()); + while (!WorkList.empty()) { + Value *V = WorkList.pop_back_val(); + if (DeletedValues.contains(V)) + continue; + auto *I = dyn_cast<Instruction>(V); + if (!I || I == Root || !isInsertedInstruction(I) || + !isInstructionTriviallyDead(I)) + continue; + append_range(WorkList, I->operands()); + InsertedValues.erase(I); + InsertedPostIncValues.erase(I); + DeletedValues.insert(I); + I->eraseFromParent(); + } +} + BasicBlock::iterator SCEVExpander::GetOptimalInsertionPointForCastOf(Value *V) const { // Cast the argument at the beginning of the entry block, after @@ -1239,10 +1260,13 @@ Value *SCEVExpander::tryToReuseLCSSAPhi(const SCEVAddRecExpr *S) { if (!isa<SCEVAddRecExpr>(ExitSCEV)) continue; Type *PhiTy = PN.getType(); - if (STy->isIntegerTy() && PhiTy->isPointerTy()) + if (STy->isIntegerTy() && PhiTy->isPointerTy()) { ExitSCEV = SE.getPtrToIntExpr(ExitSCEV, STy); - else if (S->getType() != PN.getType()) + if (isa<SCEVCouldNotCompute>(ExitSCEV)) + continue; + } else if (S->getType() != PN.getType()) { continue; + } // Check if we can re-use the existing PN, by adjusting it with an expanded // offset, if the offset is simpler. @@ -2184,8 +2208,15 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR, // negative. If Step is known to be positive or negative, only create // either 1. or 2. auto ComputeEndCheck = [&]() -> Value * { - // Checking <u 0 is always false. - if (!Signed && Start->isZero() && SE.isKnownPositive(Step)) + // Checking <u 0 is always false, if (Step * trunc ExitCount) does not wrap. + // TODO: Predicates that can be proven true/false should be discarded when + // the predicates are created, not late during expansion. + if (!Signed && Start->isZero() && SE.isKnownPositive(Step) && + DstBits < SrcBits && + ExitCount == SE.getZeroExtendExpr(SE.getTruncateExpr(ExitCount, ARTy), + ExitCount->getType()) && + SE.willNotOverflow(Instruction::Mul, Signed, Step, + SE.getTruncateExpr(ExitCount, ARTy))) return ConstantInt::getFalse(Loc->getContext()); // Get the backedge taken count and truncate or extended to the AR type. diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 7a538ae2c583..970f85378d3d 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -612,6 +612,18 @@ private: /// If CompValue is already set, the function is expected to fail if a match /// is found but the value compared to is different. bool matchInstruction(Instruction *I, bool isEQ) { + if (match(I, m_Not(m_Instruction(I)))) + isEQ = !isEQ; + + Value *Val; + if (match(I, m_NUWTrunc(m_Value(Val)))) { + // If we already have a value for the switch, it has to match! + if (!setValueOnce(Val)) + return false; + UsedICmps++; + Vals.push_back(ConstantInt::get(cast<IntegerType>(Val->getType()), isEQ)); + return true; + } // If this is an icmp against a constant, handle this as one of the cases. ICmpInst *ICI; ConstantInt *C; @@ -2260,10 +2272,6 @@ static bool canSinkInstructions( for (unsigned OI = 0, OE = I0->getNumOperands(); OI != OE; ++OI) { Value *Op = I0->getOperand(OI); - if (Op->getType()->isTokenTy()) - // Don't touch any operand of token type. - return false; - auto SameAsI0 = [&I0, OI](const Instruction *I) { assert(I->getNumOperands() == I0->getNumOperands()); return I->getOperand(OI) == I0->getOperand(OI); @@ -2764,8 +2772,7 @@ bool CompatibleSets::shouldBelongToSameSet(ArrayRef<InvokeInst *> Invokes) { Use &U1 = std::get<1>(Ops); if (U0 == U1) return false; - return U0->getType()->isTokenTy() || - !canReplaceOperandWithVariable(cast<Instruction>(U0.getUser()), + return !canReplaceOperandWithVariable(cast<Instruction>(U0.getUser()), U0.getOperandNo()); }; assert(Invokes.size() == 2 && "Always called with exactly two candidates."); @@ -4404,10 +4411,12 @@ static bool mergeConditionalStoreToAddress( // OK, we're going to sink the stores to PostBB. The store has to be // conditional though, so first create the predicate. - Value *PCond = cast<BranchInst>(PFB->getSinglePredecessor()->getTerminator()) - ->getCondition(); - Value *QCond = cast<BranchInst>(QFB->getSinglePredecessor()->getTerminator()) - ->getCondition(); + BranchInst *PBranch = + cast<BranchInst>(PFB->getSinglePredecessor()->getTerminator()); + BranchInst *QBranch = + cast<BranchInst>(QFB->getSinglePredecessor()->getTerminator()); + Value *PCond = PBranch->getCondition(); + Value *QCond = QBranch->getCondition(); Value *PPHI = ensureValueAvailableInSuccessor(PStore->getValueOperand(), PStore->getParent()); @@ -4418,13 +4427,11 @@ static bool mergeConditionalStoreToAddress( IRBuilder<> QB(PostBB, PostBBFirst); QB.SetCurrentDebugLocation(PostBBFirst->getStableDebugLoc()); - Value *PPred = PStore->getParent() == PTB ? PCond : QB.CreateNot(PCond); - Value *QPred = QStore->getParent() == QTB ? QCond : QB.CreateNot(QCond); + InvertPCond ^= (PStore->getParent() != PTB); + InvertQCond ^= (QStore->getParent() != QTB); + Value *PPred = InvertPCond ? QB.CreateNot(PCond) : PCond; + Value *QPred = InvertQCond ? QB.CreateNot(QCond) : QCond; - if (InvertPCond) - PPred = QB.CreateNot(PPred); - if (InvertQCond) - QPred = QB.CreateNot(QPred); Value *CombinedPred = QB.CreateOr(PPred, QPred); BasicBlock::iterator InsertPt = QB.GetInsertPoint(); @@ -4808,23 +4815,12 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI, SelectInst *NV = cast<SelectInst>( Builder.CreateSelect(PBICond, PBIV, BIV, PBIV->getName() + ".mux")); PN.setIncomingValue(PBBIdx, NV); - // Although the select has the same condition as PBI, the original branch - // weights for PBI do not apply to the new select because the select's - // 'logical' edges are incoming edges of the phi that is eliminated, not - // the outgoing edges of PBI. + // The select has the same condition as PBI, in the same BB. The + // probabilities don't change. if (HasWeights) { - uint64_t PredCommon = PBIOp ? PredFalseWeight : PredTrueWeight; - uint64_t PredOther = PBIOp ? PredTrueWeight : PredFalseWeight; - uint64_t SuccCommon = BIOp ? SuccFalseWeight : SuccTrueWeight; - uint64_t SuccOther = BIOp ? SuccTrueWeight : SuccFalseWeight; - // The weight to PredCommonDest should be PredCommon * SuccTotal. - // The weight to PredOtherDest should be PredOther * SuccCommon. - uint64_t NewWeights[2] = {PredCommon * (SuccCommon + SuccOther), - PredOther * SuccCommon}; - - fitWeights(NewWeights); - - setBranchWeights(NV, NewWeights[0], NewWeights[1], + uint64_t TrueWeight = PBIOp ? PredFalseWeight : PredTrueWeight; + uint64_t FalseWeight = PBIOp ? PredTrueWeight : PredFalseWeight; + setBranchWeights(NV, TrueWeight, FalseWeight, /*IsExpected=*/false); } } @@ -6437,34 +6433,42 @@ static bool trySwitchToSelect(SwitchInst *SI, IRBuilder<> &Builder, namespace { -/// This class represents a lookup table that can be used to replace a switch. -class SwitchLookupTable { +/// This class finds alternatives for switches to ultimately +/// replace the switch. +class SwitchReplacement { public: - /// Create a lookup table to use as a switch replacement with the contents - /// of Values, using DefaultValue to fill any holes in the table. - SwitchLookupTable( + /// Create a helper for optimizations to use as a switch replacement. + /// Find a better representation for the content of Values, + /// using DefaultValue to fill any holes in the table. + SwitchReplacement( Module &M, uint64_t TableSize, ConstantInt *Offset, const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values, Constant *DefaultValue, const DataLayout &DL, const StringRef &FuncName); - /// Build instructions with Builder to retrieve the value at - /// the position given by Index in the lookup table. - Value *buildLookup(Value *Index, IRBuilder<> &Builder, const DataLayout &DL); + /// Build instructions with Builder to retrieve values using Index + /// and replace the switch. + Value *replaceSwitch(Value *Index, IRBuilder<> &Builder, const DataLayout &DL, + Function *Func); /// Return true if a table with TableSize elements of /// type ElementType would fit in a target-legal register. static bool wouldFitInRegister(const DataLayout &DL, uint64_t TableSize, Type *ElementType); + /// Return the default value of the switch. + Constant *getDefaultValue(); + + /// Return true if the replacement is a lookup table. + bool isLookupTable(); + private: - // Depending on the contents of the table, it can be represented in - // different ways. + // Depending on the switch, there are different alternatives. enum { - // For tables where each element contains the same value, we just have to + // For switches where each case contains the same value, we just have to // store that single value and return it for each lookup. SingleValueKind, - // For tables where there is a linear relationship between table index + // For switches where there is a linear relationship between table index // and values. We calculate the result with a simple multiplication // and addition instead of a table lookup. LinearMapKind, @@ -6476,9 +6480,15 @@ private: // The table is stored as an array of values. Values are retrieved by load // instructions from the table. - ArrayKind + LookupTableKind } Kind; + // The default value of the switch. + Constant *DefaultValue; + + // The type of the output values. + Type *ValueType; + // For SingleValueKind, this is the single value. Constant *SingleValue = nullptr; @@ -6491,23 +6501,24 @@ private: ConstantInt *LinearMultiplier = nullptr; bool LinearMapValWrapped = false; - // For ArrayKind, this is the array. - GlobalVariable *Array = nullptr; + // For LookupTableKind, this is the table. + Constant *Initializer = nullptr; }; } // end anonymous namespace -SwitchLookupTable::SwitchLookupTable( +SwitchReplacement::SwitchReplacement( Module &M, uint64_t TableSize, ConstantInt *Offset, const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values, - Constant *DefaultValue, const DataLayout &DL, const StringRef &FuncName) { + Constant *DefaultValue, const DataLayout &DL, const StringRef &FuncName) + : DefaultValue(DefaultValue) { assert(Values.size() && "Can't build lookup table without values!"); assert(TableSize >= Values.size() && "Can't fit values in table!"); // If all values in the table are equal, this is that value. SingleValue = Values.begin()->second; - Type *ValueType = Values.begin()->second->getType(); + ValueType = Values.begin()->second->getType(); // Build up the table contents. SmallVector<Constant *, 64> TableContents(TableSize); @@ -6597,7 +6608,6 @@ SwitchLookupTable::SwitchLookupTable( (void)M.smul_ov(APInt(M.getBitWidth(), TableSize - 1), MayWrap); LinearMapValWrapped = NonMonotonic || MayWrap; Kind = LinearMapKind; - ++NumLinearMaps; return; } } @@ -6617,30 +6627,23 @@ SwitchLookupTable::SwitchLookupTable( BitMap = ConstantInt::get(M.getContext(), TableInt); BitMapElementTy = IT; Kind = BitMapKind; - ++NumBitMaps; return; } // Store the table in an array. - ArrayType *ArrayTy = ArrayType::get(ValueType, TableSize); - Constant *Initializer = ConstantArray::get(ArrayTy, TableContents); - - Array = new GlobalVariable(M, ArrayTy, /*isConstant=*/true, - GlobalVariable::PrivateLinkage, Initializer, - "switch.table." + FuncName); - Array->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); - // Set the alignment to that of an array items. We will be only loading one - // value out of it. - Array->setAlignment(DL.getPrefTypeAlign(ValueType)); - Kind = ArrayKind; + auto *TableTy = ArrayType::get(ValueType, TableSize); + Initializer = ConstantArray::get(TableTy, TableContents); + + Kind = LookupTableKind; } -Value *SwitchLookupTable::buildLookup(Value *Index, IRBuilder<> &Builder, - const DataLayout &DL) { +Value *SwitchReplacement::replaceSwitch(Value *Index, IRBuilder<> &Builder, + const DataLayout &DL, Function *Func) { switch (Kind) { case SingleValueKind: return SingleValue; case LinearMapKind: { + ++NumLinearMaps; // Derive the result value from the input value. Value *Result = Builder.CreateIntCast(Index, LinearMultiplier->getType(), false, "switch.idx.cast"); @@ -6656,6 +6659,7 @@ Value *SwitchLookupTable::buildLookup(Value *Index, IRBuilder<> &Builder, return Result; } case BitMapKind: { + ++NumBitMaps; // Type of the bitmap (e.g. i59). IntegerType *MapTy = BitMap->getIntegerType(); @@ -6677,9 +6681,18 @@ Value *SwitchLookupTable::buildLookup(Value *Index, IRBuilder<> &Builder, // Mask off. return Builder.CreateTrunc(DownShifted, BitMapElementTy, "switch.masked"); } - case ArrayKind: { - Type *IndexTy = DL.getIndexType(Array->getType()); - auto *ArrayTy = cast<ArrayType>(Array->getValueType()); + case LookupTableKind: { + ++NumLookupTables; + auto *Table = + new GlobalVariable(*Func->getParent(), Initializer->getType(), + /*isConstant=*/true, GlobalVariable::PrivateLinkage, + Initializer, "switch.table." + Func->getName()); + Table->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + // Set the alignment to that of an array items. We will be only loading one + // value out of it. + Table->setAlignment(DL.getPrefTypeAlign(ValueType)); + Type *IndexTy = DL.getIndexType(Table->getType()); + auto *ArrayTy = cast<ArrayType>(Table->getValueType()); if (Index->getType() != IndexTy) { unsigned OldBitWidth = Index->getType()->getIntegerBitWidth(); @@ -6691,14 +6704,14 @@ Value *SwitchLookupTable::buildLookup(Value *Index, IRBuilder<> &Builder, Value *GEPIndices[] = {ConstantInt::get(IndexTy, 0), Index}; Value *GEP = - Builder.CreateInBoundsGEP(ArrayTy, Array, GEPIndices, "switch.gep"); + Builder.CreateInBoundsGEP(ArrayTy, Table, GEPIndices, "switch.gep"); return Builder.CreateLoad(ArrayTy->getElementType(), GEP, "switch.load"); } } - llvm_unreachable("Unknown lookup table kind!"); + llvm_unreachable("Unknown helper kind!"); } -bool SwitchLookupTable::wouldFitInRegister(const DataLayout &DL, +bool SwitchReplacement::wouldFitInRegister(const DataLayout &DL, uint64_t TableSize, Type *ElementType) { auto *IT = dyn_cast<IntegerType>(ElementType); @@ -6734,6 +6747,10 @@ static bool isTypeLegalForLookupTable(Type *Ty, const TargetTransformInfo &TTI, DL.fitsInLegalInteger(IT->getBitWidth()); } +Constant *SwitchReplacement::getDefaultValue() { return DefaultValue; } + +bool SwitchReplacement::isLookupTable() { return Kind == LookupTableKind; } + static bool isSwitchDense(uint64_t NumCases, uint64_t CaseRange) { // 40% is the default density for building a jump table in optsize/minsize // mode. See also TargetLoweringBase::isSuitableForJumpTable(), which this @@ -6760,25 +6777,23 @@ static bool isSwitchDense(ArrayRef<int64_t> Values) { // TODO: We could support larger than legal types by limiting based on the // number of loads required and/or table size. If the constants are small we // could use smaller table entries and extend after the load. -static bool -shouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize, - const TargetTransformInfo &TTI, const DataLayout &DL, - const SmallDenseMap<PHINode *, Type *> &ResultTypes) { +static bool shouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize, + const TargetTransformInfo &TTI, + const DataLayout &DL, + const SmallVector<Type *> &ResultTypes) { if (SI->getNumCases() > TableSize) return false; // TableSize overflowed. bool AllTablesFitInRegister = true; bool HasIllegalType = false; - for (const auto &I : ResultTypes) { - Type *Ty = I.second; - + for (const auto &Ty : ResultTypes) { // Saturate this flag to true. HasIllegalType = HasIllegalType || !isTypeLegalForLookupTable(Ty, TTI, DL); // Saturate this flag to false. AllTablesFitInRegister = AllTablesFitInRegister && - SwitchLookupTable::wouldFitInRegister(DL, TableSize, Ty); + SwitchReplacement::wouldFitInRegister(DL, TableSize, Ty); // If both flags saturate, we're done. NOTE: This *only* works with // saturating flags, and all flags have to saturate first due to the @@ -6800,7 +6815,7 @@ shouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize, static bool shouldUseSwitchConditionAsTableIndex( ConstantInt &MinCaseVal, const ConstantInt &MaxCaseVal, - bool HasDefaultResults, const SmallDenseMap<PHINode *, Type *> &ResultTypes, + bool HasDefaultResults, const SmallVector<Type *> &ResultTypes, const DataLayout &DL, const TargetTransformInfo &TTI) { if (MinCaseVal.isNullValue()) return true; @@ -6808,10 +6823,9 @@ static bool shouldUseSwitchConditionAsTableIndex( MaxCaseVal.getLimitedValue() == std::numeric_limits<uint64_t>::max() || !HasDefaultResults) return false; - return all_of(ResultTypes, [&](const auto &KV) { - return SwitchLookupTable::wouldFitInRegister( - DL, MaxCaseVal.getLimitedValue() + 1 /* TableSize */, - KV.second /* ResultType */); + return all_of(ResultTypes, [&](const auto &ResultType) { + return SwitchReplacement::wouldFitInRegister( + DL, MaxCaseVal.getLimitedValue() + 1 /* TableSize */, ResultType); }); } @@ -6900,18 +6914,13 @@ static void reuseTableCompare( /// If the switch is only used to initialize one or more phi nodes in a common /// successor block with different constant values, replace the switch with /// lookup tables. -static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, - DomTreeUpdater *DTU, const DataLayout &DL, - const TargetTransformInfo &TTI) { +static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder, + DomTreeUpdater *DTU, const DataLayout &DL, + const TargetTransformInfo &TTI) { assert(SI->getNumCases() > 1 && "Degenerate switch?"); BasicBlock *BB = SI->getParent(); Function *Fn = BB->getParent(); - // Only build lookup table when we have a target that supports it or the - // attribute is not set. - if (!TTI.shouldBuildLookupTables() || - (Fn->getFnAttribute("no-jump-tables").getValueAsBool())) - return false; // FIXME: If the switch is too sparse for a lookup table, perhaps we could // split off a dense part and build a lookup table for that. @@ -6938,7 +6947,7 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, SmallDenseMap<PHINode *, ResultListTy> ResultLists; SmallDenseMap<PHINode *, Constant *> DefaultResults; - SmallDenseMap<PHINode *, Type *> ResultTypes; + SmallVector<Type *> ResultTypes; SmallVector<PHINode *, 4> PHIs; for (SwitchInst::CaseIt E = SI->case_end(); CI != E; ++CI) { @@ -6955,7 +6964,8 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, Results, DL, TTI)) return false; - // Append the result from this case to the list for each phi. + // Append the result and result types from this case to the list for each + // phi. for (const auto &I : Results) { PHINode *PHI = I.first; Constant *Value = I.second; @@ -6963,23 +6973,16 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, if (Inserted) PHIs.push_back(PHI); It->second.push_back(std::make_pair(CaseVal, Value)); + ResultTypes.push_back(PHI->getType()); } } - // Keep track of the result types. - for (PHINode *PHI : PHIs) { - ResultTypes[PHI] = ResultLists[PHI][0].second->getType(); - } - - uint64_t NumResults = ResultLists[PHIs[0]].size(); - // If the table has holes, we need a constant result for the default case // or a bitmask that fits in a register. SmallVector<std::pair<PHINode *, Constant *>, 4> DefaultResultsList; bool HasDefaultResults = getCaseResults(SI, nullptr, SI->getDefaultDest(), &CommonDest, DefaultResultsList, DL, TTI); - for (const auto &I : DefaultResultsList) { PHINode *PHI = I.first; Constant *Result = I.second; @@ -6989,15 +6992,21 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, bool UseSwitchConditionAsTableIndex = shouldUseSwitchConditionAsTableIndex( *MinCaseVal, *MaxCaseVal, HasDefaultResults, ResultTypes, DL, TTI); uint64_t TableSize; - if (UseSwitchConditionAsTableIndex) + ConstantInt *TableIndexOffset; + if (UseSwitchConditionAsTableIndex) { TableSize = MaxCaseVal->getLimitedValue() + 1; - else + TableIndexOffset = ConstantInt::get(MaxCaseVal->getIntegerType(), 0); + } else { TableSize = (MaxCaseVal->getValue() - MinCaseVal->getValue()).getLimitedValue() + 1; + TableIndexOffset = MinCaseVal; + } + // If the default destination is unreachable, or if the lookup table covers // all values of the conditional variable, branch directly to the lookup table // BB. Otherwise, check that the condition is within the case range. + uint64_t NumResults = ResultLists[PHIs[0]].size(); bool DefaultIsReachable = !SI->defaultDestUnreachable(); bool TableHasHoles = (NumResults < TableSize); @@ -7025,68 +7034,100 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, if (!shouldBuildLookupTable(SI, TableSize, TTI, DL, ResultTypes)) return false; - std::vector<DominatorTree::UpdateType> Updates; - - // Compute the maximum table size representable by the integer type we are - // switching upon. - unsigned CaseSize = MinCaseVal->getType()->getPrimitiveSizeInBits(); - uint64_t MaxTableSize = CaseSize > 63 ? UINT64_MAX : 1ULL << CaseSize; - assert(MaxTableSize >= TableSize && - "It is impossible for a switch to have more entries than the max " - "representable value of its input integer type's size."); - - // Create the BB that does the lookups. - Module &Mod = *CommonDest->getParent()->getParent(); - BasicBlock *LookupBB = BasicBlock::Create( - Mod.getContext(), "switch.lookup", CommonDest->getParent(), CommonDest); - // Compute the table index value. - Builder.SetInsertPoint(SI); Value *TableIndex; - ConstantInt *TableIndexOffset; if (UseSwitchConditionAsTableIndex) { - TableIndexOffset = ConstantInt::get(MaxCaseVal->getIntegerType(), 0); TableIndex = SI->getCondition(); - } else { - TableIndexOffset = MinCaseVal; + if (HasDefaultResults) { + // Grow the table to cover all possible index values to avoid the range + // check. It will use the default result to fill in the table hole later, + // so make sure it exist. + ConstantRange CR = + computeConstantRange(TableIndex, /* ForSigned */ false); + // Grow the table shouldn't have any size impact by checking + // wouldFitInRegister. + // TODO: Consider growing the table also when it doesn't fit in a register + // if no optsize is specified. + const uint64_t UpperBound = CR.getUpper().getLimitedValue(); + if (!CR.isUpperWrapped() && + all_of(ResultTypes, [&](const auto &ResultType) { + return SwitchReplacement::wouldFitInRegister(DL, UpperBound, + ResultType); + })) { + // There may be some case index larger than the UpperBound (unreachable + // case), so make sure the table size does not get smaller. + TableSize = std::max(UpperBound, TableSize); + // The default branch is unreachable after we enlarge the lookup table. + // Adjust DefaultIsReachable to reuse code path. + DefaultIsReachable = false; + } + } + } + + // Keep track of the switch replacement for each phi + SmallDenseMap<PHINode *, SwitchReplacement> PhiToReplacementMap; + for (PHINode *PHI : PHIs) { + const auto &ResultList = ResultLists[PHI]; + + Type *ResultType = ResultList.begin()->second->getType(); + // Use any value to fill the lookup table holes. + Constant *DefaultVal = + AllHolesArePoison ? PoisonValue::get(ResultType) : DefaultResults[PHI]; + StringRef FuncName = Fn->getName(); + SwitchReplacement Replacement(*Fn->getParent(), TableSize, TableIndexOffset, + ResultList, DefaultVal, DL, FuncName); + PhiToReplacementMap.insert({PHI, Replacement}); + } + + bool AnyLookupTables = any_of( + PhiToReplacementMap, [](auto &KV) { return KV.second.isLookupTable(); }); + + // A few conditions prevent the generation of lookup tables: + // 1. The target does not support lookup tables. + // 2. The "no-jump-tables" function attribute is set. + // However, these objections do not apply to other switch replacements, like + // the bitmap, so we only stop here if any of these conditions are met and we + // want to create a LUT. Otherwise, continue with the switch replacement. + if (AnyLookupTables && + (!TTI.shouldBuildLookupTables() || + Fn->getFnAttribute("no-jump-tables").getValueAsBool())) + return false; + + Builder.SetInsertPoint(SI); + // TableIndex is the switch condition - TableIndexOffset if we don't + // use the condition directly + if (!UseSwitchConditionAsTableIndex) { // If the default is unreachable, all case values are s>= MinCaseVal. Then // we can try to attach nsw. bool MayWrap = true; if (!DefaultIsReachable) { - APInt Res = MaxCaseVal->getValue().ssub_ov(MinCaseVal->getValue(), MayWrap); + APInt Res = + MaxCaseVal->getValue().ssub_ov(MinCaseVal->getValue(), MayWrap); (void)Res; } - TableIndex = Builder.CreateSub(SI->getCondition(), TableIndexOffset, "switch.tableidx", /*HasNUW =*/false, /*HasNSW =*/!MayWrap); } - BranchInst *RangeCheckBranch = nullptr; + std::vector<DominatorTree::UpdateType> Updates; - // Grow the table to cover all possible index values to avoid the range check. - // It will use the default result to fill in the table hole later, so make - // sure it exist. - if (UseSwitchConditionAsTableIndex && HasDefaultResults) { - ConstantRange CR = computeConstantRange(TableIndex, /* ForSigned */ false); - // Grow the table shouldn't have any size impact by checking - // wouldFitInRegister. - // TODO: Consider growing the table also when it doesn't fit in a register - // if no optsize is specified. - const uint64_t UpperBound = CR.getUpper().getLimitedValue(); - if (!CR.isUpperWrapped() && all_of(ResultTypes, [&](const auto &KV) { - return SwitchLookupTable::wouldFitInRegister( - DL, UpperBound, KV.second /* ResultType */); - })) { - // There may be some case index larger than the UpperBound (unreachable - // case), so make sure the table size does not get smaller. - TableSize = std::max(UpperBound, TableSize); - // The default branch is unreachable after we enlarge the lookup table. - // Adjust DefaultIsReachable to reuse code path. - DefaultIsReachable = false; - } - } + // Compute the maximum table size representable by the integer type we are + // switching upon. + unsigned CaseSize = MinCaseVal->getType()->getPrimitiveSizeInBits(); + uint64_t MaxTableSize = CaseSize > 63 ? UINT64_MAX : 1ULL << CaseSize; + assert(MaxTableSize >= TableSize && + "It is impossible for a switch to have more entries than the max " + "representable value of its input integer type's size."); + + // Create the BB that does the lookups. + Module &Mod = *CommonDest->getParent()->getParent(); + BasicBlock *LookupBB = BasicBlock::Create( + Mod.getContext(), "switch.lookup", CommonDest->getParent(), CommonDest); + + BranchInst *RangeCheckBranch = nullptr; + Builder.SetInsertPoint(SI); const bool GeneratingCoveredLookupTable = (MaxTableSize == TableSize); if (!DefaultIsReachable || GeneratingCoveredLookupTable) { Builder.CreateBr(LookupBB); @@ -7157,25 +7198,16 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, for (PHINode *PHI : PHIs) { const ResultListTy &ResultList = ResultLists[PHI]; - - Type *ResultType = ResultList.begin()->second->getType(); - - // Use any value to fill the lookup table holes. - Constant *DV = - AllHolesArePoison ? PoisonValue::get(ResultType) : DefaultResults[PHI]; - StringRef FuncName = Fn->getName(); - SwitchLookupTable Table(Mod, TableSize, TableIndexOffset, ResultList, DV, - DL, FuncName); - - Value *Result = Table.buildLookup(TableIndex, Builder, DL); - + auto Replacement = PhiToReplacementMap.at(PHI); + auto *Result = Replacement.replaceSwitch(TableIndex, Builder, DL, Fn); // Do a small peephole optimization: re-use the switch table compare if // possible. if (!TableHasHoles && HasDefaultResults && RangeCheckBranch) { BasicBlock *PhiBlock = PHI->getParent(); // Search for compare instructions which use the phi. for (auto *User : PHI->users()) { - reuseTableCompare(User, PhiBlock, RangeCheckBranch, DV, ResultList); + reuseTableCompare(User, PhiBlock, RangeCheckBranch, + Replacement.getDefaultValue(), ResultList); } } @@ -7202,7 +7234,6 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, if (DTU) DTU->applyUpdates(Updates); - ++NumLookupTables; if (NeedMask) ++NumLookupTablesHoles; return true; @@ -7708,7 +7739,7 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { // CVP. Therefore, only apply this transformation during late stages of the // optimisation pipeline. if (Options.ConvertSwitchToLookupTable && - switchToLookupTable(SI, Builder, DTU, DL, TTI)) + simplifySwitchLookup(SI, Builder, DTU, DL, TTI)) return requestResimplify(); if (simplifySwitchOfPowersOfTwo(SI, Builder, DL, TTI)) diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index 2d6a748f4507..8acebbaa5458 100644 --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -97,6 +97,10 @@ static cl::opt<unsigned, false, HotColdHintParser> static cl::opt<unsigned, false, HotColdHintParser> HotNewHintValue( "hot-new-hint-value", cl::Hidden, cl::init(254), cl::desc("Value to pass to hot/cold operator new for hot allocation")); +static cl::opt<unsigned, false, HotColdHintParser> AmbiguousNewHintValue( + "ambiguous-new-hint-value", cl::Hidden, cl::init(222), + cl::desc( + "Value to pass to hot/cold operator new for ambiguous allocation")); //===----------------------------------------------------------------------===// // Helper Functions @@ -1719,6 +1723,37 @@ Value *LibCallSimplifier::optimizeRealloc(CallInst *CI, IRBuilderBase &B) { return nullptr; } +// Allow existing calls to operator new() that takes a __hot_cold_t parameter to +// be updated with a compiler-determined hot cold hint value. This is used in +// cases where the call is marked nobuiltin (because operator new called +// explicitly) and therefore cannot be replaced with a different callee. +Value *LibCallSimplifier::optimizeExistingHotColdNew(CallInst *CI, + IRBuilderBase &B) { + if (!OptimizeHotColdNew || !OptimizeExistingHotColdNew) + return nullptr; + Function *Callee = CI->getCalledFunction(); + if (!Callee) + return nullptr; + LibFunc Func; + if (!TLI->getLibFunc(*Callee, Func)) + return nullptr; + switch (Func) { + case LibFunc_Znwm12__hot_cold_t: + case LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t: + case LibFunc_ZnwmSt11align_val_t12__hot_cold_t: + case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t: + case LibFunc_Znam12__hot_cold_t: + case LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t: + case LibFunc_ZnamSt11align_val_t12__hot_cold_t: + case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t: + case LibFunc_size_returning_new_hot_cold: + case LibFunc_size_returning_new_aligned_hot_cold: + return optimizeNew(CI, B, Func); + default: + return nullptr; + } +} + // When enabled, replace operator new() calls marked with a hot or cold memprof // attribute with an operator new() call that takes a __hot_cold_t parameter. // Currently this is supported by the open source version of tcmalloc, see: @@ -1736,6 +1771,9 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B, HotCold = NotColdNewHintValue; else if (CI->getAttributes().getFnAttr("memprof").getValueAsString() == "hot") HotCold = HotNewHintValue; + else if (CI->getAttributes().getFnAttr("memprof").getValueAsString() == + "ambiguous") + HotCold = AmbiguousNewHintValue; else return nullptr; @@ -1753,9 +1791,8 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B, LibFunc_Znwm12__hot_cold_t, HotCold); break; case LibFunc_Znwm: - if (HotCold != NotColdNewHintValue) - return emitHotColdNew(CI->getArgOperand(0), B, TLI, - LibFunc_Znwm12__hot_cold_t, HotCold); + return emitHotColdNew(CI->getArgOperand(0), B, TLI, + LibFunc_Znwm12__hot_cold_t, HotCold); break; case LibFunc_Znam12__hot_cold_t: if (OptimizeExistingHotColdNew) @@ -1763,9 +1800,8 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B, LibFunc_Znam12__hot_cold_t, HotCold); break; case LibFunc_Znam: - if (HotCold != NotColdNewHintValue) - return emitHotColdNew(CI->getArgOperand(0), B, TLI, - LibFunc_Znam12__hot_cold_t, HotCold); + return emitHotColdNew(CI->getArgOperand(0), B, TLI, + LibFunc_Znam12__hot_cold_t, HotCold); break; case LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t: if (OptimizeExistingHotColdNew) @@ -1774,10 +1810,9 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B, LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t, HotCold); break; case LibFunc_ZnwmRKSt9nothrow_t: - if (HotCold != NotColdNewHintValue) - return emitHotColdNewNoThrow( - CI->getArgOperand(0), CI->getArgOperand(1), B, TLI, - LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t, HotCold); + return emitHotColdNewNoThrow(CI->getArgOperand(0), CI->getArgOperand(1), B, + TLI, LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t, + HotCold); break; case LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t: if (OptimizeExistingHotColdNew) @@ -1786,10 +1821,9 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B, LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t, HotCold); break; case LibFunc_ZnamRKSt9nothrow_t: - if (HotCold != NotColdNewHintValue) - return emitHotColdNewNoThrow( - CI->getArgOperand(0), CI->getArgOperand(1), B, TLI, - LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t, HotCold); + return emitHotColdNewNoThrow(CI->getArgOperand(0), CI->getArgOperand(1), B, + TLI, LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t, + HotCold); break; case LibFunc_ZnwmSt11align_val_t12__hot_cold_t: if (OptimizeExistingHotColdNew) @@ -1798,10 +1832,9 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B, LibFunc_ZnwmSt11align_val_t12__hot_cold_t, HotCold); break; case LibFunc_ZnwmSt11align_val_t: - if (HotCold != NotColdNewHintValue) - return emitHotColdNewAligned( - CI->getArgOperand(0), CI->getArgOperand(1), B, TLI, - LibFunc_ZnwmSt11align_val_t12__hot_cold_t, HotCold); + return emitHotColdNewAligned(CI->getArgOperand(0), CI->getArgOperand(1), B, + TLI, LibFunc_ZnwmSt11align_val_t12__hot_cold_t, + HotCold); break; case LibFunc_ZnamSt11align_val_t12__hot_cold_t: if (OptimizeExistingHotColdNew) @@ -1810,10 +1843,9 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B, LibFunc_ZnamSt11align_val_t12__hot_cold_t, HotCold); break; case LibFunc_ZnamSt11align_val_t: - if (HotCold != NotColdNewHintValue) - return emitHotColdNewAligned( - CI->getArgOperand(0), CI->getArgOperand(1), B, TLI, - LibFunc_ZnamSt11align_val_t12__hot_cold_t, HotCold); + return emitHotColdNewAligned(CI->getArgOperand(0), CI->getArgOperand(1), B, + TLI, LibFunc_ZnamSt11align_val_t12__hot_cold_t, + HotCold); break; case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t: if (OptimizeExistingHotColdNew) @@ -1823,11 +1855,9 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B, HotCold); break; case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t: - if (HotCold != NotColdNewHintValue) - return emitHotColdNewAlignedNoThrow( - CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), B, - TLI, LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t, - HotCold); + return emitHotColdNewAlignedNoThrow( + CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), B, + TLI, LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t, HotCold); break; case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t: if (OptimizeExistingHotColdNew) @@ -1837,17 +1867,14 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B, HotCold); break; case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t: - if (HotCold != NotColdNewHintValue) - return emitHotColdNewAlignedNoThrow( - CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), B, - TLI, LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t, - HotCold); + return emitHotColdNewAlignedNoThrow( + CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), B, + TLI, LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t, HotCold); break; case LibFunc_size_returning_new: - if (HotCold != NotColdNewHintValue) - return emitHotColdSizeReturningNew(CI->getArgOperand(0), B, TLI, - LibFunc_size_returning_new_hot_cold, - HotCold); + return emitHotColdSizeReturningNew(CI->getArgOperand(0), B, TLI, + LibFunc_size_returning_new_hot_cold, + HotCold); break; case LibFunc_size_returning_new_hot_cold: if (OptimizeExistingHotColdNew) @@ -1856,10 +1883,9 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B, HotCold); break; case LibFunc_size_returning_new_aligned: - if (HotCold != NotColdNewHintValue) - return emitHotColdSizeReturningNewAligned( - CI->getArgOperand(0), CI->getArgOperand(1), B, TLI, - LibFunc_size_returning_new_aligned_hot_cold, HotCold); + return emitHotColdSizeReturningNewAligned( + CI->getArgOperand(0), CI->getArgOperand(1), B, TLI, + LibFunc_size_returning_new_aligned_hot_cold, HotCold); break; case LibFunc_size_returning_new_aligned_hot_cold: if (OptimizeExistingHotColdNew) @@ -4094,8 +4120,11 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI, IRBuilderBase &Builder) { // TODO: Split out the code below that operates on FP calls so that // we can all non-FP calls with the StrictFP attribute to be // optimized. - if (CI->isNoBuiltin()) - return nullptr; + if (CI->isNoBuiltin()) { + // If this is an existing call to a hot cold operator new, we can update the + // hint parameter value, which doesn't change the callee. + return optimizeExistingHotColdNew(CI, Builder); + } LibFunc Func; Function *Callee = CI->getCalledFunction(); diff --git a/llvm/lib/Transforms/Utils/SymbolRewriter.cpp b/llvm/lib/Transforms/Utils/SymbolRewriter.cpp index d52d52a9b7d3..6319fd524ff0 100644 --- a/llvm/lib/Transforms/Utils/SymbolRewriter.cpp +++ b/llvm/lib/Transforms/Utils/SymbolRewriter.cpp @@ -349,13 +349,7 @@ parseRewriteFunctionDescriptor(yaml::Stream &YS, yaml::ScalarNode *K, KeyValue = Key->getValue(KeyStorage); if (KeyValue == "source") { - std::string Error; - Source = std::string(Value->getValue(ValueStorage)); - if (!Regex(Source).isValid(Error)) { - YS.printError(Field.getKey(), "invalid regex: " + Error); - return false; - } } else if (KeyValue == "target") { Target = std::string(Value->getValue(ValueStorage)); } else if (KeyValue == "transform") { @@ -379,12 +373,22 @@ parseRewriteFunctionDescriptor(yaml::Stream &YS, yaml::ScalarNode *K, // TODO see if there is a more elegant solution to selecting the rewrite // descriptor type - if (!Target.empty()) + if (!Target.empty()) { DL->push_back(std::make_unique<ExplicitRewriteFunctionDescriptor>( Source, Target, Naked)); - else - DL->push_back( - std::make_unique<PatternRewriteFunctionDescriptor>(Source, Transform)); + return true; + } + + { + std::string Error; + if (!Regex(Source).isValid(Error)) { + YS.printError(Descriptor, "invalid Source regex: " + Error); + return false; + } + } + + DL->push_back( + std::make_unique<PatternRewriteFunctionDescriptor>(Source, Transform)); return true; } @@ -418,13 +422,7 @@ parseRewriteGlobalVariableDescriptor(yaml::Stream &YS, yaml::ScalarNode *K, KeyValue = Key->getValue(KeyStorage); if (KeyValue == "source") { - std::string Error; - Source = std::string(Value->getValue(ValueStorage)); - if (!Regex(Source).isValid(Error)) { - YS.printError(Field.getKey(), "invalid regex: " + Error); - return false; - } } else if (KeyValue == "target") { Target = std::string(Value->getValue(ValueStorage)); } else if (KeyValue == "transform") { @@ -441,13 +439,23 @@ parseRewriteGlobalVariableDescriptor(yaml::Stream &YS, yaml::ScalarNode *K, return false; } - if (!Target.empty()) + if (!Target.empty()) { DL->push_back(std::make_unique<ExplicitRewriteGlobalVariableDescriptor>( Source, Target, /*Naked*/ false)); - else - DL->push_back(std::make_unique<PatternRewriteGlobalVariableDescriptor>( - Source, Transform)); + return true; + } + + { + std::string Error; + if (!Regex(Source).isValid(Error)) { + YS.printError(Descriptor, "invalid Source regex: " + Error); + return false; + } + } + + DL->push_back(std::make_unique<PatternRewriteGlobalVariableDescriptor>( + Source, Transform)); return true; } @@ -481,13 +489,7 @@ parseRewriteGlobalAliasDescriptor(yaml::Stream &YS, yaml::ScalarNode *K, KeyValue = Key->getValue(KeyStorage); if (KeyValue == "source") { - std::string Error; - Source = std::string(Value->getValue(ValueStorage)); - if (!Regex(Source).isValid(Error)) { - YS.printError(Field.getKey(), "invalid regex: " + Error); - return false; - } } else if (KeyValue == "target") { Target = std::string(Value->getValue(ValueStorage)); } else if (KeyValue == "transform") { @@ -504,13 +506,23 @@ parseRewriteGlobalAliasDescriptor(yaml::Stream &YS, yaml::ScalarNode *K, return false; } - if (!Target.empty()) + if (!Target.empty()) { DL->push_back(std::make_unique<ExplicitRewriteNamedAliasDescriptor>( Source, Target, /*Naked*/ false)); - else - DL->push_back(std::make_unique<PatternRewriteNamedAliasDescriptor>( - Source, Transform)); + return true; + } + + { + std::string Error; + if (!Regex(Source).isValid(Error)) { + YS.printError(Descriptor, "invalid Source regex: " + Error); + return false; + } + } + + DL->push_back( + std::make_unique<PatternRewriteNamedAliasDescriptor>(Source, Transform)); return true; } diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp index 491f0b76f4ae..53129e2e5fbb 100644 --- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp @@ -170,10 +170,10 @@ private: bool recognizeFindFirstByte(); Value *expandFindFirstByte(IRBuilder<> &Builder, DomTreeUpdater &DTU, - unsigned VF, Type *CharTy, BasicBlock *ExitSucc, - BasicBlock *ExitFail, Value *SearchStart, - Value *SearchEnd, Value *NeedleStart, - Value *NeedleEnd); + unsigned VF, Type *CharTy, Value *IndPhi, + BasicBlock *ExitSucc, BasicBlock *ExitFail, + Value *SearchStart, Value *SearchEnd, + Value *NeedleStart, Value *NeedleEnd); void transformFindFirstByte(PHINode *IndPhi, unsigned VF, Type *CharTy, BasicBlock *ExitSucc, BasicBlock *ExitFail, @@ -242,6 +242,37 @@ bool LoopIdiomVectorize::run(Loop *L) { return false; } +static void fixSuccessorPhis(Loop *L, Value *ScalarRes, Value *VectorRes, + BasicBlock *SuccBB, BasicBlock *IncBB) { + for (PHINode &PN : SuccBB->phis()) { + // Look through the incoming values to find ScalarRes, meaning this is a + // PHI collecting the results of the transformation. + bool ResPhi = false; + for (Value *Op : PN.incoming_values()) + if (Op == ScalarRes) { + ResPhi = true; + break; + } + + // Any PHI that depended upon the result of the transformation needs a new + // incoming value from IncBB. + if (ResPhi) + PN.addIncoming(VectorRes, IncBB); + else { + // There should be no other outside uses of other values in the + // original loop. Any incoming values should either: + // 1. Be for blocks outside the loop, which aren't interesting. Or .. + // 2. These are from blocks in the loop with values defined outside + // the loop. We should a similar incoming value from CmpBB. + for (BasicBlock *BB : PN.blocks()) + if (L->contains(BB)) { + PN.addIncoming(PN.getIncomingValueForBlock(BB), IncBB); + break; + } + } + } +} + bool LoopIdiomVectorize::recognizeByteCompare() { // Currently the transformation only works on scalable vector types, although // there is no fundamental reason why it cannot be made to work for fixed @@ -574,13 +605,8 @@ Value *LoopIdiomVectorize::createPredicatedFindMismatch( Intrinsic::vp_load, {VectorLoadType, VectorLhsGep->getType()}, {VectorRhsGep, AllTrueMask, VL}, nullptr, "rhs.load"); - StringRef PredicateStr = CmpInst::getPredicateName(CmpInst::ICMP_NE); - auto *PredicateMDS = MDString::get(VectorLhsLoad->getContext(), PredicateStr); - Value *Pred = MetadataAsValue::get(VectorLhsLoad->getContext(), PredicateMDS); - Value *VectorMatchCmp = Builder.CreateIntrinsic( - Intrinsic::vp_icmp, {VectorLhsLoad->getType()}, - {VectorLhsLoad, VectorRhsLoad, Pred, AllTrueMask, VL}, nullptr, - "mismatch.cmp"); + Value *VectorMatchCmp = + Builder.CreateICmpNE(VectorLhsLoad, VectorRhsLoad, "mismatch.cmp"); Value *CTZ = Builder.CreateIntrinsic( Intrinsic::vp_cttz_elts, {ResType, VectorMatchCmp->getType()}, {VectorMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(false), AllTrueMask, @@ -940,42 +966,10 @@ void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA, DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB}}); } - auto fixSuccessorPhis = [&](BasicBlock *SuccBB) { - for (PHINode &PN : SuccBB->phis()) { - // At this point we've already replaced all uses of the result from the - // loop with ByteCmp. Look through the incoming values to find ByteCmp, - // meaning this is a Phi collecting the results of the byte compare. - bool ResPhi = false; - for (Value *Op : PN.incoming_values()) - if (Op == ByteCmpRes) { - ResPhi = true; - break; - } - - // Any PHI that depended upon the result of the byte compare needs a new - // incoming value from CmpBB. This is because the original loop will get - // deleted. - if (ResPhi) - PN.addIncoming(ByteCmpRes, CmpBB); - else { - // There should be no other outside uses of other values in the - // original loop. Any incoming values should either: - // 1. Be for blocks outside the loop, which aren't interesting. Or .. - // 2. These are from blocks in the loop with values defined outside - // the loop. We should a similar incoming value from CmpBB. - for (BasicBlock *BB : PN.blocks()) - if (CurLoop->contains(BB)) { - PN.addIncoming(PN.getIncomingValueForBlock(BB), CmpBB); - break; - } - } - } - }; - // Ensure all Phis in the successors of CmpBB have an incoming value from it. - fixSuccessorPhis(EndBB); + fixSuccessorPhis(CurLoop, ByteCmpRes, ByteCmpRes, EndBB, CmpBB); if (EndBB != FoundBB) - fixSuccessorPhis(FoundBB); + fixSuccessorPhis(CurLoop, ByteCmpRes, ByteCmpRes, FoundBB, CmpBB); // The new CmpBB block isn't part of the loop, but will need to be added to // the outer loop if there is one. @@ -1173,8 +1167,9 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() { Value *LoopIdiomVectorize::expandFindFirstByte( IRBuilder<> &Builder, DomTreeUpdater &DTU, unsigned VF, Type *CharTy, - BasicBlock *ExitSucc, BasicBlock *ExitFail, Value *SearchStart, - Value *SearchEnd, Value *NeedleStart, Value *NeedleEnd) { + Value *IndPhi, BasicBlock *ExitSucc, BasicBlock *ExitFail, + Value *SearchStart, Value *SearchEnd, Value *NeedleStart, + Value *NeedleEnd) { // Set up some types and constants that we intend to reuse. auto *PtrTy = Builder.getPtrTy(); auto *I64Ty = Builder.getInt64Ty(); @@ -1374,6 +1369,12 @@ Value *LoopIdiomVectorize::expandFindFirstByte( MatchLCSSA->addIncoming(Search, BB2); MatchPredLCSSA->addIncoming(MatchPred, BB2); + // Ensure all Phis in the successors of BB3/BB5 have an incoming value from + // them. + fixSuccessorPhis(CurLoop, IndPhi, MatchVal, ExitSucc, BB3); + if (ExitSucc != ExitFail) + fixSuccessorPhis(CurLoop, IndPhi, MatchVal, ExitFail, BB5); + if (VerifyLoops) { OuterLoop->verifyLoop(); InnerLoop->verifyLoop(); @@ -1395,21 +1396,12 @@ void LoopIdiomVectorize::transformFindFirstByte( DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc()); - Value *MatchVal = - expandFindFirstByte(Builder, DTU, VF, CharTy, ExitSucc, ExitFail, - SearchStart, SearchEnd, NeedleStart, NeedleEnd); + expandFindFirstByte(Builder, DTU, VF, CharTy, IndPhi, ExitSucc, ExitFail, + SearchStart, SearchEnd, NeedleStart, NeedleEnd); assert(PHBranch->isUnconditional() && "Expected preheader to terminate with an unconditional branch."); - // Add new incoming values with the result of the transformation to PHINodes - // of ExitSucc that use IndPhi. - for (auto *U : llvm::make_early_inc_range(IndPhi->users())) { - auto *PN = dyn_cast<PHINode>(U); - if (PN && PN->getParent() == ExitSucc) - PN->addIncoming(MatchVal, cast<Instruction>(MatchVal)->getParent()); - } - if (VerifyLoops && CurLoop->getParentLoop()) { CurLoop->getParentLoop()->verifyLoop(); if (!CurLoop->getParentLoop()->isRecursivelyLCSSAForm(*DT, *LI)) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 789047a2a28e..2704e66f3a70 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -15,8 +15,10 @@ // #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MustExecute.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -1223,8 +1225,18 @@ bool LoopVectorizationLegality::canVectorizeMemory() { }); } - if (!LAI->canVectorizeMemory()) + if (!LAI->canVectorizeMemory()) { + if (hasUncountableExitWithSideEffects()) { + reportVectorizationFailure( + "Cannot vectorize unsafe dependencies in uncountable exit loop with " + "side effects", + "CantVectorizeUnsafeDependencyForEELoopWithSideEffects", ORE, + TheLoop); + return false; + } + return canVectorizeIndirectUnsafeDependences(); + } if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) { reportVectorizationFailure("We don't allow storing to uniform addresses", @@ -1530,7 +1542,8 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { if (!isGuaranteedNotToBePoison(CurrV, AC, TheLoop->getLoopPredecessor() ->getTerminator() - ->getIterator())) + ->getIterator(), + DT)) return false; continue; } @@ -1754,16 +1767,24 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { } }; + bool HasSideEffects = false; for (auto *BB : TheLoop->blocks()) for (auto &I : *BB) { if (I.mayWriteToMemory()) { - // We don't support writes to memory. + if (isa<StoreInst>(&I) && cast<StoreInst>(&I)->isSimple()) { + HasSideEffects = true; + continue; + } + + // We don't support complex writes to memory. reportVectorizationFailure( - "Writes to memory unsupported in early exit loops", - "Cannot vectorize early exit loop with writes to memory", + "Complex writes to memory unsupported in early exit loops", + "Cannot vectorize early exit loop with complex writes to memory", "WritesInEarlyExitLoop", ORE, TheLoop); return false; - } else if (!IsSafeOperation(&I)) { + } + + if (!IsSafeOperation(&I)) { reportVectorizationFailure("Early exit loop contains operations that " "cannot be speculatively executed", "UnsafeOperationsEarlyExitLoop", ORE, @@ -1776,15 +1797,37 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { assert(LatchBB->getUniquePredecessor() == SingleUncountableExitingBlock && "Expected latch predecessor to be the early exiting block"); + SmallVector<LoadInst *, 4> NonDerefLoads; // TODO: Handle loops that may fault. - Predicates.clear(); - if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC, - &Predicates)) { - reportVectorizationFailure( - "Loop may fault", - "Cannot vectorize potentially faulting early exit loop", - "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop); + if (!HasSideEffects) { + // Read-only loop. + Predicates.clear(); + if (!isReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC, NonDerefLoads, + &Predicates)) { + reportVectorizationFailure( + "Loop may fault", "Cannot vectorize non-read-only early exit loop", + "NonReadOnlyEarlyExitLoop", ORE, TheLoop); + return false; + } + } else if (!canUncountableExitConditionLoadBeMoved( + SingleUncountableExitingBlock)) return false; + + // Check non-dereferenceable loads if any. + for (LoadInst *LI : NonDerefLoads) { + // Only support unit-stride access for now. + int Stride = isConsecutivePtr(LI->getType(), LI->getPointerOperand()); + if (Stride != 1) { + reportVectorizationFailure( + "Loop contains potentially faulting strided load", + "Cannot vectorize early exit loop with " + "strided fault-only-first load", + "EarlyExitLoopWithStridedFaultOnlyFirstLoad", ORE, TheLoop); + return false; + } + PotentiallyFaultingLoads.insert(LI); + LLVM_DEBUG(dbgs() << "LV: Found potentially faulting load: " << *LI + << "\n"); } [[maybe_unused]] const SCEV *SymbolicMaxBTC = @@ -1797,6 +1840,99 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { "backedge taken count: " << *SymbolicMaxBTC << '\n'); UncountableExitingBB = SingleUncountableExitingBlock; + UncountableExitWithSideEffects = HasSideEffects; + return true; +} + +bool LoopVectorizationLegality::canUncountableExitConditionLoadBeMoved( + BasicBlock *ExitingBlock) { + // Try to find a load in the critical path for the uncountable exit condition. + // This is currently matching about the simplest form we can, expecting + // only one in-loop load, the result of which is directly compared against + // a loop-invariant value. + // FIXME: We're insisting on a single use for now, because otherwise we will + // need to make PHI nodes for other users. That can be done once the initial + // transform code lands. + auto *Br = cast<BranchInst>(ExitingBlock->getTerminator()); + + using namespace llvm::PatternMatch; + Instruction *L = nullptr; + Value *Ptr = nullptr; + Value *R = nullptr; + if (!match(Br->getCondition(), + m_OneUse(m_ICmp(m_OneUse(m_Instruction(L, m_Load(m_Value(Ptr)))), + m_Value(R))))) { + reportVectorizationFailure( + "Early exit loop with store but no supported condition load", + "NoConditionLoadForEarlyExitLoop", ORE, TheLoop); + return false; + } + + // FIXME: Don't rely on operand ordering for the comparison. + if (!TheLoop->isLoopInvariant(R)) { + reportVectorizationFailure( + "Early exit loop with store but no supported condition load", + "NoConditionLoadForEarlyExitLoop", ORE, TheLoop); + return false; + } + + // Make sure that the load address is not loop invariant; we want an + // address calculation that we can rotate to the next vector iteration. + const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr); + if (!isa<SCEVAddRecExpr>(PtrScev)) { + reportVectorizationFailure( + "Uncountable exit condition depends on load with an address that is " + "not an add recurrence", + "EarlyExitLoadInvariantAddress", ORE, TheLoop); + return false; + } + + // FIXME: Support gathers after first-faulting load support lands. + SmallVector<const SCEVPredicate *, 4> Predicates; + LoadInst *Load = cast<LoadInst>(L); + if (!isDereferenceableAndAlignedInLoop(Load, TheLoop, *PSE.getSE(), *DT, AC, + &Predicates)) { + reportVectorizationFailure( + "Loop may fault", + "Cannot vectorize potentially faulting early exit loop", + "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop); + return false; + } + + ICFLoopSafetyInfo SafetyInfo; + SafetyInfo.computeLoopSafetyInfo(TheLoop); + // We need to know that load will be executed before we can hoist a + // copy out to run just before the first iteration. + // FIXME: Currently, other restrictions prevent us from reaching this point + // with a loop where the uncountable exit condition is determined + // by a conditional load. + assert(SafetyInfo.isGuaranteedToExecute(*Load, DT, TheLoop) && + "Unhandled control flow in uncountable exit loop with side effects"); + + // Prohibit any potential aliasing with any instruction in the loop which + // might store to memory. + // FIXME: Relax this constraint where possible. + for (auto *BB : TheLoop->blocks()) { + for (auto &I : *BB) { + if (&I == Load) + continue; + + if (I.mayWriteToMemory()) { + if (auto *SI = dyn_cast<StoreInst>(&I)) { + AliasResult AR = AA->alias(Ptr, SI->getPointerOperand()); + if (AR == AliasResult::NoAlias) + continue; + } + + reportVectorizationFailure( + "Cannot determine whether critical uncountable exit load address " + "does not alias with a memory write", + "CantVectorizeAliasWithCriticalUncountableExitLoad", ORE, TheLoop); + return false; + } + } + } + return true; } @@ -1869,6 +2005,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { } else { if (!isVectorizableEarlyExitLoop()) { assert(!hasUncountableEarlyExit() && + !hasUncountableExitWithSideEffects() && "Must be false without vectorizable early-exit loop"); if (DoExtraAnalysis) Result = false; @@ -1887,6 +2024,15 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { return false; } + // Bail out for state-changing loops with uncountable exits for now. + if (UncountableExitWithSideEffects) { + reportVectorizationFailure( + "Writes to memory unsupported in early exit loops", + "Cannot vectorize early exit loop with writes to memory", + "WritesInEarlyExitLoop", ORE, TheLoop); + return false; + } + if (Result) { LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop" << (LAI->getRuntimePointerChecking()->Need diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 838476dcae66..d34d2ae7a0b3 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -334,6 +334,10 @@ public: FPBinOp ? FPBinOp->getFastMathFlags() : FastMathFlags(), DL)); } + VPExpandSCEVRecipe *createExpandSCEV(const SCEV *Expr) { + return tryInsertInstruction(new VPExpandSCEVRecipe(Expr)); + } + //===--------------------------------------------------------------------===// // RAII helpers. //===--------------------------------------------------------------------===// @@ -559,6 +563,20 @@ public: /// Emit remarks for recipes with invalid costs in the available VPlans. void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE); + /// Create a check to \p Plan to see if the vector loop should be executed + /// based on its trip count. + void addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF, + ElementCount MinProfitableTripCount) const; + + /// Update loop metadata and profile info for both the scalar remainder loop + /// and \p VectorLoop, if it exists. Keeps all loop hints from the original + /// loop on the vector loop and replaces vectorizer-specific metadata. + void updateLoopMetadataAndProfileInfo(Loop *VectorLoop, + VPBasicBlock *HeaderVPBB, + bool VectorizingEpilogue, + unsigned EstimatedVFxUF, + bool DisableRuntimeUnroll); + protected: /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is @@ -613,13 +631,15 @@ private: /// Returns true if the per-lane cost of VectorizationFactor A is lower than /// that of B. bool isMoreProfitable(const VectorizationFactor &A, - const VectorizationFactor &B, bool HasTail) const; + const VectorizationFactor &B, bool HasTail, + bool IsEpilogue = false) const; /// Returns true if the per-lane cost of VectorizationFactor A is lower than /// that of B in the context of vectorizing a loop with known \p MaxTripCount. bool isMoreProfitable(const VectorizationFactor &A, const VectorizationFactor &B, - const unsigned MaxTripCount, bool HasTail) const; + const unsigned MaxTripCount, bool HasTail, + bool IsEpilogue = false) const; /// Determines if we have the infrastructure to vectorize the loop and its /// epilogue, assuming the main loop is vectorized by \p VF. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index a0f306c12754..3cff43a51029 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -165,15 +165,6 @@ using namespace SCEVPatternMatch; const char VerboseDebug[] = DEBUG_TYPE "-verbose"; #endif -/// @{ -/// Metadata attribute names -const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; -const char LLVMLoopVectorizeFollowupVectorized[] = - "llvm.loop.vectorize.followup_vectorized"; -const char LLVMLoopVectorizeFollowupEpilogue[] = - "llvm.loop.vectorize.followup_epilogue"; -/// @} - STATISTIC(LoopsVectorized, "Number of loops vectorized"); STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); @@ -500,26 +491,22 @@ public: InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, - ElementCount VecWidth, - ElementCount MinProfitableTripCount, - unsigned UnrollFactor, LoopVectorizationCostModel *CM, - BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, - GeneratedRTChecks &RTChecks, VPlan &Plan) + ElementCount VecWidth, unsigned UnrollFactor, + LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, + ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks, + VPlan &Plan) : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TTI(TTI), AC(AC), - VF(VecWidth), MinProfitableTripCount(MinProfitableTripCount), - UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Cost(CM), - BFI(BFI), PSI(PSI), RTChecks(RTChecks), Plan(Plan), + VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), + Cost(CM), BFI(BFI), PSI(PSI), RTChecks(RTChecks), Plan(Plan), VectorPHVPBB(cast<VPBasicBlock>( Plan.getVectorLoopRegion()->getSinglePredecessor())) {} virtual ~InnerLoopVectorizer() = default; - /// Create a new empty loop that will contain vectorized instructions later - /// on, while the old loop will be used as the scalar remainder. Control flow - /// is generated around the vectorized (and scalar epilogue) loops consisting - /// of various checks and bypasses. Return the pre-header block of the new - /// loop. In the case of epilogue vectorization, this function is overriden to - /// handle the more complex control flow around the loops. + /// Creates a basic block for the scalar preheader. Both + /// EpilogueVectorizerMainLoop and EpilogueVectorizerEpilogueLoop overwrite + /// the method to create additional blocks and checks needed for epilogue + /// vectorization. virtual BasicBlock *createVectorizedLoopSkeleton(); /// Fix the vectorized code, taking care of header phi's, and more. @@ -536,38 +523,18 @@ public: /// count of the original loop for both main loop and epilogue vectorization. void setTripCount(Value *TC) { TripCount = TC; } - /// Return the additional bypass block which targets the scalar loop by - /// skipping the epilogue loop after completing the main loop. - BasicBlock *getAdditionalBypassBlock() const { - assert(AdditionalBypassBlock && - "Trying to access AdditionalBypassBlock but it has not been set"); - return AdditionalBypassBlock; - } - protected: friend class LoopVectorizationPlanner; - // Create a check to see if the vector loop should be executed - Value *createIterationCountCheck(ElementCount VF, unsigned UF) const; - - /// Emit a bypass check to see if the vector trip count is zero, including if - /// it overflows. - void emitIterationCountCheck(BasicBlock *Bypass); - - /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, - /// vector loop preheader, middle block and scalar preheader. - void createVectorLoopSkeleton(StringRef Prefix); + /// Create and return a new IR basic block for the scalar preheader whose name + /// is prefixed with \p Prefix. + BasicBlock *createScalarPreheader(StringRef Prefix); /// Allow subclasses to override and print debug traces before/after vplan /// execution, when trace information is requested. virtual void printDebugTracesAtStart() {} virtual void printDebugTracesAtEnd() {} - /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the - /// vector preheader and its predecessor, also connecting the new block to the - /// scalar preheader. - void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB); - /// The original loop. Loop *OrigLoop; @@ -592,8 +559,6 @@ protected: /// vector elements. ElementCount VF; - ElementCount MinProfitableTripCount; - /// The vectorization unroll factor to use. Each scalar is vectorized to this /// many different vector instructions. unsigned UF; @@ -603,18 +568,9 @@ protected: // --- Vectorization state --- - /// The vector-loop preheader. - BasicBlock *LoopVectorPreHeader = nullptr; - - /// The scalar-loop preheader. - BasicBlock *LoopScalarPreHeader = nullptr; - /// Trip count of the original loop. Value *TripCount = nullptr; - /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) - Value *VectorTripCount = nullptr; - /// The profitablity analysis. LoopVectorizationCostModel *Cost; @@ -626,11 +582,6 @@ protected: /// for cleaning the checks, if vectorization turns out unprofitable. GeneratedRTChecks &RTChecks; - /// The additional bypass block which conditionally skips over the epilogue - /// loop after executing the main loop. Needed to resume inductions and - /// reductions during epilogue vectorization. - BasicBlock *AdditionalBypassBlock = nullptr; - VPlan &Plan; /// The vector preheader block of \p Plan, used as target for check blocks @@ -679,20 +630,8 @@ public: GeneratedRTChecks &Checks, VPlan &Plan, ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, VecWidth, - MinProfitableTripCount, UnrollFactor, CM, BFI, PSI, - Checks, Plan), - EPI(EPI) {} - - // Override this function to handle the more complex control flow around the - // three loops. - BasicBlock *createVectorizedLoopSkeleton() final { - return createEpilogueVectorizedLoopSkeleton(); - } - - /// The interface for creating a vectorized skeleton using one of two - /// different strategies, each corresponding to one execution of the vplan - /// as described above. - virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; + UnrollFactor, CM, BFI, PSI, Checks, Plan), + EPI(EPI), MinProfitableTripCount(MinProfitableTripCount) {} /// Holds and updates state information required to vectorize the main loop /// and its epilogue in two separate passes. This setup helps us avoid @@ -701,6 +640,9 @@ public: /// iteration count of the loop is so small that the main vector loop is /// completely skipped. EpilogueLoopVectorizationInfo &EPI; + +protected: + ElementCount MinProfitableTripCount; }; /// A specialized derived class of inner loop vectorizer that performs @@ -720,14 +662,24 @@ public: BFI, PSI, Check, Plan, EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF) {} /// Implements the interface for creating a vectorized skeleton using the - /// *main loop* strategy (ie the first pass of vplan execution). - BasicBlock *createEpilogueVectorizedLoopSkeleton() final; + /// *main loop* strategy (i.e., the first pass of VPlan execution). + BasicBlock *createVectorizedLoopSkeleton() final; protected: + /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the + /// vector preheader and its predecessor, also connecting the new block to the + /// scalar preheader. + void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB); + + // Create a check to see if the main vector loop should be executed + Value *createIterationCountCheck(BasicBlock *VectorPH, ElementCount VF, + unsigned UF) const; + /// Emits an iteration count bypass check once for the main loop (when \p /// ForEpilogue is false) and once for the epilogue loop (when \p /// ForEpilogue is true). - BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); + BasicBlock *emitIterationCountCheck(BasicBlock *VectorPH, BasicBlock *Bypass, + bool ForEpilogue); void printDebugTracesAtStart() override; void printDebugTracesAtEnd() override; }; @@ -736,6 +688,11 @@ protected: // vectorization of *epilogue* loops in the process of vectorizing loops and // their epilogues. class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { + /// The additional bypass block which conditionally skips over the epilogue + /// loop after executing the main loop. Needed to resume inductions and + /// reductions during epilogue vectorization. + BasicBlock *AdditionalBypassBlock = nullptr; + public: EpilogueVectorizerEpilogueLoop( Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, @@ -749,14 +706,22 @@ public: TripCount = EPI.TripCount; } /// Implements the interface for creating a vectorized skeleton using the - /// *epilogue loop* strategy (ie the second pass of vplan execution). - BasicBlock *createEpilogueVectorizedLoopSkeleton() final; + /// *epilogue loop* strategy (i.e., the second pass of VPlan execution). + BasicBlock *createVectorizedLoopSkeleton() final; + + /// Return the additional bypass block which targets the scalar loop by + /// skipping the epilogue loop after completing the main loop. + BasicBlock *getAdditionalBypassBlock() const { + assert(AdditionalBypassBlock && + "Trying to access AdditionalBypassBlock but it has not been set"); + return AdditionalBypassBlock; + } protected: /// Emits an iteration count bypass check after the main vector loop has /// finished to see if there are any iterations left to execute by either /// the vector epilogue or the scalar epilogue. - BasicBlock *emitMinimumVectorEpilogueIterCountCheck( + BasicBlock *emitMinimumVectorEpilogueIterCountCheck(BasicBlock *VectorPH, BasicBlock *Bypass, BasicBlock *Insert); void printDebugTracesAtStart() override; @@ -962,8 +927,8 @@ public: /// user options, for the given register kind. bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind); - /// \return True if register pressure should be calculated for the given VF. - bool shouldCalculateRegPressureForVF(ElementCount VF); + /// \return True if register pressure should be considered for the given VF. + bool shouldConsiderRegPressureForVF(ElementCount VF); /// \return The size (in bits) of the smallest and widest types in the code /// that needs to be vectorized. We ignore values that remain scalar such as @@ -1159,7 +1124,10 @@ public: CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const { assert(!VF.isScalar() && "Expected vector VF"); - return CallWideningDecisions.at({CI, VF}); + auto I = CallWideningDecisions.find({CI, VF}); + if (I == CallWideningDecisions.end()) + return {CM_Unknown, nullptr, Intrinsic::not_intrinsic, std::nullopt, 0}; + return I->second; } /// Return True if instruction \p I is an optimizable truncate whose operand @@ -1682,7 +1650,9 @@ private: Instruction *I = dyn_cast<Instruction>(V); if (VF.isScalar() || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I) || - getWideningDecision(I, VF) == CM_Scalarize) + getWideningDecision(I, VF) == CM_Scalarize || + (isa<CallInst>(I) && + getCallWideningDecision(cast<CallInst>(I), VF).Kind == CM_Scalarize)) return false; // Assume we can vectorize V (and hence we need extraction) if the @@ -1878,6 +1848,8 @@ public: "claimed checks are required"); } + SCEVExp.eraseDeadInstructions(SCEVCheckCond); + if (!MemCheckBlock && !SCEVCheckBlock) return; @@ -2030,7 +2002,7 @@ public: /// Retrieves the SCEVCheckCond and SCEVCheckBlock that were generated as IR /// outside VPlan. - std::pair<Value *, BasicBlock *> getSCEVChecks() { + std::pair<Value *, BasicBlock *> getSCEVChecks() const { using namespace llvm::PatternMatch; if (!SCEVCheckCond || match(SCEVCheckCond, m_ZeroInt())) return {nullptr, nullptr}; @@ -2040,7 +2012,7 @@ public: /// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR /// outside VPlan. - std::pair<Value *, BasicBlock *> getMemRuntimeChecks() { + std::pair<Value *, BasicBlock *> getMemRuntimeChecks() const { using namespace llvm::PatternMatch; if (MemRuntimeCheckCond && match(MemRuntimeCheckCond, m_ZeroInt())) return {nullptr, nullptr}; @@ -2049,9 +2021,7 @@ public: /// Return true if any runtime checks have been added bool hasChecks() const { - using namespace llvm::PatternMatch; - return (SCEVCheckCond && !match(SCEVCheckCond, m_ZeroInt())) || - MemRuntimeCheckCond; + return getSCEVChecks().first || getMemRuntimeChecks().first; } }; } // namespace @@ -2276,7 +2246,8 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { return TTI.enableMaskedInterleavedAccessVectorization(); } -void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) { +void EpilogueVectorizerMainLoop::introduceCheckBlockInVPlan( + BasicBlock *CheckIRBB) { // Note: The block with the minimum trip-count check is already connected // during earlier VPlan construction. VPBlockBase *ScalarPH = Plan.getScalarPreheader(); @@ -2300,8 +2271,8 @@ void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) { } } -Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF, - unsigned UF) const { +Value *EpilogueVectorizerMainLoop::createIterationCountCheck( + BasicBlock *VectorPH, ElementCount VF, unsigned UF) const { // Generate code to check if the loop's trip count is less than VF * UF, or // equal to it in case a scalar epilogue is required; this implies that the // vector trip count is zero. This check also covers the case where adding one @@ -2312,7 +2283,7 @@ Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF, // Reuse existing vector loop preheader for TC checks. // Note that new preheader block is generated for vector loop. - BasicBlock *const TCCheckBlock = LoopVectorPreHeader; + BasicBlock *const TCCheckBlock = VectorPH; IRBuilder<InstSimplifyFolder> Builder( TCCheckBlock->getContext(), InstSimplifyFolder(TCCheckBlock->getDataLayout())); @@ -2371,25 +2342,6 @@ Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF, return CheckMinIters; } -void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { - BasicBlock *const TCCheckBlock = LoopVectorPreHeader; - Value *CheckMinIters = createIterationCountCheck(VF, UF); - // Create new preheader for vector loop. - LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), - static_cast<DominatorTree *>(nullptr), LI, - nullptr, "vector.ph"); - - BranchInst &BI = - *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); - if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) - setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false); - ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); - - assert(cast<VPIRBasicBlock>(Plan.getEntry())->getIRBasicBlock() == - TCCheckBlock && - "Plan's entry must be TCCCheckBlock"); -} - /// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p /// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must /// have a single predecessor, which is rewired to the new VPIRBasicBlock. All @@ -2410,20 +2362,19 @@ static VPIRBasicBlock *replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, return IRVPBB; } -void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { - LoopVectorPreHeader = OrigLoop->getLoopPreheader(); - assert(LoopVectorPreHeader && "Invalid loop structure"); +BasicBlock *InnerLoopVectorizer::createScalarPreheader(StringRef Prefix) { + BasicBlock *VectorPH = OrigLoop->getLoopPreheader(); + assert(VectorPH && "Invalid loop structure"); assert((OrigLoop->getUniqueLatchExitBlock() || Cost->requiresScalarEpilogue(VF.isVector())) && "loops not exiting via the latch without required epilogue?"); - LoopScalarPreHeader = - SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, - LI, nullptr, Twine(Prefix) + "scalar.ph"); // NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock - // wrapping LoopScalarPreHeader here at the moment, because the Plan's scalar - // preheader may be unreachable at this point. Instead it is replaced in - // createVectorizedLoopSkeleton. + // wrapping the newly created scalar preheader here at the moment, because the + // Plan's scalar preheader may be unreachable at this point. Instead it is + // replaced in executePlan. + return SplitBlock(VectorPH, VectorPH->getTerminator(), DT, LI, nullptr, + Twine(Prefix) + "scalar.ph"); } /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV @@ -2464,54 +2415,9 @@ static void addFullyUnrolledInstructionsToIgnore( } BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { - /* - In this function we generate a new loop. The new loop will contain - the vectorized instructions while the old loop will continue to run the - scalar remainder. - - [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's - / | preheader are expanded here. Eventually all required SCEV - / | expansion should happen here. - / v - | [ ] <-- vector loop bypass (may consist of multiple blocks). - | / | - | / v - || [ ] <-- vector pre header. - |/ | - | v - | [ ] \ - | [ ]_| <-- vector loop (created during VPlan execution). - | | - | v - \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to - | | successors created during VPlan execution) - \/ | - /\ v - | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock). - | | - (opt) v <-- edge from middle to exit iff epilogue is not required. - | [ ] \ - | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue, header - | | wrapped in VPIRBasicBlock). - \ | - \ v - >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock) - ... - */ - - // Create an empty vector loop, and prepare basic blocks for the runtime - // checks. - createVectorLoopSkeleton(""); - - // Now, compare the new count to zero. If it is zero skip the vector loop and - // jump to the scalar loop. This check also covers the case where the - // backedge-taken count is uint##_max: adding one to it will overflow leading - // to an incorrect trip count of zero. In this (rare) case we will also jump - // to the scalar loop. - emitIterationCountCheck(LoopScalarPreHeader); - - replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader); - return LoopVectorPreHeader; + // Create a new IR basic block for the scalar preheader. + BasicBlock *ScalarPH = createScalarPreheader(""); + return ScalarPH->getSinglePredecessor(); } namespace { @@ -2652,24 +2558,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { // Remove redundant induction instructions. cse(HeaderBB); - - // Set/update profile weights for the vector and remainder loops as original - // loop iterations are now distributed among them. Note that original loop - // becomes the scalar remainder loop after vectorization. - // - // For cases like foldTailByMasking() and requiresScalarEpiloque() we may - // end up getting slightly roughened result but that should be OK since - // profile is not inherently precise anyway. Note also possible bypass of - // vector code caused by legality checks is ignored, assigning all the weight - // to the vector loop, optimistically. - // - // For scalable vectorization we can't know at compile time how many - // iterations of the loop are handled in one vector iteration, so instead - // use the value of vscale used for tuning. - Loop *VectorLoop = LI->getLoopFor(HeaderBB); - unsigned EstimatedVFxUF = - estimateElementCount(VF * UF, Cost->getVScaleForTuning()); - setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF); } void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { @@ -3020,19 +2908,12 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, toVectorTy(Type::getInt1Ty(I->getContext()), VF), CmpInst::BAD_ICMP_PREDICATE, CostKind); - // Certain instructions can be cheaper to vectorize if they have a constant - // second vector operand. One example of this are shifts on x86. - Value *Op2 = I->getOperand(1); - auto Op2Info = TTI.getOperandInfo(Op2); - if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && - Legal->isInvariant(Op2)) - Op2Info.Kind = TargetTransformInfo::OK_UniformValue; - SmallVector<const Value *, 4> Operands(I->operand_values()); SafeDivisorCost += TTI.getArithmeticInstrCost( - I->getOpcode(), VecTy, CostKind, - {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, - Op2Info, Operands, I); + I->getOpcode(), VecTy, CostKind, + {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, + {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, + Operands, I); return {ScalarizationCost, SafeDivisorCost}; } @@ -3810,7 +3691,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { return FixedScalableVFPair::getNone(); } -bool LoopVectorizationCostModel::shouldCalculateRegPressureForVF( +bool LoopVectorizationCostModel::shouldConsiderRegPressureForVF( ElementCount VF) { if (!useMaxBandwidth(VF.isScalable() ? TargetTransformInfo::RGK_ScalableVector @@ -3939,7 +3820,8 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A, const VectorizationFactor &B, const unsigned MaxTripCount, - bool HasTail) const { + bool HasTail, + bool IsEpilogue) const { InstructionCost CostA = A.Cost; InstructionCost CostB = B.Cost; @@ -3963,7 +3845,7 @@ bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A, // Assume vscale may be larger than 1 (or the value being tuned for), // so that scalable vectorization is slightly favorable over fixed-width // vectorization. - bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() && + bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost(IsEpilogue) && A.Width.isScalable() && !B.Width.isScalable(); auto CmpFn = [PreferScalable](const InstructionCost &LHS, @@ -4001,10 +3883,11 @@ bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A, bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A, const VectorizationFactor &B, - bool HasTail) const { + bool HasTail, + bool IsEpilogue) const { const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount(); - return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, - HasTail); + return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, HasTail, + IsEpilogue); } void LoopVectorizationPlanner::emitInvalidCostRemarks( @@ -4171,6 +4054,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, case VPDef::VPWidenIntOrFpInductionSC: case VPDef::VPWidenPointerInductionSC: case VPDef::VPReductionPHISC: + case VPDef::VPInterleaveEVLSC: case VPDef::VPInterleaveSC: case VPDef::VPWidenLoadEVLSC: case VPDef::VPWidenLoadSC: @@ -4199,8 +4083,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, // If no def nor is a store, e.g., branches, continue - no value to check. if (R.getNumDefinedValues() == 0 && - !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>( - &R)) + !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveBase>(&R)) continue; // For multi-def recipes, currently only interleaved loads, suffice to // check first def only. @@ -4255,8 +4138,9 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { P->vectorFactors().end()); SmallVector<VPRegisterUsage, 8> RUs; - if (CM.useMaxBandwidth(TargetTransformInfo::RGK_ScalableVector) || - CM.useMaxBandwidth(TargetTransformInfo::RGK_FixedWidthVector)) + if (any_of(VFs, [this](ElementCount VF) { + return CM.shouldConsiderRegPressureForVF(VF); + })) RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore); for (unsigned I = 0; I < VFs.size(); I++) { @@ -4268,7 +4152,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { /// If the register pressure needs to be considered for VF, /// don't consider the VF as valid if it exceeds the number /// of registers for the target. - if (CM.shouldCalculateRegPressureForVF(VF) && + if (CM.shouldConsiderRegPressureForVF(VF) && RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) continue; @@ -4286,7 +4170,33 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { if (!VPI) continue; switch (VPI->getOpcode()) { - case VPInstruction::ActiveLaneMask: + // Selects are only modelled in the legacy cost model for safe + // divisors. + case Instruction::Select: { + VPValue *VPV = VPI->getVPSingleValue(); + if (VPV->getNumUsers() == 1) { + if (auto *WR = dyn_cast<VPWidenRecipe>(*VPV->user_begin())) { + switch (WR->getOpcode()) { + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + continue; + default: + break; + } + } + } + C += VPI->cost(VF, CostCtx); + break; + } + case VPInstruction::ActiveLaneMask: { + unsigned Multiplier = + cast<ConstantInt>(VPI->getOperand(2)->getLiveInIRValue()) + ->getZExtValue(); + C += VPI->cost(VF * Multiplier, CostCtx); + break; + } case VPInstruction::ExplicitVectorLength: C += VPI->cost(VF, CostCtx); break; @@ -4511,7 +4421,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( } if (Result.Width.isScalar() || - isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking())) + isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking(), + /*IsEpilogue*/ true)) Result = NextVF; } @@ -5326,8 +5237,11 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, Type *ValTy = getLoadStoreType(I); auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); - const Value *Ptr = getLoadStorePointerOperand(I); - Type *PtrTy = toVectorTy(Ptr->getType(), VF); + Value *Ptr = getLoadStorePointerOperand(I); + Type *PtrTy = Ptr->getType(); + + if (!Legal->isUniform(Ptr, VF)) + PtrTy = toVectorTy(PtrTy, VF); return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) + TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, @@ -5483,7 +5397,8 @@ LoopVectorizationCostModel::getReductionPatternCost(Instruction *I, TTI::CastContextHint::None, CostKind, RedOp); InstructionCost RedCost = TTI.getMulAccReductionCost( - IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); + IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType, + CostKind); if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) @@ -5528,7 +5443,8 @@ LoopVectorizationCostModel::getReductionPatternCost(Instruction *I, TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); InstructionCost RedCost = TTI.getMulAccReductionCost( - IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); + IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType, + CostKind); InstructionCost ExtraExtCost = 0; if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; @@ -5547,7 +5463,8 @@ LoopVectorizationCostModel::getReductionPatternCost(Instruction *I, TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); InstructionCost RedCost = TTI.getMulAccReductionCost( - true, RdxDesc.getRecurrenceType(), VectorTy, CostKind); + true, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), VectorTy, + CostKind); if (RedCost.isValid() && RedCost < MulCost + BaseCost) return I == RetI ? RedCost : 0; @@ -6262,10 +6179,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, assert(Op0->getType()->getScalarSizeInBits() == 1 && Op1->getType()->getScalarSizeInBits() == 1); - SmallVector<const Value *, 2> Operands{Op0, Op1}; return TTI.getArithmeticInstrCost( - match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, - CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I); + match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, + VectorTy, CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, {Op0, Op1}, I); } Type *CondTy = SI->getCondition()->getType(); @@ -6495,7 +6411,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { })) continue; VecValuesToIgnore.insert(Op); - DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end()); + append_range(DeadInterleavePointerOps, Op->operands()); } for (const auto &[_, Ops] : DeadInvariantStoreOps) @@ -6555,7 +6471,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { ValuesToIgnore.insert(Op); VecValuesToIgnore.insert(Op); - DeadOps.append(Op->op_begin(), Op->op_end()); + append_range(DeadOps, Op->operands()); } // Ignore type-promoting instructions we identified during reduction @@ -6765,9 +6681,10 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { InstructionCost VPCostContext::getLegacyCost(Instruction *UI, ElementCount VF) const { - if (ForceTargetInstructionCost.getNumOccurrences()) - return InstructionCost(ForceTargetInstructionCost.getNumOccurrences()); - return CM.getInstructionCost(UI, VF); + InstructionCost Cost = CM.getInstructionCost(UI, VF); + if (Cost.isValid() && ForceTargetInstructionCost.getNumOccurrences()) + return InstructionCost(ForceTargetInstructionCost); + return Cost; } bool VPCostContext::isLegacyUniformAfterVectorization(Instruction *I, @@ -7071,8 +6988,9 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { P->vectorFactors().end()); SmallVector<VPRegisterUsage, 8> RUs; - if (CM.useMaxBandwidth(TargetTransformInfo::RGK_ScalableVector) || - CM.useMaxBandwidth(TargetTransformInfo::RGK_FixedWidthVector)) + if (any_of(VFs, [this](ElementCount VF) { + return CM.shouldConsiderRegPressureForVF(VF); + })) RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore); for (unsigned I = 0; I < VFs.size(); I++) { @@ -7098,7 +7016,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { InstructionCost Cost = cost(*P, VF); VectorizationFactor CurrentFactor(VF, Cost, ScalarCost); - if (CM.shouldCalculateRegPressureForVF(VF) && + if (CM.shouldConsiderRegPressureForVF(VF) && RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) { LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width " << VF << " because it uses too many registers\n"); @@ -7146,40 +7064,6 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { return BestFactor; } -static void addRuntimeUnrollDisableMetaData(Loop *L) { - SmallVector<Metadata *, 4> MDs; - // Reserve first location for self reference to the LoopID metadata node. - MDs.push_back(nullptr); - bool IsUnrollMetadata = false; - MDNode *LoopID = L->getLoopID(); - if (LoopID) { - // First find existing loop unrolling disable metadata. - for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) { - auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I)); - if (MD) { - const auto *S = dyn_cast<MDString>(MD->getOperand(0)); - IsUnrollMetadata = - S && S->getString().starts_with("llvm.loop.unroll.disable"); - } - MDs.push_back(LoopID->getOperand(I)); - } - } - - if (!IsUnrollMetadata) { - // Add runtime unroll disable metadata. - LLVMContext &Context = L->getHeader()->getContext(); - SmallVector<Metadata *, 1> DisableOperands; - DisableOperands.push_back( - MDString::get(Context, "llvm.loop.unroll.runtime.disable")); - MDNode *DisableNode = MDNode::get(Context, DisableOperands); - MDs.push_back(DisableNode); - MDNode *NewLoopID = MDNode::get(Context, MDs); - // Set operand 0 to refer to the loop id itself. - NewLoopID->replaceOperandWith(0, NewLoopID); - L->setLoopID(NewLoopID); - } -} - static Value *getStartValueFromReductionResult(VPInstruction *RdxResult) { using namespace VPlanPatternMatch; assert(RdxResult->getOpcode() == VPInstruction::ComputeFindIVResult && @@ -7193,7 +7077,7 @@ static Value *getStartValueFromReductionResult(VPInstruction *RdxResult) { // epilog loop, fix the reduction's scalar PHI node by adding the incoming value // from the main vector loop. static void fixReductionScalarResumeWhenVectorizingEpilog( - VPPhi *EpiResumePhiR, VPTransformState &State, BasicBlock *BypassBlock) { + VPPhi *EpiResumePhiR, PHINode &EpiResumePhi, BasicBlock *BypassBlock) { // Get the VPInstruction computing the reduction result in the middle block. // The first operand may not be from the middle block if it is not connected // to the scalar preheader. In that case, there's nothing to fix. @@ -7248,8 +7132,7 @@ static void fixReductionScalarResumeWhenVectorizingEpilog( // When fixing reductions in the epilogue loop we should already have // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry // over the incoming values correctly. - auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiR, true)); - EpiResumePhi->setIncomingValueForBlock( + EpiResumePhi.setIncomingValueForBlock( BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock)); } @@ -7276,11 +7159,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( BestVPlan, BestVF, VScale); } - if (!VectorizingEpilogue) { - // Checks are the same for all VPlans, added to BestVPlan only for - // compactness. - attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights); - } + // Checks are the same for all VPlans, added to BestVPlan only for + // compactness. + attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights); // Retrieving VectorPH now when it's easier while VPlan still has Regions. VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader()); @@ -7291,6 +7172,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( VPlanTransforms::narrowInterleaveGroups( BestVPlan, BestVF, TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)); + VPlanTransforms::cse(BestVPlan); VPlanTransforms::removeDeadRecipes(BestVPlan); VPlanTransforms::convertToConcreteRecipes(BestVPlan); @@ -7327,8 +7209,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( // 1. Set up the skeleton for vectorization, including vector pre-header and // middle block. The vector loop is created during VPlan execution. - BasicBlock *EntryBB = - cast<VPIRBasicBlock>(BestVPlan.getEntry())->getIRBasicBlock(); State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); replaceVPBBWithIRVPBB(BestVPlan.getScalarPreheader(), State.CFG.PrevBB->getSingleSuccessor()); @@ -7342,7 +7222,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( // looked through single-entry phis. ScalarEvolution &SE = *PSE.getSE(); for (VPIRBasicBlock *Exit : BestVPlan.getExitBlocks()) { - if (Exit->getNumPredecessors() == 0) + if (!Exit->hasPredecessors()) continue; for (VPRecipeBase &PhiR : Exit->phis()) SE.forgetLcssaPhiWithNewPredecessor( @@ -7362,88 +7242,22 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( // //===------------------------------------------------===// - // Move check blocks to their final position. - // TODO: Move as part of VPIRBB execute and update impacted tests. - if (BasicBlock *MemCheckBlock = ILV.RTChecks.getMemRuntimeChecks().second) - MemCheckBlock->moveAfter(EntryBB); - if (BasicBlock *SCEVCheckBlock = ILV.RTChecks.getSCEVChecks().second) - SCEVCheckBlock->moveAfter(EntryBB); - BestVPlan.execute(&State); - // 2.5 When vectorizing the epilogue, fix reduction resume values from the - // additional bypass block. - if (VectorizingEpilogue) { - assert(!BestVPlan.hasEarlyExit() && - "Epilogue vectorisation not yet supported with early exits"); - BasicBlock *PH = OrigLoop->getLoopPreheader(); - BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock(); - for (auto *Pred : predecessors(PH)) { - for (PHINode &Phi : PH->phis()) { - if (Phi.getBasicBlockIndex(Pred) != -1) - continue; - Phi.addIncoming(Phi.getIncomingValueForBlock(BypassBlock), Pred); - } - } - VPBasicBlock *ScalarPH = BestVPlan.getScalarPreheader(); - if (ScalarPH->getNumPredecessors() > 0) { - // If ScalarPH has predecessors, we may need to update its reduction - // resume values. - for (VPRecipeBase &R : ScalarPH->phis()) { - fixReductionScalarResumeWhenVectorizingEpilog(cast<VPPhi>(&R), State, - BypassBlock); - } - } - } - // 2.6. Maintain Loop Hints // Keep all loop hints from the original loop on the vector loop (we'll // replace the vectorizer-specific hints below). VPBasicBlock *HeaderVPBB = vputils::getFirstLoopHeader(BestVPlan, State.VPDT); - if (HeaderVPBB) { - MDNode *OrigLoopID = OrigLoop->getLoopID(); - - std::optional<MDNode *> VectorizedLoopID = - makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, - LLVMLoopVectorizeFollowupVectorized}); - - Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); - if (VectorizedLoopID) { - L->setLoopID(*VectorizedLoopID); - } else { - // Keep all loop hints from the original loop on the vector loop (we'll - // replace the vectorizer-specific hints below). - if (MDNode *LID = OrigLoop->getLoopID()) - L->setLoopID(LID); - - LoopVectorizeHints Hints(L, true, *ORE); - Hints.setAlreadyVectorized(); - - // Check if it's EVL-vectorized and mark the corresponding metadata. - bool IsEVLVectorized = - llvm::any_of(*HeaderVPBB, [](const VPRecipeBase &Recipe) { - // Looking for the ExplictVectorLength VPInstruction. - if (const auto *VI = dyn_cast<VPInstruction>(&Recipe)) - return VI->getOpcode() == VPInstruction::ExplicitVectorLength; - return false; - }); - if (IsEVLVectorized) { - LLVMContext &Context = L->getHeader()->getContext(); - MDNode *LoopID = L->getLoopID(); - auto *IsEVLVectorizedMD = MDNode::get( - Context, - {MDString::get(Context, "llvm.loop.isvectorized.tailfoldingstyle"), - MDString::get(Context, "evl")}); - MDNode *NewLoopID = makePostTransformationMetadata(Context, LoopID, {}, - {IsEVLVectorizedMD}); - L->setLoopID(NewLoopID); - } - } - TargetTransformInfo::UnrollingPreferences UP; - TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); - if (!UP.UnrollVectorizedLoop || VectorizingEpilogue) - addRuntimeUnrollDisableMetaData(L); - } + // Add metadata to disable runtime unrolling a scalar loop when there + // are no runtime checks about strides and memory. A scalar loop that is + // rarely used is not worth unrolling. + bool DisableRuntimeUnroll = !ILV.RTChecks.hasChecks() && !BestVF.isScalar(); + updateLoopMetadataAndProfileInfo( + HeaderVPBB ? LI->getLoopFor(State.CFG.VPBB2IRBB.lookup(HeaderVPBB)) + : nullptr, + HeaderVPBB, VectorizingEpilogue, + estimateElementCount(BestVF * BestUF, CM.getVScaleForTuning()), + DisableRuntimeUnroll); // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. @@ -7460,15 +7274,18 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. -BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { - createVectorLoopSkeleton(""); +BasicBlock *EpilogueVectorizerMainLoop::createVectorizedLoopSkeleton() { + BasicBlock *ScalarPH = createScalarPreheader(""); + BasicBlock *VectorPH = ScalarPH->getSinglePredecessor(); // Generate the code to check the minimum iteration count of the vector // epilogue (see below). EPI.EpilogueIterationCountCheck = - emitIterationCountCheck(LoopScalarPreHeader, true); + emitIterationCountCheck(VectorPH, ScalarPH, true); EPI.EpilogueIterationCountCheck->setName("iter.check"); + VectorPH = cast<BranchInst>(EPI.EpilogueIterationCountCheck->getTerminator()) + ->getSuccessor(1); // Generate the iteration count check for the main loop, *after* the check // for the epilogue loop, so that the path-length is shorter for the case // that goes directly through the vector epilogue. The longer-path length for @@ -7476,9 +7293,10 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { // trip count. Note: the branch will get updated later on when we vectorize // the epilogue. EPI.MainLoopIterationCountCheck = - emitIterationCountCheck(LoopScalarPreHeader, false); + emitIterationCountCheck(VectorPH, ScalarPH, false); - return LoopVectorPreHeader; + return cast<BranchInst>(EPI.MainLoopIterationCountCheck->getTerminator()) + ->getSuccessor(1); } void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { @@ -7498,35 +7316,33 @@ void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { }); } -BasicBlock * -EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, - bool ForEpilogue) { +BasicBlock *EpilogueVectorizerMainLoop::emitIterationCountCheck( + BasicBlock *VectorPH, BasicBlock *Bypass, bool ForEpilogue) { assert(Bypass && "Expected valid bypass basic block."); Value *Count = getTripCount(); MinProfitableTripCount = ElementCount::getFixed(0); - Value *CheckMinIters = - createIterationCountCheck(ForEpilogue ? EPI.EpilogueVF : EPI.MainLoopVF, - ForEpilogue ? EPI.EpilogueUF : EPI.MainLoopUF); + Value *CheckMinIters = createIterationCountCheck( + VectorPH, ForEpilogue ? EPI.EpilogueVF : EPI.MainLoopVF, + ForEpilogue ? EPI.EpilogueUF : EPI.MainLoopUF); - BasicBlock *const TCCheckBlock = LoopVectorPreHeader; + BasicBlock *const TCCheckBlock = VectorPH; if (!ForEpilogue) TCCheckBlock->setName("vector.main.loop.iter.check"); // Create new preheader for vector loop. - LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), - static_cast<DominatorTree *>(nullptr), LI, - nullptr, "vector.ph"); + VectorPH = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), + static_cast<DominatorTree *>(nullptr), LI, nullptr, + "vector.ph"); if (ForEpilogue) { // Save the trip count so we don't have to regenerate it in the // vec.epilog.iter.check. This is safe to do because the trip count // generated here dominates the vector epilog iter check. EPI.TripCount = Count; } else { - VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader); + VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, VectorPH); } - BranchInst &BI = - *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); + BranchInst &BI = *BranchInst::Create(Bypass, VectorPH, CheckMinIters); if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false); ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); @@ -7546,19 +7362,18 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. -BasicBlock * -EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { - createVectorLoopSkeleton("vec.epilog."); - +BasicBlock *EpilogueVectorizerEpilogueLoop::createVectorizedLoopSkeleton() { + BasicBlock *ScalarPH = createScalarPreheader("vec.epilog."); + BasicBlock *VectorPH = ScalarPH->getSinglePredecessor(); // Now, compare the remaining count and if there aren't enough iterations to // execute the vectorized epilogue skip to the scalar part. - LoopVectorPreHeader->setName("vec.epilog.ph"); + VectorPH->setName("vec.epilog.ph"); BasicBlock *VecEpilogueIterationCountCheck = - SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI, - nullptr, "vec.epilog.iter.check", true); - VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader); + SplitBlock(VectorPH, VectorPH->begin(), DT, LI, nullptr, + "vec.epilog.iter.check", true); + VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, VectorPH); - emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, + emitMinimumVectorEpilogueIterCountCheck(VectorPH, ScalarPH, VecEpilogueIterationCountCheck); AdditionalBypassBlock = VecEpilogueIterationCountCheck; @@ -7567,23 +7382,22 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && "expected this to be saved from the previous pass."); EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( - VecEpilogueIterationCountCheck, LoopVectorPreHeader); + VecEpilogueIterationCountCheck, VectorPH); EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( - VecEpilogueIterationCountCheck, LoopScalarPreHeader); + VecEpilogueIterationCountCheck, ScalarPH); // Adjust the terminators of runtime check blocks and phis using them. BasicBlock *SCEVCheckBlock = RTChecks.getSCEVChecks().second; BasicBlock *MemCheckBlock = RTChecks.getMemRuntimeChecks().second; if (SCEVCheckBlock) SCEVCheckBlock->getTerminator()->replaceUsesOfWith( - VecEpilogueIterationCountCheck, LoopScalarPreHeader); + VecEpilogueIterationCountCheck, ScalarPH); if (MemCheckBlock) MemCheckBlock->getTerminator()->replaceUsesOfWith( - VecEpilogueIterationCountCheck, LoopScalarPreHeader); + VecEpilogueIterationCountCheck, ScalarPH); - DT->changeImmediateDominator(LoopScalarPreHeader, - EPI.EpilogueIterationCountCheck); + DT->changeImmediateDominator(ScalarPH, EPI.EpilogueIterationCountCheck); // The vec.epilog.iter.check block may contain Phi nodes from inductions or // reductions which merge control-flow from the latch block and the middle @@ -7592,7 +7406,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { llvm::make_pointer_range(VecEpilogueIterationCountCheck->phis())); for (PHINode *Phi : PhisInBlock) { - Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHIIt()); + Phi->moveBefore(VectorPH->getFirstNonPHIIt()); Phi->replaceIncomingBlockWith( VecEpilogueIterationCountCheck->getSinglePredecessor(), VecEpilogueIterationCountCheck); @@ -7612,12 +7426,12 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { Phi->removeIncomingValue(MemCheckBlock); } - return LoopVectorPreHeader; + return VectorPH; } BasicBlock * EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( - BasicBlock *Bypass, BasicBlock *Insert) { + BasicBlock *VectorPH, BasicBlock *Bypass, BasicBlock *Insert) { assert(EPI.TripCount && "Expected trip count to have been saved in the first pass."); @@ -7637,23 +7451,22 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( EPI.EpilogueVF, EPI.EpilogueUF), "min.epilog.iters.check"); - BranchInst &BI = - *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); - if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { - auto VScale = Cost->getVScaleForTuning(); - unsigned MainLoopStep = - estimateElementCount(EPI.MainLoopVF * EPI.MainLoopUF, VScale); - unsigned EpilogueLoopStep = - estimateElementCount(EPI.EpilogueVF * EPI.EpilogueUF, VScale); - // We assume the remaining `Count` is equally distributed in - // [0, MainLoopStep) - // So the probability for `Count < EpilogueLoopStep` should be - // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep - unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep); - const uint32_t Weights[] = {EstimatedSkipCount, - MainLoopStep - EstimatedSkipCount}; - setBranchWeights(BI, Weights, /*IsExpected=*/false); - } + BranchInst &BI = *BranchInst::Create(Bypass, VectorPH, CheckMinIters); + auto VScale = Cost->getVScaleForTuning(); + unsigned MainLoopStep = + estimateElementCount(EPI.MainLoopVF * EPI.MainLoopUF, VScale); + unsigned EpilogueLoopStep = + estimateElementCount(EPI.EpilogueVF * EPI.EpilogueUF, VScale); + // We assume the remaining `Count` is equally distributed in + // [0, MainLoopStep) + // So the probability for `Count < EpilogueLoopStep` should be + // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep + // TODO: Improve the estimate by taking the estimated trip count into + // consideration. + unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep); + const uint32_t Weights[] = {EstimatedSkipCount, + MainLoopStep - EstimatedSkipCount}; + setBranchWeights(BI, Weights, /*IsExpected=*/false); ReplaceInstWithInst(Insert->getTerminator(), &BI); // A new entry block has been created for the epilogue VPlan. Hook it in, as @@ -8634,8 +8447,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( return !CM.requiresScalarEpilogue(VF.isVector()); }, Range); - VPlanTransforms::handleEarlyExits(*Plan, Legal->hasUncountableEarlyExit(), - Range); + VPlanTransforms::handleEarlyExits(*Plan, Legal->hasUncountableEarlyExit()); VPlanTransforms::addMiddleCheck(*Plan, RequiresScalarEpilogueCheck, CM.foldTailByMasking()); @@ -8761,10 +8573,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( VPRecipeBase *Recipe = RecipeBuilder.tryToCreateWidenRecipe(SingleDef, Range); - if (!Recipe) { - SmallVector<VPValue *, 4> Operands(R.operands()); - Recipe = RecipeBuilder.handleReplication(Instr, Operands, Range); - } + if (!Recipe) + Recipe = RecipeBuilder.handleReplication(Instr, R.operands(), Range); RecipeBuilder.setRecipe(Instr, Recipe); if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && isa<TruncInst>(Instr)) { @@ -8790,7 +8600,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // to remove the need to keep a map of masks beyond the predication // transform. RecipeBuilder.updateBlockMaskCache(Old2New); - for (const auto &[Old, _] : Old2New) + for (VPValue *Old : Old2New.keys()) Old->getDefiningRecipe()->eraseFromParent(); assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && @@ -8851,41 +8661,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed()); - // Replace VPValues for known constant strides guaranteed by predicate scalar - // evolution. - auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) { - auto *R = cast<VPRecipeBase>(&U); - return R->getParent()->getParent() || - R->getParent() == - Plan->getVectorLoopRegion()->getSinglePredecessor(); - }; - for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) { - auto *StrideV = cast<SCEVUnknown>(Stride)->getValue(); - auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV)); - // Only handle constant strides for now. - if (!ScevStride) - continue; - - auto *CI = Plan->getOrAddLiveIn( - ConstantInt::get(Stride->getType(), ScevStride->getAPInt())); - if (VPValue *StrideVPV = Plan->getLiveIn(StrideV)) - StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); - - // The versioned value may not be used in the loop directly but through a - // sext/zext. Add new live-ins in those cases. - for (Value *U : StrideV->users()) { - if (!isa<SExtInst, ZExtInst>(U)) - continue; - VPValue *StrideVPV = Plan->getLiveIn(U); - if (!StrideVPV) - continue; - unsigned BW = U->getType()->getScalarSizeInBits(); - APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW) - : ScevStride->getAPInt().zext(BW); - VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C)); - StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); - } - } + // Replace VPValues for known constant strides. + VPlanTransforms::runPass(VPlanTransforms::replaceSymbolicStrides, *Plan, PSE, + Legal->getLAI()->getSymbolicStrides()); auto BlockNeedsPredication = [this](BasicBlock *BB) { return Legal->blockNeedsPredication(BB); @@ -8926,7 +8704,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) { OrigLoop, *LI, Legal->getWidestInductionType(), getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE); VPlanTransforms::handleEarlyExits(*Plan, - /*HasUncountableExit*/ false, Range); + /*HasUncountableExit*/ false); VPlanTransforms::addMiddleCheck(*Plan, /*RequiresScalarEpilogue*/ true, /*TailFolded*/ false); @@ -9316,7 +9094,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( void LoopVectorizationPlanner::attachRuntimeChecks( VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const { const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks(); - if (SCEVCheckBlock) { + if (SCEVCheckBlock && SCEVCheckBlock->hasNPredecessors(0)) { assert((!CM.OptForSize || CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) && "Cannot SCEV check stride or overflow when optimizing for size"); @@ -9324,7 +9102,7 @@ void LoopVectorizationPlanner::attachRuntimeChecks( HasBranchWeights); } const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks(); - if (MemCheckBlock) { + if (MemCheckBlock && MemCheckBlock->hasNPredecessors(0)) { // VPlan-native path does not do any analysis for runtime checks // currently. assert((!EnableVPlanNativePath || OrigLoop->isInnermost()) && @@ -9350,6 +9128,29 @@ void LoopVectorizationPlanner::attachRuntimeChecks( } } +void LoopVectorizationPlanner::addMinimumIterationCheck( + VPlan &Plan, ElementCount VF, unsigned UF, + ElementCount MinProfitableTripCount) const { + // vscale is not necessarily a power-of-2, which means we cannot guarantee + // an overflow to zero when updating induction variables and so an + // additional overflow check is required before entering the vector loop. + bool IsIndvarOverflowCheckNeededForVF = + VF.isScalable() && !TTI.isVScaleKnownToBeAPowerOfTwo() && + !isIndvarOverflowCheckKnownFalse(&CM, VF, UF) && + CM.getTailFoldingStyle() != + TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; + const uint32_t *BranchWeigths = + hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()) + ? &MinItersBypassWeights[0] + : nullptr; + VPlanTransforms::addMinimumIterationCheck( + Plan, VF, UF, MinProfitableTripCount, + CM.requiresScalarEpilogue(VF.isVector()), CM.foldTailByMasking(), + IsIndvarOverflowCheckNeededForVF, OrigLoop, BranchWeigths, + OrigLoop->getLoopPredecessor()->getTerminator()->getDebugLoc(), + *PSE.getSE()); +} + void VPDerivedIVRecipe::execute(VPTransformState &State) { assert(!State.Lane && "VPDerivedIVRecipe being replicated."); @@ -9465,17 +9266,18 @@ static bool processLoopInVPlanNativePath( { GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind); - InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, VF.Width, 1, &CM, + InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM, BFI, PSI, Checks, BestPlan); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); - LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); + LVP.addMinimumIterationCheck(BestPlan, VF.Width, /*UF=*/1, + VF.MinProfitableTripCount); + + LVP.executePlan(VF.Width, /*UF=*/1, BestPlan, LB, DT, false); } reportVectorization(ORE, L, VF, 1); - // Mark the loop as already vectorized to avoid vectorizing again. - Hints.setAlreadyVectorized(); assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); return true; } @@ -9929,6 +9731,43 @@ static Value *createInductionAdditionalBypassValues( return EndValueFromAdditionalBypass; } +static void fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L, + VPlan &BestEpiPlan, + LoopVectorizationLegality &LVL, + const SCEV2ValueTy &ExpandedSCEVs, + Value *MainVectorTripCount) { + // Fix reduction resume values from the additional bypass block. + BasicBlock *PH = L->getLoopPreheader(); + for (auto *Pred : predecessors(PH)) { + for (PHINode &Phi : PH->phis()) { + if (Phi.getBasicBlockIndex(Pred) != -1) + continue; + Phi.addIncoming(Phi.getIncomingValueForBlock(BypassBlock), Pred); + } + } + auto *ScalarPH = cast<VPIRBasicBlock>(BestEpiPlan.getScalarPreheader()); + if (ScalarPH->hasPredecessors()) { + // If ScalarPH has predecessors, we may need to update its reduction + // resume values. + for (const auto &[R, IRPhi] : + zip(ScalarPH->phis(), ScalarPH->getIRBasicBlock()->phis())) { + fixReductionScalarResumeWhenVectorizingEpilog(cast<VPPhi>(&R), IRPhi, + BypassBlock); + } + } + + // Fix induction resume values from the additional bypass block. + IRBuilder<> BypassBuilder(BypassBlock, BypassBlock->getFirstInsertionPt()); + for (const auto &[IVPhi, II] : LVL.getInductionVars()) { + auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH)); + Value *V = createInductionAdditionalBypassValues( + IVPhi, II, BypassBuilder, ExpandedSCEVs, MainVectorTripCount, + LVL.getPrimaryInduction()); + // TODO: Directly add as extra operand to the VPResumePHI recipe. + Inc->setIncomingValueForBlock(BypassBlock, V); + } +} + bool LoopVectorizePass::processLoop(Loop *L) { assert((EnableVPlanNativePath || L->isInnermost()) && "VPlan-native path is not enabled. Only process inner loops."); @@ -9971,7 +9810,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Check if it is legal to vectorize the loop. LoopVectorizationRequirements Requirements; LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE, - &Requirements, &Hints, DB, AC, BFI, PSI); + &Requirements, &Hints, DB, AC, BFI, PSI, AA); if (!LVL.canVectorize(EnableVPlanNativePath)) { LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); Hints.emitRemarkWithHints(); @@ -9985,6 +9824,13 @@ bool LoopVectorizePass::processLoop(Loop *L) { return false; } + if (!LVL.getPotentiallyFaultingLoads().empty()) { + reportVectorizationFailure("Auto-vectorization of loops with potentially " + "faulting load is not supported", + "PotentiallyFaultingLoadsNotSupported", ORE, L); + return false; + } + // Entrance to the VPlan-native vectorization path. Outer loops are processed // here. They may require CFG and instruction level transformations before // even evaluating whether vectorization is profitable. Since we cannot modify @@ -10251,128 +10097,80 @@ bool LoopVectorizePass::processLoop(Loop *L) { LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); } - bool DisableRuntimeUnroll = false; - MDNode *OrigLoopID = L->getLoopID(); - { + // Report the vectorization decision. + if (VF.Width.isScalar()) { using namespace ore; - if (!VectorizeLoop) { - assert(IC > 1 && "interleave count should not be 1 or 0"); - // If we decided that it is not legal to vectorize the loop, then - // interleave it. - VPlan &BestPlan = LVP.getPlanFor(VF.Width); - InnerLoopVectorizer Unroller( - L, PSE, LI, DT, TTI, AC, ElementCount::getFixed(1), - ElementCount::getFixed(1), IC, &CM, BFI, PSI, Checks, BestPlan); - - // TODO: Move to general VPlan pipeline once epilogue loops are also - // supported. - VPlanTransforms::runPass( - VPlanTransforms::materializeConstantVectorTripCount, BestPlan, - VF.Width, IC, PSE); - - LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); + assert(IC > 1); + ORE->emit([&]() { + return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), + L->getHeader()) + << "interleaved loop (interleaved count: " + << NV("InterleaveCount", IC) << ")"; + }); + } else { + // Report the vectorization decision. + reportVectorization(ORE, L, VF, IC); + } + if (ORE->allowExtraAnalysis(LV_NAME)) + checkMixedPrecision(L, ORE); - ORE->emit([&]() { - return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), - L->getHeader()) - << "interleaved loop (interleaved count: " - << NV("InterleaveCount", IC) << ")"; - }); - } else { - // If we decided that it is *legal* to vectorize the loop, then do it. - - VPlan &BestPlan = LVP.getPlanFor(VF.Width); - // Consider vectorizing the epilogue too if it's profitable. - VectorizationFactor EpilogueVF = - LVP.selectEpilogueVectorizationFactor(VF.Width, IC); - if (EpilogueVF.Width.isVector()) { - std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate()); - - // The first pass vectorizes the main loop and creates a scalar epilogue - // to be vectorized by executing the plan (potentially with a different - // factor) again shortly afterwards. - VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width); - BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block"); - preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan); - EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1, - BestEpiPlan); - EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM, - BFI, PSI, Checks, *BestMainPlan); - auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, - *BestMainPlan, MainILV, DT, false); - ++LoopsVectorized; - - // Second pass vectorizes the epilogue and adjusts the control flow - // edges from the first pass. - EpilogueVectorizerEpilogueLoop EpilogILV( - L, PSE, LI, DT, TTI, AC, EPI, &CM, BFI, PSI, Checks, BestEpiPlan); - EpilogILV.setTripCount(MainILV.getTripCount()); - preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI); - - LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, - DT, true); - - // Fix induction resume values from the additional bypass block. - BasicBlock *BypassBlock = EpilogILV.getAdditionalBypassBlock(); - IRBuilder<> BypassBuilder(BypassBlock, - BypassBlock->getFirstInsertionPt()); - BasicBlock *PH = L->getLoopPreheader(); - for (const auto &[IVPhi, II] : LVL.getInductionVars()) { - auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH)); - Value *V = createInductionAdditionalBypassValues( - IVPhi, II, BypassBuilder, ExpandedSCEVs, EPI.VectorTripCount, - LVL.getPrimaryInduction()); - // TODO: Directly add as extra operand to the VPResumePHI recipe. - Inc->setIncomingValueForBlock(BypassBlock, V); - } - ++LoopsEpilogueVectorized; + // If we decided that it is *legal* to interleave or vectorize the loop, then + // do it. - if (!Checks.hasChecks()) - DisableRuntimeUnroll = true; - } else { - InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, - VF.MinProfitableTripCount, IC, &CM, BFI, PSI, - Checks, BestPlan); - // TODO: Move to general VPlan pipeline once epilogue loops are also - // supported. - VPlanTransforms::runPass( - VPlanTransforms::materializeConstantVectorTripCount, BestPlan, - VF.Width, IC, PSE); - - LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); - ++LoopsVectorized; - - // Add metadata to disable runtime unrolling a scalar loop when there - // are no runtime checks about strides and memory. A scalar loop that is - // rarely used is not worth unrolling. - if (!Checks.hasChecks()) - DisableRuntimeUnroll = true; - } - // Report the vectorization decision. - reportVectorization(ORE, L, VF, IC); - } + VPlan &BestPlan = LVP.getPlanFor(VF.Width); + // Consider vectorizing the epilogue too if it's profitable. + VectorizationFactor EpilogueVF = + LVP.selectEpilogueVectorizationFactor(VF.Width, IC); + if (EpilogueVF.Width.isVector()) { + std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate()); + + // The first pass vectorizes the main loop and creates a scalar epilogue + // to be vectorized by executing the plan (potentially with a different + // factor) again shortly afterwards. + VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width); + BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block"); + preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan); + EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1, + BestEpiPlan); + EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM, BFI, + PSI, Checks, *BestMainPlan); + auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, + *BestMainPlan, MainILV, DT, false); + ++LoopsVectorized; + + // Second pass vectorizes the epilogue and adjusts the control flow + // edges from the first pass. + EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TTI, AC, EPI, &CM, + BFI, PSI, Checks, BestEpiPlan); + EpilogILV.setTripCount(MainILV.getTripCount()); + preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI); + + LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT, + true); + + fixScalarResumeValuesFromBypass(EpilogILV.getAdditionalBypassBlock(), L, + BestEpiPlan, LVL, ExpandedSCEVs, + EPI.VectorTripCount); + ++LoopsEpilogueVectorized; + } else { + InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, IC, &CM, BFI, PSI, + Checks, BestPlan); + // TODO: Move to general VPlan pipeline once epilogue loops are also + // supported. + VPlanTransforms::runPass( + VPlanTransforms::materializeConstantVectorTripCount, BestPlan, VF.Width, + IC, PSE); + LVP.addMinimumIterationCheck(BestPlan, VF.Width, IC, + VF.MinProfitableTripCount); - if (ORE->allowExtraAnalysis(LV_NAME)) - checkMixedPrecision(L, ORE); + LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); + ++LoopsVectorized; } assert(DT->verify(DominatorTree::VerificationLevel::Fast) && "DT not preserved correctly"); + assert(!verifyFunction(*F, &dbgs())); - std::optional<MDNode *> RemainderLoopID = - makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, - LLVMLoopVectorizeFollowupEpilogue}); - if (RemainderLoopID) { - L->setLoopID(*RemainderLoopID); - } else { - if (DisableRuntimeUnroll) - addRuntimeUnrollDisableMetaData(L); - - // Mark the loop as already vectorized to avoid vectorizing again. - Hints.setAlreadyVectorized(); - } - - assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); return true; } @@ -10449,6 +10247,7 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, DB = &AM.getResult<DemandedBitsAnalysis>(F); ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F); LAIs = &AM.getResult<LoopAccessAnalysis>(F); + AA = &AM.getResult<AAManager>(F); auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 37dc41413966..6a56dbfaa015 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -967,9 +967,7 @@ class BinOpSameOpcodeHelper { return false; } bool equal(unsigned Opcode) { - if (Opcode == I->getOpcode()) - return trySet(MainOpBIT, MainOpBIT); - return false; + return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT); } unsigned getOpcode() const { MaskType Candidate = Mask & SeenBefore; @@ -5576,7 +5574,23 @@ private: if (auto *SD = dyn_cast<ScheduleData>(Data)) { SD->setScheduled(/*Scheduled=*/true); LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); - ProcessBundleMember(SD, {}); + SmallVector<std::unique_ptr<ScheduleBundle>> PseudoBundles; + SmallVector<ScheduleBundle *> Bundles; + Instruction *In = SD->getInst(); + if (R.isVectorized(In)) { + ArrayRef<TreeEntry *> Entries = R.getTreeEntries(In); + for (TreeEntry *TE : Entries) { + if (!isa<ExtractValueInst, ExtractElementInst, CallBase>(In) && + In->getNumOperands() != TE->getNumOperands()) + continue; + auto &BundlePtr = + PseudoBundles.emplace_back(std::make_unique<ScheduleBundle>()); + BundlePtr->setTreeEntry(TE); + BundlePtr->add(SD); + Bundles.push_back(BundlePtr.get()); + } + } + ProcessBundleMember(SD, Bundles); } else { ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data); Bundle.setScheduled(/*Scheduled=*/true); @@ -6325,17 +6339,11 @@ static bool isReverseOrder(ArrayRef<unsigned> Order) { } /// Checks if the provided list of pointers \p Pointers represents the strided -/// pointers for type ElemTy. If they are not, std::nullopt is returned. -/// Otherwise, if \p Inst is not specified, just initialized optional value is -/// returned to show that the pointers represent strided pointers. If \p Inst -/// specified, the runtime stride is materialized before the given \p Inst. -/// \returns std::nullopt if the pointers are not pointers with the runtime -/// stride, nullptr or actual stride value, otherwise. -static std::optional<Value *> -calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy, - const DataLayout &DL, ScalarEvolution &SE, - SmallVectorImpl<unsigned> &SortedIndices, - Instruction *Inst = nullptr) { +/// pointers for type ElemTy. If they are not, nullptr is returned. +/// Otherwise, SCEV* of the stride value is returned. +static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy, + const DataLayout &DL, ScalarEvolution &SE, + SmallVectorImpl<unsigned> &SortedIndices) { SmallVector<const SCEV *> SCEVs; const SCEV *PtrSCEVLowest = nullptr; const SCEV *PtrSCEVHighest = nullptr; @@ -6344,7 +6352,7 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy, for (Value *Ptr : PointerOps) { const SCEV *PtrSCEV = SE.getSCEV(Ptr); if (!PtrSCEV) - return std::nullopt; + return nullptr; SCEVs.push_back(PtrSCEV); if (!PtrSCEVLowest && !PtrSCEVHighest) { PtrSCEVLowest = PtrSCEVHighest = PtrSCEV; @@ -6352,14 +6360,14 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy, } const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest); if (isa<SCEVCouldNotCompute>(Diff)) - return std::nullopt; + return nullptr; if (Diff->isNonConstantNegative()) { PtrSCEVLowest = PtrSCEV; continue; } const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV); if (isa<SCEVCouldNotCompute>(Diff1)) - return std::nullopt; + return nullptr; if (Diff1->isNonConstantNegative()) { PtrSCEVHighest = PtrSCEV; continue; @@ -6368,7 +6376,7 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy, // Dist = PtrSCEVHighest - PtrSCEVLowest; const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest); if (isa<SCEVCouldNotCompute>(Dist)) - return std::nullopt; + return nullptr; int Size = DL.getTypeStoreSize(ElemTy); auto TryGetStride = [&](const SCEV *Dist, const SCEV *Multiplier) -> const SCEV * { @@ -6389,10 +6397,10 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy, const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1)); Stride = TryGetStride(Dist, Sz); if (!Stride) - return std::nullopt; + return nullptr; } if (!Stride || isa<SCEVConstant>(Stride)) - return std::nullopt; + return nullptr; // Iterate through all pointers and check if all distances are // unique multiple of Stride. using DistOrdPair = std::pair<int64_t, int>; @@ -6406,28 +6414,28 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy, const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest); const SCEV *Coeff = TryGetStride(Diff, Stride); if (!Coeff) - return std::nullopt; + return nullptr; const auto *SC = dyn_cast<SCEVConstant>(Coeff); if (!SC || isa<SCEVCouldNotCompute>(SC)) - return std::nullopt; + return nullptr; if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest, SE.getMulExpr(Stride, SC))) ->isZero()) - return std::nullopt; + return nullptr; Dist = SC->getAPInt().getZExtValue(); } // If the strides are not the same or repeated, we can't vectorize. if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size()) - return std::nullopt; + return nullptr; auto Res = Offsets.emplace(Dist, Cnt); if (!Res.second) - return std::nullopt; + return nullptr; // Consecutive order if the inserted element is the last one. IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end(); ++Cnt; } if (Offsets.size() != SCEVs.size()) - return std::nullopt; + return nullptr; SortedIndices.clear(); if (!IsConsecutive) { // Fill SortedIndices array only if it is non-consecutive. @@ -6438,10 +6446,7 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy, ++Cnt; } } - if (!Inst) - return nullptr; - SCEVExpander Expander(SE, DL, "strided-load-vec"); - return Expander.expandCodeFor(Stride, Stride->getType(), Inst); + return Stride; } static std::pair<InstructionCost, InstructionCost> @@ -8030,11 +8035,11 @@ void BoUpSLP::reorderTopToBottom() { // it is an attempt to reorder node with reused scalars but with // external uses. if (OpTE->getVectorFactor() != OpTE->Scalars.size()) { - OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second += + OrdersUses.try_emplace(OrdersType(), 0).first->second += ExternalUserReorderIndices.size(); } else { for (const OrdersType &ExtOrder : ExternalUserReorderIndices) - ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second; + ++OrdersUses.try_emplace(ExtOrder, 0).first->second; } // No other useful reorder data in this entry. if (Order.empty()) @@ -8054,9 +8059,9 @@ void BoUpSLP::reorderTopToBottom() { return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx); }); fixupOrderingIndices(CurrentOrder); - ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second; + ++OrdersUses.try_emplace(CurrentOrder, 0).first->second; } else { - ++OrdersUses.insert(std::make_pair(Order, 0)).first->second; + ++OrdersUses.try_emplace(Order, 0).first->second; } } if (OrdersUses.empty()) @@ -8480,12 +8485,11 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx); }); fixupOrderingIndices(CurrentOrder); - OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second += - NumOps; + OrdersUses.try_emplace(CurrentOrder, 0).first->second += NumOps; } else { - OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps; + OrdersUses.try_emplace(Order, 0).first->second += NumOps; } - auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0)); + auto Res = OrdersUses.try_emplace(OrdersType(), 0); const auto AllowsReordering = [&](const TreeEntry *TE) { if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() || (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) || @@ -10639,8 +10643,19 @@ class InstructionsCompatibilityAnalysis { } } } - if (MainOp) + if (MainOp) { + // Do not match, if any copyable is a terminator from the same block as + // the main operation. + if (any_of(VL, [&](Value *V) { + auto *I = dyn_cast<Instruction>(V); + return I && I->getParent() == MainOp->getParent() && + I->isTerminator(); + })) { + MainOp = nullptr; + return; + } MainOpcode = MainOp->getOpcode(); + } } /// Returns the idempotent value for the \p MainOp with the detected \p @@ -11013,7 +11028,10 @@ BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality( } SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars); if (all_of(VL, [&](Value *V) { - return isa<PoisonValue>(V) || Values.contains(V); + return isa<PoisonValue>(V) || Values.contains(V) || + (S.getOpcode() == Instruction::PHI && isa<PHINode>(V) && + LI->getLoopFor(S.getMainOp()->getParent()) && + isVectorized(V)); })) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n"); return ScalarsVectorizationLegality(S, /*IsLegal=*/false); @@ -17835,6 +17853,17 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL)))); } + Value *getVectorizedValue(const TreeEntry &E) { + Value *Vec = E.VectorizedValue; + if (!Vec->getType()->isIntOrIntVectorTy()) + return Vec; + return castToScalarTyElem(Vec, any_of(E.Scalars, [&](Value *V) { + return !isa<PoisonValue>(V) && + !isKnownNonNegative( + V, SimplifyQuery(*R.DL)); + })); + } + public: ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R) : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {} @@ -18001,35 +18030,14 @@ public: /// Adds 2 input vectors (in form of tree entries) and the mask for their /// shuffling. void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) { - Value *V1 = E1.VectorizedValue; - if (V1->getType()->isIntOrIntVectorTy()) - V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) { - if (isa<PoisonValue>(V)) - return false; - return !isKnownNonNegative( - V, SimplifyQuery(*R.DL)); - })); - Value *V2 = E2.VectorizedValue; - if (V2->getType()->isIntOrIntVectorTy()) - V2 = castToScalarTyElem(V2, any_of(E2.Scalars, [&](Value *V) { - if (isa<PoisonValue>(V)) - return false; - return !isKnownNonNegative( - V, SimplifyQuery(*R.DL)); - })); + Value *V1 = getVectorizedValue(E1); + Value *V2 = getVectorizedValue(E2); add(V1, V2, Mask); } /// Adds single input vector (in form of tree entry) and the mask for its /// shuffling. void add(const TreeEntry &E1, ArrayRef<int> Mask) { - Value *V1 = E1.VectorizedValue; - if (V1->getType()->isIntOrIntVectorTy()) - V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) { - if (isa<PoisonValue>(V)) - return false; - return !isKnownNonNegative( - V, SimplifyQuery(*R.DL)); - })); + Value *V1 = getVectorizedValue(E1); add(V1, Mask); } /// Adds 2 input vectors and the mask for their shuffling. @@ -18178,14 +18186,7 @@ public: auto CreateSubVectors = [&](Value *Vec, SmallVectorImpl<int> &CommonMask) { for (auto [E, Idx] : SubVectors) { - Value *V = E->VectorizedValue; - if (V->getType()->isIntOrIntVectorTy()) - V = castToScalarTyElem(V, any_of(E->Scalars, [&](Value *V) { - if (isa<PoisonValue>(V)) - return false; - return !isKnownNonNegative( - V, SimplifyQuery(*R.DL)); - })); + Value *V = getVectorizedValue(*E); unsigned InsertionIndex = Idx * getNumElements(ScalarTy); // Use scalar version of the SCalarType to correctly handle shuffles // for revectorization. The revectorization mode operates by the @@ -19526,11 +19527,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return cast<LoadInst>(V)->getPointerOperand(); }); OrdersType Order; - std::optional<Value *> Stride = - calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order, - &*Builder.GetInsertPoint()); + const SCEV *StrideSCEV = + calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order); + assert(StrideSCEV && "At this point stride should be known"); + SCEVExpander Expander(*SE, *DL, "strided-load-vec"); + Value *Stride = Expander.expandCodeFor( + StrideSCEV, StrideSCEV->getType(), &*Builder.GetInsertPoint()); Value *NewStride = - Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true); + Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true); StrideVal = Builder.CreateMul( NewStride, ConstantInt::get( @@ -20519,7 +20523,9 @@ Value *BoUpSLP::vectorizeTree( !(GatheredLoadsEntriesFirst.has_value() && IE->Idx >= *GatheredLoadsEntriesFirst && VectorizableTree.front()->isGather() && - is_contained(VectorizableTree.front()->Scalars, I))) + is_contained(VectorizableTree.front()->Scalars, I)) && + !(!VectorizableTree.front()->isGather() && + VectorizableTree.front()->isCopyableElement(I))) continue; SmallVector<SelectInst *> LogicalOpSelects; I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) { @@ -20782,6 +20788,14 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, continue; } auto *SD = cast<ScheduleData>(SE); + if (SD->hasValidDependencies() && + (!S.areInstructionsWithCopyableElements() || + !S.isCopyableElement(SD->getInst())) && + !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE && + EI.UserTE->hasState() && + (!EI.UserTE->hasCopyableElements() || + !EI.UserTE->isCopyableElement(SD->getInst()))) + SD->clearDirectDependencies(); for (const Use &U : SD->getInst()->operands()) { unsigned &NumOps = UserOpToNumOps @@ -20791,7 +20805,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, if (auto *Op = dyn_cast<Instruction>(U.get()); Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op, *SLP, NumOps)) { - if (ScheduleData *OpSD = getScheduleData(Op)) { + if (ScheduleData *OpSD = getScheduleData(Op); + OpSD && OpSD->hasValidDependencies()) { OpSD->clearDirectDependencies(); if (RegionHasStackSave || !isGuaranteedToTransferExecutionToSuccessor(OpSD->getInst())) @@ -20977,7 +20992,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, ScheduleCopyableDataMapByUsers.erase(I); ScheduleCopyableDataMap.erase(KV); // Need to recalculate dependencies for the actual schedule data. - if (ScheduleData *OpSD = getScheduleData(I)) { + if (ScheduleData *OpSD = getScheduleData(I); + OpSD && OpSD->hasValidDependencies()) { OpSD->clearDirectDependencies(); if (RegionHasStackSave || !isGuaranteedToTransferExecutionToSuccessor(OpSD->getInst())) @@ -21881,6 +21897,10 @@ bool BoUpSLP::collectValuesToDemote( return TryProcessInstruction(BitWidth); case Instruction::ZExt: case Instruction::SExt: + if (E.UserTreeIndex.UserTE && E.UserTreeIndex.UserTE->hasState() && + E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast && + E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy()) + return false; IsProfitableToDemote = true; return TryProcessInstruction(BitWidth); @@ -23797,9 +23817,7 @@ public: size_t Key, Idx; std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey, /*AllowAlternate=*/false); - ++PossibleReducedVals[Key][Idx] - .insert(std::make_pair(V, 0)) - .first->second; + ++PossibleReducedVals[Key][Idx].try_emplace(V, 0).first->second; } for (Instruction *I : reverse(PossibleReductionOps)) Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1); @@ -23820,21 +23838,20 @@ public: stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) { return P1.size() > P2.size(); }); - int NewIdx = -1; + bool First = true; for (ArrayRef<Value *> Data : PossibleRedValsVect) { - if (NewIdx < 0 || - (!isGoodForReduction(Data) && - (!isa<LoadInst>(Data.front()) || - !isa<LoadInst>(ReducedVals[NewIdx].front()) || - getUnderlyingObject( - cast<LoadInst>(Data.front())->getPointerOperand()) != - getUnderlyingObject( - cast<LoadInst>(ReducedVals[NewIdx].front()) - ->getPointerOperand())))) { - NewIdx = ReducedVals.size(); + if (First) { + First = false; ReducedVals.emplace_back(); + } else if (!isGoodForReduction(Data)) { + auto *LI = dyn_cast<LoadInst>(Data.front()); + auto *LastLI = dyn_cast<LoadInst>(ReducedVals.back().front()); + if (!LI || !LastLI || + getUnderlyingObject(LI->getPointerOperand()) != + getUnderlyingObject(LastLI->getPointerOperand())) + ReducedVals.emplace_back(); } - ReducedVals[NewIdx].append(Data.rbegin(), Data.rend()); + ReducedVals.back().append(Data.rbegin(), Data.rend()); } } // Sort the reduced values by number of same/alternate opcode and/or pointer @@ -23847,7 +23864,8 @@ public: /// Attempt to vectorize the tree found by matchAssociativeReduction. Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI, - const TargetLibraryInfo &TLI, AssumptionCache *AC) { + const TargetLibraryInfo &TLI, AssumptionCache *AC, + DominatorTree &DT) { constexpr unsigned RegMaxNumber = 4; constexpr unsigned RedValsMaxNumber = 128; // If there are a sufficient number of reduction values, reduce @@ -24164,9 +24182,7 @@ public: // previous vectorization attempts. if (any_of(VL, [&V](Value *RedVal) { auto *RedValI = dyn_cast<Instruction>(RedVal); - if (!RedValI) - return false; - return V.isDeleted(RedValI); + return RedValI && V.isDeleted(RedValI); })) break; V.buildTree(VL, IgnoreList); @@ -24248,7 +24264,7 @@ public: // Estimate cost. InstructionCost ReductionCost = - getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V); + getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT, DL, TLI); InstructionCost Cost = V.getTreeCost(VL, ReductionCost); LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for reduction\n"); @@ -24553,7 +24569,9 @@ private: InstructionCost getReductionCost(TargetTransformInfo *TTI, ArrayRef<Value *> ReducedVals, bool IsCmpSelMinMax, FastMathFlags FMF, - const BoUpSLP &R) { + const BoUpSLP &R, DominatorTree &DT, + const DataLayout &DL, + const TargetLibraryInfo &TLI) { TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; Type *ScalarTy = ReducedVals.front()->getType(); unsigned ReduxWidth = ReducedVals.size(); @@ -24578,6 +24596,22 @@ private: for (User *U : RdxVal->users()) { auto *RdxOp = cast<Instruction>(U); if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) { + if (RdxKind == RecurKind::FAdd) { + InstructionCost FMACost = canConvertToFMA( + RdxOp, getSameOpcode(RdxOp, TLI), DT, DL, *TTI, TLI); + if (FMACost.isValid()) { + LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n"); + if (auto *I = dyn_cast<Instruction>(RdxVal)) { + // Also, exclude scalar fmul cost. + InstructionCost FMulCost = + TTI->getInstructionCost(I, CostKind); + LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n"); + FMACost -= FMulCost; + } + ScalarCost += FMACost; + continue; + } + } ScalarCost += TTI->getInstructionCost(RdxOp, CostKind); continue; } @@ -24642,8 +24676,45 @@ private: auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or( std::make_pair(RedTy, true)); VectorType *RVecTy = getWidenedType(RType, ReduxWidth); - VectorCost += - TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind); + InstructionCost FMACost = InstructionCost::getInvalid(); + if (RdxKind == RecurKind::FAdd) { + // Check if the reduction operands can be converted to FMA. + SmallVector<Value *> Ops; + FastMathFlags FMF; + FMF.set(); + for (Value *RdxVal : ReducedVals) { + if (!RdxVal->hasOneUse()) { + Ops.clear(); + break; + } + if (auto *FPCI = dyn_cast<FPMathOperator>(RdxVal)) + FMF &= FPCI->getFastMathFlags(); + Ops.push_back(RdxVal->user_back()); + } + if (!Ops.empty()) { + FMACost = canConvertToFMA(Ops, getSameOpcode(Ops, TLI), DT, DL, + *TTI, TLI); + if (FMACost.isValid()) { + // Calculate actual FMAD cost. + IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy, + {RVecTy, RVecTy, RVecTy}, FMF); + FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind); + + LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n"); + // Also, exclude vector fmul cost. + InstructionCost FMulCost = TTI->getArithmeticInstrCost( + Instruction::FMul, RVecTy, CostKind); + LLVM_DEBUG(dbgs() + << "Minus vector FMul cost: " << FMulCost << "\n"); + FMACost -= FMulCost; + } + } + } + if (FMACost.isValid()) + VectorCost += FMACost; + else + VectorCost += + TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind); if (RType != RedTy) { unsigned Opcode = Instruction::Trunc; if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits()) @@ -25311,7 +25382,7 @@ bool SLPVectorizerPass::vectorizeHorReduction( HorizontalReduction HorRdx; if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI)) return nullptr; - return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC); + return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT); }; auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) { if (TryOperandsAsNewSeeds && FutureSeed == Root) { @@ -25456,7 +25527,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { if (RedCost >= ScalarCost) return false; - return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC) != nullptr; + return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr; }; if (Candidates.size() == 1) return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R); @@ -25540,7 +25611,7 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, template <typename T> static bool tryToVectorizeSequence( SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator, - function_ref<bool(T *, T *)> AreCompatible, + function_ref<bool(ArrayRef<T *>, T *)> AreCompatible, function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R) { bool Changed = false; @@ -25562,7 +25633,7 @@ static bool tryToVectorizeSequence( auto *SameTypeIt = IncIt; while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) || R.isDeleted(cast<Instruction>(*SameTypeIt)) || - AreCompatible(*SameTypeIt, *IncIt))) { + AreCompatible(VL, *SameTypeIt))) { auto *I = dyn_cast<Instruction>(*SameTypeIt); ++SameTypeIt; if (I && !R.isDeleted(I)) @@ -25760,10 +25831,10 @@ bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts, return compareCmp<false>(V, V2, *TLI, *DT); }; - auto AreCompatibleCompares = [&](Value *V1, Value *V2) { - if (V1 == V2) + auto AreCompatibleCompares = [&](ArrayRef<Value *> VL, Value *V1) { + if (VL.empty() || VL.back() == V1) return true; - return compareCmp<true>(V1, V2, *TLI, *DT); + return compareCmp<true>(V1, VL.back(), *TLI, *DT); }; SmallVector<Value *> Vals; @@ -25969,9 +26040,11 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { } return false; }; - auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) { - if (V1 == V2) + auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](ArrayRef<Value *> VL, + Value *V1) { + if (VL.empty() || V1 == VL.back()) return true; + Value *V2 = VL.back(); if (V1->getType() != V2->getType()) return false; ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1]; @@ -26061,7 +26134,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { InstSetVector PostProcessInserts; SmallSetVector<CmpInst *, 8> PostProcessCmps; - // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true + // Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true // also vectorizes `PostProcessCmps`. auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) { bool Changed = vectorizeInserts(PostProcessInserts, BB, R); @@ -26342,7 +26415,13 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { V2->getValueOperand()->getValueID(); }; - auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) { + bool SameParent = true; + auto AreCompatibleStores = [&](ArrayRef<StoreInst *> VL, StoreInst *V1) { + if (VL.empty()) { + SameParent = true; + return true; + } + StoreInst *V2 = VL.back(); if (V1 == V2) return true; if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType()) @@ -26353,15 +26432,34 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { if (isa<UndefValue>(V1->getValueOperand()) || isa<UndefValue>(V2->getValueOperand())) return true; - if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand())) - if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) { - if (I1->getParent() != I2->getParent()) - return false; - return getSameOpcode({I1, I2}, *TLI).valid(); - } if (isa<Constant>(V1->getValueOperand()) && isa<Constant>(V2->getValueOperand())) return true; + // Check if the operands of the stores can be vectorized. They can be + // vectorized, if they have compatible operands or have operands, which can + // be vectorized as copyables. + auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()); + auto *I2 = dyn_cast<Instruction>(V2->getValueOperand()); + if (I1 || I2) { + // Accept only tail-following non-compatible values for now. + // TODO: investigate if it is possible to vectorize incompatible values, + // if the copyables are first in the list. + if (I1 && !I2) + return false; + SameParent &= I1 && I2 && I1->getParent() == I2->getParent(); + SmallVector<Value *> NewVL(VL.size() + 1); + for (auto [SI, V] : zip(VL, NewVL)) + V = SI->getValueOperand(); + NewVL.back() = V1->getValueOperand(); + InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI); + InstructionsState S = Analysis.buildInstructionsState( + NewVL, R, VectorizeCopyableElements, /*WithProfitabilityCheck=*/true, + /*SkipSameCodeCheck=*/!SameParent); + if (S) + return true; + if (!SameParent) + return false; + } return V1->getValueOperand()->getValueID() == V2->getValueOperand()->getValueID(); }; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index f972efa07eb7..16b1b539345d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -45,6 +45,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/LoopVersioning.h" +#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include <cassert> #include <string> @@ -55,6 +56,15 @@ namespace llvm { extern cl::opt<bool> EnableVPlanNativePath; } +/// @{ +/// Metadata attribute names +const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; +const char LLVMLoopVectorizeFollowupVectorized[] = + "llvm.loop.vectorize.followup_vectorized"; +const char LLVMLoopVectorizeFollowupEpilogue[] = + "llvm.loop.vectorize.followup_epilogue"; +/// @} + extern cl::opt<unsigned> ForceTargetInstructionCost; static cl::opt<bool> PrintVPlansInDotFormat( @@ -143,7 +153,7 @@ template <typename T> static T *getPlanEntry(T *Start) { for (unsigned i = 0; i < WorkList.size(); i++) { T *Current = WorkList[i]; - if (Current->getNumPredecessors() == 0) + if (!Current->hasPredecessors()) return Current; auto &Predecessors = Current->getPredecessors(); WorkList.insert_range(Predecessors); @@ -216,7 +226,7 @@ bool VPBlockUtils::isHeader(const VPBlockBase *VPB, // If VPBB is in a region R, VPBB is a loop header if R is a loop region with // VPBB as its entry, i.e., free of predecessors. if (auto *R = VPBB->getParent()) - return !R->isReplicator() && VPBB->getNumPredecessors() == 0; + return !R->isReplicator() && !VPBB->hasPredecessors(); // A header dominates its second predecessor (the latch), with the other // predecessor being the preheader @@ -493,6 +503,9 @@ void VPBasicBlock::connectToPredecessors(VPTransformState &State) { void VPIRBasicBlock::execute(VPTransformState *State) { assert(getHierarchicalSuccessors().size() <= 2 && "VPIRBasicBlock can have at most two successors at the moment!"); + // Move completely disconnected blocks to their final position. + if (IRBB->hasNPredecessors(0) && succ_begin(IRBB) == succ_end(IRBB)) + IRBB->moveAfter(State->CFG.PrevBB); State->Builder.SetInsertPoint(IRBB->getTerminator()); State->CFG.PrevBB = IRBB; State->CFG.VPBB2IRBB[this] = IRBB; @@ -809,7 +822,7 @@ InstructionCost VPBasicBlock::cost(ElementCount VF, VPCostContext &Ctx) { const VPBasicBlock *VPBasicBlock::getCFGPredecessor(unsigned Idx) const { const VPBlockBase *Pred = nullptr; - if (getNumPredecessors() > 0) { + if (hasPredecessors()) { Pred = getPredecessors()[Idx]; } else { auto *Region = getParent(); @@ -1183,14 +1196,14 @@ VPlan *VPlan::duplicate() { BasicBlock *ScalarHeaderIRBB = getScalarHeader()->getIRBasicBlock(); VPIRBasicBlock *NewScalarHeader = nullptr; - if (getScalarHeader()->getNumPredecessors() == 0) { - NewScalarHeader = createVPIRBasicBlock(ScalarHeaderIRBB); - } else { + if (getScalarHeader()->hasPredecessors()) { NewScalarHeader = cast<VPIRBasicBlock>(*find_if( vp_depth_first_shallow(NewEntry), [ScalarHeaderIRBB](VPBlockBase *VPB) { auto *VPIRBB = dyn_cast<VPIRBasicBlock>(VPB); return VPIRBB && VPIRBB->getIRBasicBlock() == ScalarHeaderIRBB; })); + } else { + NewScalarHeader = createVPIRBasicBlock(ScalarHeaderIRBB); } // Create VPlan, clone live-ins and remap operands in the cloned blocks. auto *NewPlan = new VPlan(cast<VPBasicBlock>(NewEntry), NewScalarHeader); @@ -1473,7 +1486,7 @@ void VPSlotTracker::assignName(const VPValue *V) { std::string BaseName = (Twine(Prefix) + Name + Twine(">")).str(); // First assign the base name for V. - const auto &[A, _] = VPValue2Name.insert({V, BaseName}); + const auto &[A, _] = VPValue2Name.try_emplace(V, BaseName); // Integer or FP constants with different types will result in he same string // due to stripping types. if (V->isLiveIn() && isa<ConstantInt, ConstantFP>(UV)) @@ -1481,7 +1494,7 @@ void VPSlotTracker::assignName(const VPValue *V) { // If it is already used by C > 0 other VPValues, increase the version counter // C and use it for V. - const auto &[C, UseInserted] = BaseName2Version.insert({BaseName, 0}); + const auto &[C, UseInserted] = BaseName2Version.try_emplace(BaseName, 0); if (!UseInserted) { C->second++; A->second = (BaseName + Twine(".") + Twine(C->second)).str(); @@ -1612,6 +1625,123 @@ VPlan &LoopVectorizationPlanner::getPlanFor(ElementCount VF) const { llvm_unreachable("No plan found!"); } +static void addRuntimeUnrollDisableMetaData(Loop *L) { + SmallVector<Metadata *, 4> MDs; + // Reserve first location for self reference to the LoopID metadata node. + MDs.push_back(nullptr); + bool IsUnrollMetadata = false; + MDNode *LoopID = L->getLoopID(); + if (LoopID) { + // First find existing loop unrolling disable metadata. + for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) { + auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I)); + if (MD) { + const auto *S = dyn_cast<MDString>(MD->getOperand(0)); + if (!S) + continue; + if (S->getString().starts_with("llvm.loop.unroll.runtime.disable")) + continue; + IsUnrollMetadata = + S->getString().starts_with("llvm.loop.unroll.disable"); + } + MDs.push_back(LoopID->getOperand(I)); + } + } + + if (!IsUnrollMetadata) { + // Add runtime unroll disable metadata. + LLVMContext &Context = L->getHeader()->getContext(); + SmallVector<Metadata *, 1> DisableOperands; + DisableOperands.push_back( + MDString::get(Context, "llvm.loop.unroll.runtime.disable")); + MDNode *DisableNode = MDNode::get(Context, DisableOperands); + MDs.push_back(DisableNode); + MDNode *NewLoopID = MDNode::get(Context, MDs); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + L->setLoopID(NewLoopID); + } +} + +void LoopVectorizationPlanner::updateLoopMetadataAndProfileInfo( + Loop *VectorLoop, VPBasicBlock *HeaderVPBB, bool VectorizingEpilogue, + unsigned EstimatedVFxUF, bool DisableRuntimeUnroll) { + MDNode *LID = OrigLoop->getLoopID(); + // Update the metadata of the scalar loop. Skip the update when vectorizing + // the epilogue loop, to ensure it is only updated once. + if (!VectorizingEpilogue) { + std::optional<MDNode *> RemainderLoopID = makeFollowupLoopID( + LID, {LLVMLoopVectorizeFollowupAll, LLVMLoopVectorizeFollowupEpilogue}); + if (RemainderLoopID) { + OrigLoop->setLoopID(*RemainderLoopID); + } else { + if (DisableRuntimeUnroll) + addRuntimeUnrollDisableMetaData(OrigLoop); + + LoopVectorizeHints Hints(OrigLoop, true, *ORE); + Hints.setAlreadyVectorized(); + } + } + + if (!VectorLoop) + return; + + if (std::optional<MDNode *> VectorizedLoopID = + makeFollowupLoopID(LID, {LLVMLoopVectorizeFollowupAll, + LLVMLoopVectorizeFollowupVectorized})) { + VectorLoop->setLoopID(*VectorizedLoopID); + } else { + // Keep all loop hints from the original loop on the vector loop (we'll + // replace the vectorizer-specific hints below). + if (LID) + VectorLoop->setLoopID(LID); + + if (!VectorizingEpilogue) { + LoopVectorizeHints Hints(VectorLoop, true, *ORE); + Hints.setAlreadyVectorized(); + } + + // Check if it's EVL-vectorized and mark the corresponding metadata. + bool IsEVLVectorized = + llvm::any_of(*HeaderVPBB, [](const VPRecipeBase &Recipe) { + // Looking for the ExplictVectorLength VPInstruction. + if (const auto *VI = dyn_cast<VPInstruction>(&Recipe)) + return VI->getOpcode() == VPInstruction::ExplicitVectorLength; + return false; + }); + if (IsEVLVectorized) { + LLVMContext &Context = VectorLoop->getHeader()->getContext(); + MDNode *LoopID = VectorLoop->getLoopID(); + auto *IsEVLVectorizedMD = MDNode::get( + Context, + {MDString::get(Context, "llvm.loop.isvectorized.tailfoldingstyle"), + MDString::get(Context, "evl")}); + MDNode *NewLoopID = makePostTransformationMetadata(Context, LoopID, {}, + {IsEVLVectorizedMD}); + VectorLoop->setLoopID(NewLoopID); + } + } + TargetTransformInfo::UnrollingPreferences UP; + TTI.getUnrollingPreferences(VectorLoop, *PSE.getSE(), UP, ORE); + if (!UP.UnrollVectorizedLoop || VectorizingEpilogue) + addRuntimeUnrollDisableMetaData(VectorLoop); + + // Set/update profile weights for the vector and remainder loops as original + // loop iterations are now distributed among them. Note that original loop + // becomes the scalar remainder loop after vectorization. + // + // For cases like foldTailByMasking() and requiresScalarEpiloque() we may + // end up getting slightly roughened result but that should be OK since + // profile is not inherently precise anyway. Note also possible bypass of + // vector code caused by legality checks is ignored, assigning all the weight + // to the vector loop, optimistically. + // + // For scalable vectorization we can't know at compile time how many + // iterations of the loop are handled in one vector iteration, so instead + // use the value of vscale used for tuning. + setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF); +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void LoopVectorizationPlanner::printPlans(raw_ostream &O) { if (VPlans.empty()) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index d6bc462a0dfa..53291a931530 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -219,6 +219,9 @@ public: size_t getNumSuccessors() const { return Successors.size(); } size_t getNumPredecessors() const { return Predecessors.size(); } + /// Returns true if this block has any predecessors. + bool hasPredecessors() const { return !Predecessors.empty(); } + /// An Enclosing Block of a block B is any block containing B, including B /// itself. \return the closest enclosing block starting from "this", which /// has successors. \return the root enclosing block if all enclosing blocks @@ -400,7 +403,7 @@ class LLVM_ABI_FOR_TEST VPRecipeBase public: VPRecipeBase(const unsigned char SC, ArrayRef<VPValue *> Operands, - DebugLoc DL = {}) + DebugLoc DL = DebugLoc::getUnknown()) : VPDef(SC), VPUser(Operands), DL(DL) {} virtual ~VPRecipeBase() = default; @@ -518,11 +521,11 @@ protected: class VPSingleDefRecipe : public VPRecipeBase, public VPValue { public: VPSingleDefRecipe(const unsigned char SC, ArrayRef<VPValue *> Operands, - DebugLoc DL = {}) + DebugLoc DL = DebugLoc::getUnknown()) : VPRecipeBase(SC, Operands, DL), VPValue(this) {} VPSingleDefRecipe(const unsigned char SC, ArrayRef<VPValue *> Operands, - Value *UV, DebugLoc DL = {}) + Value *UV, DebugLoc DL = DebugLoc::getUnknown()) : VPRecipeBase(SC, Operands, DL), VPValue(this, UV) {} static inline bool classof(const VPRecipeBase *R) { @@ -557,6 +560,7 @@ public: case VPRecipeBase::VPPartialReductionSC: return true; case VPRecipeBase::VPBranchOnMaskSC: + case VPRecipeBase::VPInterleaveEVLSC: case VPRecipeBase::VPInterleaveSC: case VPRecipeBase::VPIRInstructionSC: case VPRecipeBase::VPWidenLoadEVLSC: @@ -712,12 +716,15 @@ public: VPIRFlags(GEPNoWrapFlags GEPFlags) : OpType(OperationType::GEPOp), GEPFlags(GEPFlags) {} -public: void transferFlags(VPIRFlags &Other) { OpType = Other.OpType; AllFlags = Other.AllFlags; } + /// Only keep flags also present in \p Other. \p Other must have the same + /// OpType as the current object. + void intersectFlags(const VPIRFlags &Other); + /// Drop all poison-generating flags. void dropPoisonGeneratingFlags() { // NOTE: This needs to be kept in-sync with @@ -864,7 +871,7 @@ public: /// using IR flags. struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags { VPRecipeWithIRFlags(const unsigned char SC, ArrayRef<VPValue *> Operands, - DebugLoc DL = {}) + DebugLoc DL = DebugLoc::getUnknown()) : VPSingleDefRecipe(SC, Operands, DL), VPIRFlags() {} VPRecipeWithIRFlags(const unsigned char SC, ArrayRef<VPValue *> Operands, @@ -872,7 +879,8 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags { : VPSingleDefRecipe(SC, Operands, &I, I.getDebugLoc()), VPIRFlags(I) {} VPRecipeWithIRFlags(const unsigned char SC, ArrayRef<VPValue *> Operands, - const VPIRFlags &Flags, DebugLoc DL = {}) + const VPIRFlags &Flags, + DebugLoc DL = DebugLoc::getUnknown()) : VPSingleDefRecipe(SC, Operands, DL), VPIRFlags(Flags) {} static inline bool classof(const VPRecipeBase *R) { @@ -900,6 +908,11 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags { return R && classof(R); } + static inline bool classof(const VPSingleDefRecipe *U) { + auto *R = dyn_cast<VPRecipeBase>(U); + return R && classof(R); + } + void execute(VPTransformState &State) override = 0; /// Compute the cost for this recipe for \p VF, using \p Opcode and \p Ctx. @@ -975,6 +988,10 @@ public: Not, SLPLoad, SLPStore, + // Creates a mask where each lane is active (true) whilst the current + // counter (first operand + index) is less than the second operand. i.e. + // mask[i] = icmpt ult (op0 + i), op1 + // The size of the mask returned is VF * Multiplier (UF, third op). ActiveLaneMask, ExplicitVectorLength, CalculateTripCountMinusVF, @@ -1014,7 +1031,8 @@ public: // Returns a scalar boolean value, which is true if any lane of its // (boolean) vector operands is true. It produces the reduced value across // all unrolled iterations. Unrolling will add all copies of its original - // operand as additional operands. + // operand as additional operands. AnyOf is poison-safe as all operands + // will be frozen. AnyOf, // Calculates the first active lane index of the vector predicate operands. // It produces the lane index across all unrolled iterations. Unrolling will @@ -1080,13 +1098,13 @@ private: #endif public: - VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL = {}, - const Twine &Name = "") + VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, + DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DL), VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {} VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, - const VPIRFlags &Flags, DebugLoc DL = {}, + const VPIRFlags &Flags, DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = ""); VP_CLASSOF_IMPL(VPDef::VPInstructionSC) @@ -1479,7 +1497,8 @@ public: } VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, - const VPIRFlags &Flags = {}, DebugLoc DL = {}) + const VPIRFlags &Flags = {}, + DebugLoc DL = DebugLoc::getUnknown()) : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, Flags, DL), VPIRMetadata(), Opcode(Opcode), ResultTy(ResultTy) { assert(flagsValidForOpcode(Opcode) && @@ -1537,7 +1556,7 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags, public VPIRMetadata { public: VPWidenIntrinsicRecipe(CallInst &CI, Intrinsic::ID VectorIntrinsicID, ArrayRef<VPValue *> CallArguments, Type *Ty, - DebugLoc DL = {}) + DebugLoc DL = DebugLoc::getUnknown()) : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, CI), VPIRMetadata(CI), VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty), MayReadFromMemory(CI.mayReadFromMemory()), @@ -1546,7 +1565,7 @@ public: VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID, ArrayRef<VPValue *> CallArguments, Type *Ty, - DebugLoc DL = {}) + DebugLoc DL = DebugLoc::getUnknown()) : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, DL), VPIRMetadata(), VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty) { LLVMContext &Ctx = Ty->getContext(); @@ -1615,7 +1634,8 @@ class LLVM_ABI_FOR_TEST VPWidenCallRecipe : public VPRecipeWithIRFlags, public: VPWidenCallRecipe(Value *UV, Function *Variant, - ArrayRef<VPValue *> CallArguments, DebugLoc DL = {}) + ArrayRef<VPValue *> CallArguments, + DebugLoc DL = DebugLoc::getUnknown()) : VPRecipeWithIRFlags(VPDef::VPWidenCallSC, CallArguments, *cast<Instruction>(UV)), VPIRMetadata(*cast<Instruction>(UV)), Variant(Variant) { @@ -1644,10 +1664,8 @@ public: return cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue()); } - operand_range args() { return make_range(op_begin(), std::prev(op_end())); } - const_operand_range args() const { - return make_range(op_begin(), std::prev(op_end())); - } + operand_range args() { return drop_end(operands()); } + const_operand_range args() const { return drop_end(operands()); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. @@ -1667,7 +1685,7 @@ class VPHistogramRecipe : public VPRecipeBase { public: VPHistogramRecipe(unsigned Opcode, ArrayRef<VPValue *> Operands, - DebugLoc DL = {}) + DebugLoc DL = DebugLoc::getUnknown()) : VPRecipeBase(VPDef::VPHistogramSC, Operands, DL), Opcode(Opcode) {} ~VPHistogramRecipe() override = default; @@ -1998,6 +2016,9 @@ public: return getOperand(1); } + /// Update the incoming value from the loop backedge. + void setBackedgeValue(VPValue *V) { setOperand(1, V); } + /// Returns the backedge value as a recipe. The backedge value is guaranteed /// to be a recipe. virtual VPRecipeBase &getBackedgeRecipe() { @@ -2229,8 +2250,8 @@ protected: public: /// Create a new VPWidenPHIRecipe for \p Phi with start value \p Start and /// debug location \p DL. - VPWidenPHIRecipe(PHINode *Phi, VPValue *Start = nullptr, DebugLoc DL = {}, - const Twine &Name = "") + VPWidenPHIRecipe(PHINode *Phi, VPValue *Start = nullptr, + DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") : VPSingleDefRecipe(VPDef::VPWidenPHISC, ArrayRef<VPValue *>(), Phi, DL), Name(Name.str()) { if (Start) @@ -2381,9 +2402,8 @@ public: } VPBlendRecipe *clone() override { - SmallVector<VPValue *> Ops(operands()); - return new VPBlendRecipe(cast_or_null<PHINode>(getUnderlyingValue()), Ops, - getDebugLoc()); + return new VPBlendRecipe(cast_or_null<PHINode>(getUnderlyingValue()), + operands(), getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPBlendSC) @@ -2409,6 +2429,12 @@ public: return Idx == 0 ? getOperand(1) : getOperand(Idx * 2 + !isNormalized()); } + /// Set mask number \p Idx to \p V. + void setMask(unsigned Idx, VPValue *V) { + assert((Idx > 0 || !isNormalized()) && "First index has no mask!"); + Idx == 0 ? setOperand(1, V) : setOperand(Idx * 2 + !isNormalized(), V); + } + void execute(VPTransformState &State) override { llvm_unreachable("VPBlendRecipe should be expanded by simplifyBlends"); } @@ -2434,12 +2460,13 @@ public: } }; -/// VPInterleaveRecipe is a recipe for transforming an interleave group of load -/// or stores into one wide load/store and shuffles. The first operand of a -/// VPInterleave recipe is the address, followed by the stored values, followed -/// by an optional mask. -class LLVM_ABI_FOR_TEST VPInterleaveRecipe : public VPRecipeBase, - public VPIRMetadata { +/// A common base class for interleaved memory operations. +/// An Interleaved memory operation is a memory access method that combines +/// multiple strided loads/stores into a single wide load/store with shuffles. +/// The first operand is the start address. The optional operands are, in order, +/// the stored values and the mask. +class LLVM_ABI_FOR_TEST VPInterleaveBase : public VPRecipeBase, + public VPIRMetadata { const InterleaveGroup<Instruction> *IG; /// Indicates if the interleave group is in a conditional block and requires a @@ -2450,12 +2477,14 @@ class LLVM_ABI_FOR_TEST VPInterleaveRecipe : public VPRecipeBase, /// unusued gaps can be loaded speculatively. bool NeedsMaskForGaps = false; -public: - VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr, - ArrayRef<VPValue *> StoredValues, VPValue *Mask, - bool NeedsMaskForGaps, const VPIRMetadata &MD, DebugLoc DL) - : VPRecipeBase(VPDef::VPInterleaveSC, {Addr}, DL), VPIRMetadata(MD), - IG(IG), NeedsMaskForGaps(NeedsMaskForGaps) { +protected: + VPInterleaveBase(const unsigned char SC, + const InterleaveGroup<Instruction> *IG, + ArrayRef<VPValue *> Operands, + ArrayRef<VPValue *> StoredValues, VPValue *Mask, + bool NeedsMaskForGaps, const VPIRMetadata &MD, DebugLoc DL) + : VPRecipeBase(SC, Operands, DL), VPIRMetadata(MD), IG(IG), + NeedsMaskForGaps(NeedsMaskForGaps) { // TODO: extend the masked interleaved-group support to reversed access. assert((!Mask || !IG->isReverse()) && "Reversed masked interleave-group not supported."); @@ -2473,14 +2502,19 @@ public: addOperand(Mask); } } - ~VPInterleaveRecipe() override = default; - VPInterleaveRecipe *clone() override { - return new VPInterleaveRecipe(IG, getAddr(), getStoredValues(), getMask(), - NeedsMaskForGaps, *this, getDebugLoc()); +public: + VPInterleaveBase *clone() override = 0; + + static inline bool classof(const VPRecipeBase *R) { + return R->getVPDefID() == VPRecipeBase::VPInterleaveSC || + R->getVPDefID() == VPRecipeBase::VPInterleaveEVLSC; } - VP_CLASSOF_IMPL(VPDef::VPInterleaveSC) + static inline bool classof(const VPUser *U) { + auto *R = dyn_cast<VPRecipeBase>(U); + return R && classof(R); + } /// Return the address accessed by this recipe. VPValue *getAddr() const { @@ -2490,48 +2524,130 @@ public: /// Return the mask used by this recipe. Note that a full mask is represented /// by a nullptr. VPValue *getMask() const { - // Mask is optional and therefore the last, currently 2nd operand. + // Mask is optional and the last operand. return HasMask ? getOperand(getNumOperands() - 1) : nullptr; } + /// Return true if the access needs a mask because of the gaps. + bool needsMaskForGaps() const { return NeedsMaskForGaps; } + + const InterleaveGroup<Instruction> *getInterleaveGroup() const { return IG; } + + Instruction *getInsertPos() const { return IG->getInsertPos(); } + + void execute(VPTransformState &State) override { + llvm_unreachable("VPInterleaveBase should not be instantiated."); + } + + /// Return the cost of this recipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + + /// Returns true if the recipe only uses the first lane of operand \p Op. + virtual bool onlyFirstLaneUsed(const VPValue *Op) const override = 0; + + /// Returns the number of stored operands of this interleave group. Returns 0 + /// for load interleave groups. + virtual unsigned getNumStoreOperands() const = 0; + /// Return the VPValues stored by this interleave group. If it is a load /// interleave group, return an empty ArrayRef. ArrayRef<VPValue *> getStoredValues() const { - // The first operand is the address, followed by the stored values, followed - // by an optional mask. - return ArrayRef<VPValue *>(op_begin(), getNumOperands()) - .slice(1, getNumStoreOperands()); + return ArrayRef<VPValue *>(op_end() - + (getNumStoreOperands() + (HasMask ? 1 : 0)), + getNumStoreOperands()); + } +}; + +/// VPInterleaveRecipe is a recipe for transforming an interleave group of load +/// or stores into one wide load/store and shuffles. The first operand of a +/// VPInterleave recipe is the address, followed by the stored values, followed +/// by an optional mask. +class LLVM_ABI_FOR_TEST VPInterleaveRecipe final : public VPInterleaveBase { +public: + VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr, + ArrayRef<VPValue *> StoredValues, VPValue *Mask, + bool NeedsMaskForGaps, const VPIRMetadata &MD, DebugLoc DL) + : VPInterleaveBase(VPDef::VPInterleaveSC, IG, Addr, StoredValues, Mask, + NeedsMaskForGaps, MD, DL) {} + + ~VPInterleaveRecipe() override = default; + + VPInterleaveRecipe *clone() override { + return new VPInterleaveRecipe(getInterleaveGroup(), getAddr(), + getStoredValues(), getMask(), + needsMaskForGaps(), *this, getDebugLoc()); } + VP_CLASSOF_IMPL(VPDef::VPInterleaveSC) + /// Generate the wide load or store, and shuffles. void execute(VPTransformState &State) override; - /// Return the cost of this VPInterleaveRecipe. - InstructionCost computeCost(ElementCount VF, - VPCostContext &Ctx) const override; - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; #endif - const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; } + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op); + } - /// Returns the number of stored operands of this interleave group. Returns 0 - /// for load interleave groups. - unsigned getNumStoreOperands() const { - return getNumOperands() - (HasMask ? 2 : 1); + unsigned getNumStoreOperands() const override { + return getNumOperands() - (getMask() ? 2 : 1); } +}; + +/// A recipe for interleaved memory operations with vector-predication +/// intrinsics. The first operand is the address, the second operand is the +/// explicit vector length. Stored values and mask are optional operands. +class LLVM_ABI_FOR_TEST VPInterleaveEVLRecipe final : public VPInterleaveBase { +public: + VPInterleaveEVLRecipe(VPInterleaveRecipe &R, VPValue &EVL, VPValue *Mask) + : VPInterleaveBase(VPDef::VPInterleaveEVLSC, R.getInterleaveGroup(), + ArrayRef<VPValue *>({R.getAddr(), &EVL}), + R.getStoredValues(), Mask, R.needsMaskForGaps(), R, + R.getDebugLoc()) { + assert(!getInterleaveGroup()->isReverse() && + "Reversed interleave-group with tail folding is not supported."); + assert(!needsMaskForGaps() && "Interleaved access with gap mask is not " + "supported for scalable vector."); + } + + ~VPInterleaveEVLRecipe() override = default; + + VPInterleaveEVLRecipe *clone() override { + llvm_unreachable("cloning not implemented yet"); + } + + VP_CLASSOF_IMPL(VPDef::VPInterleaveEVLSC) + + /// The VPValue of the explicit vector length. + VPValue *getEVL() const { return getOperand(1); } - /// The recipe only uses the first lane of the address. + /// Generate the wide load or store, and shuffles. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + /// The recipe only uses the first lane of the address, and EVL operand. bool onlyFirstLaneUsed(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); - return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op); + return (Op == getAddr() && !llvm::is_contained(getStoredValues(), Op)) || + Op == getEVL(); } - Instruction *getInsertPos() const { return IG->getInsertPos(); } + unsigned getNumStoreOperands() const override { + return getNumOperands() - (getMask() ? 3 : 2); + } }; /// A recipe to represent inloop reduction operations, performing a reduction on @@ -2561,14 +2677,14 @@ protected: public: VPReductionRecipe(RecurKind RdxKind, FastMathFlags FMFs, Instruction *I, VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, - bool IsOrdered, DebugLoc DL = {}) + bool IsOrdered, DebugLoc DL = DebugLoc::getUnknown()) : VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, I, ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp, IsOrdered, DL) {} VPReductionRecipe(const RecurKind RdxKind, FastMathFlags FMFs, VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, - bool IsOrdered, DebugLoc DL = {}) + bool IsOrdered, DebugLoc DL = DebugLoc::getUnknown()) : VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, nullptr, ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp, IsOrdered, DL) {} @@ -2686,7 +2802,7 @@ public: class LLVM_ABI_FOR_TEST VPReductionEVLRecipe : public VPReductionRecipe { public: VPReductionEVLRecipe(VPReductionRecipe &R, VPValue &EVL, VPValue *CondOp, - DebugLoc DL = {}) + DebugLoc DL = DebugLoc::getUnknown()) : VPReductionRecipe( VPDef::VPReductionEVLSC, R.getRecurrenceKind(), R.getFastMathFlags(), @@ -3537,7 +3653,8 @@ public: InductionOpcode(Opcode) {} VPScalarIVStepsRecipe(const InductionDescriptor &IndDesc, VPValue *IV, - VPValue *Step, VPValue *VF, DebugLoc DL = {}) + VPValue *Step, VPValue *VF, + DebugLoc DL = DebugLoc::getUnknown()) : VPScalarIVStepsRecipe( IV, Step, VF, IndDesc.getInductionOpcode(), dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp()) @@ -4142,7 +4259,7 @@ public: /// Returns an iterator range over all VFs of the plan. iterator_range<SmallSetVector<ElementCount, 2>::iterator> vectorFactors() const { - return {VFs.begin(), VFs.end()}; + return VFs; } bool hasScalarVFOnly() const { @@ -4299,9 +4416,8 @@ public: /// via the other early exit). bool hasEarlyExit() const { return count_if(ExitBlocks, - [](VPIRBasicBlock *EB) { - return EB->getNumPredecessors() != 0; - }) > 1 || + [](VPIRBasicBlock *EB) { return EB->hasPredecessors(); }) > + 1 || (ExitBlocks.size() == 1 && ExitBlocks[0]->getNumPredecessors() > 1); } @@ -4309,7 +4425,7 @@ public: /// that this relies on unneeded branches to the scalar tail loop being /// removed. bool hasScalarTail() const { - return !(getScalarPreheader()->getNumPredecessors() == 0 || + return !(!getScalarPreheader()->hasPredecessors() || getScalarPreheader()->getSinglePredecessor() == getEntry()); } }; diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 747c6623aa22..d400ceff7797 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -296,7 +296,7 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { .Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe, VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe>( [this](const auto *R) { return inferScalarTypeForRecipe(R); }) - .Case<VPInterleaveRecipe>([V](const VPInterleaveRecipe *R) { + .Case<VPInterleaveBase>([V](const auto *R) { // TODO: Use info from interleave group. return V->getUnderlyingValue()->getType(); }) diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 80b48de57b40..cef91c15dd87 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -193,6 +193,9 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB, } if (auto *SI = dyn_cast<SwitchInst>(Inst)) { + // Don't emit recipes for unconditional switch instructions. + if (SI->getNumCases() == 0) + continue; SmallVector<VPValue *> Ops = {getOrCreateVPOperand(SI->getCondition())}; for (auto Case : SI->cases()) Ops.push_back(getOrCreateVPOperand(Case.getCaseValue())); @@ -538,8 +541,7 @@ VPlanTransforms::buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, } void VPlanTransforms::handleEarlyExits(VPlan &Plan, - bool HasUncountableEarlyExit, - VFRange &Range) { + bool HasUncountableEarlyExit) { auto *MiddleVPBB = cast<VPBasicBlock>( Plan.getScalarHeader()->getSinglePredecessor()->getPredecessors()[0]); auto *LatchVPBB = cast<VPBasicBlock>(MiddleVPBB->getSinglePredecessor()); @@ -559,8 +561,7 @@ void VPlanTransforms::handleEarlyExits(VPlan &Plan, assert(!HandledUncountableEarlyExit && "can handle exactly one uncountable early exit"); handleUncountableEarlyExit(cast<VPBasicBlock>(Pred), EB, Plan, - cast<VPBasicBlock>(HeaderVPB), LatchVPBB, - Range); + cast<VPBasicBlock>(HeaderVPB), LatchVPBB); HandledUncountableEarlyExit = true; } else { for (VPRecipeBase &R : EB->phis()) @@ -671,6 +672,90 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond, } } +void VPlanTransforms::addMinimumIterationCheck( + VPlan &Plan, ElementCount VF, unsigned UF, + ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue, + bool TailFolded, bool CheckNeededWithTailFolding, Loop *OrigLoop, + const uint32_t *MinItersBypassWeights, DebugLoc DL, ScalarEvolution &SE) { + // Generate code to check if the loop's trip count is less than VF * UF, or + // equal to it in case a scalar epilogue is required; this implies that the + // vector trip count is zero. This check also covers the case where adding one + // to the backedge-taken count overflowed leading to an incorrect trip count + // of zero. In this case we will also jump to the scalar loop. + CmpInst::Predicate CmpPred = + RequiresScalarEpilogue ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; + // If tail is to be folded, vector loop takes care of all iterations. + VPValue *TripCountVPV = Plan.getTripCount(); + const SCEV *TripCount = vputils::getSCEVExprForVPValue(TripCountVPV, SE); + Type *TripCountTy = TripCount->getType(); + auto GetMinTripCount = [&]() -> const SCEV * { + // Compute max(MinProfitableTripCount, UF * VF) and return it. + const SCEV *VFxUF = + SE.getElementCount(TripCountTy, (VF * UF), SCEV::FlagNUW); + if (UF * VF.getKnownMinValue() >= + MinProfitableTripCount.getKnownMinValue()) { + // TODO: SCEV should be able to simplify test. + return VFxUF; + } + const SCEV *MinProfitableTripCountSCEV = + SE.getElementCount(TripCountTy, MinProfitableTripCount, SCEV::FlagNUW); + return SE.getUMaxExpr(MinProfitableTripCountSCEV, VFxUF); + }; + + VPBasicBlock *EntryVPBB = Plan.getEntry(); + VPBuilder Builder(EntryVPBB); + VPValue *TripCountCheck = Plan.getFalse(); + const SCEV *Step = GetMinTripCount(); + if (TailFolded) { + if (CheckNeededWithTailFolding) { + // vscale is not necessarily a power-of-2, which means we cannot guarantee + // an overflow to zero when updating induction variables and so an + // additional overflow check is required before entering the vector loop. + + // Get the maximum unsigned value for the type. + VPValue *MaxUIntTripCount = Plan.getOrAddLiveIn(ConstantInt::get( + TripCountTy, cast<IntegerType>(TripCountTy)->getMask())); + VPValue *DistanceToMax = Builder.createNaryOp( + Instruction::Sub, {MaxUIntTripCount, TripCountVPV}, + DebugLoc::getUnknown()); + + // Don't execute the vector loop if (UMax - n) < (VF * UF). + // FIXME: Should only check VF * UF, but currently checks Step=max(VF*UF, + // minProfitableTripCount). + TripCountCheck = Builder.createICmp(ICmpInst::ICMP_ULT, DistanceToMax, + Builder.createExpandSCEV(Step), DL); + } else { + // TripCountCheck = false, folding tail implies positive vector trip + // count. + } + } else { + // TODO: Emit unconditional branch to vector preheader instead of + // conditional branch with known condition. + TripCount = SE.applyLoopGuards(TripCount, OrigLoop); + // Check if the trip count is < the step. + if (SE.isKnownPredicate(CmpPred, TripCount, Step)) { + // TODO: Ensure step is at most the trip count when determining max VF and + // UF, w/o tail folding. + TripCountCheck = Plan.getTrue(); + } else if (!SE.isKnownPredicate(CmpInst::getInversePredicate(CmpPred), + TripCount, Step)) { + // Generate the minimum iteration check only if we cannot prove the + // check is known to be true, or known to be false. + VPValue *MinTripCountVPV = Builder.createExpandSCEV(Step); + TripCountCheck = Builder.createICmp( + CmpPred, TripCountVPV, MinTripCountVPV, DL, "min.iters.check"); + } // else step known to be < trip count, use TripCountCheck preset to false. + } + VPInstruction *Term = + Builder.createNaryOp(VPInstruction::BranchOnCond, {TripCountCheck}, DL); + if (MinItersBypassWeights) { + MDBuilder MDB(Plan.getContext()); + MDNode *BranchWeights = MDB.createBranchWeights( + ArrayRef(MinItersBypassWeights, 2), /*IsExpected=*/false); + Term->addMetadata(LLVMContext::MD_prof, BranchWeights); + } +} + bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) { auto GetMinMaxCompareValue = [](VPReductionPHIRecipe *RedPhiR) -> VPValue * { auto *MinMaxR = dyn_cast<VPRecipeWithIRFlags>( diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 1ec6ae677374..109156c1469c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -145,6 +145,16 @@ inline int_pred_ty<is_all_ones> m_AllOnes() { return int_pred_ty<is_all_ones>(); } +struct is_zero_int { + bool isValue(const APInt &C) const { return C.isZero(); } +}; + +/// Match an integer 0 or a vector with all elements equal to 0. +/// For vectors, this includes constants with undefined elements. +inline int_pred_ty<is_zero_int> m_ZeroInt() { + return int_pred_ty<is_zero_int>(); +} + /// Matching combinators template <typename LTy, typename RTy> struct match_combine_or { LTy L; @@ -218,9 +228,12 @@ struct Recipe_match { if ((!matchRecipeAndOpcode<RecipeTys>(R) && ...)) return false; - assert(R->getNumOperands() == std::tuple_size<Ops_t>::value && - "recipe with matched opcode does not have the expected number of " - "operands"); + if (R->getNumOperands() != std::tuple_size<Ops_t>::value) { + assert(Opcode == Instruction::PHI && + "non-variadic recipe with matched opcode does not have the " + "expected number of operands"); + return false; + } auto IdxSeq = std::make_index_sequence<std::tuple_size<Ops_t>::value>(); if (all_of_tuple_elements(IdxSeq, [R](auto Op, unsigned Idx) { @@ -302,14 +315,21 @@ m_Broadcast(const Op0_t &Op0) { } template <typename Op0_t> +inline VPInstruction_match<VPInstruction::ExplicitVectorLength, Op0_t> +m_EVL(const Op0_t &Op0) { + return m_VPInstruction<VPInstruction::ExplicitVectorLength>(Op0); +} + +template <typename Op0_t> inline VPInstruction_match<VPInstruction::ExtractLastElement, Op0_t> m_ExtractLastElement(const Op0_t &Op0) { return m_VPInstruction<VPInstruction::ExtractLastElement>(Op0); } -template <typename Op0_t, typename Op1_t> -inline VPInstruction_match<VPInstruction::ActiveLaneMask, Op0_t, Op1_t> -m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1) { - return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1); + +template <typename Op0_t, typename Op1_t, typename Op2_t> +inline VPInstruction_match<VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t> +m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) { + return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1, Op2); } template <typename Op0_t, typename Op1_t> @@ -345,6 +365,12 @@ m_ZExtOrSExt(const Op0_t &Op0) { return m_CombineOr(m_ZExt(Op0), m_SExt(Op0)); } +template <typename Op0_t> +inline match_combine_or<AllRecipe_match<Instruction::ZExt, Op0_t>, Op0_t> +m_ZExtOrSelf(const Op0_t &Op0) { + return m_CombineOr(m_ZExt(Op0), Op0); +} + template <unsigned Opcode, typename Op0_t, typename Op1_t> inline AllRecipe_match<Opcode, Op0_t, Op1_t> m_Binary(const Op0_t &Op0, const Op1_t &Op1) { @@ -381,6 +407,13 @@ m_c_Mul(const Op0_t &Op0, const Op1_t &Op1) { return m_c_Binary<Instruction::Mul, Op0_t, Op1_t>(Op0, Op1); } +/// Match a binary AND operation. +template <typename Op0_t, typename Op1_t> +inline AllRecipe_commutative_match<Instruction::And, Op0_t, Op1_t> +m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1) { + return m_c_Binary<Instruction::And, Op0_t, Op1_t>(Op0, Op1); +} + /// Match a binary OR operation. Note that while conceptually the operands can /// be matched commutatively, \p Commutative defaults to false in line with the /// IR-based pattern matching infrastructure. Use m_c_BinaryOr for a commutative diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp index cdadc33e3088..0c27d535b680 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -14,11 +14,13 @@ #include "VPRecipeBuilder.h" #include "VPlan.h" #include "VPlanCFG.h" +#include "VPlanPatternMatch.h" #include "VPlanTransforms.h" #include "VPlanUtils.h" #include "llvm/ADT/PostOrderIterator.h" using namespace llvm; +using namespace VPlanPatternMatch; namespace { class VPPredicator { @@ -246,6 +248,7 @@ void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) { "Distinct incoming values with one having a full mask"); break; } + OperandsWithMask.push_back(EdgeMask); } PHINode *IRPhi = cast_or_null<PHINode>(PhiR->getUnderlyingValue()); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index c4fdcccc6d62..bf5148954309 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -52,8 +52,9 @@ bool VPRecipeBase::mayWriteToMemory() const { return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory(); case VPInstructionSC: return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory(); + case VPInterleaveEVLSC: case VPInterleaveSC: - return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0; + return cast<VPInterleaveBase>(this)->getNumStoreOperands() > 0; case VPWidenStoreEVLSC: case VPWidenStoreSC: return true; @@ -142,6 +143,7 @@ bool VPRecipeBase::mayReadFromMemory() const { return false; } default: + // FIXME: Return false if the recipe represents an interleaved store. return true; } } @@ -183,6 +185,7 @@ bool VPRecipeBase::mayHaveSideEffects() const { "underlying instruction has side-effects"); return false; } + case VPInterleaveEVLSC: case VPInterleaveSC: return mayWriteToMemory(); case VPWidenLoadEVLSC: @@ -255,7 +258,7 @@ InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { Instruction *UI = nullptr; if (auto *S = dyn_cast<VPSingleDefRecipe>(this)) UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue()); - else if (auto *IG = dyn_cast<VPInterleaveRecipe>(this)) + else if (auto *IG = dyn_cast<VPInterleaveBase>(this)) UI = IG->getInsertPos(); else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this)) UI = &WidenMem->getIngredient(); @@ -389,6 +392,42 @@ void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent, } #endif +void VPIRFlags::intersectFlags(const VPIRFlags &Other) { + assert(OpType == Other.OpType && "OpType must match"); + switch (OpType) { + case OperationType::OverflowingBinOp: + WrapFlags.HasNUW &= Other.WrapFlags.HasNUW; + WrapFlags.HasNSW &= Other.WrapFlags.HasNSW; + break; + case OperationType::Trunc: + TruncFlags.HasNUW &= Other.TruncFlags.HasNUW; + TruncFlags.HasNSW &= Other.TruncFlags.HasNSW; + break; + case OperationType::DisjointOp: + DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint; + break; + case OperationType::PossiblyExactOp: + ExactFlags.IsExact &= Other.ExactFlags.IsExact; + break; + case OperationType::GEPOp: + GEPFlags &= Other.GEPFlags; + break; + case OperationType::FPMathOp: + FMFs.NoNaNs &= Other.FMFs.NoNaNs; + FMFs.NoInfs &= Other.FMFs.NoInfs; + break; + case OperationType::NonNegOp: + NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg; + break; + case OperationType::Cmp: + assert(CmpPredicate == Other.CmpPredicate && "Cannot drop CmpPredicate"); + break; + case OperationType::Other: + assert(AllFlags == Other.AllFlags && "Cannot drop other flags"); + break; + } +} + FastMathFlags VPIRFlags::getFastMathFlags() const { assert(OpType == OperationType::FPMathOp && "recipe doesn't have fast math flags"); @@ -471,7 +510,6 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { case Instruction::ICmp: case Instruction::FCmp: case Instruction::Store: - case VPInstruction::ActiveLaneMask: case VPInstruction::BranchOnCount: case VPInstruction::ComputeReductionResult: case VPInstruction::FirstOrderRecurrenceSplice: @@ -481,6 +519,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { case VPInstruction::WideIVStep: return 2; case Instruction::Select: + case VPInstruction::ActiveLaneMask: case VPInstruction::ComputeAnyOfResult: case VPInstruction::ReductionStartVector: return 3; @@ -620,7 +659,9 @@ Value *VPInstruction::generate(VPTransformState &State) { Name); auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); - auto *PredTy = VectorType::get(Int1Ty, State.VF); + auto PredTy = VectorType::get( + Int1Ty, State.VF * cast<ConstantInt>(getOperand(2)->getLiveInIRValue()) + ->getZExtValue()); return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()}, {VIVElem0, ScalarTC}, nullptr, Name); @@ -875,9 +916,9 @@ Value *VPInstruction::generate(VPTransformState &State) { return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags()); } case VPInstruction::AnyOf: { - Value *Res = State.get(getOperand(0)); + Value *Res = Builder.CreateFreeze(State.get(getOperand(0))); for (VPValue *Op : drop_begin(operands())) - Res = Builder.CreateOr(Res, State.get(Op)); + Res = Builder.CreateOr(Res, Builder.CreateFreeze(State.get(Op))); return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res); } case VPInstruction::ExtractLane: { @@ -919,8 +960,15 @@ Value *VPInstruction::generate(VPTransformState &State) { unsigned LastOpIdx = getNumOperands() - 1; Value *Res = nullptr; for (int Idx = LastOpIdx; Idx >= 0; --Idx) { - Value *TrailingZeros = Builder.CreateCountTrailingZeroElems( - Builder.getInt64Ty(), State.get(getOperand(Idx)), true, Name); + Value *TrailingZeros = + State.VF.isScalar() + ? Builder.CreateZExt( + Builder.CreateICmpEQ(State.get(getOperand(Idx)), + Builder.getFalse()), + Builder.getInt64Ty()) + : Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), + State.get(getOperand(Idx)), + true, Name); Value *Current = Builder.CreateAdd( Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros); if (Res) { @@ -1027,8 +1075,27 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, } switch (getOpcode()) { + case Instruction::Select: { + // TODO: It may be possible to improve this by analyzing where the + // condition operand comes from. + CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; + auto *CondTy = Ctx.Types.inferScalarType(getOperand(0)); + auto *VecTy = Ctx.Types.inferScalarType(getOperand(1)); + if (!vputils::onlyFirstLaneUsed(this)) { + CondTy = toVectorTy(CondTy, VF); + VecTy = toVectorTy(VecTy, VF); + } + return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, Pred, + Ctx.CostKind); + } case Instruction::ExtractElement: case VPInstruction::ExtractLane: { + if (VF.isScalar()) { + // ExtractLane with VF=1 takes care of handling extracting across multiple + // parts. + return 0; + } + // Add on the cost of extracting the element. auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, @@ -1040,8 +1107,13 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind); } case VPInstruction::FirstActiveLane: { + Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0)); + if (VF.isScalar()) + return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy, + CmpInst::makeCmpResultType(ScalarTy), + CmpInst::ICMP_EQ, Ctx.CostKind); // Calculate the cost of determining the lane index. - auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); + auto *PredTy = toVectorTy(ScalarTy, VF); IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Type::getInt64Ty(Ctx.LLVMCtx), {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)}); @@ -1060,7 +1132,9 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, } case VPInstruction::ActiveLaneMask: { Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0)); - Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF); + unsigned Multiplier = + cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue(); + Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier); IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy, {ArgTy, ArgTy}); return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); @@ -1684,18 +1758,22 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) { State.set(this, V); } -InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { +/// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R. +static InstructionCost getCostForIntrinsics(Intrinsic::ID ID, + ArrayRef<const VPValue *> Operands, + const VPRecipeWithIRFlags &R, + ElementCount VF, + VPCostContext &Ctx) { // Some backends analyze intrinsic arguments to determine cost. Use the // underlying value for the operand if it has one. Otherwise try to use the // operand of the underlying call instruction, if there is one. Otherwise // clear Arguments. // TODO: Rework TTI interface to be independent of concrete IR values. SmallVector<const Value *> Arguments; - for (const auto &[Idx, Op] : enumerate(operands())) { + for (const auto &[Idx, Op] : enumerate(Operands)) { auto *V = Op->getUnderlyingValue(); if (!V) { - if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) { + if (auto *UI = dyn_cast_or_null<CallBase>(R.getUnderlyingValue())) { Arguments.push_back(UI->getArgOperand(Idx)); continue; } @@ -1705,21 +1783,31 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, Arguments.push_back(V); } - Type *RetTy = toVectorizedTy(Ctx.Types.inferScalarType(this), VF); + Type *ScalarRetTy = Ctx.Types.inferScalarType(&R); + Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy; SmallVector<Type *> ParamTys; - for (unsigned I = 0; I != getNumOperands(); ++I) - ParamTys.push_back( - toVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF)); + for (const VPValue *Op : Operands) { + ParamTys.push_back(VF.isVector() + ? toVectorTy(Ctx.Types.inferScalarType(Op), VF) + : Ctx.Types.inferScalarType(Op)); + } // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst. - FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags(); + FastMathFlags FMF = + R.hasFastMathFlags() ? R.getFastMathFlags() : FastMathFlags(); IntrinsicCostAttributes CostAttrs( - VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF, - dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()), + ID, RetTy, Arguments, ParamTys, FMF, + dyn_cast_or_null<IntrinsicInst>(R.getUnderlyingValue()), InstructionCost::getInvalid(), &Ctx.TLI); return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind); } +InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, + VPCostContext &Ctx) const { + SmallVector<const VPValue *> ArgOps(operands()); + return getCostForIntrinsics(VectorIntrinsicID, ArgOps, *this, VF, Ctx); +} + StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const { return Intrinsic::getBaseName(VectorIntrinsicID); } @@ -2110,8 +2198,10 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF, case Instruction::SDiv: case Instruction::SRem: case Instruction::URem: - // More complex computation, let the legacy cost-model handle this for now. - return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF); + // If the div/rem operation isn't safe to speculate and requires + // predication, then the only way we can even create a vplan is to insert + // a select on the second input operand to ensure we use the value of 1 + // for the inactive lanes. The select will be costed separately. case Instruction::FNeg: case Instruction::Add: case Instruction::FAdd: @@ -2174,7 +2264,7 @@ InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF, auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint { if (VF.isScalar()) return TTI::CastContextHint::Normal; - if (isa<VPInterleaveRecipe>(R)) + if (isa<VPInterleaveBase>(R)) return TTI::CastContextHint::Interleave; if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R)) return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked @@ -2756,10 +2846,10 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF)); assert(RedTy->isIntegerTy() && "VPExpressionRecipe only supports integer types currently."); + unsigned Opcode = RecurrenceDescriptor::getOpcode( + cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind()); switch (ExpressionType) { case ExpressionTypes::ExtendedReduction: { - unsigned Opcode = RecurrenceDescriptor::getOpcode( - cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind()); return Ctx.TTI.getExtendedReductionCost( Opcode, cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() == @@ -2767,13 +2857,14 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, RedTy, SrcVecTy, std::nullopt, Ctx.CostKind); } case ExpressionTypes::MulAccReduction: - return Ctx.TTI.getMulAccReductionCost(false, RedTy, SrcVecTy, Ctx.CostKind); + return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy, + Ctx.CostKind); case ExpressionTypes::ExtMulAccReduction: return Ctx.TTI.getMulAccReductionCost( cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() == Instruction::ZExt, - RedTy, SrcVecTy, Ctx.CostKind); + Opcode, RedTy, SrcVecTy, Ctx.CostKind); } llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum"); } @@ -3014,23 +3105,75 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, // instruction cost. return 0; case Instruction::Call: { - if (!isSingleScalar()) { - // TODO: Handle remaining call costs here as well. - if (VF.isScalable()) - return InstructionCost::getInvalid(); - break; - } - auto *CalledFn = cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue()); - if (CalledFn->isIntrinsic()) - break; + SmallVector<const VPValue *> ArgOps(drop_end(operands())); SmallVector<Type *, 4> Tys; - for (VPValue *ArgOp : drop_end(operands())) + for (const VPValue *ArgOp : ArgOps) Tys.push_back(Ctx.Types.inferScalarType(ArgOp)); + + if (CalledFn->isIntrinsic()) + // Various pseudo-intrinsics with costs of 0 are scalarized instead of + // vectorized via VPWidenIntrinsicRecipe. Return 0 for them early. + switch (CalledFn->getIntrinsicID()) { + case Intrinsic::assume: + case Intrinsic::lifetime_end: + case Intrinsic::lifetime_start: + case Intrinsic::sideeffect: + case Intrinsic::pseudoprobe: + case Intrinsic::experimental_noalias_scope_decl: { + assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this, + ElementCount::getFixed(1), Ctx) == 0 && + "scalarizing intrinsic should be free"); + return InstructionCost(0); + } + default: + break; + } + Type *ResultTy = Ctx.Types.inferScalarType(this); - return Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind); + InstructionCost ScalarCallCost = + Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind); + if (isSingleScalar()) { + if (CalledFn->isIntrinsic()) + ScalarCallCost = std::min( + ScalarCallCost, + getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this, + ElementCount::getFixed(1), Ctx)); + return ScalarCallCost; + } + + if (VF.isScalable()) + return InstructionCost::getInvalid(); + + // Compute the cost of scalarizing the result and operands if needed. + InstructionCost ScalarizationCost = 0; + if (VF.isVector()) { + if (!ResultTy->isVoidTy()) { + for (Type *VectorTy : + to_vector(getContainedTypes(toVectorizedTy(ResultTy, VF)))) { + ScalarizationCost += Ctx.TTI.getScalarizationOverhead( + cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()), + /*Insert=*/true, + /*Extract=*/false, Ctx.CostKind); + } + } + // Skip operands that do not require extraction/scalarization and do not + // incur any overhead. + SmallPtrSet<const VPValue *, 4> UniqueOperands; + Tys.clear(); + for (auto *Op : ArgOps) { + if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) || + !UniqueOperands.insert(Op).second) + continue; + Tys.push_back(toVectorizedTy(Ctx.Types.inferScalarType(Op), VF)); + } + ScalarizationCost += + Ctx.TTI.getOperandsScalarizationOverhead(Tys, Ctx.CostKind); + } + + return ScalarCallCost * VF.getFixedValue() + ScalarizationCost; } case Instruction::Add: case Instruction::Sub: @@ -3045,10 +3188,29 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, case Instruction::AShr: case Instruction::And: case Instruction::Or: - case Instruction::Xor: { + case Instruction::Xor: + case Instruction::ICmp: + case Instruction::FCmp: return *getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1), Ctx) * (isSingleScalar() ? 1 : VF.getFixedValue()); + case Instruction::Load: + case Instruction::Store: { + if (isSingleScalar()) { + bool IsLoad = UI->getOpcode() == Instruction::Load; + Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); + Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1)); + const Align Alignment = getLoadStoreAlignment(UI); + unsigned AS = getLoadStoreAddressSpace(UI); + TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0)); + InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( + UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI); + return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost( + ScalarPtrTy, nullptr, nullptr, Ctx.CostKind); + } + // TODO: See getMemInstScalarizationCost for how to handle replicating and + // predicated cases. + break; } } @@ -3181,10 +3343,17 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, // TODO: Using the original IR may not be accurate. // Currently, ARM will use the underlying IR to calculate gather/scatter // instruction cost. - const Value *Ptr = getLoadStorePointerOperand(&Ingredient); - Type *PtrTy = toVectorTy(Ptr->getType(), VF); assert(!Reverse && "Inconsecutive memory access should not have the order."); + + const Value *Ptr = getLoadStorePointerOperand(&Ingredient); + Type *PtrTy = Ptr->getType(); + + // If the address value is uniform across all lanes, then the address can be + // calculated with scalar type and broadcast. + if (!vputils::isSingleScalar(getAddr())) + PtrTy = toVectorTy(PtrTy, VF); + return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, Ctx.CostKind) + Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment, @@ -3532,9 +3701,9 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Lane && "Interleave group being replicated."); - assert((!NeedsMaskForGaps || !State.VF.isScalable()) && + assert((!needsMaskForGaps() || !State.VF.isScalable()) && "Masking gaps for scalable vectors is not yet supported."); - const InterleaveGroup<Instruction> *Group = IG; + const InterleaveGroup<Instruction> *Group = getInterleaveGroup(); Instruction *Instr = Group->getInsertPos(); // Prepare for the vector type of the interleaved load/store. @@ -3574,7 +3743,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { // Vectorize the interleaved load group. if (isa<LoadInst>(Instr)) { Value *MaskForGaps = nullptr; - if (NeedsMaskForGaps) { + if (needsMaskForGaps()) { MaskForGaps = createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group); assert(MaskForGaps && "Mask for Gaps is required but it is null"); @@ -3651,7 +3820,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { // Vectorize the interleaved store group. Value *MaskForGaps = createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group); - assert(((MaskForGaps != nullptr) == NeedsMaskForGaps) && + assert(((MaskForGaps != nullptr) == needsMaskForGaps()) && "Mismatch between NeedsMaskForGaps and MaskForGaps"); ArrayRef<VPValue *> StoredValues = getStoredValues(); // Collect the stored vector from each member. @@ -3702,6 +3871,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { + const InterleaveGroup<Instruction> *IG = getInterleaveGroup(); O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; IG->getInsertPos()->printAsOperand(O, false); O << ", "; @@ -3730,8 +3900,152 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, } #endif -InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { +void VPInterleaveEVLRecipe::execute(VPTransformState &State) { + assert(!State.Lane && "Interleave group being replicated."); + assert(State.VF.isScalable() && + "Only support scalable VF for EVL tail-folding."); + assert(!needsMaskForGaps() && + "Masking gaps for scalable vectors is not yet supported."); + const InterleaveGroup<Instruction> *Group = getInterleaveGroup(); + Instruction *Instr = Group->getInsertPos(); + + // Prepare for the vector type of the interleaved load/store. + Type *ScalarTy = getLoadStoreType(Instr); + unsigned InterleaveFactor = Group->getFactor(); + assert(InterleaveFactor <= 8 && + "Unsupported deinterleave/interleave factor for scalable vectors"); + ElementCount WideVF = State.VF * InterleaveFactor; + auto *VecTy = VectorType::get(ScalarTy, WideVF); + + VPValue *Addr = getAddr(); + Value *ResAddr = State.get(Addr, VPLane(0)); + Value *EVL = State.get(getEVL(), VPLane(0)); + Value *InterleaveEVL = State.Builder.CreateMul( + EVL, ConstantInt::get(EVL->getType(), InterleaveFactor), "interleave.evl", + /* NUW= */ true, /* NSW= */ true); + LLVMContext &Ctx = State.Builder.getContext(); + + Value *GroupMask = nullptr; + if (VPValue *BlockInMask = getMask()) { + SmallVector<Value *> Ops(InterleaveFactor, State.get(BlockInMask)); + GroupMask = interleaveVectors(State.Builder, Ops, "interleaved.mask"); + } else { + GroupMask = + State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue()); + } + + // Vectorize the interleaved load group. + if (isa<LoadInst>(Instr)) { + CallInst *NewLoad = State.Builder.CreateIntrinsic( + VecTy, Intrinsic::vp_load, {ResAddr, GroupMask, InterleaveEVL}, nullptr, + "wide.vp.load"); + NewLoad->addParamAttr(0, + Attribute::getWithAlignment(Ctx, Group->getAlign())); + + applyMetadata(*NewLoad); + // TODO: Also manage existing metadata using VPIRMetadata. + Group->addMetadata(NewLoad); + + // Scalable vectors cannot use arbitrary shufflevectors (only splats), + // so must use intrinsics to deinterleave. + NewLoad = State.Builder.CreateIntrinsic( + Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor), + NewLoad->getType(), NewLoad, + /*FMFSource=*/nullptr, "strided.vec"); + + const DataLayout &DL = Instr->getDataLayout(); + for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) { + Instruction *Member = Group->getMember(I); + // Skip the gaps in the group. + if (!Member) + continue; + + Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I); + // If this member has different type, cast the result type. + if (Member->getType() != ScalarTy) { + VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); + StridedVec = + createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); + } + + State.set(getVPValue(J), StridedVec); + ++J; + } + return; + } // End for interleaved load. + + // The sub vector type for current instruction. + auto *SubVT = VectorType::get(ScalarTy, State.VF); + // Vectorize the interleaved store group. + ArrayRef<VPValue *> StoredValues = getStoredValues(); + // Collect the stored vector from each member. + SmallVector<Value *, 4> StoredVecs; + const DataLayout &DL = Instr->getDataLayout(); + for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) { + Instruction *Member = Group->getMember(I); + // Skip the gaps in the group. + if (!Member) { + StoredVecs.push_back(PoisonValue::get(SubVT)); + continue; + } + + Value *StoredVec = State.get(StoredValues[StoredIdx]); + // If this member has different type, cast it to a unified type. + if (StoredVec->getType() != SubVT) + StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL); + + StoredVecs.push_back(StoredVec); + ++StoredIdx; + } + + // Interleave all the smaller vectors into one wider vector. + Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec"); + CallInst *NewStore = + State.Builder.CreateIntrinsic(Type::getVoidTy(Ctx), Intrinsic::vp_store, + {IVec, ResAddr, GroupMask, InterleaveEVL}); + NewStore->addParamAttr(1, + Attribute::getWithAlignment(Ctx, Group->getAlign())); + + applyMetadata(*NewStore); + // TODO: Also manage existing metadata using VPIRMetadata. + Group->addMetadata(NewStore); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPInterleaveEVLRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + const InterleaveGroup<Instruction> *IG = getInterleaveGroup(); + O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; + IG->getInsertPos()->printAsOperand(O, false); + O << ", "; + getAddr()->printAsOperand(O, SlotTracker); + O << ", "; + getEVL()->printAsOperand(O, SlotTracker); + if (VPValue *Mask = getMask()) { + O << ", "; + Mask->printAsOperand(O, SlotTracker); + } + + unsigned OpIdx = 0; + for (unsigned i = 0; i < IG->getFactor(); ++i) { + if (!IG->getMember(i)) + continue; + if (getNumStoreOperands() > 0) { + O << "\n" << Indent << " vp.store "; + getOperand(2 + OpIdx)->printAsOperand(O, SlotTracker); + O << " to index " << i; + } else { + O << "\n" << Indent << " "; + getVPValue(OpIdx)->printAsOperand(O, SlotTracker); + O << " = vp.load from index " << i; + } + ++OpIdx; + } +} +#endif + +InstructionCost VPInterleaveBase::computeCost(ElementCount VF, + VPCostContext &Ctx) const { Instruction *InsertPos = getInsertPos(); // Find the VPValue index of the interleave group. We need to skip gaps. unsigned InsertPosIdx = 0; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index e0bf241c73fd..2cac5557daee 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -29,6 +29,7 @@ #include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/InstSimplifyFolder.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolutionPatternMatch.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/MDBuilder.h" @@ -39,6 +40,10 @@ using namespace llvm; using namespace VPlanPatternMatch; +cl::opt<bool> EnableWideActiveLaneMask( + "enable-wide-lane-mask", cl::init(false), cl::Hidden, + cl::desc("Enable use of wide get active lane mask instructions")); + bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( VPlanPtr &Plan, function_ref<const InductionDescriptor *(PHINode *)> @@ -142,7 +147,7 @@ static bool sinkScalarOperands(VPlan &Plan) { for (VPValue *Op : Recipe.operands()) if (auto *Def = dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe())) - WorkList.insert(std::make_pair(VPBB, Def)); + WorkList.insert({VPBB, Def}); } } @@ -206,7 +211,7 @@ static bool sinkScalarOperands(VPlan &Plan) { for (VPValue *Op : SinkCandidate->operands()) if (auto *Def = dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe())) - WorkList.insert(std::make_pair(SinkTo, Def)); + WorkList.insert({SinkTo, Def}); Changed = true; } return Changed; @@ -344,7 +349,7 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe, auto *BlockInMask = PredRecipe->getMask(); auto *MaskDef = BlockInMask->getDefiningRecipe(); auto *BOMRecipe = new VPBranchOnMaskRecipe( - BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc()); + BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown()); auto *Entry = Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); @@ -859,8 +864,8 @@ static VPValue *optimizeLatchExitInductionUser( Type *StepTy = TypeInfo.inferScalarType(Step); auto *Zero = Plan.getOrAddLiveIn(ConstantInt::get(StepTy, 0)); return B.createPtrAdd(EndValue, - B.createNaryOp(Instruction::Sub, {Zero, Step}), {}, - "ind.escape"); + B.createNaryOp(Instruction::Sub, {Zero, Step}), + DebugLoc::getUnknown(), "ind.escape"); } if (ScalarTy->isFloatingPointTy()) { const auto &ID = WideIV->getInductionDescriptor(); @@ -910,10 +915,10 @@ static void removeRedundantExpandSCEVRecipes(VPlan &Plan) { if (!ExpR) continue; - auto I = SCEV2VPV.insert({ExpR->getSCEV(), ExpR}); - if (I.second) + const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR); + if (Inserted) continue; - ExpR->replaceAllUsesWith(I.first->second); + ExpR->replaceAllUsesWith(V->second); ExpR->eraseFromParent(); } } @@ -1067,7 +1072,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X // && (Y || Z) and (X || !X) into true. This requires queuing newly created // recipes to be visited during simplification. - VPValue *X, *Y; + VPValue *X, *Y, *Z; if (match(Def, m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)), m_LogicalAnd(m_Deferred(X), m_Not(m_Deferred(Y)))))) { @@ -1076,13 +1081,37 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return; } - // OR x, 1 -> 1. - if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes()))) { - Def->replaceAllUsesWith(Def->getOperand(0) == X ? Def->getOperand(1) - : Def->getOperand(0)); - Def->eraseFromParent(); - return; - } + // x | 1 -> 1 + if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes()))) + return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X)); + + // x | 0 -> x + if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt()))) + return Def->replaceAllUsesWith(X); + + // x & 0 -> 0 + if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt()))) + return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X)); + + // x && false -> false + if (match(Def, m_LogicalAnd(m_VPValue(X), m_False()))) + return Def->replaceAllUsesWith(Def->getOperand(1)); + + // (x && y) || (x && z) -> x && (y || z) + VPBuilder Builder(Def); + if (match(Def, m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)), + m_LogicalAnd(m_Deferred(X), m_VPValue(Z)))) && + // Simplify only if one of the operands has one use to avoid creating an + // extra recipe. + (!Def->getOperand(0)->hasMoreThanOneUniqueUser() || + !Def->getOperand(1)->hasMoreThanOneUniqueUser())) + return Def->replaceAllUsesWith( + Builder.createLogicalAnd(X, Builder.createOr(Y, Z))); + + // x && !x -> 0 + if (match(&R, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X))))) + return Def->replaceAllUsesWith(Plan->getOrAddLiveIn( + ConstantInt::getFalse(VPTypeAnalysis(*Plan).inferScalarType(Def)))); if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X)))) return Def->replaceAllUsesWith(X); @@ -1096,6 +1125,15 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return; } + // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With + // tail folding it is likely that x is a header mask and can be simplified + // further. + if (match(Def, m_LogicalAnd(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)), + m_VPValue(Z))) && + X->hasMoreThanOneUniqueUser()) + return Def->replaceAllUsesWith( + Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z))); + if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(1)))) return Def->replaceAllUsesWith(A); @@ -1150,7 +1188,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { m_VPValue(X), m_SpecificInt(1)))) { Type *WideStepTy = TypeInfo.inferScalarType(Def); if (TypeInfo.inferScalarType(X) != WideStepTy) - X = VPBuilder(Def).createWidenCast(Instruction::Trunc, X, WideStepTy); + X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy); Def->replaceAllUsesWith(X); return; } @@ -1232,11 +1270,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return; } - VPInstruction *OpVPI; - if (match(Def, m_ExtractLastElement(m_VPInstruction(OpVPI))) && - OpVPI->isVectorToScalar()) { - Def->replaceAllUsesWith(OpVPI); - return; + if (match(Def, + m_VPInstruction<VPInstruction::ExtractLastElement>(m_VPValue(A))) && + vputils::isSingleScalar(A) && all_of(A->users(), [Def, A](VPUser *U) { + return U->usesScalars(A) || Def == U; + })) { + return Def->replaceAllUsesWith(A); } } @@ -1269,11 +1308,29 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { continue; auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R); + if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) && + vputils::isSingleScalar(RepR->getOperand(1))) { + auto *Clone = new VPReplicateRecipe( + RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(), + true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/); + Clone->insertBefore(RepOrWidenR); + auto *Ext = new VPInstruction(VPInstruction::ExtractLastElement, + {Clone->getOperand(0)}); + Ext->insertBefore(Clone); + Clone->setOperand(0, Ext); + RepR->eraseFromParent(); + continue; + } + // Skip recipes that aren't single scalars or don't have only their // scalar results used. In the latter case, we would introduce extra // broadcasts. if (!vputils::isSingleScalar(RepOrWidenR) || - !vputils::onlyScalarValuesUsed(RepOrWidenR)) + !all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) { + return U->usesScalars(RepOrWidenR) || + match(cast<VPRecipeBase>(U), + m_ExtractLastElement(m_VPValue())); + })) continue; auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(), @@ -1285,6 +1342,23 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { } } +/// Try to see if all of \p Blend's masks share a common value logically and'ed +/// and remove it from the masks. +static void removeCommonBlendMask(VPBlendRecipe *Blend) { + if (Blend->isNormalized()) + return; + VPValue *CommonEdgeMask; + if (!match(Blend->getMask(0), + m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue()))) + return; + for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++) + if (!match(Blend->getMask(I), + m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue()))) + return; + for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++) + Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1)); +} + /// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes /// to make sure the masks are simplified. static void simplifyBlends(VPlan &Plan) { @@ -1295,6 +1369,8 @@ static void simplifyBlends(VPlan &Plan) { if (!Blend) continue; + removeCommonBlendMask(Blend); + // Try to remove redundant blend recipes. SmallPtrSet<VPValue *, 4> UniqueValues; if (Blend->isNormalized() || !match(Blend->getMask(0), m_False())) @@ -1467,6 +1543,102 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C); } +/// Try to replace multiple active lane masks used for control flow with +/// a single, wide active lane mask instruction followed by multiple +/// extract subvector intrinsics. This applies to the active lane mask +/// instructions both in the loop and in the preheader. +/// Incoming values of all ActiveLaneMaskPHIs are updated to use the +/// new extracts from the first active lane mask, which has it's last +/// operand (multiplier) set to UF. +static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, + unsigned UF) { + if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1) + return false; + + VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); + VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock(); + auto *Term = &ExitingVPBB->back(); + + using namespace llvm::VPlanPatternMatch; + if (!match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask( + m_VPValue(), m_VPValue(), m_VPValue()))))) + return false; + + auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry()); + LLVMContext &Ctx = Plan.getContext(); + + auto ExtractFromALM = [&](VPInstruction *ALM, + SmallVectorImpl<VPValue *> &Extracts) { + DebugLoc DL = ALM->getDebugLoc(); + for (unsigned Part = 0; Part < UF; ++Part) { + SmallVector<VPValue *> Ops; + Ops.append({ALM, Plan.getOrAddLiveIn( + ConstantInt::get(IntegerType::getInt64Ty(Ctx), + VF.getKnownMinValue() * Part))}); + auto *Ext = new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops, + IntegerType::getInt1Ty(Ctx), DL); + Extracts[Part] = Ext; + Ext->insertAfter(ALM); + } + }; + + // Create a list of each active lane mask phi, ordered by unroll part. + SmallVector<VPActiveLaneMaskPHIRecipe *> Phis(UF, nullptr); + for (VPRecipeBase &R : Header->phis()) { + auto *Phi = dyn_cast<VPActiveLaneMaskPHIRecipe>(&R); + if (!Phi) + continue; + VPValue *Index = nullptr; + match(Phi->getBackedgeValue(), + m_ActiveLaneMask(m_VPValue(Index), m_VPValue(), m_VPValue())); + assert(Index && "Expected index from ActiveLaneMask instruction"); + + auto *II = dyn_cast<VPInstruction>(Index); + if (II && II->getOpcode() == VPInstruction::CanonicalIVIncrementForPart) { + auto Part = cast<ConstantInt>(II->getOperand(1)->getLiveInIRValue()); + Phis[Part->getZExtValue()] = Phi; + } else + // Anything other than a CanonicalIVIncrementForPart is part 0 + Phis[0] = Phi; + } + + assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) && + "Expected one VPActiveLaneMaskPHIRecipe for each unroll part"); + + auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue()); + auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue()); + + assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask && + LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) && + "Expected incoming values of Phi to be ActiveLaneMasks"); + + // When using wide lane masks, the return type of the get.active.lane.mask + // intrinsic is VF x UF (last operand). + VPValue *ALMMultiplier = + Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF)); + EntryALM->setOperand(2, ALMMultiplier); + LoopALM->setOperand(2, ALMMultiplier); + + // Create UF x extract vectors and insert into preheader. + SmallVector<VPValue *> EntryExtracts(UF); + ExtractFromALM(EntryALM, EntryExtracts); + + // Create UF x extract vectors and insert before the loop compare & branch, + // updating the compare to use the first extract. + SmallVector<VPValue *> LoopExtracts(UF); + ExtractFromALM(LoopALM, LoopExtracts); + VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0)); + Not->setOperand(0, LoopExtracts[0]); + + // Update the incoming values of active lane mask phis. + for (unsigned Part = 0; Part < UF; ++Part) { + Phis[Part]->setStartValue(EntryExtracts[Part]); + Phis[Part]->setBackedgeValue(LoopExtracts[Part]); + } + + return true; +} + /// Try to simplify the branch condition of \p Plan. This may restrict the /// resulting plan to \p BestVF and \p BestUF. static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, @@ -1478,8 +1650,8 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, VPValue *Cond; ScalarEvolution &SE = *PSE.getSE(); if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) || - match(Term, m_BranchOnCond( - m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue()))))) { + match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask( + m_VPValue(), m_VPValue(), m_VPValue()))))) { // Try to simplify the branch condition if TC <= VF * UF when the latch // terminator is BranchOnCount or BranchOnCond where the input is // Not(ActiveLaneMask). @@ -1558,8 +1730,8 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan"); assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan"); - bool MadeChange = - simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE); + bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF); + MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE); MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF); if (MadeChange) { @@ -1792,6 +1964,110 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) { } } +namespace { +struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> { + static bool isSentinel(const VPSingleDefRecipe *Def) { + return Def == getEmptyKey() || Def == getTombstoneKey(); + } + + /// Get any instruction opcode or intrinsic ID data embedded in recipe \p R. + /// Returns an optional pair, where the first element indicates whether it is + /// an intrinsic ID. + static std::optional<std::pair<bool, unsigned>> + getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) { + return TypeSwitch<const VPSingleDefRecipe *, + std::optional<std::pair<bool, unsigned>>>(R) + .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe, + VPWidenSelectRecipe, VPReplicateRecipe>( + [](auto *I) { return std::make_pair(false, I->getOpcode()); }) + .Case<VPWidenIntrinsicRecipe>([](auto *I) { + return std::make_pair(true, I->getVectorIntrinsicID()); + }) + .Default([](auto *) { return std::nullopt; }); + } + + /// Returns true if recipe \p Def can be safely handed for CSE. + static bool canHandle(const VPSingleDefRecipe *Def) { + // We can extend the list of handled recipes in the future, + // provided we account for the data embedded in them while checking for + // equality or hashing. + auto C = getOpcodeOrIntrinsicID(Def); + + // The issue with (Insert|Extract)Value is that the index of the + // insert/extract is not a proper operand in LLVM IR, and hence also not in + // VPlan. + if (!C || (!C->first && (C->second == Instruction::InsertValue || + C->second == Instruction::ExtractValue))) + return false; + + // During CSE, we can only handle recipes that don't read from memory: if + // they read from memory, there could be an intervening write to memory + // before the next instance is CSE'd, leading to an incorrect result. + return !Def->mayReadFromMemory(); + } + + /// Hash the underlying data of \p Def. + static unsigned getHashValue(const VPSingleDefRecipe *Def) { + const VPlan *Plan = Def->getParent()->getPlan(); + VPTypeAnalysis TypeInfo(*Plan); + hash_code Result = hash_combine( + Def->getVPDefID(), getOpcodeOrIntrinsicID(Def), + TypeInfo.inferScalarType(Def), vputils::isSingleScalar(Def), + hash_combine_range(Def->operands())); + if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def)) + if (RFlags->hasPredicate()) + return hash_combine(Result, RFlags->getPredicate()); + return Result; + } + + /// Check equality of underlying data of \p L and \p R. + static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) { + if (isSentinel(L) || isSentinel(R)) + return L == R; + if (L->getVPDefID() != R->getVPDefID() || + getOpcodeOrIntrinsicID(L) != getOpcodeOrIntrinsicID(R) || + vputils::isSingleScalar(L) != vputils::isSingleScalar(R) || + !equal(L->operands(), R->operands())) + return false; + if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L)) + if (LFlags->hasPredicate() && + LFlags->getPredicate() != + cast<VPRecipeWithIRFlags>(R)->getPredicate()) + return false; + const VPlan *Plan = L->getParent()->getPlan(); + VPTypeAnalysis TypeInfo(*Plan); + return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R); + } +}; +} // end anonymous namespace + +/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p +/// Plan. +void VPlanTransforms::cse(VPlan &Plan) { + VPDominatorTree VPDT(Plan); + DenseMap<VPSingleDefRecipe *, VPSingleDefRecipe *, VPCSEDenseMapInfo> CSEMap; + + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( + vp_depth_first_deep(Plan.getEntry()))) { + for (VPRecipeBase &R : *VPBB) { + auto *Def = dyn_cast<VPSingleDefRecipe>(&R); + if (!Def || !VPCSEDenseMapInfo::canHandle(Def)) + continue; + if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) { + // V must dominate Def for a valid replacement. + if (!VPDT.dominates(V->getParent(), VPBB)) + continue; + // Only keep flags present on both V and Def. + if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V)) + RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def)); + Def->replaceAllUsesWith(V); + continue; + } + CSEMap[Def] = Def; + } + } +} + /// Move loop-invariant recipes out of the vector loop region in \p Plan. static void licm(VPlan &Plan) { VPBasicBlock *Preheader = Plan.getVectorPreheader(); @@ -1953,10 +2229,10 @@ void VPlanTransforms::optimize(VPlan &Plan) { runPass(removeRedundantInductionCasts, Plan); runPass(simplifyRecipes, Plan); - runPass(simplifyBlends, Plan); runPass(removeDeadRecipes, Plan); - runPass(narrowToSingleScalarRecipes, Plan); + runPass(simplifyBlends, Plan); runPass(legalizeAndOptimizeInductions, Plan); + runPass(narrowToSingleScalarRecipes, Plan); runPass(removeRedundantExpandSCEVRecipes, Plan); runPass(simplifyRecipes, Plan); runPass(removeBranchOnConst, Plan); @@ -2042,13 +2318,16 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( "index.part.next"); // Create the active lane mask instruction in the VPlan preheader. - auto *EntryALM = - Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC}, - DL, "active.lane.mask.entry"); + VPValue *ALMMultiplier = Plan.getOrAddLiveIn( + ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1)); + auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, + {EntryIncrement, TC, ALMMultiplier}, DL, + "active.lane.mask.entry"); // Now create the ActiveLaneMaskPhi recipe in the main loop using the // preheader ActiveLaneMask instruction. - auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); + auto *LaneMaskPhi = + new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc::getUnknown()); LaneMaskPhi->insertAfter(CanonicalIVPHI); // Create the active lane mask for the next iteration of the loop before the @@ -2059,8 +2338,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart, {IncrementValue}, {false, false}, DL); auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, - {InLoopIncrement, TripCount}, DL, - "active.lane.mask.next"); + {InLoopIncrement, TripCount, ALMMultiplier}, + DL, "active.lane.mask.next"); LaneMaskPhi->addOperand(ALM); // Replace the original terminator with BranchOnCond. We have to invert the @@ -2077,12 +2356,10 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( /// for the header-mask pattern manually. static VPSingleDefRecipe *findHeaderMask(VPlan &Plan) { SmallVector<VPValue *> WideCanonicalIVs; - auto *FoundWidenCanonicalIVUser = - find_if(Plan.getCanonicalIV()->users(), - [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); }); + auto *FoundWidenCanonicalIVUser = find_if(Plan.getCanonicalIV()->users(), + IsaPred<VPWidenCanonicalIVRecipe>); assert(count_if(Plan.getCanonicalIV()->users(), - [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); }) <= - 1 && + IsaPred<VPWidenCanonicalIVRecipe>) <= 1 && "Must have at most one VPWideCanonicalIVRecipe"); if (FoundWidenCanonicalIVUser != Plan.getCanonicalIV()->users().end()) { auto *WideCanonicalIV = @@ -2125,9 +2402,8 @@ void VPlanTransforms::addActiveLaneMask( "DataAndControlFlowWithoutRuntimeCheck implies " "UseActiveLaneMaskForControlFlow"); - auto *FoundWidenCanonicalIVUser = - find_if(Plan.getCanonicalIV()->users(), - [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); }); + auto *FoundWidenCanonicalIVUser = find_if(Plan.getCanonicalIV()->users(), + IsaPred<VPWidenCanonicalIVRecipe>); assert(FoundWidenCanonicalIVUser && "Must have widened canonical IV when tail folding!"); VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan); @@ -2139,9 +2415,12 @@ void VPlanTransforms::addActiveLaneMask( Plan, DataAndControlFlowWithoutRuntimeCheck); } else { VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV); - LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask, - {WideCanonicalIV, Plan.getTripCount()}, nullptr, - "active.lane.mask"); + VPValue *ALMMultiplier = Plan.getOrAddLiveIn( + ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1)); + LaneMask = + B.createNaryOp(VPInstruction::ActiveLaneMask, + {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier}, + nullptr, "active.lane.mask"); } // Walk users of WideCanonicalIV and replace the header mask of the form @@ -2205,6 +2484,10 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask, VPValue *NewAddr = GetNewAddr(S->getAddr()); return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask); }) + .Case<VPInterleaveRecipe>([&](VPInterleaveRecipe *IR) { + VPValue *NewMask = GetNewMask(IR->getMask()); + return new VPInterleaveEVLRecipe(*IR, EVL, NewMask); + }) .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) { VPValue *NewMask = GetNewMask(Red->getCondOp()); return new VPReductionEVLRecipe(*Red, EVL, NewMask); @@ -2271,11 +2554,11 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { VPBuilder Builder(LoopRegion->getPreheaderVPBB()); MaxEVL = Builder.createScalarZExtOrTrunc( MaxEVL, Type::getInt32Ty(Plan.getContext()), - TypeInfo.inferScalarType(MaxEVL), DebugLoc()); + TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown()); Builder.setInsertPoint(Header, Header->getFirstNonPhi()); - VPValue *PrevEVL = - Builder.createScalarPhi({MaxEVL, &EVL}, DebugLoc(), "prev.evl"); + VPValue *PrevEVL = Builder.createScalarPhi( + {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl"); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()))) { @@ -2327,16 +2610,17 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { if (!EVLRecipe) continue; - [[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues(); + unsigned NumDefVal = EVLRecipe->getNumDefinedValues(); assert(NumDefVal == CurRecipe->getNumDefinedValues() && "New recipe must define the same number of values as the " "original."); - assert(NumDefVal <= 1 && - "Only supports recipes with a single definition or without users."); EVLRecipe->insertBefore(CurRecipe); - if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(EVLRecipe)) { - VPValue *CurVPV = CurRecipe->getVPSingleValue(); - CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue()); + if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe, VPInterleaveEVLRecipe>( + EVLRecipe)) { + for (unsigned I = 0; I < NumDefVal; ++I) { + VPValue *CurVPV = CurRecipe->getVPValue(I); + CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I)); + } } ToErase.push_back(CurRecipe); } @@ -2404,7 +2688,7 @@ void VPlanTransforms::addExplicitVectorLength( VPValue *StartV = CanonicalIVPHI->getStartValue(); // Create the ExplicitVectorLengthPhi recipe in the main loop. - auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc()); + auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc::getUnknown()); EVLPhi->insertAfter(CanonicalIVPHI); VPBuilder Builder(Header, Header->getFirstNonPhi()); // Create the AVL (application vector length), starting from TC -> 0 in steps @@ -2418,10 +2702,11 @@ void VPlanTransforms::addExplicitVectorLength( VPValue *AVLSafe = Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, *MaxSafeElements)); VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe); - AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc(), "safe_avl"); + AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(), + "safe_avl"); } auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL, - DebugLoc()); + DebugLoc::getUnknown()); auto *CanonicalIVIncrement = cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue()); @@ -2473,6 +2758,22 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) { VPBasicBlock *HeaderVPBB = EVLPhi->getParent(); VPValue *EVLIncrement = EVLPhi->getBackedgeValue(); + VPValue *AVL; + [[maybe_unused]] bool FoundAVL = + match(EVLIncrement, + m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi))); + assert(FoundAVL && "Didn't find AVL?"); + + // The AVL may be capped to a safe distance. + VPValue *SafeAVL; + if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue()))) + AVL = SafeAVL; + + VPValue *AVLNext; + [[maybe_unused]] bool FoundAVLNext = + match(AVL, m_VPInstruction<Instruction::PHI>( + m_Specific(Plan.getTripCount()), m_VPValue(AVLNext))); + assert(FoundAVLNext && "Didn't find AVL backedge?"); // Convert EVLPhi to concrete recipe. auto *ScalarR = @@ -2496,7 +2797,7 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) { // Replace the use of VectorTripCount in the latch-exiting block. // Before: (branch-on-count EVLIVInc, VectorTripCount) - // After: (branch-on-count EVLIVInc, TripCount) + // After: (branch-on-cond eq AVLNext, 0) VPBasicBlock *LatchExiting = HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock(); @@ -2509,7 +2810,54 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) { m_BranchOnCount(m_VPValue(EVLIncrement), m_Specific(&Plan.getVectorTripCount()))) && "Unexpected terminator in EVL loop"); - LatchExitingBr->setOperand(1, Plan.getTripCount()); + + Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext); + VPBuilder Builder(LatchExitingBr); + VPValue *Cmp = + Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, + Plan.getOrAddLiveIn(ConstantInt::getNullValue(AVLTy))); + Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp); + LatchExitingBr->eraseFromParent(); +} + +void VPlanTransforms::replaceSymbolicStrides( + VPlan &Plan, PredicatedScalarEvolution &PSE, + const DenseMap<Value *, const SCEV *> &StridesMap) { + // Replace VPValues for known constant strides guaranteed by predicate scalar + // evolution. + auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) { + auto *R = cast<VPRecipeBase>(&U); + return R->getParent()->getParent() || + R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor(); + }; + for (const SCEV *Stride : StridesMap.values()) { + using namespace SCEVPatternMatch; + auto *StrideV = cast<SCEVUnknown>(Stride)->getValue(); + const APInt *StrideConst; + if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst))) + // Only handle constant strides for now. + continue; + + auto *CI = + Plan.getOrAddLiveIn(ConstantInt::get(Stride->getType(), *StrideConst)); + if (VPValue *StrideVPV = Plan.getLiveIn(StrideV)) + StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); + + // The versioned value may not be used in the loop directly but through a + // sext/zext. Add new live-ins in those cases. + for (Value *U : StrideV->users()) { + if (!isa<SExtInst, ZExtInst>(U)) + continue; + VPValue *StrideVPV = Plan.getLiveIn(U); + if (!StrideVPV) + continue; + unsigned BW = U->getType()->getScalarSizeInBits(); + APInt C = + isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW); + VPValue *CI = Plan.getOrAddLiveIn(ConstantInt::get(U->getType(), C)); + StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); + } + } } void VPlanTransforms::dropPoisonGeneratingRecipes( @@ -2785,8 +3133,8 @@ expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step); Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags); - Init = - Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags, {}, "induction"); + Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags, + DebugLoc::getUnknown(), "induction"); // Create the widened phi of the vector IV. auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), nullptr, @@ -2983,9 +3331,11 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) { R->eraseFromParent(); } -void VPlanTransforms::handleUncountableEarlyExit( - VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, - VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VFRange &Range) { +void VPlanTransforms::handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, + VPBasicBlock *EarlyExitVPBB, + VPlan &Plan, + VPBasicBlock *HeaderVPBB, + VPBasicBlock *LatchVPBB) { VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0]; if (!EarlyExitVPBB->getSinglePredecessor() && EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) { @@ -3038,13 +3388,7 @@ void VPlanTransforms::handleUncountableEarlyExit( } VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx); - auto IsVector = [](ElementCount VF) { return VF.isVector(); }; - // When the VFs are vectors, need to add `extract` to get the incoming value - // from early exit. When the range contains scalar VF, limit the range to - // scalar VF to prevent mis-compilation for the range containing both scalar - // and vector VFs. - if (!IncomingFromEarlyExit->isLiveIn() && - LoopVectorizationPlanner::getDecisionAndClampRange(IsVector, Range)) { + if (!IncomingFromEarlyExit->isLiveIn()) { // Update the incoming value from the early exit. VPValue *FirstActiveLane = EarlyExitB.createNaryOp( VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr, @@ -3125,7 +3469,7 @@ static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range) { unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); - if (Opcode != Instruction::Add) + if (Opcode != Instruction::Add && Opcode != Instruction::Sub) return nullptr; Type *RedTy = Ctx.Types.inferScalarType(Red); @@ -3140,8 +3484,8 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, Type *SrcTy = Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy; auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF)); - InstructionCost MulAccCost = - Ctx.TTI.getMulAccReductionCost(isZExt, RedTy, SrcVecTy, CostKind); + InstructionCost MulAccCost = Ctx.TTI.getMulAccReductionCost( + isZExt, Opcode, RedTy, SrcVecTy, CostKind); InstructionCost MulCost = Mul->computeCost(VF, Ctx); InstructionCost RedCost = Red->computeCost(VF, Ctx); InstructionCost ExtCost = 0; @@ -3506,6 +3850,21 @@ VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) { Plan.resetTripCount(Exp); ExpSCEV->eraseFromParent(); } + assert(none_of(*Entry, IsaPred<VPExpandSCEVRecipe>) && + "VPExpandSCEVRecipes must be at the beginning of the entry block, " + "after any VPIRInstructions"); + // Add IR instructions in the entry basic block but not in the VPIRBasicBlock + // to the VPIRBasicBlock. + auto EI = Entry->begin(); + for (Instruction &I : drop_end(*EntryBB)) { + if (EI != Entry->end() && isa<VPIRInstruction>(*EI) && + &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) { + EI++; + continue; + } + VPIRInstruction::create(I)->insertBefore(*Entry, EI); + } + return ExpandedSCEVs; } @@ -3574,12 +3933,12 @@ static bool isAlreadyNarrow(VPValue *VPV) { void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned VectorRegWidth) { VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion(); - if (VF.isScalable() || !VectorLoop) + if (!VectorLoop) return; VPTypeAnalysis TypeInfo(Plan); - unsigned FixedVF = VF.getFixedValue(); + unsigned VFMinVal = VF.getKnownMinValue(); SmallVector<VPInterleaveRecipe *> StoreGroups; for (auto &R : *VectorLoop->getEntryBasicBlock()) { if (isa<VPCanonicalIVPHIRecipe>(&R) || @@ -3615,7 +3974,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, continue; // Bail out on non-consecutive interleave groups. - if (!isConsecutiveInterleaveGroup(InterleaveR, FixedVF, TypeInfo, + if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo, VectorRegWidth)) return; @@ -3672,9 +4031,10 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, return; // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe. - auto NarrowOp = [](VPValue *V) -> VPValue * { + SmallPtrSet<VPValue *, 4> NarrowedOps; + auto NarrowOp = [&NarrowedOps](VPValue *V) -> VPValue * { auto *R = V->getDefiningRecipe(); - if (!R) + if (!R || NarrowedOps.contains(V)) return V; if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) { // Narrow interleave group to wide load, as transformed VPlan will only @@ -3684,6 +4044,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true, /*Reverse=*/false, {}, LoadGroup->getDebugLoc()); L->insertBefore(LoadGroup); + NarrowedOps.insert(L); return L; } @@ -3691,6 +4052,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, assert(RepR->isSingleScalar() && isa<LoadInst>(RepR->getUnderlyingInstr()) && "must be a single scalar load"); + NarrowedOps.insert(RepR); return RepR; } auto *WideLoad = cast<VPWidenLoadRecipe>(R); @@ -3704,6 +4066,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, /*IsUniform*/ true, /*Mask*/ nullptr, *WideLoad); N->insertBefore(WideLoad); + NarrowedOps.insert(N); return N; }; @@ -3734,10 +4097,21 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, // original iteration. auto *CanIV = Plan.getCanonicalIV(); auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue()); - Inc->setOperand(1, Plan.getOrAddLiveIn(ConstantInt::get( - CanIV->getScalarType(), 1 * Plan.getUF()))); - Plan.getVF().replaceAllUsesWith( - Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1))); + VPBuilder PHBuilder(Plan.getVectorPreheader()); + + VPValue *UF = Plan.getOrAddLiveIn( + ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF())); + if (VF.isScalable()) { + VPValue *VScale = PHBuilder.createElementCount( + CanIV->getScalarType(), ElementCount::getScalable(1)); + VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF}); + Inc->setOperand(1, VScaleUF); + Plan.getVF().replaceAllUsesWith(VScale); + } else { + Inc->setOperand(1, UF); + Plan.getVF().replaceAllUsesWith( + Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1))); + } removeDeadRecipes(Plan); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 700b94621d5f..1957428fab79 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -62,16 +62,47 @@ struct VPlanTransforms { /// The created loop is wrapped in an initial skeleton to facilitate /// vectorization, consisting of a vector pre-header, an exit block for the /// main vector loop (middle.block) and a new block as preheader of the scalar - /// loop (scalar.ph). It also adds a canonical IV and its increment, using \p - /// InductionTy and \p IVDL, and creates a VPValue expression for the original - /// trip count. + /// loop (scalar.ph). See below for an illustration. It also adds a canonical + /// IV and its increment, using \p InductionTy and \p IVDL, and creates a + /// VPValue expression for the original trip count. + /// + /// [ ] <-- Plan's entry VPIRBasicBlock, wrapping the original loop's + /// / \ old preheader. Will contain iteration number check and SCEV + /// | | expansions. + /// | | + /// / v + /// | [ ] <-- vector loop bypass (may consist of multiple blocks) will be + /// | / | added later. + /// | / v + /// || [ ] <-- vector pre header. + /// |/ | + /// | v + /// | [ ] \ <-- plain CFG loop wrapping original loop to be vectorized. + /// | [ ]_| + /// | | + /// | v + /// | [ ] <--- middle-block with the branch to successors + /// | / | + /// | / | + /// | | v + /// \--->[ ] <--- scalar preheader (initial a VPBasicBlock, which will be + /// | | replaced later by a VPIRBasicBlock wrapping the scalar + /// | | preheader basic block. + /// | | + /// v <-- edge from middle to exit iff epilogue is not required. + /// | [ ] \ + /// | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue, + /// | | header wrapped in VPIRBasicBlock). + /// \ | + /// \ v + /// >[ ] <-- original loop exit block(s), wrapped in VPIRBasicBlocks. LLVM_ABI_FOR_TEST static std::unique_ptr<VPlan> buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, DebugLoc IVDL, PredicatedScalarEvolution &PSE); /// Update \p Plan to account for all early exits. - LLVM_ABI_FOR_TEST static void - handleEarlyExits(VPlan &Plan, bool HasUncountableExit, VFRange &Range); + LLVM_ABI_FOR_TEST static void handleEarlyExits(VPlan &Plan, + bool HasUncountableExit); /// If a check is needed to guard executing the scalar epilogue loop, it will /// be added to the middle block. @@ -79,6 +110,13 @@ struct VPlanTransforms { bool RequiresScalarEpilogueCheck, bool TailFolded); + // Create a check to \p Plan to see if the vector loop should be executed. + static void addMinimumIterationCheck( + VPlan &Plan, ElementCount VF, unsigned UF, + ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue, + bool TailFolded, bool CheckNeededWithTailFolding, Loop *OrigLoop, + const uint32_t *MinItersBypassWeights, DebugLoc DL, ScalarEvolution &SE); + /// Replace loops in \p Plan's flat CFG with VPRegionBlocks, turning \p Plan's /// flat CFG into a hierarchical CFG. LLVM_ABI_FOR_TEST static void createLoopRegions(VPlan &Plan); @@ -161,6 +199,12 @@ struct VPlanTransforms { truncateToMinimalBitwidths(VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs); + /// Replace symbolic strides from \p StridesMap in \p Plan with constants when + /// possible. + static void + replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, + const DenseMap<Value *, const SCEV *> &StridesMap); + /// Drop poison flags from recipes that may generate a poison value that is /// used after vectorization, even when their operands are not poison. Those /// recipes meet the following conditions: @@ -207,8 +251,7 @@ struct VPlanTransforms { static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, VPBasicBlock *HeaderVPBB, - VPBasicBlock *LatchVPBB, - VFRange &Range); + VPBasicBlock *LatchVPBB); /// Replace loop regions with explicit CFG. static void dissolveLoopRegions(VPlan &Plan); @@ -220,9 +263,10 @@ struct VPlanTransforms { /// variable vector lengths instead of fixed lengths. This transformation: /// * Makes EVL-Phi concrete. // * Removes CanonicalIV and increment. - /// * Replaces fixed-length stepping (branch-on-cond CanonicalIVInc, - /// VectorTripCount) with variable-length stepping (branch-on-cond - /// EVLIVInc, TripCount). + /// * Replaces the exit condition from + /// (branch-on-count CanonicalIVInc, VectorTripCount) + /// to + /// (branch-on-cond eq AVLNext, 0) static void canonicalizeEVLLoops(VPlan &Plan); /// Lower abstract recipes to concrete ones, that can be codegen'd. @@ -242,6 +286,9 @@ struct VPlanTransforms { /// removing dead edges to their successors. static void removeBranchOnConst(VPlan &Plan); + /// Perform common-subexpression-elimination on \p Plan. + static void cse(VPlan &Plan); + /// If there's a single exit block, optimize its phi recipes that use exiting /// IV values by feeding them precomputed end values instead, possibly taken /// one step backwards. diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 4bcde8cd5d42..443df167378b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -92,18 +92,18 @@ public: void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR, unsigned Part) { for (const auto &[Idx, VPV] : enumerate(OrigR->definedValues())) { - auto Ins = VPV2Parts.insert({VPV, {}}); - assert(Ins.first->second.size() == Part - 1 && "earlier parts not set"); - Ins.first->second.push_back(CopyR->getVPValue(Idx)); + const auto &[V, _] = VPV2Parts.try_emplace(VPV); + assert(V->second.size() == Part - 1 && "earlier parts not set"); + V->second.push_back(CopyR->getVPValue(Idx)); } } /// Given a uniform recipe \p R, add it for all parts. void addUniformForAllParts(VPSingleDefRecipe *R) { - auto Ins = VPV2Parts.insert({R, {}}); - assert(Ins.second && "uniform value already added"); + const auto &[V, Inserted] = VPV2Parts.try_emplace(R); + assert(Inserted && "uniform value already added"); for (unsigned Part = 0; Part != UF; ++Part) - Ins.first->second.push_back(R); + V->second.push_back(R); } bool contains(VPValue *VPV) const { return VPV2Parts.contains(VPV); } @@ -536,16 +536,9 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { VPBuilder Builder(RepR); if (RepR->getNumUsers() == 0) { - if (isa<StoreInst>(RepR->getUnderlyingInstr()) && - vputils::isSingleScalar(RepR->getOperand(1))) { - // Stores to invariant addresses need to store the last lane only. - cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF), - Def2LaneDefs); - } else { - // Create single-scalar version of RepR for all lanes. - for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) - cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs); - } + // Create single-scalar version of RepR for all lanes. + for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) + cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs); RepR->eraseFromParent(); continue; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 700a733bf9f2..c6c1ef336982 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -65,7 +65,7 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) { VPValue *A, *B; using namespace VPlanPatternMatch; - if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B)))) + if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B), m_SpecificInt(1)))) return B == Plan.getTripCount() && (match(A, m_ScalarIVSteps(m_Specific(Plan.getCanonicalIV()), m_SpecificInt(1), diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 9e1d325a4d8d..77c099b27171 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -49,6 +49,8 @@ inline bool isSingleScalar(const VPValue *VPV) { case Instruction::GetElementPtr: case Instruction::ICmp: case Instruction::FCmp: + case Instruction::Select: + case VPInstruction::Not: case VPInstruction::Broadcast: case VPInstruction::PtrAdd: return true; diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 24f6d61512ef..85c6c2c8d796 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -38,7 +38,7 @@ struct VPDoubleValueDef; class VPSlotTracker; class VPUser; class VPRecipeBase; -class VPInterleaveRecipe; +class VPInterleaveBase; class VPPhiAccessors; // This is the base class of the VPlan Def/Use graph, used for modeling the data @@ -48,7 +48,7 @@ class VPPhiAccessors; class LLVM_ABI_FOR_TEST VPValue { friend class VPDef; friend struct VPDoubleValueDef; - friend class VPInterleaveRecipe; + friend class VPInterleaveBase; friend class VPlan; friend class VPExpressionRecipe; @@ -335,6 +335,7 @@ public: VPExpressionSC, VPIRInstructionSC, VPInstructionSC, + VPInterleaveEVLSC, VPInterleaveSC, VPReductionEVLSC, VPReductionSC, diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index e25ffe135418..99f3bc367a54 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -166,7 +166,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { } return VerifyEVLUse(*R, 2); }) - .Case<VPWidenLoadEVLRecipe, VPVectorEndPointerRecipe>( + .Case<VPWidenLoadEVLRecipe, VPVectorEndPointerRecipe, + VPInterleaveEVLRecipe>( [&](const VPRecipeBase *R) { return VerifyEVLUse(*R, 1); }) .Case<VPInstructionWithType>( [&](const VPInstructionWithType *S) { return VerifyEVLUse(*S, 0); }) @@ -412,7 +413,7 @@ bool VPlanVerifier::verifyRegion(const VPRegionBlock *Region) { const VPBlockBase *Exiting = Region->getExiting(); // Entry and Exiting shouldn't have any predecessor/successor, respectively. - if (Entry->getNumPredecessors() != 0) { + if (Entry->hasPredecessors()) { errs() << "region entry block has predecessors\n"; return false; } diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 092a3a87954f..17cb18a22336 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -99,6 +99,10 @@ private: InstructionWorklist Worklist; + /// Next instruction to iterate. It will be updated when it is erased by + /// RecursivelyDeleteTriviallyDeadInstructions. + Instruction *NextInst; + // TODO: Direct calls from the top-level "run" loop use a plain "Instruction" // parameter. That should be updated to specific sub-classes because the // run loop was changed to dispatch on opcode. @@ -118,6 +122,7 @@ private: bool foldInsExtBinop(Instruction &I); bool foldInsExtVectorToShuffle(Instruction &I); bool foldBitOpOfCastops(Instruction &I); + bool foldBitOpOfCastConstant(Instruction &I); bool foldBitcastShuffle(Instruction &I); bool scalarizeOpOrCmp(Instruction &I); bool scalarizeVPIntrinsic(Instruction &I); @@ -169,13 +174,16 @@ private: // further folds that were hindered by OneUse limits. SmallPtrSet<Value *, 4> Visited; for (Value *Op : Ops) { - if (Visited.insert(Op).second) { + if (!Visited.contains(Op)) { if (auto *OpI = dyn_cast<Instruction>(Op)) { if (RecursivelyDeleteTriviallyDeadInstructions( - OpI, nullptr, nullptr, [this](Value *V) { - if (auto I = dyn_cast<Instruction>(V)) { + OpI, nullptr, nullptr, [&](Value *V) { + if (auto *I = dyn_cast<Instruction>(V)) { LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n'); Worklist.remove(I); + if (I == NextInst) + NextInst = NextInst->getNextNode(); + Visited.insert(I); } })) continue; @@ -862,14 +870,17 @@ bool VectorCombine::foldBitOpOfCastops(Instruction &I) { if (LHSSrc->getType() != RHSSrc->getType()) return false; - // Only handle vector types with integer elements - auto *SrcVecTy = dyn_cast<FixedVectorType>(LHSSrc->getType()); - auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType()); - if (!SrcVecTy || !DstVecTy) + auto *SrcTy = LHSSrc->getType(); + auto *DstTy = I.getType(); + // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>. + // Other casts only handle vector types with integer elements. + if (CastOpcode != Instruction::BitCast && + (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy))) return false; - if (!SrcVecTy->getScalarType()->isIntegerTy() || - !DstVecTy->getScalarType()->isIntegerTy()) + // Only integer scalar/vector values are legal for bitwise logic operations. + if (!SrcTy->getScalarType()->isIntegerTy() || + !DstTy->getScalarType()->isIntegerTy()) return false; // Cost Check : @@ -877,23 +888,21 @@ bool VectorCombine::foldBitOpOfCastops(Instruction &I) { // NewCost = bitlogic + cast // Calculate specific costs for each cast with instruction context - InstructionCost LHSCastCost = - TTI.getCastInstrCost(CastOpcode, DstVecTy, SrcVecTy, - TTI::CastContextHint::None, CostKind, LHSCast); - InstructionCost RHSCastCost = - TTI.getCastInstrCost(CastOpcode, DstVecTy, SrcVecTy, - TTI::CastContextHint::None, CostKind, RHSCast); + InstructionCost LHSCastCost = TTI.getCastInstrCost( + CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast); + InstructionCost RHSCastCost = TTI.getCastInstrCost( + CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, RHSCast); InstructionCost OldCost = - TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstVecTy, CostKind) + + TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstTy, CostKind) + LHSCastCost + RHSCastCost; // For new cost, we can't provide an instruction (it doesn't exist yet) InstructionCost GenericCastCost = TTI.getCastInstrCost( - CastOpcode, DstVecTy, SrcVecTy, TTI::CastContextHint::None, CostKind); + CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind); InstructionCost NewCost = - TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcVecTy, CostKind) + + TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcTy, CostKind) + GenericCastCost; // Account for multi-use casts using specific costs @@ -930,6 +939,102 @@ bool VectorCombine::foldBitOpOfCastops(Instruction &I) { return true; } +/// Match: +// bitop(castop(x), C) -> +// bitop(castop(x), castop(InvC)) -> +// castop(bitop(x, InvC)) +// Supports: bitcast +bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) { + Instruction *LHS; + Constant *C; + + // Check if this is a bitwise logic operation + if (!match(&I, m_c_BitwiseLogic(m_Instruction(LHS), m_Constant(C)))) + return false; + + // Get the cast instructions + auto *LHSCast = dyn_cast<CastInst>(LHS); + if (!LHSCast) + return false; + + Instruction::CastOps CastOpcode = LHSCast->getOpcode(); + + // Only handle supported cast operations + switch (CastOpcode) { + case Instruction::BitCast: + break; + default: + return false; + } + + Value *LHSSrc = LHSCast->getOperand(0); + + auto *SrcTy = LHSSrc->getType(); + auto *DstTy = I.getType(); + // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>. + // Other casts only handle vector types with integer elements. + if (CastOpcode != Instruction::BitCast && + (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy))) + return false; + + // Only integer scalar/vector values are legal for bitwise logic operations. + if (!SrcTy->getScalarType()->isIntegerTy() || + !DstTy->getScalarType()->isIntegerTy()) + return false; + + // Find the constant InvC, such that castop(InvC) equals to C. + PreservedCastFlags RHSFlags; + Constant *InvC = getLosslessInvCast(C, SrcTy, CastOpcode, *DL, &RHSFlags); + if (!InvC) + return false; + + // Cost Check : + // OldCost = bitlogic + cast + // NewCost = bitlogic + cast + + // Calculate specific costs for each cast with instruction context + InstructionCost LHSCastCost = TTI.getCastInstrCost( + CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast); + + InstructionCost OldCost = + TTI.getArithmeticInstrCost(I.getOpcode(), DstTy, CostKind) + LHSCastCost; + + // For new cost, we can't provide an instruction (it doesn't exist yet) + InstructionCost GenericCastCost = TTI.getCastInstrCost( + CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind); + + InstructionCost NewCost = + TTI.getArithmeticInstrCost(I.getOpcode(), SrcTy, CostKind) + + GenericCastCost; + + // Account for multi-use casts using specific costs + if (!LHSCast->hasOneUse()) + NewCost += LHSCastCost; + + LLVM_DEBUG(dbgs() << "foldBitOpOfCastConstant: OldCost=" << OldCost + << " NewCost=" << NewCost << "\n"); + + if (NewCost > OldCost) + return false; + + // Create the operation on the source type + Value *NewOp = Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(), + LHSSrc, InvC, I.getName() + ".inner"); + if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp)) + NewBinOp->copyIRFlags(&I); + + Worklist.pushValue(NewOp); + + // Create the cast operation directly to ensure we get a new instruction + Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType()); + + // Insert the new instruction + Value *Result = Builder.Insert(NewCast); + + replaceValue(I, *Result); + return true; +} + /// If this is a bitcast of a shuffle, try to bitcast the source vector to the /// destination type followed by shuffle. This can enable further transforms by /// moving bitcasts or shuffles together. @@ -1461,8 +1566,8 @@ static void analyzeCostOfVecReduction(const IntrinsicInst &II, TTI::CastContextHint::None, CostKind, RedOp); CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost; - CostAfterReduction = - TTI.getMulAccReductionCost(IsUnsigned, II.getType(), ExtType, CostKind); + CostAfterReduction = TTI.getMulAccReductionCost( + IsUnsigned, ReductionOpc, II.getType(), ExtType, CostKind); return; } CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy, @@ -3753,6 +3858,8 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { unsigned MaxVectorSize = TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector); unsigned MaxElementsInVector = MaxVectorSize / ElementSize; + if (MaxElementsInVector == 0) + return false; // When there are multiple shufflevector operations on the same input, // especially when the vector length is larger than the register size, // identical shuffle patterns may occur across different groups of elements. @@ -4467,6 +4574,8 @@ bool VectorCombine::run() { case Instruction::Xor: if (foldBitOpOfCastops(I)) return true; + if (foldBitOpOfCastConstant(I)) + return true; break; case Instruction::PHI: if (shrinkPhiOfShuffles(I)) @@ -4519,13 +4628,21 @@ bool VectorCombine::run() { if (!DT.isReachableFromEntry(&BB)) continue; // Use early increment range so that we can erase instructions in loop. - for (Instruction &I : make_early_inc_range(BB)) { - if (I.isDebugOrPseudoInst()) - continue; - MadeChange |= FoldInst(I); + // make_early_inc_range is not applicable here, as the next iterator may + // be invalidated by RecursivelyDeleteTriviallyDeadInstructions. + // We manually maintain the next instruction and update it when it is about + // to be deleted. + Instruction *I = &BB.front(); + while (I) { + NextInst = I->getNextNode(); + if (!I->isDebugOrPseudoInst()) + MadeChange |= FoldInst(*I); + I = NextInst; } } + NextInst = nullptr; + while (!Worklist.isEmpty()) { Instruction *I = Worklist.removeOne(); if (!I) |
