diff options
68 files changed, 1636 insertions, 434 deletions
diff --git a/clang/include/clang/AST/OpenACCClause.h b/clang/include/clang/AST/OpenACCClause.h index 613c03ce4ad6..379495c65882 100644 --- a/clang/include/clang/AST/OpenACCClause.h +++ b/clang/include/clang/AST/OpenACCClause.h @@ -1277,7 +1277,7 @@ public: }; // A structure to stand in for the recipe on a reduction. RecipeDecl is the -// 'main' declaration used for initializaiton, which is fixed. +// 'main' declaration used for initializaiton, which is fixed. struct OpenACCReductionRecipe { VarDecl *AllocaDecl; @@ -1297,45 +1297,72 @@ struct OpenACCReductionRecipe { // -For a struct without the operator, this will be 1 element per field, which // should be the combiner for that element. // -For an array of any of the above, it will be the above for the element. - llvm::SmallVector<CombinerRecipe, 1> CombinerRecipes; + // Note: These are necessarily stored in either Trailing Storage (when in the + // AST), or in a separate collection when being semantically analyzed. + llvm::ArrayRef<CombinerRecipe> CombinerRecipes; OpenACCReductionRecipe(VarDecl *A, llvm::ArrayRef<CombinerRecipe> Combiners) : AllocaDecl(A), CombinerRecipes(Combiners) {} bool isSet() const { return AllocaDecl; } - static OpenACCReductionRecipe Empty() { - return OpenACCReductionRecipe(/*AllocaDecl=*/nullptr, {}); +}; + +// A version of the above that is used for semantic analysis, at a time before +// the OpenACCReductionClause node has been created. This one has storage for +// the CombinerRecipe, since Trailing storage for it doesn't exist yet. +struct OpenACCReductionRecipeWithStorage : OpenACCReductionRecipe { + llvm::SmallVector<CombinerRecipe, 1> CombinerRecipeStorage; + + OpenACCReductionRecipeWithStorage(VarDecl *A, + llvm::ArrayRef<CombinerRecipe> Combiners) + : OpenACCReductionRecipe(A, {}), CombinerRecipeStorage(Combiners) { + CombinerRecipes = CombinerRecipeStorage; + } + static OpenACCReductionRecipeWithStorage Empty() { + return OpenACCReductionRecipeWithStorage(/*AllocaDecl=*/nullptr, {}); } }; class OpenACCReductionClause final : public OpenACCClauseWithVarList, private llvm::TrailingObjects<OpenACCReductionClause, Expr *, - OpenACCReductionRecipe> { + OpenACCReductionRecipe, + OpenACCReductionRecipe::CombinerRecipe> { friend TrailingObjects; OpenACCReductionOperator Op; OpenACCReductionClause(SourceLocation BeginLoc, SourceLocation LParenLoc, OpenACCReductionOperator Operator, ArrayRef<Expr *> VarList, - ArrayRef<OpenACCReductionRecipe> Recipes, + ArrayRef<OpenACCReductionRecipeWithStorage> Recipes, SourceLocation EndLoc) : OpenACCClauseWithVarList(OpenACCClauseKind::Reduction, BeginLoc, LParenLoc, EndLoc), Op(Operator) { - assert(VarList.size() == Recipes.size()); + assert(VarList.size() == Recipes.size()); setExprs(getTrailingObjects<Expr *>(VarList.size()), VarList); - llvm::uninitialized_copy(Recipes, - getTrailingObjects<OpenACCReductionRecipe>()); - } -public: - ~OpenACCReductionClause() { - for (unsigned I = 0; I < getExprs().size(); ++I) { - getTrailingObjects<OpenACCReductionRecipe>()[I].~OpenACCReductionRecipe(); + // Since we're using trailing storage on this node to store the 'combiner' + // recipes of the Reduction Recipes (which have a 1:M relationship), we need + // to ensure we get the ArrayRef of each of our combiner 'correct'. + OpenACCReductionRecipe::CombinerRecipe *CurCombinerLoc = + getTrailingObjects<OpenACCReductionRecipe::CombinerRecipe>(); + for (const auto &[Idx, R] : llvm::enumerate(Recipes)) { + + // ArrayRef to the 'correct' data location in trailing storage. + llvm::MutableArrayRef<OpenACCReductionRecipe::CombinerRecipe> + NewCombiners{CurCombinerLoc, R.CombinerRecipes.size()}; + CurCombinerLoc += R.CombinerRecipes.size(); + + llvm::uninitialized_copy(R.CombinerRecipes, NewCombiners.begin()); + + // Placement new into the correct location in trailng storage. + new (&getTrailingObjects<OpenACCReductionRecipe>()[Idx]) + OpenACCReductionRecipe(R.AllocaDecl, NewCombiners); } } +public: static bool classof(const OpenACCClause *C) { return C->getClauseKind() == OpenACCClauseKind::Reduction; } @@ -1353,13 +1380,17 @@ public: static OpenACCReductionClause * Create(const ASTContext &C, SourceLocation BeginLoc, SourceLocation LParenLoc, OpenACCReductionOperator Operator, ArrayRef<Expr *> VarList, - ArrayRef<OpenACCReductionRecipe> Recipes, SourceLocation EndLoc); + ArrayRef<OpenACCReductionRecipeWithStorage> Recipes, + SourceLocation EndLoc); OpenACCReductionOperator getReductionOp() const { return Op; } size_t numTrailingObjects(OverloadToken<Expr *>) const { return getExprs().size(); } + size_t numTrailingObjects(OverloadToken<OpenACCReductionRecipe>) const { + return getExprs().size(); + } }; class OpenACCLinkClause final diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h index 46b088c0174b..f9d3a4ea9480 100644 --- a/clang/include/clang/Sema/SemaHLSL.h +++ b/clang/include/clang/Sema/SemaHLSL.h @@ -215,7 +215,6 @@ public: bool diagnosePositionType(QualType T, const ParsedAttr &AL); bool CanPerformScalarCast(QualType SrcTy, QualType DestTy); - bool ContainsBitField(QualType BaseTy); bool CanPerformElementwiseCast(Expr *Src, QualType DestType); bool CanPerformAggregateSplatCast(Expr *Src, QualType DestType); ExprResult ActOnOutParamExpr(ParmVarDecl *Param, Expr *Arg); diff --git a/clang/include/clang/Sema/SemaOpenACC.h b/clang/include/clang/Sema/SemaOpenACC.h index 16e7f1bc8301..f751e985ae0f 100644 --- a/clang/include/clang/Sema/SemaOpenACC.h +++ b/clang/include/clang/Sema/SemaOpenACC.h @@ -245,7 +245,7 @@ public: OpenACCPrivateRecipe CreatePrivateInitRecipe(const Expr *VarExpr); OpenACCFirstPrivateRecipe CreateFirstPrivateInitRecipe(const Expr *VarExpr); - OpenACCReductionRecipe + OpenACCReductionRecipeWithStorage CreateReductionInitRecipe(OpenACCReductionOperator ReductionOperator, const Expr *VarExpr); @@ -951,12 +951,14 @@ public: ArrayRef<Expr *> IntExprs, SourceLocation EndLoc); // Does the checking for a 'reduction ' clause that needs to be done in // dependent and not dependent cases. - OpenACCClause *CheckReductionClause( - ArrayRef<const OpenACCClause *> ExistingClauses, - OpenACCDirectiveKind DirectiveKind, SourceLocation BeginLoc, - SourceLocation LParenLoc, OpenACCReductionOperator ReductionOp, - ArrayRef<Expr *> Vars, ArrayRef<OpenACCReductionRecipe> Recipes, - SourceLocation EndLoc); + OpenACCClause * + CheckReductionClause(ArrayRef<const OpenACCClause *> ExistingClauses, + OpenACCDirectiveKind DirectiveKind, + SourceLocation BeginLoc, SourceLocation LParenLoc, + OpenACCReductionOperator ReductionOp, + ArrayRef<Expr *> Vars, + ArrayRef<OpenACCReductionRecipeWithStorage> Recipes, + SourceLocation EndLoc); ExprResult BuildOpenACCAsteriskSizeExpr(SourceLocation AsteriskLoc); ExprResult ActOnOpenACCAsteriskSizeExpr(SourceLocation AsteriskLoc); diff --git a/clang/lib/AST/OpenACCClause.cpp b/clang/lib/AST/OpenACCClause.cpp index 6c4bc7c274ea..17c6bece44c8 100644 --- a/clang/lib/AST/OpenACCClause.cpp +++ b/clang/lib/AST/OpenACCClause.cpp @@ -506,11 +506,17 @@ OpenACCDeviceTypeClause *OpenACCDeviceTypeClause::Create( OpenACCReductionClause *OpenACCReductionClause::Create( const ASTContext &C, SourceLocation BeginLoc, SourceLocation LParenLoc, OpenACCReductionOperator Operator, ArrayRef<Expr *> VarList, - ArrayRef<OpenACCReductionRecipe> Recipes, + ArrayRef<OpenACCReductionRecipeWithStorage> Recipes, SourceLocation EndLoc) { - void *Mem = C.Allocate( - OpenACCReductionClause::totalSizeToAlloc<Expr *, OpenACCReductionRecipe>( - VarList.size(), Recipes.size())); + size_t NumCombiners = llvm::accumulate( + Recipes, 0, [](size_t Num, const OpenACCReductionRecipe &R) { + return Num + R.CombinerRecipes.size(); + }); + + void *Mem = C.Allocate(OpenACCReductionClause::totalSizeToAlloc< + Expr *, OpenACCReductionRecipe, + OpenACCReductionRecipe::CombinerRecipe>( + VarList.size(), Recipes.size(), NumCombiners)); return new (Mem) OpenACCReductionClause(BeginLoc, LParenLoc, Operator, VarList, Recipes, EndLoc); } diff --git a/clang/lib/CIR/CodeGen/Address.h b/clang/lib/CIR/CodeGen/Address.h index fb74aa0f3bb0..a67cbad7033a 100644 --- a/clang/lib/CIR/CodeGen/Address.h +++ b/clang/lib/CIR/CodeGen/Address.h @@ -17,6 +17,7 @@ #include "mlir/IR/Value.h" #include "clang/AST/CharUnits.h" #include "clang/CIR/Dialect/IR/CIRTypes.h" +#include "clang/CIR/MissingFeatures.h" #include "llvm/ADT/PointerIntPair.h" namespace clang::CIRGen { @@ -90,6 +91,13 @@ public: return getPointer(); } + /// Return the pointer contained in this class after authenticating it and + /// adding offset to it if necessary. + mlir::Value emitRawPointer() const { + assert(!cir::MissingFeatures::addressPointerAuthInfo()); + return getBasePointer(); + } + mlir::Type getType() const { assert(mlir::cast<cir::PointerType>( pointerAndKnownNonNull.getPointer().getType()) diff --git a/clang/lib/CIR/CodeGen/CIRGenCXXABI.h b/clang/lib/CIR/CodeGen/CIRGenCXXABI.h index 06f41cd8fcfd..6d3741c41735 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCXXABI.h +++ b/clang/lib/CIR/CodeGen/CIRGenCXXABI.h @@ -191,6 +191,15 @@ public: virtual void emitVTableDefinitions(CIRGenVTables &cgvt, const CXXRecordDecl *rd) = 0; + using DeleteOrMemberCallExpr = + llvm::PointerUnion<const CXXDeleteExpr *, const CXXMemberCallExpr *>; + + virtual mlir::Value emitVirtualDestructorCall(CIRGenFunction &cgf, + const CXXDestructorDecl *dtor, + CXXDtorType dtorType, + Address thisAddr, + DeleteOrMemberCallExpr e) = 0; + /// Emit any tables needed to implement virtual inheritance. For Itanium, /// this emits virtual table tables. virtual void emitVirtualInheritanceTables(const CXXRecordDecl *rd) = 0; diff --git a/clang/lib/CIR/CodeGen/CIRGenClass.cpp b/clang/lib/CIR/CodeGen/CIRGenClass.cpp index 485b2c86cbc5..dd357ce69f1b 100644 --- a/clang/lib/CIR/CodeGen/CIRGenClass.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenClass.cpp @@ -895,6 +895,26 @@ void CIRGenFunction::destroyCXXObject(CIRGenFunction &cgf, Address addr, } namespace { +mlir::Value loadThisForDtorDelete(CIRGenFunction &cgf, + const CXXDestructorDecl *dd) { + if (Expr *thisArg = dd->getOperatorDeleteThisArg()) + return cgf.emitScalarExpr(thisArg); + return cgf.loadCXXThis(); +} + +/// Call the operator delete associated with the current destructor. +struct CallDtorDelete final : EHScopeStack::Cleanup { + CallDtorDelete() {} + + void emit(CIRGenFunction &cgf) override { + const CXXDestructorDecl *dtor = cast<CXXDestructorDecl>(cgf.curFuncDecl); + const CXXRecordDecl *classDecl = dtor->getParent(); + cgf.emitDeleteCall(dtor->getOperatorDelete(), + loadThisForDtorDelete(cgf, dtor), + cgf.getContext().getCanonicalTagType(classDecl)); + } +}; + class DestroyField final : public EHScopeStack::Cleanup { const FieldDecl *field; CIRGenFunction::Destroyer *destroyer; @@ -932,7 +952,18 @@ void CIRGenFunction::enterDtorCleanups(const CXXDestructorDecl *dd, // The deleting-destructor phase just needs to call the appropriate // operator delete that Sema picked up. if (dtorType == Dtor_Deleting) { - cgm.errorNYI(dd->getSourceRange(), "deleting destructor cleanups"); + assert(dd->getOperatorDelete() && + "operator delete missing - EnterDtorCleanups"); + if (cxxStructorImplicitParamValue) { + cgm.errorNYI(dd->getSourceRange(), "deleting destructor with vtt"); + } else { + if (dd->getOperatorDelete()->isDestroyingOperatorDelete()) { + cgm.errorNYI(dd->getSourceRange(), + "deleting destructor with destroying operator delete"); + } else { + ehStack.pushCleanup<CallDtorDelete>(NormalAndEHCleanup); + } + } return; } diff --git a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp index 97c0944fca33..b1e9e768ff1e 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp @@ -130,13 +130,11 @@ RValue CIRGenFunction::emitCXXMemberOrOperatorMemberCallExpr( const CXXMethodDecl *calleeDecl = devirtualizedMethod ? devirtualizedMethod : md; const CIRGenFunctionInfo *fInfo = nullptr; - if (isa<CXXDestructorDecl>(calleeDecl)) { - cgm.errorNYI(ce->getSourceRange(), - "emitCXXMemberOrOperatorMemberCallExpr: destructor call"); - return RValue::get(nullptr); - } - - fInfo = &cgm.getTypes().arrangeCXXMethodDeclaration(calleeDecl); + if (const auto *dtor = dyn_cast<CXXDestructorDecl>(calleeDecl)) + fInfo = &cgm.getTypes().arrangeCXXStructorDeclaration( + GlobalDecl(dtor, Dtor_Complete)); + else + fInfo = &cgm.getTypes().arrangeCXXMethodDeclaration(calleeDecl); cir::FuncType ty = cgm.getTypes().getFunctionType(*fInfo); @@ -151,9 +149,34 @@ RValue CIRGenFunction::emitCXXMemberOrOperatorMemberCallExpr( // because then we know what the type is. bool useVirtualCall = canUseVirtualCall && !devirtualizedMethod; - if (isa<CXXDestructorDecl>(calleeDecl)) { - cgm.errorNYI(ce->getSourceRange(), - "emitCXXMemberOrOperatorMemberCallExpr: destructor call"); + if (const auto *dtor = dyn_cast<CXXDestructorDecl>(calleeDecl)) { + assert(ce->arg_begin() == ce->arg_end() && + "Destructor shouldn't have explicit parameters"); + assert(returnValue.isNull() && "Destructor shouldn't have return value"); + if (useVirtualCall) { + cgm.getCXXABI().emitVirtualDestructorCall(*this, dtor, Dtor_Complete, + thisPtr.getAddress(), + cast<CXXMemberCallExpr>(ce)); + } else { + GlobalDecl globalDecl(dtor, Dtor_Complete); + CIRGenCallee callee; + assert(!cir::MissingFeatures::appleKext()); + if (!devirtualizedMethod) { + callee = CIRGenCallee::forDirect( + cgm.getAddrOfCXXStructor(globalDecl, fInfo, ty), globalDecl); + } else { + cgm.errorNYI(ce->getSourceRange(), "devirtualized destructor call"); + return RValue::get(nullptr); + } + + QualType thisTy = + isArrow ? base->getType()->getPointeeType() : base->getType(); + // CIRGen does not pass CallOrInvoke here (different from OG LLVM codegen) + // because in practice it always null even in OG. + emitCXXDestructorCall(globalDecl, callee, thisPtr.getPointer(), thisTy, + /*implicitParam=*/nullptr, + /*implicitParamTy=*/QualType(), ce); + } return RValue::get(nullptr); } diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp index 7a774e0441bb..01a43a997637 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp @@ -678,7 +678,13 @@ void CIRGenFunction::emitDestructorBody(FunctionArgList &args) { // possible to delegate the destructor body to the complete // destructor. Do so. if (dtorType == Dtor_Deleting) { - cgm.errorNYI(dtor->getSourceRange(), "deleting destructor"); + RunCleanupsScope dtorEpilogue(*this); + enterDtorCleanups(dtor, Dtor_Deleting); + if (haveInsertPoint()) { + QualType thisTy = dtor->getFunctionObjectParameterType(); + emitCXXDestructorCall(dtor, Dtor_Complete, /*forVirtualBase=*/false, + /*delegating=*/false, loadCXXThisAddress(), thisTy); + } return; } diff --git a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp index 9e490c6d054a..d30c975a8ffb 100644 --- a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp @@ -95,7 +95,10 @@ public: clang::GlobalDecl gd, Address thisAddr, mlir::Type ty, SourceLocation loc) override; - + mlir::Value emitVirtualDestructorCall(CIRGenFunction &cgf, + const CXXDestructorDecl *dtor, + CXXDtorType dtorType, Address thisAddr, + DeleteOrMemberCallExpr e) override; mlir::Value getVTableAddressPoint(BaseSubobject base, const CXXRecordDecl *vtableClass) override; mlir::Value getVTableAddressPointInStructorWithVTT( @@ -465,6 +468,29 @@ void CIRGenItaniumCXXABI::emitVTableDefinitions(CIRGenVTables &cgvt, } } +mlir::Value CIRGenItaniumCXXABI::emitVirtualDestructorCall( + CIRGenFunction &cgf, const CXXDestructorDecl *dtor, CXXDtorType dtorType, + Address thisAddr, DeleteOrMemberCallExpr expr) { + auto *callExpr = dyn_cast<const CXXMemberCallExpr *>(expr); + auto *delExpr = dyn_cast<const CXXDeleteExpr *>(expr); + assert((callExpr != nullptr) ^ (delExpr != nullptr)); + assert(callExpr == nullptr || callExpr->arg_begin() == callExpr->arg_end()); + assert(dtorType == Dtor_Deleting || dtorType == Dtor_Complete); + + GlobalDecl globalDecl(dtor, dtorType); + const CIRGenFunctionInfo *fnInfo = + &cgm.getTypes().arrangeCXXStructorDeclaration(globalDecl); + const cir::FuncType &fnTy = cgm.getTypes().getFunctionType(*fnInfo); + auto callee = CIRGenCallee::forVirtual(callExpr, globalDecl, thisAddr, fnTy); + + QualType thisTy = + callExpr ? callExpr->getObjectType() : delExpr->getDestroyedType(); + + cgf.emitCXXDestructorCall(globalDecl, callee, thisAddr.emitRawPointer(), + thisTy, nullptr, QualType(), nullptr); + return nullptr; +} + void CIRGenItaniumCXXABI::emitVirtualInheritanceTables( const CXXRecordDecl *rd) { CIRGenVTables &vtables = cgm.getVTables(); diff --git a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp index e65896a9ff10..2ab1ea0c8ff8 100644 --- a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp @@ -619,10 +619,8 @@ const CIRGenFunctionInfo &CIRGenTypes::arrangeGlobalDeclaration(GlobalDecl gd) { const auto *fd = cast<FunctionDecl>(gd.getDecl()); if (isa<CXXConstructorDecl>(gd.getDecl()) || - isa<CXXDestructorDecl>(gd.getDecl())) { - cgm.errorNYI(SourceLocation(), - "arrangeGlobalDeclaration for C++ constructor or destructor"); - } + isa<CXXDestructorDecl>(gd.getDecl())) + return arrangeCXXStructorDeclaration(gd); return arrangeFunctionDeclaration(fd); } diff --git a/clang/lib/CIR/CodeGen/CIRGenVTables.cpp b/clang/lib/CIR/CodeGen/CIRGenVTables.cpp index 84f59773757b..36bab625c4dd 100644 --- a/clang/lib/CIR/CodeGen/CIRGenVTables.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenVTables.cpp @@ -120,12 +120,6 @@ mlir::Attribute CIRGenVTables::getVTableComponent( assert(!cir::MissingFeatures::vtableRelativeLayout()); switch (component.getKind()) { - case VTableComponent::CK_CompleteDtorPointer: - cgm.errorNYI("getVTableComponent: CompleteDtorPointer"); - return mlir::Attribute(); - case VTableComponent::CK_DeletingDtorPointer: - cgm.errorNYI("getVTableComponent: DeletingDtorPointer"); - return mlir::Attribute(); case VTableComponent::CK_UnusedFunctionPointer: cgm.errorNYI("getVTableComponent: UnusedFunctionPointer"); return mlir::Attribute(); @@ -148,7 +142,9 @@ mlir::Attribute CIRGenVTables::getVTableComponent( "expected GlobalViewAttr or ConstPtrAttr"); return rtti; - case VTableComponent::CK_FunctionPointer: { + case VTableComponent::CK_FunctionPointer: + case VTableComponent::CK_CompleteDtorPointer: + case VTableComponent::CK_DeletingDtorPointer: { GlobalDecl gd = component.getGlobalDecl(); assert(!cir::MissingFeatures::cudaSupport()); diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index fa946921d468..1ff2be756452 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -1762,8 +1762,11 @@ void CGOpenMPRuntime::emitDeclareTargetFunction(const FunctionDecl *FD, // access its value. llvm::GlobalValue *Addr = GV; if (CGM.getLangOpts().OpenMPIsTargetDevice) { + llvm::PointerType *FnPtrTy = llvm::PointerType::get( + CGM.getLLVMContext(), + CGM.getModule().getDataLayout().getProgramAddressSpace()); Addr = new llvm::GlobalVariable( - CGM.getModule(), CGM.VoidPtrTy, + CGM.getModule(), FnPtrTy, /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, GV, Name, nullptr, llvm::GlobalValue::NotThreadLocal, CGM.getModule().getDataLayout().getDefaultGlobalsAddressSpace()); diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 3613b6a143d4..fddeba98adcc 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // // This provides a generalized class for OpenMP runtime code generation -// specialized by GPU targets NVPTX and AMDGCN. +// specialized by GPU targets NVPTX, AMDGCN and SPIR-V. // //===----------------------------------------------------------------------===// @@ -1242,12 +1242,13 @@ void CGOpenMPRuntimeGPU::emitParallelCall( CGBuilderTy &Bld = CGF.Builder; llvm::Value *NumThreadsVal = NumThreads; llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn]; - llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy); - if (WFn) - ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy); - llvm::Type *FnPtrTy = llvm::PointerType::get( + llvm::PointerType *FnPtrTy = llvm::PointerType::get( CGF.getLLVMContext(), CGM.getDataLayout().getProgramAddressSpace()); + llvm::Value *ID = llvm::ConstantPointerNull::get(FnPtrTy); + if (WFn) + ID = Bld.CreateBitOrPointerCast(WFn, FnPtrTy); + llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, FnPtrTy); // Create a private scope that will globalize the arguments diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index 17cb1e4f153c..72b2ac99ec53 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -3544,40 +3544,6 @@ bool SemaHLSL::CanPerformScalarCast(QualType SrcTy, QualType DestTy) { llvm_unreachable("Unhandled scalar cast"); } -// Detect if a type contains a bitfield. Will be removed when -// bitfield support is added to HLSLElementwiseCast and HLSLAggregateSplatCast -bool SemaHLSL::ContainsBitField(QualType BaseTy) { - llvm::SmallVector<QualType, 16> WorkList; - WorkList.push_back(BaseTy); - while (!WorkList.empty()) { - QualType T = WorkList.pop_back_val(); - T = T.getCanonicalType().getUnqualifiedType(); - // only check aggregate types - if (const auto *AT = dyn_cast<ConstantArrayType>(T)) { - WorkList.push_back(AT->getElementType()); - continue; - } - if (const auto *RT = dyn_cast<RecordType>(T)) { - const RecordDecl *RD = RT->getOriginalDecl()->getDefinitionOrSelf(); - if (RD->isUnion()) - continue; - - const CXXRecordDecl *CXXD = dyn_cast<CXXRecordDecl>(RD); - - if (CXXD && CXXD->isStandardLayout()) - RD = CXXD->getStandardLayoutBaseWithFields(); - - for (const auto *FD : RD->fields()) { - if (FD->isBitField()) - return true; - WorkList.push_back(FD->getType()); - } - continue; - } - } - return false; -} - // Can perform an HLSL Aggregate splat cast if the Dest is an aggregate and the // Src is a scalar or a vector of length 1 // Or if Dest is a vector and Src is a vector of length 1 diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp index 779b6e9cb6b4..ca99834ce826 100644 --- a/clang/lib/Sema/SemaOpenACC.cpp +++ b/clang/lib/Sema/SemaOpenACC.cpp @@ -2883,12 +2883,12 @@ SemaOpenACC::CreateFirstPrivateInitRecipe(const Expr *VarExpr) { return OpenACCFirstPrivateRecipe(AllocaDecl, Temporary); } -OpenACCReductionRecipe SemaOpenACC::CreateReductionInitRecipe( +OpenACCReductionRecipeWithStorage SemaOpenACC::CreateReductionInitRecipe( OpenACCReductionOperator ReductionOperator, const Expr *VarExpr) { // We don't strip bounds here, so that we are doing our recipe init at the // 'lowest' possible level. Codegen is going to have to do its own 'looping'. if (!VarExpr || VarExpr->getType()->isDependentType()) - return OpenACCReductionRecipe::Empty(); + return OpenACCReductionRecipeWithStorage::Empty(); QualType VarTy = VarExpr->getType().getNonReferenceType().getUnqualifiedType(); @@ -2905,7 +2905,7 @@ OpenACCReductionRecipe SemaOpenACC::CreateReductionInitRecipe( // at any of the combiners. if (CreateReductionCombinerRecipe(VarExpr->getBeginLoc(), ReductionOperator, VarTy, CombinerRecipes)) - return OpenACCReductionRecipe::Empty(); + return OpenACCReductionRecipeWithStorage::Empty(); VarDecl *AllocaDecl = CreateAllocaDecl( getASTContext(), SemaRef.getCurContext(), VarExpr->getBeginLoc(), @@ -2956,7 +2956,7 @@ OpenACCReductionRecipe SemaOpenACC::CreateReductionInitRecipe( AllocaDecl->setInitStyle(VarDecl::CallInit); } - return OpenACCReductionRecipe(AllocaDecl, CombinerRecipes); + return OpenACCReductionRecipeWithStorage(AllocaDecl, CombinerRecipes); } bool SemaOpenACC::CreateReductionCombinerRecipe( diff --git a/clang/lib/Sema/SemaOpenACCClause.cpp b/clang/lib/Sema/SemaOpenACCClause.cpp index 881e960e5a24..ead97816defe 100644 --- a/clang/lib/Sema/SemaOpenACCClause.cpp +++ b/clang/lib/Sema/SemaOpenACCClause.cpp @@ -1772,7 +1772,7 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitReductionClause( } SmallVector<Expr *> ValidVars; - SmallVector<OpenACCReductionRecipe> Recipes; + SmallVector<OpenACCReductionRecipeWithStorage> Recipes; for (Expr *Var : Clause.getVarList()) { ExprResult Res = SemaRef.CheckReductionVar(Clause.getDirectiveKind(), @@ -2196,7 +2196,7 @@ OpenACCClause *SemaOpenACC::CheckReductionClause( ArrayRef<const OpenACCClause *> ExistingClauses, OpenACCDirectiveKind DirectiveKind, SourceLocation BeginLoc, SourceLocation LParenLoc, OpenACCReductionOperator ReductionOp, - ArrayRef<Expr *> Vars, ArrayRef<OpenACCReductionRecipe> Recipes, + ArrayRef<Expr *> Vars, ArrayRef<OpenACCReductionRecipeWithStorage> Recipes, SourceLocation EndLoc) { if (DirectiveKind == OpenACCDirectiveKind::Loop || isOpenACCCombinedDirectiveKind(DirectiveKind)) { diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 940324bbc5e4..0dcfe0ecbad4 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -12374,7 +12374,7 @@ void OpenACCClauseTransform<Derived>::VisitReductionClause( const OpenACCReductionClause &C) { SmallVector<Expr *> TransformedVars = VisitVarList(C.getVarList()); SmallVector<Expr *> ValidVars; - llvm::SmallVector<OpenACCReductionRecipe> Recipes; + llvm::SmallVector<OpenACCReductionRecipeWithStorage> Recipes; for (const auto [Var, OrigRecipe] : llvm::zip(TransformedVars, C.getRecipes())) { @@ -12384,7 +12384,7 @@ void OpenACCClauseTransform<Derived>::VisitReductionClause( ValidVars.push_back(Res.get()); if (OrigRecipe.isSet()) - Recipes.push_back(OrigRecipe); + Recipes.emplace_back(OrigRecipe.AllocaDecl, OrigRecipe.CombinerRecipes); else Recipes.push_back(Self.getSema().OpenACC().CreateReductionInitRecipe( C.getReductionOp(), Res.get())); diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 868f0cc8b1da..8ebf909fb4df 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -13006,7 +13006,7 @@ OpenACCClause *ASTRecordReader::readOpenACCClause() { SourceLocation LParenLoc = readSourceLocation(); OpenACCReductionOperator Op = readEnum<OpenACCReductionOperator>(); llvm::SmallVector<Expr *> VarList = readOpenACCVarList(); - llvm::SmallVector<OpenACCReductionRecipe> RecipeList; + llvm::SmallVector<OpenACCReductionRecipeWithStorage> RecipeList; for (unsigned I = 0; I < VarList.size(); ++I) { VarDecl *Recipe = readDeclAs<VarDecl>(); diff --git a/clang/test/CIR/CodeGen/virtual-destructor-calls.cpp b/clang/test/CIR/CodeGen/virtual-destructor-calls.cpp new file mode 100644 index 000000000000..08a6b21ca91d --- /dev/null +++ b/clang/test/CIR/CodeGen/virtual-destructor-calls.cpp @@ -0,0 +1,129 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-android21 -std=c++20 -mconstructor-aliases -O0 -fclangir -emit-cir %s -o %t.cir +// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s +// RUN: %clang_cc1 -triple aarch64-none-linux-android21 -std=c++20 -mconstructor-aliases -O0 -fclangir -emit-llvm %s -o %t-cir.ll +// RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll %s +// RUN: %clang_cc1 -triple aarch64-none-linux-android21 -std=c++20 -mconstructor-aliases -O0 -emit-llvm %s -o %t.ll +// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s + +// TODO(cir): Try to emit base destructor as an alias at O1 or higher. + +// FIXME: LLVM IR dialect does not yet support function ptr globals, which precludes +// a lot of the proper semantics for properly representing alias functions in LLVM +// (see the note on LLVM_O1 below). + +struct Member { + ~Member(); +}; + +struct A { + virtual ~A(); +}; + +struct B : A { + Member m; + virtual ~B(); +}; + +B::~B() { } + +// Aliases are inserted before the function definitions in LLVM IR +// FIXME: These should have unnamed_addr set. +// LLVM: @_ZN1BD1Ev = alias void (ptr), ptr @_ZN1BD2Ev +// LLVM: @_ZN1CD1Ev = alias void (ptr), ptr @_ZN1CD2Ev + +// OGCG: @_ZN1BD1Ev = unnamed_addr alias void (ptr), ptr @_ZN1BD2Ev +// OGCG: @_ZN1CD1Ev = unnamed_addr alias void (ptr), ptr @_ZN1CD2Ev + + +// Base (D2) dtor for B: calls A's base dtor. + +// CIR: cir.func{{.*}} @_ZN1BD2Ev +// CIR: cir.call @_ZN6MemberD1Ev +// CIR: cir.call @_ZN1AD2Ev + +// LLVM: define{{.*}} void @_ZN1BD2Ev +// LLVM: call void @_ZN6MemberD1Ev +// LLVM: call void @_ZN1AD2Ev + +// OGCG: define{{.*}} @_ZN1BD2Ev +// OGCG: call void @_ZN6MemberD1Ev +// OGCG: call void @_ZN1AD2Ev + +// Complete (D1) dtor for B: just an alias because there are no virtual bases. + +// CIR: cir.func{{.*}} @_ZN1BD1Ev(!cir.ptr<!rec_B>) alias(@_ZN1BD2Ev) +// This is defined above for LLVM and OGCG. + +// Deleting (D0) dtor for B: defers to the complete dtor but also calls operator delete. + +// CIR: cir.func{{.*}} @_ZN1BD0Ev +// CIR: cir.call @_ZN1BD1Ev(%[[THIS:.*]]) nothrow : (!cir.ptr<!rec_B>) -> () +// CIR: %[[THIS_VOID:.*]] = cir.cast bitcast %[[THIS]] : !cir.ptr<!rec_B> -> !cir.ptr<!void> +// CIR: %[[SIZE:.*]] = cir.const #cir.int<16> +// CIR: cir.call @_ZdlPvm(%[[THIS_VOID]], %[[SIZE]]) + +// LLVM: define{{.*}} void @_ZN1BD0Ev +// LLVM: call void @_ZN1BD1Ev(ptr %[[THIS:.*]]) +// LLVM: call void @_ZdlPvm(ptr %[[THIS]], i64 16) + +// OGCG: define{{.*}} @_ZN1BD0Ev +// OGCG: call void @_ZN1BD1Ev(ptr{{.*}} %[[THIS:.*]]) +// OGCG: call void @_ZdlPvm(ptr{{.*}} %[[THIS]], i64{{.*}} 16) + +struct C : B { + ~C(); +}; + +C::~C() { } + +// Base (D2) dtor for C: calls B's base dtor. + +// CIR: cir.func{{.*}} @_ZN1CD2Ev +// CIR: %[[B:.*]] = cir.base_class_addr %[[THIS:.*]] : !cir.ptr<!rec_C> nonnull [0] -> !cir.ptr<!rec_B> +// CIR: cir.call @_ZN1BD2Ev(%[[B]]) + +// LLVM: define{{.*}} void @_ZN1CD2Ev +// LLVM: call void @_ZN1BD2Ev + +// OGCG: define{{.*}} @_ZN1CD2Ev +// OGCG: call void @_ZN1BD2Ev + +// Complete (D1) dtor for C: just an alias because there are no virtual bases. + +// CIR: cir.func{{.*}} @_ZN1CD1Ev(!cir.ptr<!rec_C>) alias(@_ZN1CD2Ev) +// This is defined above for LLVM and OGCG. + + +// Deleting (D0) dtor for C: defers to the complete dtor but also calls operator delete. + +// CIR: cir.func{{.*}} @_ZN1CD0Ev +// CIR: cir.call @_ZN1CD1Ev(%[[THIS:.*]]) nothrow : (!cir.ptr<!rec_C>) -> () +// CIR: %[[THIS_VOID:.*]] = cir.cast bitcast %[[THIS]] : !cir.ptr<!rec_C> -> !cir.ptr<!void> +// CIR: %[[SIZE:.*]] = cir.const #cir.int<16> +// CIR: cir.call @_ZdlPvm(%[[THIS_VOID]], %[[SIZE]]) + +// LLVM: define{{.*}} void @_ZN1CD0Ev +// LLVM: call void @_ZN1CD1Ev(ptr %[[THIS:.*]]) +// LLVM: call void @_ZdlPvm(ptr %[[THIS]], i64 16) + +// OGCG: define{{.*}} @_ZN1CD0Ev +// OGCG: call void @_ZN1CD1Ev(ptr{{.*}} %[[THIS:.*]]) +// OGCG: call void @_ZdlPvm(ptr{{.*}} %[[THIS]], i64{{.*}} 16) + +namespace PR12798 { + // A qualified call to a base class destructor should not undergo virtual + // dispatch. Template instantiation used to lose the qualifier. + struct A { virtual ~A(); }; + template<typename T> void f(T *p) { p->A::~A(); } + + // CIR: cir.func{{.*}} @_ZN7PR127981fINS_1AEEEvPT_ + // CIR: cir.call @_ZN7PR127981AD1Ev + + // LLVM: define{{.*}} @_ZN7PR127981fINS_1AEEEvPT_ + // LLVM: call void @_ZN7PR127981AD1Ev + + // OGCG: define{{.*}} @_ZN7PR127981fINS_1AEEEvPT_ + // OGCG: call void @_ZN7PR127981AD1Ev + + template void f(A*); +} diff --git a/clang/test/OpenMP/target_indirect_codegen.cpp b/clang/test/OpenMP/target_indirect_codegen.cpp index 20a36c293551..ec249dd43b93 100644 --- a/clang/test/OpenMP/target_indirect_codegen.cpp +++ b/clang/test/OpenMP/target_indirect_codegen.cpp @@ -4,6 +4,12 @@ // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -x c++ -triple amdgcn-amd-amdhsa %s -fopenmp-is-target-device -fvisibility=protected -fopenmp-host-ir-file-path %t-host.bc -emit-pch -o %t // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -x c++ -triple amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fvisibility=protected -fopenmp-host-ir-file-path %t-host.bc -include-pch %t -o - | FileCheck %s --check-prefix=DEVICE +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=spirv64-intel -emit-llvm %s -o - | FileCheck %s --check-prefix=HOST +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=spirv64-intel -emit-llvm-bc %s -o %t-spirv-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -x c++ -triple spirv64-intel -emit-llvm %s -fopenmp-is-target-device -fvisibility=protected -fopenmp-host-ir-file-path %t-spirv-host.bc -o - | FileCheck %s --check-prefix=DEVICE +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -x c++ -triple spirv64-intel %s -fopenmp-is-target-device -fvisibility=protected -fopenmp-host-ir-file-path %t-spirv-host.bc -emit-pch -o %t +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -x c++ -triple spirv64-intel -emit-llvm %s -fopenmp-is-target-device -fvisibility=protected -fopenmp-host-ir-file-path %t-spirv-host.bc -include-pch %t -o - | FileCheck %s --check-prefix=DEVICE + // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -19,10 +25,10 @@ // HOST: @[[BAR_ENTRY_NAME:.+]] = internal unnamed_addr constant [{{[0-9]+}} x i8] c"[[BAR_NAME:__omp_offloading_[0-9a-z]+_[0-9a-z]+_bar_l[0-9]+]]\00" // HOST: @.offloading.entry.[[BAR_NAME]] = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 8, ptr @_ZL3barv, ptr @[[BAR_ENTRY_NAME]], i64 8, i64 0, ptr null } //. -// DEVICE: @[[FOO_NAME:__omp_offloading_[0-9a-z]+_[0-9a-z]+_foo_l[0-9]+]] = protected addrspace(1) constant ptr @_Z3foov -// DEVICE: @[[BAZ_NAME:__omp_offloading_[0-9a-z]+_[0-9a-z]+_baz_l[0-9]+]] = protected addrspace(1) constant ptr @_Z3bazv +// DEVICE: @[[FOO_NAME:__omp_offloading_[0-9a-z]+_[0-9a-z]+_foo_l[0-9]+]] = protected addrspace(1) constant {{ptr|ptr addrspace\(9\)}} @_Z3foov +// DEVICE: @[[BAZ_NAME:__omp_offloading_[0-9a-z]+_[0-9a-z]+_baz_l[0-9]+]] = protected addrspace(1) constant {{ptr|ptr addrspace\(9\)}} @_Z3bazv // DEVICE: @var = protected addrspace(1) global i8 0, align 1 -// DEVICE: @[[BAR_NAME:__omp_offloading_[0-9a-z]+_[0-9a-z]+_bar_l[0-9]+]] = protected addrspace(1) constant ptr @_ZL3barv +// DEVICE: @[[BAR_NAME:__omp_offloading_[0-9a-z]+_[0-9a-z]+_bar_l[0-9]+]] = protected addrspace(1) constant {{ptr|ptr addrspace\(9\)}} @_ZL3barv //. void foo() { } #pragma omp declare target to(foo) indirect diff --git a/clang/test/OpenMP/amdgcn_parallel_num_threads_strict_messages.cpp b/clang/test/OpenMP/target_parallel_num_threads_strict_messages.cpp index 513754b0bbad..8ceff02d4ecf 100644 --- a/clang/test/OpenMP/amdgcn_parallel_num_threads_strict_messages.cpp +++ b/clang/test/OpenMP/target_parallel_num_threads_strict_messages.cpp @@ -5,6 +5,13 @@ // RUN: %clang_cc1 -DF3 -verify -fopenmp -fopenmp-version=60 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host-ppc.bc // RUN: %clang_cc1 -DF3 -DTARGET -verify -fopenmp -fopenmp-version=60 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host-ppc.bc -o /dev/null +// RUN: %clang_cc1 -DF1 -verify -fopenmp -fopenmp-version=60 -triple x86_64-unknown-unknown -fopenmp-targets=spirv64-intel -emit-llvm-bc %s -o %t-spirv-ppc-host-ppc.bc +// RUN: %clang_cc1 -DF1 -DTARGET -verify -fopenmp -fopenmp-version=60 -triple spirv64-intel -fopenmp-targets=spirv64-intel -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-spirv-ppc-host-ppc.bc -o /dev/null +// RUN: %clang_cc1 -DF2 -verify -fopenmp -fopenmp-version=60 -triple x86_64-unknown-unknown -fopenmp-targets=spirv64-intel -emit-llvm-bc %s -o %t-spirv-ppc-host-ppc.bc +// RUN: %clang_cc1 -DF2 -DTARGET -verify -fopenmp -fopenmp-version=60 -triple spirv64-intel -fopenmp-targets=spirv64-intel -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-spirv-ppc-host-ppc.bc -o /dev/null +// RUN: %clang_cc1 -DF3 -verify -fopenmp -fopenmp-version=60 -triple x86_64-unknown-unknown -fopenmp-targets=spirv64-intel -emit-llvm-bc %s -o %t-spirv-ppc-host-ppc.bc +// RUN: %clang_cc1 -DF3 -DTARGET -verify -fopenmp -fopenmp-version=60 -triple spirv64-intel -fopenmp-targets=spirv64-intel -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-spirv-ppc-host-ppc.bc -o /dev/null + #ifndef TARGET // expected-no-diagnostics #endif diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index 2adfd6f2510d..c3cd119b9617 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -459,6 +459,7 @@ struct IntrinsicLibrary { mlir::Value genTime(mlir::Type, llvm::ArrayRef<mlir::Value>); void genTMABulkCommitGroup(llvm::ArrayRef<fir::ExtendedValue>); void genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue>); void genTMABulkWaitGroup(llvm::ArrayRef<fir::ExtendedValue>); mlir::Value genTrailz(mlir::Type, llvm::ArrayRef<mlir::Value>); fir::ExtendedValue genTransfer(mlir::Type, diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 5fe2a76128e0..e07baafcef0d 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -1027,6 +1027,10 @@ static constexpr IntrinsicHandler handlers[]{ {"dst", asAddr}, {"nbytes", asValue}}}, /*isElemental=*/false}, + {"tma_bulk_s2g", + &I::genTMABulkS2G, + {{{"src", asAddr}, {"dst", asAddr}, {"nbytes", asValue}}}, + /*isElemental=*/false}, {"tma_bulk_wait_group", &I::genTMABulkWaitGroup, {{}}, @@ -9227,6 +9231,17 @@ void IntrinsicLibrary::genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue> args) { builder, loc, dst, src, barrier, fir::getBase(args[3]), {}, {}); } +// TMA_BULK_S2G (CUDA) +void IntrinsicLibrary::genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 3); + mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[0]), + mlir::NVVM::NVVMMemorySpace::Shared); + mlir::Value dst = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]), + mlir::NVVM::NVVMMemorySpace::Global); + mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create( + builder, loc, dst, src, fir::getBase(args[2]), {}, {}); +} + // TMA_BULK_WAIT_GROUP (CUDA) void IntrinsicLibrary::genTMABulkWaitGroup( llvm::ArrayRef<fir::ExtendedValue> args) { diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index 609a1fc9fb02..e5c5ba908242 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -558,6 +558,7 @@ static mlir::Value emboxSrc(mlir::PatternRewriter &rewriter, if (srcTy.isInteger(1)) { // i1 is not a supported type in the descriptor and it is actually coming // from a LOGICAL constant. Use the destination type to avoid mismatch. + assert(dstEleTy && "expect dst element type to be set"); srcTy = dstEleTy; src = createConvertOp(rewriter, loc, srcTy, src); addr = builder.createTemporary(loc, srcTy); @@ -652,7 +653,8 @@ struct CUFDataTransferOpConversion // Initialization of an array from a scalar value should be implemented // via a kernel launch. Use the flang runtime via the Assign function // until we have more infrastructure. - mlir::Value src = emboxSrc(rewriter, op, symtab); + mlir::Type dstEleTy = fir::unwrapInnerType(fir::unwrapRefType(dstTy)); + mlir::Value src = emboxSrc(rewriter, op, symtab, dstEleTy); mlir::Value dst = emboxDst(rewriter, op, symtab); mlir::func::FuncOp func = fir::runtime::getRuntimeFunc<mkRTKey(CUFDataTransferCstDesc)>( diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90 index a8b9aa8b57ef..22df9cdf410d 100644 --- a/flang/module/cudadevice.f90 +++ b/flang/module/cudadevice.f90 @@ -2034,6 +2034,15 @@ implicit none end subroutine end interface + interface + attributes(device) subroutine tma_bulk_s2g(src, dst, nbytes) + !dir$ ignore_tkr src, dst + integer(4), shared :: src(*) + integer(4), device :: dst(*) + integer(4), value :: nbytes + end subroutine + end interface + contains attributes(device) subroutine syncthreads() diff --git a/flang/test/Fir/CUDA/cuda-data-transfer.fir b/flang/test/Fir/CUDA/cuda-data-transfer.fir index 669300cf6473..5d3215dd07fc 100644 --- a/flang/test/Fir/CUDA/cuda-data-transfer.fir +++ b/flang/test/Fir/CUDA/cuda-data-transfer.fir @@ -651,5 +651,45 @@ func.func @_QPsub28() { // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DESC]] : (!fir.ref<!fir.box<!fir.logical<8>>>) -> !fir.ref<!fir.box<none>> // CHECK: fir.call @_FortranACUFDataTransferCstDesc(%{{.*}}, %[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> () +func.func @_QPtesti4(%arg0: !fir.ref<i32> {fir.bindc_name = "n1"}, %arg1: !fir.ref<i32> {fir.bindc_name = "n2"}, %arg2: !fir.ref<i32> {fir.bindc_name = "n3"}, %arg3: !fir.ref<i32> {fir.bindc_name = "n4"}) { + %true = arith.constant true + %c0 = arith.constant 0 : index + %c2_i32 = arith.constant 2 : i32 + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFtesti4En1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) + %2:2 = hlfir.declare %arg1 dummy_scope %0 {uniq_name = "_QFtesti4En2"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) + %3:2 = hlfir.declare %arg2 dummy_scope %0 {uniq_name = "_QFtesti4En3"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) + %4:2 = hlfir.declare %arg3 dummy_scope %0 {uniq_name = "_QFtesti4En4"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>) + %5 = fir.load %1#0 : !fir.ref<i32> + %6 = arith.divsi %5, %c2_i32 : i32 + %7 = fir.convert %6 : (i32) -> index + %8 = arith.cmpi sgt, %7, %c0 : index + %9 = arith.select %8, %7, %c0 : index + %10 = fir.load %2#0 : !fir.ref<i32> + %11 = arith.divsi %10, %c2_i32 : i32 + %12 = fir.convert %11 : (i32) -> index + %13 = arith.cmpi sgt, %12, %c0 : index + %14 = arith.select %13, %12, %c0 : index + %15 = fir.load %3#0 : !fir.ref<i32> + %16 = arith.divsi %15, %c2_i32 : i32 + %17 = fir.convert %16 : (i32) -> index + %18 = arith.cmpi sgt, %17, %c0 : index + %19 = arith.select %18, %17, %c0 : index + %20 = fir.load %4#0 : !fir.ref<i32> + %21 = arith.divsi %20, %c2_i32 : i32 + %22 = fir.convert %21 : (i32) -> index + %23 = arith.cmpi sgt, %22, %c0 : index + %24 = arith.select %23, %22, %c0 : index + %25 = cuf.alloc !fir.array<?x?x?x?x!fir.logical<4>>, %9, %14, %19, %24 : index, index, index, index {bindc_name = "lma", data_attr = #cuf.cuda<managed>, uniq_name = "_QFtesti4Elma"} -> !fir.ref<!fir.array<?x?x?x?x!fir.logical<4>>> + %26 = fir.shape %9, %14, %19, %24 : (index, index, index, index) -> !fir.shape<4> + %27:2 = hlfir.declare %25(%26) {data_attr = #cuf.cuda<managed>, uniq_name = "_QFtesti4Elma"} : (!fir.ref<!fir.array<?x?x?x?x!fir.logical<4>>>, !fir.shape<4>) -> (!fir.box<!fir.array<?x?x?x?x!fir.logical<4>>>, !fir.ref<!fir.array<?x?x?x?x!fir.logical<4>>>) + cuf.data_transfer %true to %27#1, %26 : !fir.shape<4> {transfer_kind = #cuf.cuda_transfer<host_device>} : i1, !fir.ref<!fir.array<?x?x?x?x!fir.logical<4>>> + cuf.free %27#1 : !fir.ref<!fir.array<?x?x?x?x!fir.logical<4>>> {data_attr = #cuf.cuda<managed>} + return +} + +// CHECK-LABEL: func.func @_QPtesti4 +// CHECK: fir.call @_FortranACUFDataTransferCstDesc + } // end of module diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 83ee0118638b..29c348c5260a 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -438,7 +438,7 @@ end subroutine ! CHECK: nvvm.cp.async.bulk.commit.group ! CHECK: nvvm.cp.async.bulk.wait_group 0 -attributes(global) subroutine test_bulk_g2s(c, a, b, n) +attributes(global) subroutine test_bulk_g2s(a) real(8), device :: a(*) real(8), shared :: tmpa(1024) integer(8), shared :: barrier1 @@ -448,3 +448,13 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_bulk_g2s ! CHECK: nvvm.cp.async.bulk.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : <7>, <1> + +attributes(global) subroutine test_bulk_s2g(a) + real(8), device :: a(*) + real(8), shared :: tmpa(1024) + integer(4) :: tx_count + call tma_bulk_s2g(tmpa, a(j), tx_count) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_bulk_s2g +! CHECL: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> diff --git a/libc/src/string/memory_utils/op_aarch64.h b/libc/src/string/memory_utils/op_aarch64.h index e552601fbb70..b5c3bb74e741 100644 --- a/libc/src/string/memory_utils/op_aarch64.h +++ b/libc/src/string/memory_utils/op_aarch64.h @@ -84,8 +84,7 @@ template <size_t Size> struct Bcmp { uint8x16_t a = vld1q_u8(_p1); uint8x16_t n = vld1q_u8(_p2); uint8x16_t an = veorq_u8(a, n); - uint32x2_t an_reduced = vqmovn_u64(vreinterpretq_u64_u8(an)); - return vmaxv_u32(an_reduced); + return vmaxvq_u32(vreinterpretq_u32_u8(an)); } else if constexpr (Size == 32) { auto _p1 = as_u8(p1); auto _p2 = as_u8(p2); @@ -97,12 +96,9 @@ template <size_t Size> struct Bcmp { uint8x16_t bo = veorq_u8(b, o); // anbo = (a ^ n) | (b ^ o). At least one byte is nonzero if there is // a difference between the two buffers. We reduce this value down to 4 - // bytes in two steps. First, calculate the saturated move value when - // going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get - // a single 32 bit nonzero value if a mismatch occurred. + // bytes using the UMAXV instruction to compute the max across the vector. uint8x16_t anbo = vorrq_u8(an, bo); - uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo)); - return vmaxv_u32(anbo_reduced); + return vmaxvq_u32(vreinterpretq_u32_u8(anbo)); } else if constexpr ((Size % BlockSize) == 0) { for (size_t offset = 0; offset < Size; offset += BlockSize) if (auto value = Bcmp<BlockSize>::block(p1 + offset, p2 + offset)) @@ -129,8 +125,7 @@ template <size_t Size> struct Bcmp { uint8x16_t bo = veorq_u8(b, o); // anbo = (a ^ n) | (b ^ o) uint8x16_t anbo = vorrq_u8(an, bo); - uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo)); - return vmaxv_u32(anbo_reduced); + return vmaxvq_u32(vreinterpretq_u32_u8(anbo)); } else if constexpr (Size == 32) { auto _p1 = as_u8(p1); auto _p2 = as_u8(p2); @@ -150,9 +145,8 @@ template <size_t Size> struct Bcmp { uint8x16_t cpdq = vorrq_u8(cp, dq); // abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)). Reduce this to // a nonzero 32 bit value if a mismatch occurred. - uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo | cpdq); - uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq); - return vmaxv_u32(abnocpdq_reduced); + uint8x16_t abnocpdq = anbo | cpdq; + return vmaxvq_u32(vreinterpretq_u32_u8(abnocpdq)); } else { static_assert(cpp::always_false<decltype(Size)>, "SIZE not implemented"); } diff --git a/lld/COFF/DLL.cpp b/lld/COFF/DLL.cpp index f4284efee8d4..10bc898244a4 100644 --- a/lld/COFF/DLL.cpp +++ b/lld/COFF/DLL.cpp @@ -320,16 +320,17 @@ static const uint8_t thunkARM64[] = { }; static const uint8_t tailMergeARM64[] = { - 0xfd, 0x7b, 0xb3, 0xa9, // stp x29, x30, [sp, #-208]! + 0xfd, 0x7b, 0xb2, 0xa9, // stp x29, x30, [sp, #-224]! 0xfd, 0x03, 0x00, 0x91, // mov x29, sp 0xe0, 0x07, 0x01, 0xa9, // stp x0, x1, [sp, #16] 0xe2, 0x0f, 0x02, 0xa9, // stp x2, x3, [sp, #32] 0xe4, 0x17, 0x03, 0xa9, // stp x4, x5, [sp, #48] 0xe6, 0x1f, 0x04, 0xa9, // stp x6, x7, [sp, #64] - 0xe0, 0x87, 0x02, 0xad, // stp q0, q1, [sp, #80] - 0xe2, 0x8f, 0x03, 0xad, // stp q2, q3, [sp, #112] - 0xe4, 0x97, 0x04, 0xad, // stp q4, q5, [sp, #144] - 0xe6, 0x9f, 0x05, 0xad, // stp q6, q7, [sp, #176] + 0xe8, 0x2b, 0x00, 0xf9, // str x8, [sp, #80] + 0xe0, 0x07, 0x03, 0xad, // stp q0, q1, [sp, #96] + 0xe2, 0x0f, 0x04, 0xad, // stp q2, q3, [sp, #128] + 0xe4, 0x17, 0x05, 0xad, // stp q4, q5, [sp, #160] + 0xe6, 0x1f, 0x06, 0xad, // stp q6, q7, [sp, #192] 0xe1, 0x03, 0x11, 0xaa, // mov x1, x17 0x00, 0x00, 0x00, 0x90, // adrp x0, #0 DELAY_IMPORT_DESCRIPTOR 0x00, 0x00, 0x00, 0x91, // add x0, x0, #0 :lo12:DELAY_IMPORT_DESCRIPTOR @@ -337,15 +338,16 @@ static const uint8_t tailMergeARM64[] = { 0x42, 0x00, 0x00, 0x91, // add x2, x2, #0 :lo12:__delayLoadHelper2 0x40, 0x00, 0x3f, 0xd6, // blr x2 0xf0, 0x03, 0x00, 0xaa, // mov x16, x0 - 0xe6, 0x9f, 0x45, 0xad, // ldp q6, q7, [sp, #176] - 0xe4, 0x97, 0x44, 0xad, // ldp q4, q5, [sp, #144] - 0xe2, 0x8f, 0x43, 0xad, // ldp q2, q3, [sp, #112] - 0xe0, 0x87, 0x42, 0xad, // ldp q0, q1, [sp, #80] + 0xe6, 0x1f, 0x46, 0xad, // ldp q6, q7, [sp, #192] + 0xe4, 0x17, 0x45, 0xad, // ldp q4, q5, [sp, #160] + 0xe2, 0x0f, 0x44, 0xad, // ldp q2, q3, [sp, #128] + 0xe0, 0x07, 0x43, 0xad, // ldp q0, q1, [sp, #96] + 0xe8, 0x2b, 0x40, 0xf9, // ldr x8, [sp, #80] 0xe6, 0x1f, 0x44, 0xa9, // ldp x6, x7, [sp, #64] 0xe4, 0x17, 0x43, 0xa9, // ldp x4, x5, [sp, #48] 0xe2, 0x0f, 0x42, 0xa9, // ldp x2, x3, [sp, #32] 0xe0, 0x07, 0x41, 0xa9, // ldp x0, x1, [sp, #16] - 0xfd, 0x7b, 0xcd, 0xa8, // ldp x29, x30, [sp], #208 + 0xfd, 0x7b, 0xce, 0xa8, // ldp x29, x30, [sp], #224 0x00, 0x02, 0x1f, 0xd6, // br x16 }; @@ -556,11 +558,11 @@ public: void writeTo(uint8_t *buf) const override { memcpy(buf, tailMergeARM64, sizeof(tailMergeARM64)); - applyArm64Addr(buf + 44, desc->getRVA(), rva + 44, 12); - applyArm64Imm(buf + 48, desc->getRVA() & 0xfff, 0); + applyArm64Addr(buf + 48, desc->getRVA(), rva + 48, 12); + applyArm64Imm(buf + 52, desc->getRVA() & 0xfff, 0); if (helper) { - applyArm64Addr(buf + 52, helper->getRVA(), rva + 52, 12); - applyArm64Imm(buf + 56, helper->getRVA() & 0xfff, 0); + applyArm64Addr(buf + 56, helper->getRVA(), rva + 56, 12); + applyArm64Imm(buf + 60, helper->getRVA() & 0xfff, 0); } } diff --git a/lld/test/COFF/arm64-delayimport.yaml b/lld/test/COFF/arm64-delayimport.yaml index 7090206dea38..5d26978db8be 100644 --- a/lld/test/COFF/arm64-delayimport.yaml +++ b/lld/test/COFF/arm64-delayimport.yaml @@ -8,33 +8,35 @@ # DISASM: 140001014: d0000011 adrp x17, 0x140003000 # DISASM: 140001018: 91002231 add x17, x17, #8 # DISASM: 14000101c: 14000001 b 0x140001020 <.text+0x20> -# DISASM: 140001020: a9b37bfd stp x29, x30, [sp, #-208]! +# DISASM: 140001020: a9b27bfd stp x29, x30, [sp, #-224]! # DISASM: 140001024: 910003fd mov x29, sp # DISASM: 140001028: a90107e0 stp x0, x1, [sp, #16] # DISASM: 14000102c: a9020fe2 stp x2, x3, [sp, #32] # DISASM: 140001030: a90317e4 stp x4, x5, [sp, #48] # DISASM: 140001034: a9041fe6 stp x6, x7, [sp, #64] -# DISASM: 140001038: ad0287e0 stp q0, q1, [sp, #80] -# DISASM: 14000103c: ad038fe2 stp q2, q3, [sp, #112] -# DISASM: 140001040: ad0497e4 stp q4, q5, [sp, #144] -# DISASM: 140001044: ad059fe6 stp q6, q7, [sp, #176] -# DISASM: 140001048: aa1103e1 mov x1, x17 -# DISASM: 14000104c: b0000000 adrp x0, 0x140002000 -# DISASM: 140001050: 91000000 add x0, x0, #0 -# DISASM: 140001054: 90000002 adrp x2, 0x140001000 <.text> -# DISASM: 140001058: 91000042 add x2, x2, #0 -# DISASM: 14000105c: d63f0040 blr x2 -# DISASM: 140001060: aa0003f0 mov x16, x0 -# DISASM: 140001064: ad459fe6 ldp q6, q7, [sp, #176] -# DISASM: 140001068: ad4497e4 ldp q4, q5, [sp, #144] -# DISASM: 14000106c: ad438fe2 ldp q2, q3, [sp, #112] -# DISASM: 140001070: ad4287e0 ldp q0, q1, [sp, #80] -# DISASM: 140001074: a9441fe6 ldp x6, x7, [sp, #64] -# DISASM: 140001078: a94317e4 ldp x4, x5, [sp, #48] -# DISASM: 14000107c: a9420fe2 ldp x2, x3, [sp, #32] -# DISASM: 140001080: a94107e0 ldp x0, x1, [sp, #16] -# DISASM: 140001084: a8cd7bfd ldp x29, x30, [sp], #208 -# DISASM: 140001088: d61f0200 br x16 +# DISASM: 140001038: f9002be8 str x8, [sp, #80] +# DISASM: 14000103c: ad0307e0 stp q0, q1, [sp, #96] +# DISASM: 140001040: ad040fe2 stp q2, q3, [sp, #128] +# DISASM: 140001044: ad0517e4 stp q4, q5, [sp, #160] +# DISASM: 140001048: ad061fe6 stp q6, q7, [sp, #192] +# DISASM: 14000104c: aa1103e1 mov x1, x17 +# DISASM: 140001050: b0000000 adrp x0, 0x140002000 +# DISASM: 140001054: 91000000 add x0, x0, #0 +# DISASM: 140001058: 90000002 adrp x2, 0x140001000 <.text> +# DISASM: 14000105c: 91000042 add x2, x2, #0 +# DISASM: 140001060: d63f0040 blr x2 +# DISASM: 140001064: aa0003f0 mov x16, x0 +# DISASM: 140001068: ad461fe6 ldp q6, q7, [sp, #192] +# DISASM: 14000106c: ad4517e4 ldp q4, q5, [sp, #160] +# DISASM: 140001070: ad440fe2 ldp q2, q3, [sp, #128] +# DISASM: 140001074: ad4307e0 ldp q0, q1, [sp, #96] +# DISASM: 140001078: f9402be8 ldr x8, [sp, #80] +# DISASM: 14000107c: a9441fe6 ldp x6, x7, [sp, #64] +# DISASM: 140001080: a94317e4 ldp x4, x5, [sp, #48] +# DISASM: 140001084: a9420fe2 ldp x2, x3, [sp, #32] +# DISASM: 140001088: a94107e0 ldp x0, x1, [sp, #16] +# DISASM: 14000108c: a8ce7bfd ldp x29, x30, [sp], #224 +# DISASM: 140001090: d61f0200 br x16 # IMPORTS: Format: COFF-ARM64 # IMPORTS: Arch: aarch64 diff --git a/lld/test/COFF/arm64x-delayimport.test b/lld/test/COFF/arm64x-delayimport.test index e22cc6d5c42f..e705fb0efc45 100644 --- a/lld/test/COFF/arm64x-delayimport.test +++ b/lld/test/COFF/arm64x-delayimport.test @@ -61,33 +61,35 @@ DISASM-NEXT: 180001010: d61f0200 br x16 DISASM-NEXT: 180001014: b0000031 adrp x17, 0x180006000 DISASM-NEXT: 180001018: 91022231 add x17, x17, #0x88 DISASM-NEXT: 18000101c: 14000001 b 0x180001020 <.text+0x20> -DISASM-NEXT: 180001020: a9b37bfd stp x29, x30, [sp, #-0xd0]! +DISASM-NEXT: 180001020: a9b27bfd stp x29, x30, [sp, #-0xe0]! DISASM-NEXT: 180001024: 910003fd mov x29, sp DISASM-NEXT: 180001028: a90107e0 stp x0, x1, [sp, #0x10] DISASM-NEXT: 18000102c: a9020fe2 stp x2, x3, [sp, #0x20] DISASM-NEXT: 180001030: a90317e4 stp x4, x5, [sp, #0x30] DISASM-NEXT: 180001034: a9041fe6 stp x6, x7, [sp, #0x40] -DISASM-NEXT: 180001038: ad0287e0 stp q0, q1, [sp, #0x50] -DISASM-NEXT: 18000103c: ad038fe2 stp q2, q3, [sp, #0x70] -DISASM-NEXT: 180001040: ad0497e4 stp q4, q5, [sp, #0x90] -DISASM-NEXT: 180001044: ad059fe6 stp q6, q7, [sp, #0xb0] -DISASM-NEXT: 180001048: aa1103e1 mov x1, x17 -DISASM-NEXT: 18000104c: f0000000 adrp x0, 0x180004000 -DISASM-NEXT: 180001050: 910d2000 add x0, x0, #0x348 -DISASM-NEXT: 180001054: 90000002 adrp x2, 0x180001000 <.text> -DISASM-NEXT: 180001058: 91000042 add x2, x2, #0x0 -DISASM-NEXT: 18000105c: d63f0040 blr x2 -DISASM-NEXT: 180001060: aa0003f0 mov x16, x0 -DISASM-NEXT: 180001064: ad459fe6 ldp q6, q7, [sp, #0xb0] -DISASM-NEXT: 180001068: ad4497e4 ldp q4, q5, [sp, #0x90] -DISASM-NEXT: 18000106c: ad438fe2 ldp q2, q3, [sp, #0x70] -DISASM-NEXT: 180001070: ad4287e0 ldp q0, q1, [sp, #0x50] -DISASM-NEXT: 180001074: a9441fe6 ldp x6, x7, [sp, #0x40] -DISASM-NEXT: 180001078: a94317e4 ldp x4, x5, [sp, #0x30] -DISASM-NEXT: 18000107c: a9420fe2 ldp x2, x3, [sp, #0x20] -DISASM-NEXT: 180001080: a94107e0 ldp x0, x1, [sp, #0x10] -DISASM-NEXT: 180001084: a8cd7bfd ldp x29, x30, [sp], #0xd0 -DISASM-NEXT: 180001088: d61f0200 br x16 +DISASM-NEXT: 180001038: f9002be8 str x8, [sp, #0x50] +DISASM-NEXT: 18000103c: ad0307e0 stp q0, q1, [sp, #0x60] +DISASM-NEXT: 180001040: ad040fe2 stp q2, q3, [sp, #0x80] +DISASM-NEXT: 180001044: ad0517e4 stp q4, q5, [sp, #0xa0] +DISASM-NEXT: 180001048: ad061fe6 stp q6, q7, [sp, #0xc0] +DISASM-NEXT: 18000104c: aa1103e1 mov x1, x17 +DISASM-NEXT: 180001050: f0000000 adrp x0, 0x180004000 +DISASM-NEXT: 180001054: 910d2000 add x0, x0, #0x348 +DISASM-NEXT: 180001058: 90000002 adrp x2, 0x180001000 <.text> +DISASM-NEXT: 18000105c: 91000042 add x2, x2, #0x0 +DISASM-NEXT: 180001060: d63f0040 blr x2 +DISASM-NEXT: 180001064: aa0003f0 mov x16, x0 +DISASM-NEXT: 180001068: ad461fe6 ldp q6, q7, [sp, #0xc0] +DISASM-NEXT: 18000106c: ad4517e4 ldp q4, q5, [sp, #0xa0] +DISASM-NEXT: 180001070: ad440fe2 ldp q2, q3, [sp, #0x80] +DISASM-NEXT: 180001074: ad4307e0 ldp q0, q1, [sp, #0x60] +DISASM-NEXT: 180001078: f9402be8 ldr x8, [sp, #0x50] +DISASM-NEXT: 18000107c: a9441fe6 ldp x6, x7, [sp, #0x40] +DISASM-NEXT: 180001080: a94317e4 ldp x4, x5, [sp, #0x30] +DISASM-NEXT: 180001084: a9420fe2 ldp x2, x3, [sp, #0x20] +DISASM-NEXT: 180001088: a94107e0 ldp x0, x1, [sp, #0x10] +DISASM-NEXT: 18000108c: a8ce7bfd ldp x29, x30, [sp], #0xe0 +DISASM-NEXT: 180001090: d61f0200 br x16 DISASM-NEXT: ... DISASM-NEXT: 180002000: 52800040 mov w0, #0x2 // =2 DISASM-NEXT: 180002004: d65f03c0 ret @@ -186,33 +188,35 @@ NATIVE-DISASM-NEXT: 180001010: d61f0200 br x16 NATIVE-DISASM-NEXT: 180001014: 90000031 adrp x17, 0x180005000 NATIVE-DISASM-NEXT: 180001018: 91022231 add x17, x17, #0x88 NATIVE-DISASM-NEXT: 18000101c: 14000001 b 0x180001020 <.text+0x20> -NATIVE-DISASM-NEXT: 180001020: a9b37bfd stp x29, x30, [sp, #-0xd0]! +NATIVE-DISASM-NEXT: 180001020: a9b27bfd stp x29, x30, [sp, #-0xe0]! NATIVE-DISASM-NEXT: 180001024: 910003fd mov x29, sp NATIVE-DISASM-NEXT: 180001028: a90107e0 stp x0, x1, [sp, #0x10] NATIVE-DISASM-NEXT: 18000102c: a9020fe2 stp x2, x3, [sp, #0x20] NATIVE-DISASM-NEXT: 180001030: a90317e4 stp x4, x5, [sp, #0x30] NATIVE-DISASM-NEXT: 180001034: a9041fe6 stp x6, x7, [sp, #0x40] -NATIVE-DISASM-NEXT: 180001038: ad0287e0 stp q0, q1, [sp, #0x50] -NATIVE-DISASM-NEXT: 18000103c: ad038fe2 stp q2, q3, [sp, #0x70] -NATIVE-DISASM-NEXT: 180001040: ad0497e4 stp q4, q5, [sp, #0x90] -NATIVE-DISASM-NEXT: 180001044: ad059fe6 stp q6, q7, [sp, #0xb0] -NATIVE-DISASM-NEXT: 180001048: aa1103e1 mov x1, x17 -NATIVE-DISASM-NEXT: 18000104c: d0000000 adrp x0, 0x180003000 -NATIVE-DISASM-NEXT: 180001050: 910cc000 add x0, x0, #0x330 -NATIVE-DISASM-NEXT: 180001054: 90000002 adrp x2, 0x180001000 <.text> -NATIVE-DISASM-NEXT: 180001058: 91000042 add x2, x2, #0x0 -NATIVE-DISASM-NEXT: 18000105c: d63f0040 blr x2 -NATIVE-DISASM-NEXT: 180001060: aa0003f0 mov x16, x0 -NATIVE-DISASM-NEXT: 180001064: ad459fe6 ldp q6, q7, [sp, #0xb0] -NATIVE-DISASM-NEXT: 180001068: ad4497e4 ldp q4, q5, [sp, #0x90] -NATIVE-DISASM-NEXT: 18000106c: ad438fe2 ldp q2, q3, [sp, #0x70] -NATIVE-DISASM-NEXT: 180001070: ad4287e0 ldp q0, q1, [sp, #0x50] -NATIVE-DISASM-NEXT: 180001074: a9441fe6 ldp x6, x7, [sp, #0x40] -NATIVE-DISASM-NEXT: 180001078: a94317e4 ldp x4, x5, [sp, #0x30] -NATIVE-DISASM-NEXT: 18000107c: a9420fe2 ldp x2, x3, [sp, #0x20] -NATIVE-DISASM-NEXT: 180001080: a94107e0 ldp x0, x1, [sp, #0x10] -NATIVE-DISASM-NEXT: 180001084: a8cd7bfd ldp x29, x30, [sp], #0xd0 -NATIVE-DISASM-NEXT: 180001088: d61f0200 br x16 +NATIVE-DISASM-NEXT: 180001038: f9002be8 str x8, [sp, #0x50] +NATIVE-DISASM-NEXT: 18000103c: ad0307e0 stp q0, q1, [sp, #0x60] +NATIVE-DISASM-NEXT: 180001040: ad040fe2 stp q2, q3, [sp, #0x80] +NATIVE-DISASM-NEXT: 180001044: ad0517e4 stp q4, q5, [sp, #0xa0] +NATIVE-DISASM-NEXT: 180001048: ad061fe6 stp q6, q7, [sp, #0xc0] +NATIVE-DISASM-NEXT: 18000104c: aa1103e1 mov x1, x17 +NATIVE-DISASM-NEXT: 180001050: d0000000 adrp x0, 0x180003000 +NATIVE-DISASM-NEXT: 180001054: 910cc000 add x0, x0, #0x330 +NATIVE-DISASM-NEXT: 180001058: 90000002 adrp x2, 0x180001000 <.text> +NATIVE-DISASM-NEXT: 18000105c: 91000042 add x2, x2, #0x0 +NATIVE-DISASM-NEXT: 180001060: d63f0040 blr x2 +NATIVE-DISASM-NEXT: 180001064: aa0003f0 mov x16, x0 +NATIVE-DISASM-NEXT: 180001068: ad461fe6 ldp q6, q7, [sp, #0xc0] +NATIVE-DISASM-NEXT: 18000106c: ad4517e4 ldp q4, q5, [sp, #0xa0] +NATIVE-DISASM-NEXT: 180001070: ad440fe2 ldp q2, q3, [sp, #0x80] +NATIVE-DISASM-NEXT: 180001074: ad4307e0 ldp q0, q1, [sp, #0x60] +NATIVE-DISASM-NEXT: 180001078: f9402be8 ldr x8, [sp, #0x50] +NATIVE-DISASM-NEXT: 18000107c: a9441fe6 ldp x6, x7, [sp, #0x40] +NATIVE-DISASM-NEXT: 180001080: a94317e4 ldp x4, x5, [sp, #0x30] +NATIVE-DISASM-NEXT: 180001084: a9420fe2 ldp x2, x3, [sp, #0x20] +NATIVE-DISASM-NEXT: 180001088: a94107e0 ldp x0, x1, [sp, #0x10] +NATIVE-DISASM-NEXT: 18000108c: a8ce7bfd ldp x29, x30, [sp], #0xe0 +NATIVE-DISASM-NEXT: 180001090: d61f0200 br x16 RUN: llvm-readobj --coff-load-config out-native.dll | FileCheck --check-prefix=NATIVE-LOADCFG %s NATIVE-LOADCFG: AuxiliaryDelayloadIAT: 0x4000 diff --git a/lldb/bindings/python/python-typemaps.swig b/lldb/bindings/python/python-typemaps.swig index 88b6cd9ef6b6..715914fe745f 100644 --- a/lldb/bindings/python/python-typemaps.swig +++ b/lldb/bindings/python/python-typemaps.swig @@ -233,6 +233,11 @@ AND call SWIG_fail at the same time, because it will result in a double free. } +// For lldb::SBFileSpec::GetPath +%typemap(in) (char *dst_path, size_t dst_len) = (char *dst_or_null, size_t dst_len); +%typemap(argout) (char *dst_path, size_t dst_len) = (char *dst_or_null, size_t dst_len); + + // typemap for an outgoing buffer // See also SBEvent::SBEvent(uint32_t event, const char *cstr, uint32_t cstr_len). // Ditto for SBProcess::PutSTDIN(const char *src, size_t src_len). diff --git a/lldb/source/Target/RegisterContextUnwind.cpp b/lldb/source/Target/RegisterContextUnwind.cpp index c6d15fc6be0a..252bee2b5d72 100644 --- a/lldb/source/Target/RegisterContextUnwind.cpp +++ b/lldb/source/Target/RegisterContextUnwind.cpp @@ -52,6 +52,14 @@ static ConstString GetSymbolOrFunctionName(const SymbolContext &sym_ctx) { return ConstString(); } +static bool CallFrameAddressIsValid(ABISP abi_sp, lldb::addr_t cfa) { + if (cfa == LLDB_INVALID_ADDRESS) + return false; + if (abi_sp) + return abi_sp->CallFrameAddressIsValid(cfa); + return cfa != 0 && cfa != 1; +} + RegisterContextUnwind::RegisterContextUnwind(Thread &thread, const SharedPtr &next_frame, SymbolContext &sym_ctx, @@ -448,7 +456,7 @@ void RegisterContextUnwind::InitializeNonZerothFrame() { ReadFrameAddress(row_register_kind, row->GetAFAValue(), m_afa); // A couple of sanity checks.. - if (m_cfa == LLDB_INVALID_ADDRESS || m_cfa == 0 || m_cfa == 1) { + if (!CallFrameAddressIsValid(abi_sp, m_cfa)) { UnwindLogMsg("could not find a valid cfa address"); m_frame_type = eNotAValidFrame; return; @@ -1847,9 +1855,11 @@ bool RegisterContextUnwind::TryFallbackUnwindPlan() { active_row->GetCFAValue().GetValueType() != UnwindPlan::Row::FAValue::unspecified) { addr_t new_cfa; + ProcessSP process_sp = m_thread.GetProcess(); + ABISP abi_sp = process_sp ? process_sp->GetABI() : nullptr; if (!ReadFrameAddress(m_fallback_unwind_plan_sp->GetRegisterKind(), - active_row->GetCFAValue(), new_cfa) || - new_cfa == 0 || new_cfa == 1 || new_cfa == LLDB_INVALID_ADDRESS) { + active_row->GetCFAValue(), new_cfa) || + !CallFrameAddressIsValid(abi_sp, new_cfa)) { UnwindLogMsg("failed to get cfa with fallback unwindplan"); m_fallback_unwind_plan_sp.reset(); m_full_unwind_plan_sp = original_full_unwind_plan_sp; @@ -1870,10 +1880,9 @@ bool RegisterContextUnwind::TryFallbackUnwindPlan() { if (ReadRegisterValueFromRegisterLocation(regloc, reg_info, reg_value)) { new_caller_pc_value = reg_value.GetAsUInt64(); - if (ProcessSP process_sp = m_thread.GetProcess()) { - if (ABISP abi_sp = process_sp->GetABI()) - new_caller_pc_value = abi_sp->FixCodeAddress(new_caller_pc_value); - } + if (process_sp) + new_caller_pc_value = + process_sp->FixCodeAddress(new_caller_pc_value); } } } @@ -1932,9 +1941,11 @@ bool RegisterContextUnwind::ForceSwitchToFallbackUnwindPlan() { active_row->GetCFAValue().GetValueType() != UnwindPlan::Row::FAValue::unspecified) { addr_t new_cfa; + ProcessSP process_sp = m_thread.GetProcess(); + ABISP abi_sp = process_sp ? process_sp->GetABI() : nullptr; if (!ReadFrameAddress(m_fallback_unwind_plan_sp->GetRegisterKind(), - active_row->GetCFAValue(), new_cfa) || - new_cfa == 0 || new_cfa == 1 || new_cfa == LLDB_INVALID_ADDRESS) { + active_row->GetCFAValue(), new_cfa) || + !CallFrameAddressIsValid(abi_sp, new_cfa)) { UnwindLogMsg("failed to get cfa with fallback unwindplan"); m_fallback_unwind_plan_sp.reset(); return false; @@ -2055,8 +2066,7 @@ bool RegisterContextUnwind::ReadFrameAddress( RegisterNumber cfa_reg(m_thread, row_register_kind, fa.GetRegisterNumber()); if (ReadGPRValue(cfa_reg, cfa_reg_contents)) { - if (cfa_reg_contents == LLDB_INVALID_ADDRESS || cfa_reg_contents == 0 || - cfa_reg_contents == 1) { + if (!CallFrameAddressIsValid(abi_sp, cfa_reg_contents)) { UnwindLogMsg( "Got an invalid CFA register value - reg %s (%d), value 0x%" PRIx64, cfa_reg.GetName(), cfa_reg.GetAsKind(eRegisterKindLLDB), diff --git a/lldb/test/API/functionalities/unwind/cortex-m-exception/TestCortexMExceptionUnwind.py b/lldb/test/API/functionalities/unwind/cortex-m-exception/TestCortexMExceptionUnwind.py index 10cbd26328f4..fc7bfe43e205 100644 --- a/lldb/test/API/functionalities/unwind/cortex-m-exception/TestCortexMExceptionUnwind.py +++ b/lldb/test/API/functionalities/unwind/cortex-m-exception/TestCortexMExceptionUnwind.py @@ -12,7 +12,7 @@ from lldbsuite.test import lldbutil class TestCortexMExceptionUnwind(TestBase): NO_DEBUG_INFO_TESTCASE = True - @skipIfRemote + @skipIfLLVMTargetMissing("ARM") def test_no_fpu(self): """Test that we can backtrace correctly through an ARM Cortex-M Exception return stack""" diff --git a/lldb/test/API/python_api/default-constructor/sb_filespec.py b/lldb/test/API/python_api/default-constructor/sb_filespec.py index 4ab5c49c37eb..5dd78b1ace98 100644 --- a/lldb/test/API/python_api/default-constructor/sb_filespec.py +++ b/lldb/test/API/python_api/default-constructor/sb_filespec.py @@ -10,5 +10,5 @@ def fuzz_obj(obj): obj.ResolveExecutableLocation() obj.GetFilename() obj.GetDirectory() - obj.GetPath(None, 0) + obj.GetPath(1) obj.GetDescription(lldb.SBStream()) diff --git a/llvm/include/llvm/ADT/StringExtras.h b/llvm/include/llvm/ADT/StringExtras.h index 7d81c63485be..2440e7678a83 100644 --- a/llvm/include/llvm/ADT/StringExtras.h +++ b/llvm/include/llvm/ADT/StringExtras.h @@ -529,13 +529,15 @@ inline std::string join_items(Sep Separator, Args &&... Items) { class ListSeparator { bool First = true; StringRef Separator; + StringRef Prefix; public: - ListSeparator(StringRef Separator = ", ") : Separator(Separator) {} + ListSeparator(StringRef Separator = ", ", StringRef Prefix = "") + : Separator(Separator), Prefix(Prefix) {} operator StringRef() { if (First) { First = false; - return {}; + return Prefix; } return Separator; } diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h index 8876e4ed6ae4..e5a6c8cc0a6a 100644 --- a/llvm/include/llvm/Analysis/ScalarEvolution.h +++ b/llvm/include/llvm/Analysis/ScalarEvolution.h @@ -2316,10 +2316,6 @@ private: /// an add rec on said loop. void getUsedLoops(const SCEV *S, SmallPtrSetImpl<const Loop *> &LoopsUsed); - /// Try to match the pattern generated by getURemExpr(A, B). If successful, - /// Assign A and B to LHS and RHS, respectively. - LLVM_ABI bool matchURem(const SCEV *Expr, const SCEV *&LHS, const SCEV *&RHS); - /// Look for a SCEV expression with type `SCEVType` and operands `Ops` in /// `UniqueSCEVs`. Return if found, else nullptr. SCEV *findExistingSCEVInCache(SCEVTypes SCEVType, ArrayRef<const SCEV *> Ops); diff --git a/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h b/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h index 07a482d4f166..871028de3163 100644 --- a/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h +++ b/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h @@ -252,6 +252,80 @@ m_scev_UDiv(const Op0_t &Op0, const Op1_t &Op1) { return m_scev_Binary<SCEVUDivExpr>(Op0, Op1); } +/// Match unsigned remainder pattern. +/// Matches patterns generated by getURemExpr. +template <typename Op0_t, typename Op1_t> struct SCEVURem_match { + Op0_t Op0; + Op1_t Op1; + ScalarEvolution &SE; + + SCEVURem_match(Op0_t Op0, Op1_t Op1, ScalarEvolution &SE) + : Op0(Op0), Op1(Op1), SE(SE) {} + + bool match(const SCEV *Expr) const { + if (Expr->getType()->isPointerTy()) + return false; + + // Try to match 'zext (trunc A to iB) to iY', which is used + // for URem with constant power-of-2 second operands. Make sure the size of + // the operand A matches the size of the whole expressions. + const SCEV *LHS; + if (SCEVPatternMatch::match(Expr, m_scev_ZExt(m_scev_Trunc(m_SCEV(LHS))))) { + Type *TruncTy = cast<SCEVZeroExtendExpr>(Expr)->getOperand()->getType(); + // Bail out if the type of the LHS is larger than the type of the + // expression for now. + if (SE.getTypeSizeInBits(LHS->getType()) > + SE.getTypeSizeInBits(Expr->getType())) + return false; + if (LHS->getType() != Expr->getType()) + LHS = SE.getZeroExtendExpr(LHS, Expr->getType()); + const SCEV *RHS = + SE.getConstant(APInt(SE.getTypeSizeInBits(Expr->getType()), 1) + << SE.getTypeSizeInBits(TruncTy)); + return Op0.match(LHS) && Op1.match(RHS); + } + const auto *Add = dyn_cast<SCEVAddExpr>(Expr); + if (Add == nullptr || Add->getNumOperands() != 2) + return false; + + const SCEV *A = Add->getOperand(1); + const auto *Mul = dyn_cast<SCEVMulExpr>(Add->getOperand(0)); + + if (Mul == nullptr) + return false; + + const auto MatchURemWithDivisor = [&](const SCEV *B) { + // (SomeExpr + (-(SomeExpr / B) * B)). + if (Expr == SE.getURemExpr(A, B)) + return Op0.match(A) && Op1.match(B); + return false; + }; + + // (SomeExpr + (-1 * (SomeExpr / B) * B)). + if (Mul->getNumOperands() == 3 && isa<SCEVConstant>(Mul->getOperand(0))) + return MatchURemWithDivisor(Mul->getOperand(1)) || + MatchURemWithDivisor(Mul->getOperand(2)); + + // (SomeExpr + ((-SomeExpr / B) * B)) or (SomeExpr + ((SomeExpr / B) * -B)). + if (Mul->getNumOperands() == 2) + return MatchURemWithDivisor(Mul->getOperand(1)) || + MatchURemWithDivisor(Mul->getOperand(0)) || + MatchURemWithDivisor(SE.getNegativeSCEV(Mul->getOperand(1))) || + MatchURemWithDivisor(SE.getNegativeSCEV(Mul->getOperand(0))); + return false; + } +}; + +/// Match the mathematical pattern A - (A / B) * B, where A and B can be +/// arbitrary expressions. Also match zext (trunc A to iB) to iY, which is used +/// for URem with constant power-of-2 second operands. It's not always easy, as +/// A and B can be folded (imagine A is X / 2, and B is 4, A / B becomes X / 8). +template <typename Op0_t, typename Op1_t> +inline SCEVURem_match<Op0_t, Op1_t> m_scev_URem(Op0_t LHS, Op1_t RHS, + ScalarEvolution &SE) { + return SCEVURem_match<Op0_t, Op1_t>(LHS, RHS, SE); +} + inline class_match<const Loop> m_Loop() { return class_match<const Loop>(); } /// Match an affine SCEVAddRecExpr. diff --git a/llvm/include/llvm/Analysis/StaticDataProfileInfo.h b/llvm/include/llvm/Analysis/StaticDataProfileInfo.h index fa21eba1377d..f06e7ceaa74c 100644 --- a/llvm/include/llvm/Analysis/StaticDataProfileInfo.h +++ b/llvm/include/llvm/Analysis/StaticDataProfileInfo.h @@ -10,6 +10,24 @@ namespace llvm { +namespace memprof { +// Represents the eligibility status of a global variable for section prefix +// annotation. Other than AnnotationOk, each enum value indicates a specific +// reason for ineligibility. +enum class AnnotationKind : uint8_t { + AnnotationOK, + DeclForLinker, + ExplicitSection, + ReservedName, +}; +/// Returns the annotation kind of the global variable \p GV. +AnnotationKind getAnnotationKind(const GlobalVariable &GV); + +/// Returns true if the annotation kind of the global variable \p GV is +/// AnnotationOK. +bool IsAnnotationOK(const GlobalVariable &GV); +} // namespace memprof + /// A class that holds the constants that represent static data and their /// profile information and provides methods to operate on them. class StaticDataProfileInfo { diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def index 1694a33510d7..46b3d53a4b40 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -472,7 +472,7 @@ __OMP_RTL(__kmpc_target_init, false, Int32, KernelEnvironmentPtr, KernelLaunchEn __OMP_RTL(__kmpc_target_deinit, false, Void,) __OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr) __OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32, - FuncPtrTy, VoidPtr, VoidPtrPtr, SizeTy) + FuncPtrTy, FuncPtrTy, VoidPtrPtr, SizeTy) __OMP_RTL(__kmpc_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int8) __OMP_RTL(__kmpc_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int8) __OMP_RTL(__kmpc_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int8) diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 00c3dbbf3e80..3fab6b0572cb 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -1774,7 +1774,7 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty, { const SCEV *LHS; const SCEV *RHS; - if (matchURem(Op, LHS, RHS)) + if (match(Op, m_scev_URem(m_SCEV(LHS), m_SCEV(RHS), *this))) return getURemExpr(getZeroExtendExpr(LHS, Ty, Depth + 1), getZeroExtendExpr(RHS, Ty, Depth + 1)); } @@ -2699,17 +2699,12 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops, } // Canonicalize (-1 * urem X, Y) + X --> (Y * X/Y) - if (Ops.size() == 2) { - const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(Ops[0]); - if (Mul && Mul->getNumOperands() == 2 && - Mul->getOperand(0)->isAllOnesValue()) { - const SCEV *X; - const SCEV *Y; - if (matchURem(Mul->getOperand(1), X, Y) && X == Ops[1]) { - return getMulExpr(Y, getUDivExpr(X, Y)); - } - } - } + const SCEV *Y; + if (Ops.size() == 2 && + match(Ops[0], + m_scev_Mul(m_scev_AllOnes(), + m_scev_URem(m_scev_Specific(Ops[1]), m_SCEV(Y), *this)))) + return getMulExpr(Y, getUDivExpr(Ops[1], Y)); // Skip past any other cast SCEVs. while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scAddExpr) @@ -15410,65 +15405,6 @@ void PredicatedScalarEvolution::print(raw_ostream &OS, unsigned Depth) const { } } -// Match the mathematical pattern A - (A / B) * B, where A and B can be -// arbitrary expressions. Also match zext (trunc A to iB) to iY, which is used -// for URem with constant power-of-2 second operands. -// It's not always easy, as A and B can be folded (imagine A is X / 2, and B is -// 4, A / B becomes X / 8). -bool ScalarEvolution::matchURem(const SCEV *Expr, const SCEV *&LHS, - const SCEV *&RHS) { - if (Expr->getType()->isPointerTy()) - return false; - - // Try to match 'zext (trunc A to iB) to iY', which is used - // for URem with constant power-of-2 second operands. Make sure the size of - // the operand A matches the size of the whole expressions. - if (match(Expr, m_scev_ZExt(m_scev_Trunc(m_SCEV(LHS))))) { - Type *TruncTy = cast<SCEVZeroExtendExpr>(Expr)->getOperand()->getType(); - // Bail out if the type of the LHS is larger than the type of the - // expression for now. - if (getTypeSizeInBits(LHS->getType()) > getTypeSizeInBits(Expr->getType())) - return false; - if (LHS->getType() != Expr->getType()) - LHS = getZeroExtendExpr(LHS, Expr->getType()); - RHS = getConstant(APInt(getTypeSizeInBits(Expr->getType()), 1) - << getTypeSizeInBits(TruncTy)); - return true; - } - const auto *Add = dyn_cast<SCEVAddExpr>(Expr); - if (Add == nullptr || Add->getNumOperands() != 2) - return false; - - const SCEV *A = Add->getOperand(1); - const auto *Mul = dyn_cast<SCEVMulExpr>(Add->getOperand(0)); - - if (Mul == nullptr) - return false; - - const auto MatchURemWithDivisor = [&](const SCEV *B) { - // (SomeExpr + (-(SomeExpr / B) * B)). - if (Expr == getURemExpr(A, B)) { - LHS = A; - RHS = B; - return true; - } - return false; - }; - - // (SomeExpr + (-1 * (SomeExpr / B) * B)). - if (Mul->getNumOperands() == 3 && isa<SCEVConstant>(Mul->getOperand(0))) - return MatchURemWithDivisor(Mul->getOperand(1)) || - MatchURemWithDivisor(Mul->getOperand(2)); - - // (SomeExpr + ((-SomeExpr / B) * B)) or (SomeExpr + ((SomeExpr / B) * -B)). - if (Mul->getNumOperands() == 2) - return MatchURemWithDivisor(Mul->getOperand(1)) || - MatchURemWithDivisor(Mul->getOperand(0)) || - MatchURemWithDivisor(getNegativeSCEV(Mul->getOperand(1))) || - MatchURemWithDivisor(getNegativeSCEV(Mul->getOperand(0))); - return false; -} - ScalarEvolution::LoopGuards ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) { BasicBlock *Header = L->getHeader(); @@ -15689,20 +15625,18 @@ void ScalarEvolution::LoopGuards::collectFromBlock( if (Predicate == CmpInst::ICMP_EQ && match(RHS, m_scev_Zero())) { // If LHS is A % B, i.e. A % B == 0, rewrite A to (A /u B) * B to // explicitly express that. - const SCEV *URemLHS = nullptr; + const SCEVUnknown *URemLHS = nullptr; const SCEV *URemRHS = nullptr; - if (SE.matchURem(LHS, URemLHS, URemRHS)) { - if (const SCEVUnknown *LHSUnknown = dyn_cast<SCEVUnknown>(URemLHS)) { - auto I = RewriteMap.find(LHSUnknown); - const SCEV *RewrittenLHS = - I != RewriteMap.end() ? I->second : LHSUnknown; - RewrittenLHS = ApplyDivisibiltyOnMinMaxExpr(RewrittenLHS, URemRHS); - const auto *Multiple = - SE.getMulExpr(SE.getUDivExpr(RewrittenLHS, URemRHS), URemRHS); - RewriteMap[LHSUnknown] = Multiple; - ExprsToRewrite.push_back(LHSUnknown); - return; - } + if (match(LHS, + m_scev_URem(m_SCEVUnknown(URemLHS), m_SCEV(URemRHS), SE))) { + auto I = RewriteMap.find(URemLHS); + const SCEV *RewrittenLHS = I != RewriteMap.end() ? I->second : URemLHS; + RewrittenLHS = ApplyDivisibiltyOnMinMaxExpr(RewrittenLHS, URemRHS); + const auto *Multiple = + SE.getMulExpr(SE.getUDivExpr(RewrittenLHS, URemRHS), URemRHS); + RewriteMap[URemLHS] = Multiple; + ExprsToRewrite.push_back(URemLHS); + return; } } diff --git a/llvm/lib/Analysis/StaticDataProfileInfo.cpp b/llvm/lib/Analysis/StaticDataProfileInfo.cpp index b036b2dde770..1f751ee5e09d 100644 --- a/llvm/lib/Analysis/StaticDataProfileInfo.cpp +++ b/llvm/lib/Analysis/StaticDataProfileInfo.cpp @@ -6,6 +6,46 @@ #include "llvm/ProfileData/InstrProf.h" using namespace llvm; + +namespace llvm { +namespace memprof { +// Returns true iff the global variable has custom section either by +// __attribute__((section("name"))) +// (https://clang.llvm.org/docs/AttributeReference.html#section-declspec-allocate) +// or #pragma clang section directives +// (https://clang.llvm.org/docs/LanguageExtensions.html#specifying-section-names-for-global-objects-pragma-clang-section). +static bool hasExplicitSectionName(const GlobalVariable &GVar) { + if (GVar.hasSection()) + return true; + + auto Attrs = GVar.getAttributes(); + if (Attrs.hasAttribute("bss-section") || Attrs.hasAttribute("data-section") || + Attrs.hasAttribute("relro-section") || + Attrs.hasAttribute("rodata-section")) + return true; + return false; +} + +AnnotationKind getAnnotationKind(const GlobalVariable &GV) { + if (GV.isDeclarationForLinker()) + return AnnotationKind::DeclForLinker; + // Skip 'llvm.'-prefixed global variables conservatively because they are + // often handled specially, + StringRef Name = GV.getName(); + if (Name.starts_with("llvm.")) + return AnnotationKind::ReservedName; + // Respect user-specified custom data sections. + if (hasExplicitSectionName(GV)) + return AnnotationKind::ExplicitSection; + return AnnotationKind::AnnotationOK; +} + +bool IsAnnotationOK(const GlobalVariable &GV) { + return getAnnotationKind(GV) == AnnotationKind::AnnotationOK; +} +} // namespace memprof +} // namespace llvm + void StaticDataProfileInfo::addConstantProfileCount( const Constant *C, std::optional<uint64_t> Count) { if (!Count) { diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp index 96428cd7ea84..1d54d7233686 100644 --- a/llvm/lib/CodeGen/MIRPrinter.cpp +++ b/llvm/lib/CodeGen/MIRPrinter.cpp @@ -862,48 +862,46 @@ static void printMI(raw_ostream &OS, MFPrintState &State, OS << TII->getName(MI.getOpcode()); - LS = ListSeparator(); + // Print a space after the opcode if any additional tokens are printed. + LS = ListSeparator(", ", " "); - if (I < E) { - OS << ' '; - for (; I < E; ++I) { - OS << LS; - printMIOperand(OS, State, MI, I, TRI, TII, ShouldPrintRegisterTies, - PrintedTypes, MRI, /*PrintDef=*/true); - } + for (; I < E; ++I) { + OS << LS; + printMIOperand(OS, State, MI, I, TRI, TII, ShouldPrintRegisterTies, + PrintedTypes, MRI, /*PrintDef=*/true); } // Print any optional symbols attached to this instruction as-if they were // operands. if (MCSymbol *PreInstrSymbol = MI.getPreInstrSymbol()) { - OS << LS << " pre-instr-symbol "; + OS << LS << "pre-instr-symbol "; MachineOperand::printSymbol(OS, *PreInstrSymbol); } if (MCSymbol *PostInstrSymbol = MI.getPostInstrSymbol()) { - OS << LS << " post-instr-symbol "; + OS << LS << "post-instr-symbol "; MachineOperand::printSymbol(OS, *PostInstrSymbol); } if (MDNode *HeapAllocMarker = MI.getHeapAllocMarker()) { - OS << LS << " heap-alloc-marker "; + OS << LS << "heap-alloc-marker "; HeapAllocMarker->printAsOperand(OS, State.MST); } if (MDNode *PCSections = MI.getPCSections()) { - OS << LS << " pcsections "; + OS << LS << "pcsections "; PCSections->printAsOperand(OS, State.MST); } if (MDNode *MMRA = MI.getMMRAMetadata()) { - OS << LS << " mmra "; + OS << LS << "mmra "; MMRA->printAsOperand(OS, State.MST); } if (uint32_t CFIType = MI.getCFIType()) - OS << LS << " cfi-type " << CFIType; + OS << LS << "cfi-type " << CFIType; if (auto Num = MI.peekDebugInstrNum()) - OS << LS << " debug-instr-number " << Num; + OS << LS << "debug-instr-number " << Num; if (PrintLocations) { if (const DebugLoc &DL = MI.getDebugLoc()) { - OS << LS << " debug-location "; + OS << LS << "debug-location "; DL->printAsOperand(OS, State.MST); } } diff --git a/llvm/lib/CodeGen/StaticDataAnnotator.cpp b/llvm/lib/CodeGen/StaticDataAnnotator.cpp index 53a9ab4dbda0..eac201201708 100644 --- a/llvm/lib/CodeGen/StaticDataAnnotator.cpp +++ b/llvm/lib/CodeGen/StaticDataAnnotator.cpp @@ -75,22 +75,11 @@ bool StaticDataAnnotator::runOnModule(Module &M) { bool Changed = false; for (auto &GV : M.globals()) { - if (GV.isDeclarationForLinker()) + if (!llvm::memprof::IsAnnotationOK(GV)) continue; - // The implementation below assumes prior passes don't set section prefixes, - // and specifically do 'assign' rather than 'update'. So report error if a - // section prefix is already set. - if (auto maybeSectionPrefix = GV.getSectionPrefix(); - maybeSectionPrefix && !maybeSectionPrefix->empty()) - llvm::report_fatal_error("Global variable " + GV.getName() + - " already has a section prefix " + - *maybeSectionPrefix); - StringRef SectionPrefix = SDPI->getConstantSectionPrefix(&GV, PSI); - if (SectionPrefix.empty()) - continue; - + // setSectionPrefix returns true if the section prefix is updated. Changed |= GV.setSectionPrefix(SectionPrefix); } diff --git a/llvm/lib/CodeGen/StaticDataSplitter.cpp b/llvm/lib/CodeGen/StaticDataSplitter.cpp index e22dc2507d54..1593a401bcb2 100644 --- a/llvm/lib/CodeGen/StaticDataSplitter.cpp +++ b/llvm/lib/CodeGen/StaticDataSplitter.cpp @@ -130,10 +130,8 @@ StaticDataSplitter::getConstant(const MachineOperand &Op, if (Op.isGlobal()) { // Find global variables with local linkage. const GlobalVariable *GV = getLocalLinkageGlobalVariable(Op.getGlobal()); - // Skip 'llvm.'-prefixed global variables conservatively because they are - // often handled specially, and skip those not in static data - // sections. - if (!GV || GV->getName().starts_with("llvm.") || + // Skip those not eligible for annotation or not in static data sections. + if (!GV || !llvm::memprof::IsAnnotationOK(*GV) || !inStaticDataSection(*GV, TM)) return nullptr; return GV; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index dbe74b1b08f8..5700468e2420 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -2394,15 +2394,19 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const { else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) && (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) || TII->isTRANS(MI))) - Result = true; + Result = !MI.mayLoadOrStore(); else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) && - TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI)) - Result = true; + TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI)) { + // Some memory instructions may be marked as VALU (e.g. BUFFER_LOAD_*_LDS). + // For our purposes, these shall not be classified as VALU as this results + // in unexpected behavior. + Result = !MI.mayLoadOrStore(); + } else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) && TII->isSALU(MI)) - Result = true; + Result = !MI.mayLoadOrStore(); else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) && TII->isMFMAorWMMA(MI)) diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp index 278cf6dc1d78..1b7cb9bd2169 100644 --- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp @@ -168,13 +168,13 @@ struct DemandedFields { // If this is true, we demand that VTYPE is set to some legal state, i.e. that // vill is unset. bool VILL = false; - bool UseTWiden = false; - bool UseAltFmt = false; + bool TWiden = false; + bool AltFmt = false; // Return true if any part of VTYPE was used bool usedVTYPE() const { return SEW || LMUL || SEWLMULRatio || TailPolicy || MaskPolicy || VILL || - UseTWiden || UseAltFmt; + TWiden || AltFmt; } // Return true if any property of VL was used @@ -190,8 +190,8 @@ struct DemandedFields { TailPolicy = true; MaskPolicy = true; VILL = true; - UseTWiden = true; - UseAltFmt = true; + TWiden = true; + AltFmt = true; } // Mark all VL properties as demanded @@ -217,8 +217,8 @@ struct DemandedFields { TailPolicy |= B.TailPolicy; MaskPolicy |= B.MaskPolicy; VILL |= B.VILL; - UseAltFmt |= B.UseAltFmt; - UseTWiden |= B.UseTWiden; + AltFmt |= B.AltFmt; + TWiden |= B.TWiden; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -266,8 +266,8 @@ struct DemandedFields { OS << "TailPolicy=" << TailPolicy << ", "; OS << "MaskPolicy=" << MaskPolicy << ", "; OS << "VILL=" << VILL << ", "; - OS << "UseAltFmt=" << UseAltFmt << ", "; - OS << "UseTWiden=" << UseTWiden; + OS << "AltFmt=" << AltFmt << ", "; + OS << "TWiden=" << TWiden; OS << "}"; } #endif @@ -337,13 +337,13 @@ static bool areCompatibleVTYPEs(uint64_t CurVType, uint64_t NewVType, if (Used.MaskPolicy && RISCVVType::isMaskAgnostic(CurVType) != RISCVVType::isMaskAgnostic(NewVType)) return false; - if (Used.UseTWiden && (RISCVVType::hasXSfmmWiden(CurVType) != - RISCVVType::hasXSfmmWiden(NewVType) || - (RISCVVType::hasXSfmmWiden(CurVType) && - RISCVVType::getXSfmmWiden(CurVType) != - RISCVVType::getXSfmmWiden(NewVType)))) + if (Used.TWiden && (RISCVVType::hasXSfmmWiden(CurVType) != + RISCVVType::hasXSfmmWiden(NewVType) || + (RISCVVType::hasXSfmmWiden(CurVType) && + RISCVVType::getXSfmmWiden(CurVType) != + RISCVVType::getXSfmmWiden(NewVType)))) return false; - if (Used.UseAltFmt && + if (Used.AltFmt && RISCVVType::isAltFmt(CurVType) != RISCVVType::isAltFmt(NewVType)) return false; return true; @@ -497,10 +497,10 @@ DemandedFields getDemanded(const MachineInstr &MI, const RISCVSubtarget *ST) { Res.TailPolicy = false; } - Res.UseAltFmt = RISCVII::getAltFmtType(MI.getDesc().TSFlags) != - RISCVII::AltFmtType::DontCare; - Res.UseTWiden = RISCVII::hasTWidenOp(MI.getDesc().TSFlags) || - RISCVInstrInfo::isXSfmmVectorConfigInstr(MI); + Res.AltFmt = RISCVII::getAltFmtType(MI.getDesc().TSFlags) != + RISCVII::AltFmtType::DontCare; + Res.TWiden = RISCVII::hasTWidenOp(MI.getDesc().TSFlags) || + RISCVInstrInfo::isXSfmmVectorConfigInstr(MI); return Res; } @@ -1347,8 +1347,8 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info, IncomingInfo.getTailAgnostic(), (Demanded.MaskPolicy ? IncomingInfo : Info).getMaskAgnostic() || IncomingInfo.getMaskAgnostic(), - (Demanded.UseAltFmt ? IncomingInfo : Info).getAltFmt(), - Demanded.UseTWiden ? IncomingInfo.getTWiden() : 0); + (Demanded.AltFmt ? IncomingInfo : Info).getAltFmt(), + Demanded.TWiden ? IncomingInfo.getTWiden() : 0); // If we only knew the sew/lmul ratio previously, replace the VTYPE but keep // the AVL. @@ -1945,9 +1945,12 @@ bool RISCVInsertVSETVLI::insertVSETMTK(MachineBasicBlock &MBB, .addImm(Log2_32(CurrInfo.getTWiden()) + 1); Changed = true; + Register Reg = Op.getReg(); + Op.setReg(Register()); + Op.setIsKill(false); if (LIS) { LIS->InsertMachineInstrInMaps(*TmpMI); - LiveInterval &LI = LIS->getInterval(Op.getReg()); + LiveInterval &LI = LIS->getInterval(Reg); // Erase the AVL operand from the instruction. LIS->shrinkToUses(&LI); @@ -1955,9 +1958,6 @@ bool RISCVInsertVSETVLI::insertVSETMTK(MachineBasicBlock &MBB, // SmallVector<LiveInterval *> SplitLIs; // LIS->splitSeparateComponents(LI, SplitLIs); } - - Op.setReg(RISCV::NoRegister); - Op.setIsKill(false); } return Changed; } diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp index c86092bd51ed..a6ec6c120776 100644 --- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/MemoryProfileInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/StaticDataProfileInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" @@ -194,6 +195,30 @@ static bool isAllocationWithHotColdVariant(const Function *Callee, } } +static void HandleUnsupportedAnnotationKinds(GlobalVariable &GVar, + AnnotationKind Kind) { + assert(Kind != llvm::memprof::AnnotationKind::AnnotationOK && + "Should not handle AnnotationOK here"); + SmallString<32> Reason; + switch (Kind) { + case llvm::memprof::AnnotationKind::ExplicitSection: + ++NumOfMemProfExplicitSectionGlobalVars; + Reason.append("explicit section name"); + break; + case llvm::memprof::AnnotationKind::DeclForLinker: + Reason.append("linker declaration"); + break; + case llvm::memprof::AnnotationKind::ReservedName: + Reason.append("name starts with `llvm.`"); + break; + default: + llvm_unreachable("Unexpected annotation kind"); + } + LLVM_DEBUG(dbgs() << "Skip annotation for " << GVar.getName() << " due to " + << Reason << ".\n"); + return; +} + struct AllocMatchInfo { uint64_t TotalSize = 0; AllocationType AllocType = AllocationType::None; @@ -775,29 +800,13 @@ PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) { return PreservedAnalyses::none(); } -// Returns true iff the global variable has custom section either by -// __attribute__((section("name"))) -// (https://clang.llvm.org/docs/AttributeReference.html#section-declspec-allocate) -// or #pragma clang section directives -// (https://clang.llvm.org/docs/LanguageExtensions.html#specifying-section-names-for-global-objects-pragma-clang-section). -static bool hasExplicitSectionName(const GlobalVariable &GVar) { - if (GVar.hasSection()) - return true; - - auto Attrs = GVar.getAttributes(); - if (Attrs.hasAttribute("bss-section") || Attrs.hasAttribute("data-section") || - Attrs.hasAttribute("relro-section") || - Attrs.hasAttribute("rodata-section")) - return true; - return false; -} - bool MemProfUsePass::annotateGlobalVariables( Module &M, const memprof::DataAccessProfData *DataAccessProf) { if (!AnnotateStaticDataSectionPrefix || M.globals().empty()) return false; if (!DataAccessProf) { + M.addModuleFlag(Module::Warning, "EnableDataAccessProf", 0U); M.getContext().diagnose(DiagnosticInfoPGOProfile( MemoryProfileFileName.data(), StringRef("Data access profiles not found in memprof. Ignore " @@ -805,6 +814,7 @@ bool MemProfUsePass::annotateGlobalVariables( DS_Warning)); return false; } + M.addModuleFlag(Module::Warning, "EnableDataAccessProf", 1U); bool Changed = false; // Iterate all global variables in the module and annotate them based on @@ -815,13 +825,9 @@ bool MemProfUsePass::annotateGlobalVariables( for (GlobalVariable &GVar : M.globals()) { assert(!GVar.getSectionPrefix().has_value() && "GVar shouldn't have section prefix yet"); - if (GVar.isDeclarationForLinker()) - continue; - - if (hasExplicitSectionName(GVar)) { - ++NumOfMemProfExplicitSectionGlobalVars; - LLVM_DEBUG(dbgs() << "Global variable " << GVar.getName() - << " has explicit section name. Skip annotating.\n"); + auto Kind = llvm::memprof::getAnnotationKind(GVar); + if (Kind != llvm::memprof::AnnotationKind::AnnotationOK) { + HandleUnsupportedAnnotationKinds(GVar, Kind); continue; } @@ -831,7 +837,6 @@ bool MemProfUsePass::annotateGlobalVariables( // TODO: Track string content hash in the profiles and compute it inside the // compiler to categeorize the hotness string literals. if (Name.starts_with(".str")) { - LLVM_DEBUG(dbgs() << "Skip annotating string literal " << Name << "\n"); continue; } diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index 45cee1e7da62..9035e58a707c 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -526,7 +526,7 @@ Value *SCEVExpander::visitAddExpr(const SCEVAddExpr *S) { // Recognize the canonical representation of an unsimplifed urem. const SCEV *URemLHS = nullptr; const SCEV *URemRHS = nullptr; - if (SE.matchURem(S, URemLHS, URemRHS)) { + if (match(S, m_scev_URem(m_SCEV(URemLHS), m_SCEV(URemRHS), SE))) { Value *LHS = expand(URemLHS); Value *RHS = expand(URemRHS); return InsertBinop(Instruction::URem, LHS, RHS, SCEV::FlagAnyWrap, diff --git a/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir b/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir new file mode 100644 index 000000000000..a4aad574aaaf --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir @@ -0,0 +1,59 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=machine-scheduler -o - %s | FileCheck %s + +--- +name: buffer_load_lds_not_valu +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: buffer_load_lds_not_valu + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF2]], [[DEF3]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF3]], [[V_ADD_U32_e32_]], implicit $exec + ; CHECK-NEXT: $m0 = S_MOV_B32 0 + ; CHECK-NEXT: BUFFER_LOAD_DWORDX4_LDS_OFFEN [[DEF]], [[DEF1]], 0, 0, 0, 0, implicit $exec, implicit $m0 + ; CHECK-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_1]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]], implicit $exec + ; CHECK-NEXT: $m0 = S_MOV_B32 1 + ; CHECK-NEXT: BUFFER_LOAD_DWORDX4_LDS_OFFEN [[DEF]], [[DEF1]], 0, 0, 0, 0, implicit $exec, implicit $m0 + ; CHECK-NEXT: [[V_ADD_U32_e32_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e32_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_3]], [[V_ADD_U32_e32_4]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e32_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_4]], [[V_ADD_U32_e32_5]], implicit $exec + ; CHECK-NEXT: dead [[V_ADD_U32_e32_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_5]], [[V_ADD_U32_e32_6]], implicit $exec + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 2, 0 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 4, 1, 0 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 2, 0 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 4, 1, 0 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 4, 0 + ; CHECK-NEXT: S_ENDPGM 0 + $exec = IMPLICIT_DEF + %0:vgpr_32 = IMPLICIT_DEF + %1:sgpr_128 = IMPLICIT_DEF + %2:vgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = IMPLICIT_DEF + %4:vgpr_32 = V_ADD_U32_e32 %2, %3, implicit $exec + %5:vgpr_32 = V_ADD_U32_e32 %3, %4, implicit $exec + $m0 = S_MOV_B32 0 + BUFFER_LOAD_DWORDX4_LDS_OFFEN %0, %1, 0, 0, 0, 0, implicit $exec, implicit $m0 + $m0 = S_MOV_B32 1 + BUFFER_LOAD_DWORDX4_LDS_OFFEN %0, %1, 0, 0, 0, 0, implicit $exec, implicit $m0 + %6:vgpr_32 = V_ADD_U32_e32 %4, %5, implicit $exec + %7:vgpr_32 = V_ADD_U32_e32 %5, %6, implicit $exec + %8:vgpr_32 = V_ADD_U32_e32 %6, %7, implicit $exec + %9:vgpr_32 = V_ADD_U32_e32 %7, %8, implicit $exec + %10:vgpr_32 = V_ADD_U32_e32 %8, %9, implicit $exec + %11:vgpr_32 = V_ADD_U32_e32 %9, %10, implicit $exec + SCHED_GROUP_BARRIER 2, 2, 0 + SCHED_GROUP_BARRIER 4, 1 ,0 + SCHED_GROUP_BARRIER 2, 2, 0 + SCHED_GROUP_BARRIER 4, 1 ,0 + SCHED_GROUP_BARRIER 2, 4, 0 + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir b/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir index 1030917c8741..302f70fc1519 100644 --- a/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir +++ b/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=aarch64 -run-pass=prologepilog -run-pass=aarch64-ptrauth -o - %s 2>&1 | FileCheck %s +# RUN: llc -mtriple=aarch64 -run-pass=prologepilog -run-pass=aarch64-ptrauth -o - %s 2>&1 | FileCheck --strict-whitespace %s --- | target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64" diff --git a/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll b/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll index a0c243be0e7e..f3950b75a969 100644 --- a/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll +++ b/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll @@ -1,16 +1,15 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -;; A minimal test case. llc will crash if global variables already has a section -;; prefix. Subsequent PRs will expand on this test case to test the hotness -;; reconciliation implementation. - -; RUN: not llc -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic \ +;; A minimal test case. Subsequent PRs will expand on this test case +;; (e.g., with more functions, variables and profiles) and test the hotness +;; reconcillation implementation. +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic \ ; RUN: -partition-static-data-sections=true \ ; RUN: -data-sections=true -unique-section-names=false \ -; RUN: %s -o - 2>&1 | FileCheck %s --check-prefix=ERR +; RUN: %s -o - 2>&1 | FileCheck %s --check-prefix=IR -; ERR: Global variable hot_bss already has a section prefix hot +; IR: .section .bss.hot.,"aw" @hot_bss = internal global i32 0, !section_prefix !17 diff --git a/llvm/test/CodeGen/X86/global-variable-partition.ll b/llvm/test/CodeGen/X86/global-variable-partition.ll index ce06d1712f84..604b4fd5a96e 100644 --- a/llvm/test/CodeGen/X86/global-variable-partition.ll +++ b/llvm/test/CodeGen/X86/global-variable-partition.ll @@ -106,23 +106,31 @@ target triple = "x86_64-unknown-linux-gnu" ; UNIQ-NEXT: .section .data.unlikely.,"aw",@progbits,unique,8 ; AGG-NEXT: .section .data.unlikely.,"aw",@progbits +;; The `.section` directive is omitted for .data with -unique-section-names=false. +; See MCSectionELF::shouldOmitSectionDirective for the implementation details. + ; For @data_with_unknown_hotness ; SYM: .type .Ldata_with_unknown_hotness,@object # @data_with_unknown_hotness ; SYM: .section .data..Ldata_with_unknown_hotness,"aw",@progbits ; UNIQ: .section .data,"aw",@progbits,unique,9 -; The `.section` directive is omitted for .data with -unique-section-names=false. -; See MCSectionELF::shouldOmitSectionDirective for the implementation details. + ; AGG: .data ; COMMON: .Ldata_with_unknown_hotness: -; For @hot_data_custom_bar_section -; It has an explicit section attribute 'var' and shouldn't have hot or unlikely suffix. +; For variables that are not eligible for section prefix annotation ; COMMON: .type hot_data_custom_bar_section,@object ; SYM-NEXT: .section bar,"aw",@progbits ; SYM: hot_data_custom_bar_section ; UNIQ: .section bar,"aw",@progbits ; AGG: .section bar,"aw",@progbits +; SYM: .section .data.llvm.fake_var,"aw" +; UNIQ: .section .data,"aw" +; AGG: .data + +;; No section for linker declaration +; COMMON-NOT: qux + @.str = private unnamed_addr constant [5 x i8] c"hot\09\00", align 1 @.str.1 = private unnamed_addr constant [10 x i8] c"%d\09%d\09%d\0A\00", align 1 @hot_relro_array = internal constant [2 x ptr] [ptr @bss2, ptr @data3] @@ -137,6 +145,8 @@ target triple = "x86_64-unknown-linux-gnu" @data3 = internal global i32 3 @data_with_unknown_hotness = private global i32 5 @hot_data_custom_bar_section = internal global i32 101 #0 +@llvm.fake_var = internal global i32 123 +@qux = external global i64 define void @cold_func(i32 %0) !prof !15 { %2 = load i32, ptr @cold_bss diff --git a/llvm/test/Transforms/PGOProfile/data-access-profile.ll b/llvm/test/Transforms/PGOProfile/data-access-profile.ll index 29198f34ccbb..205184bdd715 100644 --- a/llvm/test/Transforms/PGOProfile/data-access-profile.ll +++ b/llvm/test/Transforms/PGOProfile/data-access-profile.ll @@ -3,55 +3,72 @@ ; RUN: rm -rf %t && split-file %s %t && cd %t -;; Read a text profile and merge it into indexed profile. +;; Read text profiles and merge them into indexed profiles. ; RUN: llvm-profdata merge --memprof-version=4 memprof.yaml -o memprof.profdata +; RUN: llvm-profdata merge --memprof-version=4 memprof-no-dap.yaml -o memprof-no-dap.profdata ;; Run optimizer pass on an IR module without IR functions, and test that global ;; variables in the module could be annotated (i.e., no early return), ; RUN: opt -passes='memprof-use<profile-filename=memprof.profdata>' -memprof-annotate-static-data-prefix \ -; RUN: -debug-only=memprof -stats -S funcless-module.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,PREFIX,STAT +; RUN: -debug-only=memprof -stats -S funcless-module.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,IR,STAT ;; Run optimizer pass on the IR, and check the section prefix. ; RUN: opt -passes='memprof-use<profile-filename=memprof.profdata>' -memprof-annotate-static-data-prefix \ -; RUN: -debug-only=memprof -stats -S input.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,PREFIX,STAT +; RUN: -debug-only=memprof -stats -S input.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,IR,STAT -;; Run optimizer pass without explicitly setting -memprof-annotate-static-data-prefix. -;; The output text IR shouldn't have `section_prefix` +;; Run memprof without providing memprof data. Test that IR has module flag +;; `EnableDataAccessProf` as 0. +; RUN: opt -passes='memprof-use<profile-filename=memprof-no-dap.profdata>' -memprof-annotate-static-data-prefix \ +; RUN: -debug-only=memprof -stats -S input.ll -o - 2>&1 | FileCheck %s --check-prefix=FLAG + +;; Run memprof without explicitly setting -memprof-annotate-static-data-prefix. +;; The output text IR shouldn't have `section_prefix` or EnableDataAccessProf module flag. ; RUN: opt -passes='memprof-use<profile-filename=memprof.profdata>' \ -; RUN: -debug-only=memprof -stats -S input.ll -o - | FileCheck %s --implicit-check-not="section_prefix" +; RUN: -debug-only=memprof -stats -S input.ll -o - | FileCheck %s --check-prefix=FLAGLESS --implicit-check-not="section_prefix" ; LOG: Skip annotating string literal .str ; LOG: Global variable var1 is annotated as hot ; LOG: Global variable var2.llvm.125 is annotated as hot ; LOG: Global variable bar is not annotated ; LOG: Global variable foo is annotated as unlikely -; LOG: Global variable var3 has explicit section name. Skip annotating. -; LOG: Global variable var4 has explicit section name. Skip annotating. +; LOG: Skip annotation for var3 due to explicit section name. +; LOG: Skip annotation for var4 due to explicit section name. +; LOG: Skip annotation for llvm.fake_var due to name starts with `llvm.`. +; LOG: Skip annotation for qux due to linker declaration. ;; String literals are not annotated. -; PREFIX: @.str = unnamed_addr constant [5 x i8] c"abcde" -; PREFIX-NOT: section_prefix -; PREFIX: @var1 = global i32 123, !section_prefix !0 +; IR: @.str = unnamed_addr constant [5 x i8] c"abcde" +; IR-NOT: section_prefix +; IR: @var1 = global i32 123, !section_prefix !0 ;; @var.llvm.125 will be canonicalized to @var2 for profile look-up. -; PREFIX-NEXT: @var2.llvm.125 = global i64 0, !section_prefix !0 +; IR-NEXT: @var2.llvm.125 = global i64 0, !section_prefix !0 ;; @bar is not seen in hot symbol or known symbol set, so it won't get a section ;; prefix. Test this by testing that there is no section_prefix between @bar and ;; @foo. -; PREFIX-NEXT: @bar = global i16 3 -; PREFIX-NOT: !section_prefix +; IR-NEXT: @bar = global i16 3 +; IR-NOT: !section_prefix ;; @foo is unlikely. -; PREFIX-NEXT: @foo = global i8 2, !section_prefix !1 +; IR-NEXT: @foo = global i8 2, !section_prefix !1 + +; IR-NEXT: @var3 = constant [2 x i32] [i32 12345, i32 6789], section "sec1" +; IR-NEXT: @var4 = constant [1 x i64] [i64 98765] #0 + +; IR: @llvm.fake_var = global i32 123 +; IR-NOT: !section_prefix +; IR: @qux = external global i64 +; IR-NOT: !section_prefix -; PREFIX-NEXT: @var3 = constant [2 x i32] [i32 12345, i32 6789], section "sec1" -; PREFIX-NEXT: @var4 = constant [1 x i64] [i64 98765] #0 +; IR: attributes #0 = { "rodata-section"="sec2" } -; PREFIX: attributes #0 = { "rodata-section"="sec2" } +; IR: !0 = !{!"section_prefix", !"hot"} +; IR-NEXT: !1 = !{!"section_prefix", !"unlikely"} +; IR-NEXT: !2 = !{i32 2, !"EnableDataAccessProf", i32 1} -; PREFIX: !0 = !{!"section_prefix", !"hot"} -; PREFIX-NEXT: !1 = !{!"section_prefix", !"unlikely"} +; FLAG: !{i32 2, !"EnableDataAccessProf", i32 0} +; FLAGLESS-NOT: EnableDataAccessProf ; STAT: 1 memprof - Number of global vars annotated with 'unlikely' section prefix. ; STAT: 2 memprof - Number of global vars with user-specified section (not annotated). @@ -72,6 +89,24 @@ DataAccessProfiles: - foo KnownColdStrHashes: [ 999, 1001 ] ... +;--- memprof-no-dap.yaml +--- +# A memprof file with without data access profiles. The heap records are simplified +# to pass profile parsing and don't need to match the IR. +HeapProfileRecords: + - GUID: 0xdeadbeef12345678 + AllocSites: + - Callstack: + - { Function: 0x1111111111111111, LineOffset: 11, Column: 10, IsInlineFrame: true } + MemInfoBlock: + AllocCount: 111 + TotalSize: 222 + TotalLifetime: 333 + TotalLifetimeAccessDensity: 444 + CallSites: + - Frames: + - { Function: 0x5555555555555555, LineOffset: 55, Column: 50, IsInlineFrame: true } +... ;--- input.ll target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" @@ -84,11 +119,14 @@ target triple = "x86_64-unknown-linux-gnu" @foo = global i8 2 @var3 = constant [2 x i32][i32 12345, i32 6789], section "sec1" @var4 = constant [1 x i64][i64 98765] #0 +@llvm.fake_var = global i32 123 +@qux = external global i64 define i32 @func() { %a = load i32, ptr @var1 %b = load i32, ptr @var2.llvm.125 - %ret = call i32 (...) @func_taking_arbitrary_param(i32 %a, i32 %b) + %c = load i32, ptr @llvm.fake_var + %ret = call i32 (...) @func_taking_arbitrary_param(i32 %a, i32 %b, i32 %c) ret i32 %ret } @@ -108,5 +146,8 @@ target triple = "x86_64-unknown-linux-gnu" @foo = global i8 2 @var3 = constant [2 x i32][i32 12345, i32 6789], section "sec1" @var4 = constant [1 x i64][i64 98765] #0 +@llvm.fake_var = global i32 123 +@qux = external global i64 + attributes #0 = { "rodata-section"="sec2" } diff --git a/llvm/unittests/ADT/StringExtrasTest.cpp b/llvm/unittests/ADT/StringExtrasTest.cpp index fbaed38da594..af88f88949b7 100644 --- a/llvm/unittests/ADT/StringExtrasTest.cpp +++ b/llvm/unittests/ADT/StringExtrasTest.cpp @@ -290,6 +290,12 @@ TEST(StringExtrasTest, ListSeparator) { EXPECT_EQ(S, ""); S = LS2; EXPECT_EQ(S, " "); + + ListSeparator LS3(",", "{"); + S = LS3; + EXPECT_EQ(S, "{"); + S = LS3; + EXPECT_EQ(S, ","); } TEST(StringExtrasTest, toStringAPInt) { diff --git a/llvm/unittests/Analysis/ScalarEvolutionTest.cpp b/llvm/unittests/Analysis/ScalarEvolutionTest.cpp index 1a68823b4f25..5d7eded06a76 100644 --- a/llvm/unittests/Analysis/ScalarEvolutionTest.cpp +++ b/llvm/unittests/Analysis/ScalarEvolutionTest.cpp @@ -11,6 +11,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ScalarEvolutionNormalization.h" +#include "llvm/Analysis/ScalarEvolutionPatternMatch.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/AsmParser/Parser.h" #include "llvm/IR/Constants.h" @@ -26,6 +27,8 @@ namespace llvm { +using namespace SCEVPatternMatch; + // We use this fixture to ensure that we clean up ScalarEvolution before // deleting the PassManager. class ScalarEvolutionsTest : public testing::Test { @@ -64,11 +67,6 @@ static std::optional<APInt> computeConstantDifference(ScalarEvolution &SE, return SE.computeConstantDifference(LHS, RHS); } - static bool matchURem(ScalarEvolution &SE, const SCEV *Expr, const SCEV *&LHS, - const SCEV *&RHS) { - return SE.matchURem(Expr, LHS, RHS); - } - static bool isImpliedCond( ScalarEvolution &SE, ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, ICmpInst::Predicate FoundPred, const SCEV *FoundLHS, @@ -1524,7 +1522,7 @@ TEST_F(ScalarEvolutionsTest, MatchURem) { auto *URemI = getInstructionByName(F, N); auto *S = SE.getSCEV(URemI); const SCEV *LHS, *RHS; - EXPECT_TRUE(matchURem(SE, S, LHS, RHS)); + EXPECT_TRUE(match(S, m_scev_URem(m_SCEV(LHS), m_SCEV(RHS), SE))); EXPECT_EQ(LHS, SE.getSCEV(URemI->getOperand(0))); EXPECT_EQ(RHS, SE.getSCEV(URemI->getOperand(1))); EXPECT_EQ(LHS->getType(), S->getType()); @@ -1537,7 +1535,7 @@ TEST_F(ScalarEvolutionsTest, MatchURem) { auto *URem1 = getInstructionByName(F, "rem4"); auto *S = SE.getSCEV(Ext); const SCEV *LHS, *RHS; - EXPECT_TRUE(matchURem(SE, S, LHS, RHS)); + EXPECT_TRUE(match(S, m_scev_URem(m_SCEV(LHS), m_SCEV(RHS), SE))); EXPECT_NE(LHS, SE.getSCEV(URem1->getOperand(0))); // RHS and URem1->getOperand(1) have different widths, so compare the // integer values. diff --git a/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt b/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt index 8d9474bf3789..c301e0b40e8f 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt @@ -48,6 +48,10 @@ mlir_tablegen(LLVMIntrinsicFromLLVMIRConversions.inc -gen-intr-from-llvmir-conve mlir_tablegen(LLVMConvertibleLLVMIRIntrinsics.inc -gen-convertible-llvmir-intrinsics) add_mlir_dialect_tablegen_target(MLIRLLVMIntrinsicConversionsIncGen) +set(LLVM_TARGET_DEFINITIONS LLVMDialectBytecode.td) +mlir_tablegen(LLVMDialectBytecode.cpp.inc -gen-bytecode -bytecode-dialect="LLVM") +add_public_tablegen_target(MLIRLLVMDialectBytecodeIncGen) + set(LLVM_TARGET_DEFINITIONS BasicPtxBuilderInterface.td) mlir_tablegen(BasicPtxBuilderInterface.h.inc -gen-op-interface-decls) mlir_tablegen(BasicPtxBuilderInterface.cpp.inc -gen-op-interface-defs) diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialectBytecode.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialectBytecode.td new file mode 100644 index 000000000000..e7b202cd4f63 --- /dev/null +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialectBytecode.td @@ -0,0 +1,353 @@ +//===-- LLVMDialectBytecode.td - LLVM bytecode defs --------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This is the LLVM bytecode reader/writer definition file. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_DIALECT_BYTECODE +#define LLVM_DIALECT_BYTECODE + +include "mlir/IR/BytecodeBase.td" + +//===----------------------------------------------------------------------===// +// Bytecode classes for attributes and types. +//===----------------------------------------------------------------------===// + +def String : + WithParser <"succeeded($_reader.readString($_var))", + WithBuilder<"$_args", + WithPrinter<"$_writer.writeOwnedString($_getter)", + WithType <"StringRef">>>>; + +class Attr<string type> : WithType<type, Attribute>; + +class OptionalAttribute<string type> : + WithParser <"succeeded($_reader.readOptionalAttribute($_var))", + WithPrinter<"$_writer.writeOptionalAttribute($_getter)", + WithType<type, Attribute>>>; + +class OptionalInt<string type> : + WithParser <"succeeded(readOptionalInt($_reader, $_var))", + WithPrinter<"writeOptionalInt($_writer, $_getter)", + WithType<"std::optional<" # type # ">", VarInt>>>; + +class OptionalArrayRef<string eltType> : + WithParser <"succeeded(readOptionalArrayRef<" + # eltType # ">($_reader, $_var))", + WithPrinter<"writeOptionalArrayRef<" + # eltType # ">($_writer, $_getter)", + WithType<"SmallVector<" + # eltType # ">", Attribute>>>; + +class EnumClassFlag<string flag, string getter> : + WithParser<"succeeded($_reader.readVarInt($_var))", + WithBuilder<"(" # flag # ")$_args", + WithPrinter<"$_writer.writeVarInt((uint64_t)$_name." # getter # ")", + WithType<"uint64_t", VarInt>>>>; + +//===----------------------------------------------------------------------===// +// General notes +// - For each attribute or type entry, the argument names should match +// LLVMAttrDefs.td +// - The mnemonics are either LLVM or builtin MLIR attributes and types, but +// regular C++ types are also allowed to match builders and parsers. +// - DIScopeAttr and DINodeAttr are empty base classes, custom encoding not +// needed. +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// DIBasicTypeAttr +//===----------------------------------------------------------------------===// + +def DIBasicTypeAttr : DialectAttribute<(attr + VarInt:$tag, + String:$name, + VarInt:$sizeInBits, + VarInt:$encoding +)>; + +//===----------------------------------------------------------------------===// +// DIExpressionAttr, DIExpressionElemAttr +//===----------------------------------------------------------------------===// + +def DIExpressionElemAttr : DialectAttribute<(attr + VarInt:$opcode, + OptionalArrayRef<"uint64_t">:$arguments +)>; + +def DIExpressionAttr : DialectAttribute<(attr + OptionalArrayRef<"DIExpressionElemAttr">:$operations +)>; + +//===----------------------------------------------------------------------===// +// DIFileAttr +//===----------------------------------------------------------------------===// + +def DIFileAttr : DialectAttribute<(attr + String:$name, + String:$directory +)>; + +//===----------------------------------------------------------------------===// +// DILocalVariableAttr +//===----------------------------------------------------------------------===// + +def DILocalVariableAttr : DialectAttribute<(attr + Attr<"DIScopeAttr">:$scope, + OptionalAttribute<"StringAttr">:$name, + OptionalAttribute<"DIFileAttr">:$file, + VarInt:$line, + VarInt:$arg, + VarInt:$alignInBits, + OptionalAttribute<"DITypeAttr">:$type, + EnumClassFlag<"DIFlags", "getFlags()">:$_rawflags, + LocalVar<"DIFlags", "(DIFlags)_rawflags">:$flags +)> { + // DILocalVariableAttr direct getter uses a `StringRef` for `name`. Since the + // more direct getter is prefered during bytecode reading, force the base one + // and prevent crashes for empty `StringAttr`. + let cBuilder = "$_resultType::get(context, $_args)"; +} + +//===----------------------------------------------------------------------===// +// DISubroutineTypeAttr +//===----------------------------------------------------------------------===// + +def DISubroutineTypeAttr : DialectAttribute<(attr + VarInt:$callingConvention, + OptionalArrayRef<"DITypeAttr">:$types +)>; + +//===----------------------------------------------------------------------===// +// DICompileUnitAttr +//===----------------------------------------------------------------------===// + +def DICompileUnitAttr : DialectAttribute<(attr + Attr<"DistinctAttr">:$id, + VarInt:$sourceLanguage, + Attr<"DIFileAttr">:$file, + OptionalAttribute<"StringAttr">:$producer, + Bool:$isOptimized, + EnumClassFlag<"DIEmissionKind", "getEmissionKind()">:$_rawEmissionKind, + LocalVar<"DIEmissionKind", "(DIEmissionKind)_rawEmissionKind">:$emissionKind, + EnumClassFlag<"DINameTableKind", "getNameTableKind()">:$_rawNameTableKind, + LocalVar<"DINameTableKind", + "(DINameTableKind)_rawNameTableKind">:$nameTableKind +)>; + +//===----------------------------------------------------------------------===// +// DISubprogramAttr +//===----------------------------------------------------------------------===// + +def DISubprogramAttr : DialectAttribute<(attr + OptionalAttribute<"DistinctAttr">:$recId, + Bool:$isRecSelf, + OptionalAttribute<"DistinctAttr">:$id, + OptionalAttribute<"DICompileUnitAttr">:$compileUnit, + OptionalAttribute<"DIScopeAttr">:$scope, + OptionalAttribute<"StringAttr">:$name, + OptionalAttribute<"StringAttr">:$linkageName, + OptionalAttribute<"DIFileAttr">:$file, + VarInt:$line, + VarInt:$scopeLine, + EnumClassFlag<"DISubprogramFlags", "getSubprogramFlags()">:$_rawflags, + LocalVar<"DISubprogramFlags", "(DISubprogramFlags)_rawflags">:$subprogramFlags, + OptionalAttribute<"DISubroutineTypeAttr">:$type, + OptionalArrayRef<"DINodeAttr">:$retainedNodes, + OptionalArrayRef<"DINodeAttr">:$annotations +)>; + +//===----------------------------------------------------------------------===// +// DICompositeTypeAttr +//===----------------------------------------------------------------------===// + +def DICompositeTypeAttr : DialectAttribute<(attr + OptionalAttribute<"DistinctAttr">:$recId, + Bool:$isRecSelf, + VarInt:$tag, + OptionalAttribute<"StringAttr">:$name, + OptionalAttribute<"DIFileAttr">:$file, + VarInt:$line, + OptionalAttribute<"DIScopeAttr">:$scope, + OptionalAttribute<"DITypeAttr">:$baseType, + EnumClassFlag<"DIFlags", "getFlags()">:$_rawflags, + LocalVar<"DIFlags", "(DIFlags)_rawflags">:$flags, + VarInt:$sizeInBits, + VarInt:$alignInBits, + OptionalAttribute<"DIExpressionAttr">:$dataLocation, + OptionalAttribute<"DIExpressionAttr">:$rank, + OptionalAttribute<"DIExpressionAttr">:$allocated, + OptionalAttribute<"DIExpressionAttr">:$associated, + OptionalArrayRef<"DINodeAttr">:$elements +)>; + +//===----------------------------------------------------------------------===// +// DIDerivedTypeAttr +//===----------------------------------------------------------------------===// + +def DIDerivedTypeAttr : DialectAttribute<(attr + VarInt:$tag, + OptionalAttribute<"StringAttr">:$name, + OptionalAttribute<"DITypeAttr">:$baseType, + VarInt:$sizeInBits, + VarInt:$alignInBits, + VarInt:$offsetInBits, + OptionalInt<"unsigned">:$dwarfAddressSpace, + OptionalAttribute<"DINodeAttr">:$extraData +)>; + +//===----------------------------------------------------------------------===// +// DIImportedEntityAttr +//===----------------------------------------------------------------------===// + +def DIImportedEntityAttr : DialectAttribute<(attr + VarInt:$tag, + Attr<"DIScopeAttr">:$scope, + Attr<"DINodeAttr">:$entity, + OptionalAttribute<"DIFileAttr">:$file, + VarInt:$line, + OptionalAttribute<"StringAttr">:$name, + OptionalArrayRef<"DINodeAttr">:$elements +)>; + +//===----------------------------------------------------------------------===// +// DIGlobalVariableAttr, DIGlobalVariableExpressionAttr +//===----------------------------------------------------------------------===// + +def DIGlobalVariableAttr : DialectAttribute<(attr + OptionalAttribute<"DIScopeAttr">:$scope, + OptionalAttribute<"StringAttr">:$name, + OptionalAttribute<"StringAttr">:$linkageName, + Attr<"DIFileAttr">:$file, + VarInt:$line, + Attr<"DITypeAttr">:$type, + Bool:$isLocalToUnit, + Bool:$isDefined, + VarInt:$alignInBits +)>; + +def DIGlobalVariableExpressionAttr : DialectAttribute<(attr + Attr<"DIGlobalVariableAttr">:$var, + OptionalAttribute<"DIExpressionAttr">:$expr +)>; + +//===----------------------------------------------------------------------===// +// DILabelAttr +//===----------------------------------------------------------------------===// + +def DILabelAttr : DialectAttribute<(attr + Attr<"DIScopeAttr">:$scope, + OptionalAttribute<"StringAttr">:$name, + OptionalAttribute<"DIFileAttr">:$file, + VarInt:$line +)> { + // DILabelAttr direct getter uses a `StringRef` for `name`. Since the + // more direct getter is prefered during bytecode reading, force the base one + // and prevent crashes for empty `StringAttr`. + let cBuilder = "$_resultType::get(context, $_args)"; +} + +//===----------------------------------------------------------------------===// +// DILexicalBlockAttr, DILexicalBlockFileAttr +//===----------------------------------------------------------------------===// + +def DILexicalBlockAttr : DialectAttribute<(attr + Attr<"DIScopeAttr">:$scope, + OptionalAttribute<"DIFileAttr">:$file, + VarInt:$line, + VarInt:$column +)>; + +def DILexicalBlockFileAttr : DialectAttribute<(attr + Attr<"DIScopeAttr">:$scope, + OptionalAttribute<"DIFileAttr">:$file, + VarInt:$discriminator +)>; + +//===----------------------------------------------------------------------===// +// DINamespaceAttr +//===----------------------------------------------------------------------===// + +def DINamespaceAttr : DialectAttribute<(attr + OptionalAttribute<"StringAttr">:$name, + OptionalAttribute<"DIScopeAttr">:$scope, + Bool:$exportSymbols +)>; + +//===----------------------------------------------------------------------===// +// DISubrangeAttr +//===----------------------------------------------------------------------===// + +def DISubrangeAttr : DialectAttribute<(attr + OptionalAttribute<"Attribute">:$count, + OptionalAttribute<"Attribute">:$lowerBound, + OptionalAttribute<"Attribute">:$upperBound, + OptionalAttribute<"Attribute">:$stride +)>; + +//===----------------------------------------------------------------------===// +// LoopAnnotationAttr +//===----------------------------------------------------------------------===// + +def LoopAnnotationAttr : DialectAttribute<(attr + OptionalAttribute<"BoolAttr">:$disableNonforced, + OptionalAttribute<"LoopVectorizeAttr">:$vectorize, + OptionalAttribute<"LoopInterleaveAttr">:$interleave, + OptionalAttribute<"LoopUnrollAttr">:$unroll, + OptionalAttribute<"LoopUnrollAndJamAttr">:$unrollAndJam, + OptionalAttribute<"LoopLICMAttr">:$licm, + OptionalAttribute<"LoopDistributeAttr">:$distribute, + OptionalAttribute<"LoopPipelineAttr">:$pipeline, + OptionalAttribute<"LoopPeeledAttr">:$peeled, + OptionalAttribute<"LoopUnswitchAttr">:$unswitch, + OptionalAttribute<"BoolAttr">:$mustProgress, + OptionalAttribute<"BoolAttr">:$isVectorized, + OptionalAttribute<"FusedLoc">:$startLoc, + OptionalAttribute<"FusedLoc">:$endLoc, + OptionalArrayRef<"AccessGroupAttr">:$parallelAccesses +)>; + +//===----------------------------------------------------------------------===// +// Attributes & Types with custom bytecode handling. +//===----------------------------------------------------------------------===// + +// All the attributes with custom bytecode handling. +def LLVMDialectAttributes : DialectAttributes<"LLVM"> { + let elems = [ + DIBasicTypeAttr, + DICompileUnitAttr, + DICompositeTypeAttr, + DIDerivedTypeAttr, + DIExpressionElemAttr, + DIExpressionAttr, + DIFileAttr, + DIGlobalVariableAttr, + DIGlobalVariableExpressionAttr, + DIImportedEntityAttr, + DILabelAttr, + DILexicalBlockAttr, + DILexicalBlockFileAttr, + DILocalVariableAttr, + DINamespaceAttr, + DISubprogramAttr, + DISubrangeAttr, + DISubroutineTypeAttr, + LoopAnnotationAttr + // Referenced attributes currently missing support: + // AccessGroupAttr, LoopVectorizeAttr, LoopInterleaveAttr, LoopUnrollAttr, + // LoopUnrollAndJamAttr, LoopLICMAttr, LoopDistributeAttr, LoopPipelineAttr, + // LoopPeeledAttr, LoopUnswitchAttr + ]; +} + +def LLVMDialectTypes : DialectTypes<"LLVM"> { + let elems = []; +} + +#endif // LLVM_DIALECT_BYTECODE diff --git a/mlir/lib/Dialect/LLVMIR/CMakeLists.txt b/mlir/lib/Dialect/LLVMIR/CMakeLists.txt index ec581ac7277e..cc66face1c00 100644 --- a/mlir/lib/Dialect/LLVMIR/CMakeLists.txt +++ b/mlir/lib/Dialect/LLVMIR/CMakeLists.txt @@ -8,11 +8,13 @@ add_mlir_dialect_library(MLIRLLVMDialect IR/LLVMMemorySlot.cpp IR/LLVMTypes.cpp IR/LLVMTypeSyntax.cpp + IR/LLVMDialectBytecode.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/LLVMIR DEPENDS + MLIRLLVMDialectBytecodeIncGen MLIRLLVMOpsIncGen MLIRLLVMTypesIncGen MLIRLLVMIntrinsicOpsIncGen diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp index 5d08cccb4faa..7ca09d9c943e 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp @@ -29,6 +29,8 @@ #include "llvm/IR/DataLayout.h" #include "llvm/Support/Error.h" +#include "LLVMDialectBytecode.h" + #include <numeric> #include <optional> @@ -4237,6 +4239,7 @@ void LLVMDialect::initialize() { // Support unknown operations because not all LLVM operations are registered. allowUnknownOperations(); declarePromisedInterface<DialectInlinerInterface, LLVMDialect>(); + detail::addBytecodeInterface(this); } #define GET_OP_CLASSES diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialectBytecode.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialectBytecode.cpp new file mode 100644 index 000000000000..41d1f80580cf --- /dev/null +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialectBytecode.cpp @@ -0,0 +1,154 @@ +//===- LLVMDialectBytecode.cpp - LLVM Bytecode Implementation -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "LLVMDialectBytecode.h" +#include "mlir/Bytecode/BytecodeImplementation.h" +#include "mlir/Dialect/LLVMIR/LLVMAttrs.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/LLVMIR/LLVMTypes.h" +#include "mlir/IR/Diagnostics.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/TypeSwitch.h" +#include <type_traits> + +using namespace mlir; +using namespace mlir::LLVM; + +namespace { + +// Provide some forward declarations of the functions that will be generated by +// the include below. +static void write(DIExpressionElemAttr attribute, + DialectBytecodeWriter &writer); +static LogicalResult writeAttribute(Attribute attribute, + DialectBytecodeWriter &writer); + +//===--------------------------------------------------------------------===// +// Optional ArrayRefs +// +// Note that both the writer and reader functions consider attributes to be +// optional. This is because the attribute may be present or empty. +//===--------------------------------------------------------------------===// + +template <class EntryTy> +static void writeOptionalArrayRef(DialectBytecodeWriter &writer, + ArrayRef<EntryTy> storage) { + if (storage.empty()) { + writer.writeOwnedBool(false); + return; + } + + writer.writeOwnedBool(true); + writer.writeList(storage, [&](EntryTy val) { + if constexpr (std::is_base_of_v<Attribute, EntryTy>) { + (void)writer.writeOptionalAttribute(val); + } else if constexpr (std::is_integral_v<EntryTy>) { + (void)writer.writeVarInt(val); + } else { + static_assert(true, "EntryTy not supported"); + } + }); +} + +template <class EntryTy> +static LogicalResult readOptionalArrayRef(DialectBytecodeReader &reader, + SmallVectorImpl<EntryTy> &storage) { + bool isPresent = false; + if (failed(reader.readBool(isPresent))) + return failure(); + // Nothing to do here, the array is empty. + if (!isPresent) + return success(); + + auto readEntry = [&]() -> FailureOr<EntryTy> { + EntryTy temp; + if constexpr (std::is_base_of_v<Attribute, EntryTy>) { + if (succeeded(reader.readOptionalAttribute(temp))) + return temp; + } else if constexpr (std::is_integral_v<EntryTy>) { + if (succeeded(reader.readVarInt(temp))) + return temp; + } else { + static_assert(true, "EntryTy not supported"); + } + return failure(); + }; + + return reader.readList(storage, readEntry); +} + +//===--------------------------------------------------------------------===// +// Optional integral types +//===--------------------------------------------------------------------===// + +template <class EntryTy> +static void writeOptionalInt(DialectBytecodeWriter &writer, + std::optional<EntryTy> storage) { + static_assert(std::is_integral_v<EntryTy>, + "EntryTy must be an integral type"); + EntryTy val = storage.value_or(0); + writer.writeVarIntWithFlag(val, storage.has_value()); +} + +template <class EntryTy> +static LogicalResult readOptionalInt(DialectBytecodeReader &reader, + std::optional<EntryTy> &storage) { + static_assert(std::is_integral_v<EntryTy>, + "EntryTy must be an integral type"); + uint64_t result = 0; + bool flag = false; + if (failed(reader.readVarIntWithFlag(result, flag))) + return failure(); + if (flag) + storage = static_cast<EntryTy>(result); + else + storage = std::nullopt; + return success(); +} + +//===--------------------------------------------------------------------===// +// Tablegen generated bytecode functions +//===--------------------------------------------------------------------===// + +#include "mlir/Dialect/LLVMIR/LLVMDialectBytecode.cpp.inc" + +//===--------------------------------------------------------------------===// +// LLVMDialectBytecodeInterface +//===--------------------------------------------------------------------===// + +/// This class implements the bytecode interface for the LLVM dialect. +struct LLVMDialectBytecodeInterface : public BytecodeDialectInterface { + LLVMDialectBytecodeInterface(Dialect *dialect) + : BytecodeDialectInterface(dialect) {} + + // Attributes + Attribute readAttribute(DialectBytecodeReader &reader) const override { + return ::readAttribute(getContext(), reader); + } + + LogicalResult writeAttribute(Attribute attr, + DialectBytecodeWriter &writer) const override { + return ::writeAttribute(attr, writer); + } + + // Types + Type readType(DialectBytecodeReader &reader) const override { + return ::readType(getContext(), reader); + } + + LogicalResult writeType(Type type, + DialectBytecodeWriter &writer) const override { + return ::writeType(type, writer); + } +}; +} // namespace + +void LLVM::detail::addBytecodeInterface(LLVMDialect *dialect) { + dialect->addInterfaces<LLVMDialectBytecodeInterface>(); +} diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialectBytecode.h b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialectBytecode.h new file mode 100644 index 000000000000..1a17cb462ccf --- /dev/null +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialectBytecode.h @@ -0,0 +1,27 @@ +//===- LLVMDialectBytecode.h - LLVM Bytecode Implementation -----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This header defines hooks into the LLVM dialect bytecode +// implementation. +// +//===----------------------------------------------------------------------===// + +#ifndef LIB_MLIR_DIALECT_LLVM_IR_LLVMDIALECTBYTECODE_H +#define LIB_MLIR_DIALECT_LLVM_IR_LLVMDIALECTBYTECODE_H + +namespace mlir::LLVM { +class LLVMDialect; + +namespace detail { +/// Add the interfaces necessary for encoding the LLVM dialect components in +/// bytecode. +void addBytecodeInterface(LLVMDialect *dialect); +} // namespace detail +} // namespace mlir::LLVM + +#endif // LIB_MLIR_DIALECT_LLVM_IR_LLVMDIALECTBYTECODE_H diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 36c498e8b849..f77784abaf0b 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -161,11 +161,24 @@ XeGPUBlockingPass::getTileShape(Operation *op) const { xegpu::UpdateOffsetOp, xegpu::LoadMatrixOp>(op)) return getTileShape(op->getOpResult(0)); if (isa<xegpu::PrefetchNdOp, xegpu::LoadNdOp, xegpu::PrefetchOp, - xegpu::LoadGatherOp, xegpu::StoreMatrixOp>(op)) + xegpu::StoreMatrixOp>(op)) return getTileShape(op->getOpOperand(0)); - if (isa<xegpu::StoreNdOp, xegpu::StoreScatterOp>(op)) + if (isa<xegpu::StoreNdOp>(op)) return getTileShape(op->getOpOperand(1)); + // Handle LoadGatherOp and StoreScatterOp (with and without offset) + if (auto loadGatherOp = dyn_cast<xegpu::LoadGatherOp>(op)) { + if (loadGatherOp.getOffsets()) + return getTileShape(loadGatherOp->getOpResult(0)); + else + return getTileShape(loadGatherOp->getOpOperand(0)); + } + + if (auto storeScatterOp = dyn_cast<xegpu::StoreScatterOp>(op)) + return getTileShape(storeScatterOp.getOffsets() + ? storeScatterOp->getOpOperand(0) + : storeScatterOp->getOpOperand(1)); + if (isa<xegpu::DpasOp>(op)) { std::optional<SmallVector<int64_t>> aTile = getTileShape(op->getOpOperand(0)); diff --git a/mlir/test/Dialect/LLVMIR/bytecode.mlir b/mlir/test/Dialect/LLVMIR/bytecode.mlir new file mode 100644 index 000000000000..821b0ac2196a --- /dev/null +++ b/mlir/test/Dialect/LLVMIR/bytecode.mlir @@ -0,0 +1,35 @@ +// RUN: mlir-opt -verify-roundtrip %s + +#access_group = #llvm.access_group<id = distinct[0]<>> +#access_group1 = #llvm.access_group<id = distinct[1]<>> +#di_subprogram = #llvm.di_subprogram<recId = distinct[2]<>> +#loc1 = loc("test.f90":12:14) +#loc2 = loc("test":4:3) +#loc6 = loc(fused<#di_subprogram>[#loc1]) +#loc7 = loc(fused<#di_subprogram>[#loc2]) +#loop_annotation = #llvm.loop_annotation<disableNonforced = false, mustProgress = true, startLoc = #loc6, endLoc = #loc7, parallelAccesses = #access_group, #access_group1> +module { + llvm.func @imp_fn() { + llvm.return loc(#loc2) + } loc(#loc8) + llvm.func @loop_annotation_with_locs() { + llvm.br ^bb1 {loop_annotation = #loop_annotation} loc(#loc4) + ^bb1: // pred: ^bb0 + llvm.return loc(#loc5) + } loc(#loc3) +} loc(#loc) +#di_file = #llvm.di_file<"test.f90" in ""> +#di_subroutine_type = #llvm.di_subroutine_type<callingConvention = DW_CC_program> +#loc = loc("test":0:0) +#loc3 = loc("test-path":36:3) +#loc4 = loc("test-path":37:5) +#loc5 = loc("test-path":39:5) +#di_compile_unit = #llvm.di_compile_unit<id = distinct[3]<>, sourceLanguage = DW_LANG_Fortran95, file = #di_file, isOptimized = false, emissionKind = Full> +#di_compile_unit1 = #llvm.di_compile_unit<id = distinct[4]<>, sourceLanguage = DW_LANG_Fortran95, file = #di_file, isOptimized = false, emissionKind = Full> +#di_compile_unit2 = #llvm.di_compile_unit<id = distinct[5]<>, sourceLanguage = DW_LANG_Fortran95, file = #di_file, isOptimized = false, emissionKind = Full> +#di_module = #llvm.di_module<file = #di_file, scope = #di_compile_unit1, name = "mod1"> +#di_module1 = #llvm.di_module<file = #di_file, scope = #di_compile_unit2, name = "mod2"> +#di_imported_entity = #llvm.di_imported_entity<tag = DW_TAG_imported_module, scope = #di_subprogram, entity = #di_module, file = #di_file, line = 1> +#di_imported_entity1 = #llvm.di_imported_entity<tag = DW_TAG_imported_module, scope = #di_subprogram, entity = #di_module1, file = #di_file, line = 1> +#di_subprogram1 = #llvm.di_subprogram<recId = distinct[2]<>, id = distinct[6]<>, compileUnit = #di_compile_unit, scope = #di_file, name = "imp_fn", file = #di_file, subprogramFlags = Definition, type = #di_subroutine_type, retainedNodes = #di_imported_entity, #di_imported_entity1> +#loc8 = loc(fused<#di_subprogram1>[#loc1]) diff --git a/mlir/test/Dialect/LLVMIR/debuginfo.mlir b/mlir/test/Dialect/LLVMIR/debuginfo.mlir index 1834b0a52470..d7bf99bfaed7 100644 --- a/mlir/test/Dialect/LLVMIR/debuginfo.mlir +++ b/mlir/test/Dialect/LLVMIR/debuginfo.mlir @@ -1,4 +1,5 @@ // RUN: mlir-opt %s | mlir-opt | FileCheck %s +// RUN: mlir-opt -emit-bytecode %s | mlir-opt | FileCheck %s // CHECK-DAG: #[[FILE:.*]] = #llvm.di_file<"debuginfo.mlir" in "/test/"> #file = #llvm.di_file<"debuginfo.mlir" in "/test/"> diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir index 73447978341d..00e763a8ffc0 100644 --- a/mlir/test/Dialect/LLVMIR/roundtrip.mlir +++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s | mlir-opt | FileCheck %s +// RUN: mlir-opt -verify-roundtrip %s // CHECK-LABEL: func @baz @@ -757,7 +757,7 @@ llvm.func @stackrestore(%arg0: !llvm.ptr) { // CHECK-LABEL: @experimental_noalias_scope_decl llvm.func @experimental_noalias_scope_decl() { - // CHECK: llvm.intr.experimental.noalias.scope.decl #{{.*}} + // CHECK: llvm.intr.experimental.noalias.scope.decl #alias_scope{{.*}} llvm.intr.experimental.noalias.scope.decl #alias_scope llvm.return } @@ -767,7 +767,7 @@ llvm.func @experimental_noalias_scope_decl() { // CHECK-LABEL: @experimental_noalias_scope_with_string_id llvm.func @experimental_noalias_scope_with_string_id() { - // CHECK: llvm.intr.experimental.noalias.scope.decl #{{.*}} + // CHECK: llvm.intr.experimental.noalias.scope.decl #alias_scope{{.*}} llvm.intr.experimental.noalias.scope.decl #alias_scope2 llvm.return } diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir index 9d63c2ddd489..fe4f44c0b02a 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir @@ -584,3 +584,101 @@ gpu.module @test_kernel { gpu.return } } + +// ----- +gpu.module @test_kernel { + // CHECK-LABEL: load_with_offsets + // CHECK-COUNT-2: xegpu.load {{.*}}[{{.*}}], {{.*}} <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16xf32> + gpu.func @load_with_offsets(%src: ui64) -> vector<32xf32> { + %cst = arith.constant dense<[ + 0, 8, 16, 24, 32, 40, 48, 56, + 64, 72, 80, 88, 96, 104, 112, 120, + 128, 136, 144, 152, 160, 168, 176, 184, + 192, 200, 208, 216, 224, 232, 240, 248 + ]> : vector<32xindex> + + %c17 = arith.constant 17: index + %mask = vector.create_mask %c17: vector<32xi1> + %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout_result_0 = #xegpu.layout<inst_data = [16]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32> + + gpu.return %ld : vector<32xf32> + } +} + +// ----- +gpu.module @test_kernel { + // CHECK-LABEL: store_with_offsets + // CHECK-COUNT-2: xegpu.store {{.*}}[{{.*}}], {{.*}} <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : vector<16xf32>, ui64, vector<16xindex>, vector<16xi1> + gpu.func @store_with_offsets(%src: ui64) { + %cst = arith.constant dense<[ + 0, 8, 16, 24, 32, 40, 48, 56, + 64, 72, 80, 88, 96, 104, 112, 120, + 128, 136, 144, 152, 160, 168, 176, 184, + 192, 200, 208, 216, 224, 232, 240, 248 + ]> : vector<32xindex> + + %c17 = arith.constant 17: index + %mask = vector.create_mask %c17: vector<32xi1> + + %st_vec = arith.constant dense<1023.0>: vector<32xf32> + xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 1, layout_operand_0 = #xegpu.layout<inst_data = [16]>, + layout_operand_2 = #xegpu.layout<inst_data = [16]>, + layout_operand_3 = #xegpu.layout<inst_data = [16]>, + l1_hint = #xegpu.cache_hint<cached>} : vector<32xf32>, ui64, vector<32xindex>, vector<32xi1> + + gpu.return + } +} + +// ----- +gpu.module @test_kernel { + // CHECK-LABEL: load_with_offsets_chunk + // CHECK: [[cst:%.+]] = arith.constant dense<0.000000e+00> : vector<32x4xf32> + // CHECK: [[cst0:%.+]] = arith.constant dense<[130, 138, 146, 154, 162, 170, 178, 186, 194, 202, 210, 218, 226, 234, 242, 250]> : vector<16xindex> + // CHECK: [[cst1:%.+]] = arith.constant dense<[2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122]> : vector<16xindex> + // CHECK: [[cst2:%.+]] = arith.constant dense<[128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]> : vector<16xindex> + // CHECK: [[cst3:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex> + // CHECK-COUNT-4: xegpu.load {{.*}}[{{.*}}], {{.*}} <{chunk_size = 2 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16x2xf32> + gpu.func @load_with_offsets_chunk(%src: ui64) -> vector<32x4xf32> { + %cst = arith.constant dense<[ + 0, 8, 16, 24, 32, 40, 48, 56, + 64, 72, 80, 88, 96, 104, 112, 120, + 128, 136, 144, 152, 160, 168, 176, 184, + 192, 200, 208, 216, 224, 232, 240, 248 + ]> : vector<32xindex> + + %c17 = arith.constant 17: index + %mask = vector.create_mask %c17: vector<32xi1> + %ld = xegpu.load %src[%cst], %mask {chunk_size = 4, layout_result_0 = #xegpu.layout<inst_data = [16, 2]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<32xindex>, vector<32xi1> -> vector<32x4xf32> + gpu.return %ld : vector<32x4xf32> + } +} + +// ----- +gpu.module @test_kernel { + // CHECK-LABEL: store_with_offsets_chunk + // CHECK: [[cst:%.+]] = arith.constant dense<1.023000e+03> : vector<16x2xf32 + // CHECK: [[cst0:%.+]] = arith.constant dense<[130, 138, 146, 154, 162, 170, 178, 186, 194, 202, 210, 218, 226, 234, 242, 250]> : vector<16xindex> + // CHECK: [[cst1:%.+]] = arith.constant dense<[2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122]> : vector<16xindex> + // CHECK: [[cst2:%.+]] = arith.constant dense<[128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]> : vector<16xindex> + // CHECK: [[cst3:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex> + // CHECK-COUNT-4: xegpu.store {{.*}}[{{.*}}], {{.*}} <{chunk_size = 2 : i64, l1_hint = #xegpu.cache_hint<cached>}> : vector<16x2xf32>, ui64, vector<16xindex>, vector<16xi1> + gpu.func @store_with_offsets_chunk(%src: ui64) { + %cst = arith.constant dense<[ + 0, 8, 16, 24, 32, 40, 48, 56, + 64, 72, 80, 88, 96, 104, 112, 120, + 128, 136, 144, 152, 160, 168, 176, 184, + 192, 200, 208, 216, 224, 232, 240, 248 + ]> : vector<32xindex> + + %c17 = arith.constant 17: index + %mask = vector.create_mask %c17: vector<32xi1> + + %st_vec = arith.constant dense<1023.>: vector<32x4xf32> + xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 4, layout_operand_0 = #xegpu.layout<inst_data = [16, 2]>, + layout_operand_2 = #xegpu.layout<inst_data = [16, 2]>, + layout_operand_3 = #xegpu.layout<inst_data = [16, 2]>, + l1_hint = #xegpu.cache_hint<cached>} : vector<32x4xf32>, ui64, vector<32xindex>, vector<32xi1> + gpu.return + } +} |
