diff options
| author | Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com> | 2025-07-17 17:12:43 +0000 |
|---|---|---|
| committer | Aiden Grossman <aidengrossman@google.com> | 2025-07-17 17:12:43 +0000 |
| commit | 0b56fc832b3c44d5cbfe58575bf10e73432ac971 (patch) | |
| tree | 379c3bdfedde61f1900ee1a89c5be6ebe5ab98f1 | |
| parent | a1179b69528245aaca7afa0c60bf9a8dc1ad3e6c (diff) | |
| parent | ff5784bb9094f6035851dc7abc4a5760fdc21e45 (diff) | |
[𝘀𝗽𝗿] changes introduced through rebaseusers/boomanaiden154/main.ci-migrate-monolithic-linux-script-to-sccache
Created using spr 1.3.4
[skip ci]
80 files changed, 2340 insertions, 896 deletions
diff --git a/clang-tools-extra/README.txt b/clang-tools-extra/README.txt index 6891e4078997..1195db9b468d 100644 --- a/clang-tools-extra/README.txt +++ b/clang-tools-extra/README.txt @@ -8,12 +8,13 @@ Clang frontend. These tools are kept in a separate "extra" repository to allow lighter weight checkouts of the core Clang codebase. All discussion regarding Clang, Clang-based tools, and code in this repository -should be held using the standard Clang forum: +should be held using the standard Clang forums: https://discourse.llvm.org/c/clang + https://discourse.llvm.org/c/clang/clang-tidy/71 + https://discourse.llvm.org/c/clang/clangd/34 -Code review for this tree should take place on the standard Clang patch and -commit lists: - http://lists.llvm.org/mailman/listinfo/cfe-commits +Code review for this tree should take place on Github: + https://github.com/llvm/llvm-project/pulls?q=label%3Aclang-tools-extra If you find a bug in these tools, please file it in the LLVM bug tracker: https://github.com/llvm/llvm-project/issues/ diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h index 937cbff4e3ea..0407897359b5 100644 --- a/clang/include/clang/Basic/LangOptions.h +++ b/clang/include/clang/Basic/LangOptions.h @@ -633,11 +633,6 @@ public: // received as a result of a standard operator new (-fcheck-new) bool CheckNew = false; - // In OpenACC mode, contains a user provided override for the _OPENACC macro. - // This exists so that we can override the macro value and test our incomplete - // implementation on real-world examples. - std::string OpenACCMacroOverride; - /// The HLSL root signature version for dxil. llvm::dxbc::RootSignatureVersion HLSLRootSigVer = llvm::dxbc::RootSignatureVersion::V1_1; diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index a8c1b5dd8ab3..6c22f06b269f 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1422,19 +1422,6 @@ def fno_hip_emit_relocatable : Flag<["-"], "fno-hip-emit-relocatable">, HelpText<"Do not override toolchain to compile HIP source to relocatable">; } -// Clang specific/exclusive options for OpenACC. -def openacc_macro_override - : Separate<["-"], "fexperimental-openacc-macro-override">, - Visibility<[ClangOption, CC1Option]>, - Group<f_Group>, - HelpText<"Overrides the _OPENACC macro value for experimental testing " - "during OpenACC support development">; -def openacc_macro_override_EQ - : Joined<["-"], "fexperimental-openacc-macro-override=">, - Alias<openacc_macro_override>; - -// End Clang specific/exclusive options for OpenACC. - def libomptarget_amdgpu_bc_path_EQ : Joined<["--"], "libomptarget-amdgpu-bc-path=">, Group<i_Group>, HelpText<"Path to libomptarget-amdgcn bitcode library">; def libomptarget_amdgcn_bc_path_EQ : Joined<["--"], "libomptarget-amdgcn-bc-path=">, Group<i_Group>, diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp index 476f99495928..61d1c54ee9ec 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp @@ -125,7 +125,7 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID, mlir::Value real = emitScalarExpr(e->getArg(0)); mlir::Value imag = emitScalarExpr(e->getArg(1)); mlir::Value complex = builder.createComplexCreate(loc, real, imag); - return RValue::get(complex); + return RValue::getComplex(complex); } case Builtin::BI__builtin_creal: @@ -150,6 +150,18 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID, return RValue::get(imag); } + case Builtin::BI__builtin_conj: + case Builtin::BI__builtin_conjf: + case Builtin::BI__builtin_conjl: + case Builtin::BIconj: + case Builtin::BIconjf: + case Builtin::BIconjl: { + mlir::Value complex = emitComplexExpr(e->getArg(0)); + mlir::Value conj = builder.createUnaryOp(getLoc(e->getExprLoc()), + cir::UnaryOpKind::Not, complex); + return RValue::getComplex(conj); + } + case Builtin::BI__builtin_clrsb: case Builtin::BI__builtin_clrsbl: case Builtin::BI__builtin_clrsbll: diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp index 6663f5ea1e75..9f36be5397ad 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp @@ -231,8 +231,7 @@ mlir::Value ComplexExprEmitter::VisitBinComma(const BinaryOperator *e) { mlir::Value ComplexExprEmitter::VisitCallExpr(const CallExpr *e) { if (e->getCallReturnType(cgf.getContext())->isReferenceType()) return emitLoadOfLValue(e); - - return cgf.emitCallExpr(e).getValue(); + return cgf.emitCallExpr(e).getComplexValue(); } mlir::Value ComplexExprEmitter::VisitCastExpr(CastExpr *e) { diff --git a/clang/lib/CIR/CodeGen/CIRGenValue.h b/clang/lib/CIR/CodeGen/CIRGenValue.h index 0a6dba5e80a6..0832c4141a10 100644 --- a/clang/lib/CIR/CodeGen/CIRGenValue.h +++ b/clang/lib/CIR/CodeGen/CIRGenValue.h @@ -58,6 +58,12 @@ public: return value; } + /// Return the value of this complex value. + mlir::Value getComplexValue() const { + assert(isComplex() && "Not a complex!"); + return value; + } + /// Return the value of the address of the aggregate. Address getAggregateAddress() const { assert(isAggregate() && "Not an aggregate!"); diff --git a/clang/lib/CodeGen/Targets/X86.cpp b/clang/lib/CodeGen/Targets/X86.cpp index 0b712ac2dabc..abb91486e7ee 100644 --- a/clang/lib/CodeGen/Targets/X86.cpp +++ b/clang/lib/CodeGen/Targets/X86.cpp @@ -2470,13 +2470,12 @@ GetSSETypeAtOffset(llvm::Type *IRType, unsigned IROffset, return llvm::Type::getDoubleTy(getVMContext()); } - /// GetINTEGERTypeAtOffset - The ABI specifies that a value should be passed in -/// an 8-byte GPR. This means that we either have a scalar or we are talking -/// about the high or low part of an up-to-16-byte struct. This routine picks -/// the best LLVM IR type to represent this, which may be i64 or may be anything -/// else that the backend will pass in a GPR that works better (e.g. i8, %foo*, -/// etc). +/// one or more 8-byte GPRs. This means that we either have a scalar or we are +/// talking about the high and/or low part of an up-to-16-byte struct. This +/// routine picks the best LLVM IR type to represent this, which may be i64 or +/// may be anything else that the backend will pass in GPRs that works better +/// (e.g. i8, %foo*, etc). /// /// PrefType is an LLVM IR type that corresponds to (part of) the IR type for /// the source type. IROffset is an offset in bytes into the LLVM IR type that @@ -2534,6 +2533,13 @@ GetINTEGERTypeAtOffset(llvm::Type *IRType, unsigned IROffset, SourceOffset); } + // if we have a 128-bit integer, we can pass it safely using an i128 + // so we return that + if (IRType->isIntegerTy(128)) { + assert(IROffset == 0); + return IRType; + } + // Okay, we don't have any better idea of what to pass, so we pass this in an // integer register that isn't too big to fit the rest of the struct. unsigned TySizeInBytes = @@ -2591,8 +2597,7 @@ GetX86_64ByValArgumentPair(llvm::Type *Lo, llvm::Type *Hi, return Result; } -ABIArgInfo X86_64ABIInfo:: -classifyReturnType(QualType RetTy) const { +ABIArgInfo X86_64ABIInfo::classifyReturnType(QualType RetTy) const { // AMD64-ABI 3.2.3p4: Rule 1. Classify the return type with the // classification algorithm. X86_64ABIInfo::Class Lo, Hi; @@ -2638,6 +2643,12 @@ classifyReturnType(QualType RetTy) const { isPromotableIntegerTypeForABI(RetTy)) return ABIArgInfo::getExtend(RetTy); } + + if (ResType->isIntegerTy(128)) { + // i128 are passed directly + assert(Hi == Integer); + return ABIArgInfo::getDirect(ResType); + } break; // AMD64-ABI 3.2.3p4: Rule 4. If the class is SSE, the next @@ -2783,6 +2794,11 @@ X86_64ABIInfo::classifyArgumentType(QualType Ty, unsigned freeIntRegs, return ABIArgInfo::getExtend(Ty, CGT.ConvertType(Ty)); } + if (ResType->isIntegerTy(128)) { + assert(Hi == Integer); + ++neededInt; + return ABIArgInfo::getDirect(ResType); + } break; // AMD64-ABI 3.2.3p3: Rule 3. If the class is SSE, the next diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 456bfe885f35..8880c9375143 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -3846,15 +3846,6 @@ static void RenderOpenACCOptions(const Driver &D, const ArgList &Args, return; CmdArgs.push_back("-fopenacc"); - - if (Arg *A = Args.getLastArg(options::OPT_openacc_macro_override)) { - StringRef Value = A->getValue(); - int Version; - if (!Value.getAsInteger(10, Version)) - A->renderAsInput(Args, CmdArgs); - else - D.Diag(diag::err_drv_clang_unsupported) << Value; - } } static void RenderBuiltinOptions(const ToolChain &TC, const llvm::Triple &T, diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 6ab36d867596..3a36250da57a 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -3913,12 +3913,8 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts, if (Opts.OpenMPCUDAMode) GenerateArg(Consumer, OPT_fopenmp_cuda_mode); - if (Opts.OpenACC) { + if (Opts.OpenACC) GenerateArg(Consumer, OPT_fopenacc); - if (!Opts.OpenACCMacroOverride.empty()) - GenerateArg(Consumer, OPT_openacc_macro_override, - Opts.OpenACCMacroOverride); - } // The arguments used to set Optimize, OptimizeSize and NoInlineDefine are // generated from CodeGenOptions. @@ -4424,13 +4420,9 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args, Args.hasArg(options::OPT_fopenmp_cuda_mode); // OpenACC Configuration. - if (Args.hasArg(options::OPT_fopenacc)) { + if (Args.hasArg(options::OPT_fopenacc)) Opts.OpenACC = true; - if (Arg *A = Args.getLastArg(options::OPT_openacc_macro_override)) - Opts.OpenACCMacroOverride = A->getValue(); - } - if (Arg *A = Args.getLastArg(OPT_ffp_contract)) { StringRef Val = A->getValue(); if (Val == "fast") diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index 38b2e0cf1ca5..382ccd610946 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -639,16 +639,8 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI, } } - if (LangOpts.OpenACC) { - // FIXME: When we have full support for OpenACC, we should set this to the - // version we support. Until then, set as '1' by default, but provide a - // temporary mechanism for users to override this so real-world examples can - // be tested against. - if (!LangOpts.OpenACCMacroOverride.empty()) - Builder.defineMacro("_OPENACC", LangOpts.OpenACCMacroOverride); - else - Builder.defineMacro("_OPENACC", "1"); - } + if (LangOpts.OpenACC) + Builder.defineMacro("_OPENACC", "202506"); } /// Initialize the predefined C++ language feature test macros defined in diff --git a/clang/test/CIR/CodeGen/complex-builtins.cpp b/clang/test/CIR/CodeGen/complex-builtins.cpp index f0d12d0ef666..811af47a704f 100644 --- a/clang/test/CIR/CodeGen/complex-builtins.cpp +++ b/clang/test/CIR/CodeGen/complex-builtins.cpp @@ -83,3 +83,39 @@ void foo3() { // OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[COMPLEX]], i32 0, i32 1 // OGCG: %[[A_IMAG:.*]] = load double, ptr %[[A_IMAG_PTR]], align 8 // OGCG: store double %[[A_IMAG]], ptr %[[INIT]], align 8 + +void foo4() { + float _Complex a; + float _Complex b = __builtin_conjf(a); +} + +// CIR: %[[COMPLEX:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"] +// CIR: %[[RESULT:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init] +// CIR: %[[TMP:.*]] = cir.load{{.*}} %[[COMPLEX]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float> +// CIR: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex<!cir.float> -> !cir.float +// CIR: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex<!cir.float> -> !cir.float +// CIR: %[[IMAG_MINUS:.*]] = cir.unary(minus, %[[IMAG]]) : !cir.float, !cir.float +// CIR: %[[RESULT_VAL:.*]] = cir.complex.create %[[REAL]], %[[IMAG_MINUS]] : !cir.float -> !cir.complex<!cir.float> +// CIR: cir.store{{.*}} %[[RESULT_VAL]], %[[RESULT]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>> + +// LLVM: %[[COMPLEX:.*]] = alloca { float, float }, i64 1, align 4 +// LLVM: %[[RESULT:.*]] = alloca { float, float }, i64 1, align 4 +// LLVM: %[[TMP:.*]] = load { float, float }, ptr %[[COMPLEX]], align 4 +// LLVM: %[[REAL:.*]] = extractvalue { float, float } %[[TMP]], 0 +// LLVM: %[[IMAG:.*]] = extractvalue { float, float } %[[TMP]], 1 +// LLVM: %[[IMAG_MINUS:.*]] = fneg float %[[IMAG]] +// LLVM: %[[RESULT_TMP:.*]] = insertvalue { float, float } {{.*}}, float %[[REAL]], 0 +// LLVM: %[[RESULT_VAL:.*]] = insertvalue { float, float } %[[RESULT_TMP]], float %[[IMAG_MINUS]], 1 +// LLVM: store { float, float } %[[RESULT_VAL]], ptr %[[RESULT]], align 4 + +// OGCG: %[[COMPLEX:.*]] = alloca { float, float }, align 4 +// OGCG: %[[RESULT:.*]] = alloca { float, float }, align 4 +// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX]], i32 0, i32 0 +// OGCG: %[[A_REAL:.*]] = load float, ptr %[[A_REAL_PTR]], align 4 +// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX]], i32 0, i32 1 +// OGCG: %[[A_IMAG:.*]] = load float, ptr %[[A_IMAG_PTR]], align 4 +// OGCG: %[[A_IMAG_MINUS:.*]] = fneg float %[[A_IMAG]] +// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[RESULT]], i32 0, i32 0 +// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[RESULT]], i32 0, i32 1 +// OGCG: store float %[[A_REAL]], ptr %[[RESULT_REAL_PTR]], align 4 +// OGCG: store float %[[A_IMAG_MINUS]], ptr %[[RESULT_IMAG_PTR]], align 4 diff --git a/clang/test/CodeGen/X86/i128-debuginfo.c b/clang/test/CodeGen/X86/i128-debuginfo.c new file mode 100644 index 000000000000..4b865c1bed9f --- /dev/null +++ b/clang/test/CodeGen/X86/i128-debuginfo.c @@ -0,0 +1,10 @@ +// no autogeneration since update_cc_test_checks does not support -g +// RUN: %clang_cc1 -triple x86_64-pc-linux -O1 -debug-info-kind=limited -emit-llvm -o - %s | FileCheck %s + +// CHECK-LABEL: define{{.*}} i128 @add(i128 noundef %a) +// CHECK: #dbg_value(i128 %a, ![[DI:.*]], !DIExpression() +__int128_t add(__int128_t a) { + return a + a; +} + +// CHECK: ![[DI]] = !DILocalVariable(name: "a", arg: 1 diff --git a/clang/test/CodeGen/X86/x86_64-arguments.c b/clang/test/CodeGen/X86/x86_64-arguments.c index 82845f0a2b31..580f9487395d 100644 --- a/clang/test/CodeGen/X86/x86_64-arguments.c +++ b/clang/test/CodeGen/X86/x86_64-arguments.c @@ -551,6 +551,45 @@ struct s68 { void f68(struct s68 x) { } +// CHECK-LABEL: define{{.*}} i128 @f69(i128 noundef %a) +__int128_t f69(__int128_t a) { + return a; +} + +// CHECK-LABEL: define{{.*}} i128 @f70(i128 noundef %a) +__uint128_t f70(__uint128_t a) { + return a; +} + +// check that registers are correctly counted for (u)int128_t arguments +struct s71 { + long long a, b; +}; +// CHECK-LABEL: define{{.*}} void @f71(i128 noundef %a, i128 noundef %b, i64 noundef %c, ptr noundef byval(%struct.s71) align 8 %d) +void f71(__int128_t a, __int128_t b, long long c, struct s71 d) { +} +// CHECK-LABEL: define{{.*}} void @f72(i128 noundef %a, i128 noundef %b, i64 %d.coerce0, i64 %d.coerce1) +void f72(__int128_t a, __int128_t b, struct s71 d) { +} + +// check that structs containing (u)int128_t are passed correctly +struct s73 { + struct inner { + __uint128_t a; + }; + struct inner in; +}; +// CHECK-LABEL: define{{.*}} i128 @f73(i128 %a.coerce) +struct s73 f73(struct s73 a) { + return a; +} + +// check that _BitInt(128) is still passed correctly on the stack +// CHECK-LABEL: define{{.*}} i128 @f74(i128 noundef %b, i128 noundef %c, i128 noundef %d, i64 noundef %e, ptr noundef byval(i128) align 8 %0) +_BitInt(128) f74(__uint128_t b, __uint128_t c, __uint128_t d, long e, _BitInt(128) a) { + return a; +} + /// The synthesized __va_list_tag does not have file/line fields. // CHECK: = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "__va_list_tag", // CHECK-NOT: file: diff --git a/clang/test/CodeGen/alloc-align-attr.c b/clang/test/CodeGen/alloc-align-attr.c index 76e5d1041b19..c4c4e76eaaa0 100644 --- a/clang/test/CodeGen/alloc-align-attr.c +++ b/clang/test/CodeGen/alloc-align-attr.c @@ -70,66 +70,42 @@ __INT32_TYPE__ test4(__SIZE_TYPE__ a) { struct Empty {}; struct MultiArgs { __INT64_TYPE__ a, b;}; -// Struct parameter doesn't take up an IR parameter, 'i' takes up 2. +// Struct parameter doesn't take up an IR parameter, 'i' takes up 1. // Truncation to i64 is permissible, since alignments of greater than 2^64 are insane. __INT32_TYPE__ *m3(struct Empty s, __int128_t i) __attribute__((alloc_align(2))); // CHECK-LABEL: @test5( // CHECK-NEXT: entry: -// CHECK-NEXT: [[A:%.*]] = alloca i128, align 16 // CHECK-NEXT: [[A_ADDR:%.*]] = alloca i128, align 16 // CHECK-NEXT: [[E:%.*]] = alloca [[STRUCT_EMPTY:%.*]], align 1 -// CHECK-NEXT: [[COERCE:%.*]] = alloca i128, align 16 -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[A]], i32 0, i32 0 -// CHECK-NEXT: store i64 [[A_COERCE0:%.*]], ptr [[TMP0]], align 16 -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[A]], i32 0, i32 1 -// CHECK-NEXT: store i64 [[A_COERCE1:%.*]], ptr [[TMP1]], align 8 -// CHECK-NEXT: [[A1:%.*]] = load i128, ptr [[A]], align 16 -// CHECK-NEXT: store i128 [[A1]], ptr [[A_ADDR]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load i128, ptr [[A_ADDR]], align 16 -// CHECK-NEXT: store i128 [[TMP2]], ptr [[COERCE]], align 16 -// CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[COERCE]], i32 0, i32 0 -// CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 16 -// CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[COERCE]], i32 0, i32 1 -// CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP5]], align 8 -// CHECK-NEXT: [[CALL:%.*]] = call ptr @m3(i64 noundef [[TMP4]], i64 noundef [[TMP6]]) -// CHECK-NEXT: [[CASTED_ALIGN:%.*]] = trunc i128 [[TMP2]] to i64 +// CHECK-NEXT: store i128 [[A:%.*]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[CALL:%.*]] = call ptr @m3(i128 noundef [[TMP0]]) +// CHECK-NEXT: [[CASTED_ALIGN:%.*]] = trunc i128 [[TMP0]] to i64 // CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[CALL]], i64 [[CASTED_ALIGN]]) ] -// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[CALL]], align 4 -// CHECK-NEXT: ret i32 [[TMP7]] +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[CALL]], align 4 +// CHECK-NEXT: ret i32 [[TMP1]] // __INT32_TYPE__ test5(__int128_t a) { struct Empty e; return *m3(e, a); } -// Struct parameter takes up 2 parameters, 'i' takes up 2. +// Struct parameter takes up 2 parameters, 'i' takes up 1. __INT32_TYPE__ *m4(struct MultiArgs s, __int128_t i) __attribute__((alloc_align(2))); // CHECK-LABEL: @test6( // CHECK-NEXT: entry: -// CHECK-NEXT: [[A:%.*]] = alloca i128, align 16 // CHECK-NEXT: [[A_ADDR:%.*]] = alloca i128, align 16 // CHECK-NEXT: [[E:%.*]] = alloca [[STRUCT_MULTIARGS:%.*]], align 8 -// CHECK-NEXT: [[COERCE:%.*]] = alloca i128, align 16 -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[A]], i32 0, i32 0 -// CHECK-NEXT: store i64 [[A_COERCE0:%.*]], ptr [[TMP0]], align 16 -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[A]], i32 0, i32 1 -// CHECK-NEXT: store i64 [[A_COERCE1:%.*]], ptr [[TMP1]], align 8 -// CHECK-NEXT: [[A1:%.*]] = load i128, ptr [[A]], align 16 -// CHECK-NEXT: store i128 [[A1]], ptr [[A_ADDR]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load i128, ptr [[A_ADDR]], align 16 -// CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[E]], i32 0, i32 0 +// CHECK-NEXT: store i128 [[A:%.*]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[E]], i32 0, i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[E]], i32 0, i32 1 // CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 8 -// CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[E]], i32 0, i32 1 -// CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP5]], align 8 -// CHECK-NEXT: store i128 [[TMP2]], ptr [[COERCE]], align 16 -// CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[COERCE]], i32 0, i32 0 -// CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP7]], align 16 -// CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[COERCE]], i32 0, i32 1 -// CHECK-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP9]], align 8 -// CHECK-NEXT: [[CALL:%.*]] = call ptr @m4(i64 [[TMP4]], i64 [[TMP6]], i64 noundef [[TMP8]], i64 noundef [[TMP10]]) -// CHECK-NEXT: [[CASTED_ALIGN:%.*]] = trunc i128 [[TMP2]] to i64 +// CHECK-NEXT: [[CALL:%.*]] = call ptr @m4(i64 [[TMP2]], i64 [[TMP4]], i128 noundef [[TMP0]]) +// CHECK-NEXT: [[CASTED_ALIGN:%.*]] = trunc i128 [[TMP0]] to i64 // CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[CALL]], i64 [[CASTED_ALIGN]]) ] -// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[CALL]], align 4 -// CHECK-NEXT: ret i32 [[TMP11]] +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[CALL]], align 4 +// CHECK-NEXT: ret i32 [[TMP5]] // __INT32_TYPE__ test6(__int128_t a) { struct MultiArgs e; diff --git a/clang/test/CodeGen/builtins.c b/clang/test/CodeGen/builtins.c index eda6c67fdad0..aa9965b81598 100644 --- a/clang/test/CodeGen/builtins.c +++ b/clang/test/CodeGen/builtins.c @@ -956,36 +956,24 @@ void test_builtin_os_log_errno(void) { void test_builtin_os_log_long_double(void *buf, long double ld) { // CHECK: %[[BUF_ADDR:.*]] = alloca ptr, align 8 // CHECK: %[[LD_ADDR:.*]] = alloca x86_fp80, align 16 - // CHECK: %[[COERCE:.*]] = alloca i128, align 16 // CHECK: store ptr %[[BUF]], ptr %[[BUF_ADDR]], align 8 // CHECK: store x86_fp80 %[[LD]], ptr %[[LD_ADDR]], align 16 // CHECK: %[[V0:.*]] = load ptr, ptr %[[BUF_ADDR]], align 8 // CHECK: %[[V1:.*]] = load x86_fp80, ptr %[[LD_ADDR]], align 16 // CHECK: %[[V2:.*]] = bitcast x86_fp80 %[[V1]] to i80 // CHECK: %[[V3:.*]] = zext i80 %[[V2]] to i128 - // CHECK: store i128 %[[V3]], ptr %[[COERCE]], align 16 - // CHECK: %[[V5:.*]] = getelementptr inbounds nuw { i64, i64 }, ptr %[[COERCE]], i32 0, i32 0 - // CHECK: %[[V6:.*]] = load i64, ptr %[[V5]], align 16 - // CHECK: %[[V7:.*]] = getelementptr inbounds nuw { i64, i64 }, ptr %[[COERCE]], i32 0, i32 1 - // CHECK: %[[V8:.*]] = load i64, ptr %[[V7]], align 8 - // CHECK: call void @__os_log_helper_1_0_1_16_0(ptr noundef %[[V0]], i64 noundef %[[V6]], i64 noundef %[[V8]]) + // CHECK: call void @__os_log_helper_1_0_1_16_0(ptr noundef %[[V0]], i128 noundef %[[V3]]) __builtin_os_log_format(buf, "%Lf", ld); } // CHECK-LABEL: define linkonce_odr hidden void @__os_log_helper_1_0_1_16_0 -// CHECK: (ptr noundef %[[BUFFER:.*]], i64 noundef %[[ARG0_COERCE0:.*]], i64 noundef %[[ARG0_COERCE1:.*]]) +// CHECK: (ptr noundef %[[BUFFER:.*]], i128 noundef %[[ARG0:.*]]) -// CHECK: %[[ARG0:.*]] = alloca i128, align 16 // CHECK: %[[BUFFER_ADDR:.*]] = alloca ptr, align 8 // CHECK: %[[ARG0_ADDR:.*]] = alloca i128, align 16 -// CHECK: %[[V1:.*]] = getelementptr inbounds nuw { i64, i64 }, ptr %[[ARG0]], i32 0, i32 0 -// CHECK: store i64 %[[ARG0_COERCE0]], ptr %[[V1]], align 16 -// CHECK: %[[V2:.*]] = getelementptr inbounds nuw { i64, i64 }, ptr %[[ARG0]], i32 0, i32 1 -// CHECK: store i64 %[[ARG0_COERCE1]], ptr %[[V2]], align 8 -// CHECK: %[[ARG01:.*]] = load i128, ptr %[[ARG0]], align 16 // CHECK: store ptr %[[BUFFER]], ptr %[[BUFFER_ADDR]], align 8 -// CHECK: store i128 %[[ARG01]], ptr %[[ARG0_ADDR]], align 16 +// CHECK: store i128 %[[ARG0]], ptr %[[ARG0_ADDR]], align 16 // CHECK: %[[BUF:.*]] = load ptr, ptr %[[BUFFER_ADDR]], align 8 // CHECK: %[[SUMMARY:.*]] = getelementptr i8, ptr %[[BUF]], i64 0 // CHECK: store i8 0, ptr %[[SUMMARY]], align 1 diff --git a/clang/test/CodeGen/ext-int-cc.c b/clang/test/CodeGen/ext-int-cc.c index f31a4eb240c2..fdca4012ee4a 100644 --- a/clang/test/CodeGen/ext-int-cc.c +++ b/clang/test/CodeGen/ext-int-cc.c @@ -32,7 +32,7 @@ // Make sure 128 and 64 bit versions are passed like integers. void ParamPassing(_BitInt(128) b, _BitInt(64) c) {} -// LIN64: define{{.*}} void @ParamPassing(i64 %{{.+}}, i64 %{{.+}}, i64 %{{.+}}) +// LIN64: define{{.*}} void @ParamPassing(i128 %{{.+}}, i64 %{{.+}}) // WIN64: define dso_local void @ParamPassing(ptr %{{.+}}, i64 %{{.+}}) // LIN32: define{{.*}} void @ParamPassing(ptr %{{.+}}, i64 %{{.+}}) // WIN32: define dso_local void @ParamPassing(ptr %{{.+}}, i64 %{{.+}}) @@ -251,7 +251,7 @@ _BitInt(127) ReturnPassing3(void) { return 0; } // LA32: define{{.*}} void @ReturnPassing3(ptr dead_on_unwind noalias writable sret _BitInt(128) ReturnPassing4(void) { return 0; } -// LIN64: define{{.*}} { i64, i64 } @ReturnPassing4( +// LIN64: define{{.*}} i128 @ReturnPassing4( // WIN64: define dso_local void @ReturnPassing4(ptr dead_on_unwind noalias writable sret // LIN32: define{{.*}} void @ReturnPassing4(ptr dead_on_unwind noalias writable sret // WIN32: define dso_local void @ReturnPassing4(ptr dead_on_unwind noalias writable sret diff --git a/clang/test/CodeGen/extend-arg-64.c b/clang/test/CodeGen/extend-arg-64.c index 2cb56d35af21..8b99c01807ec 100644 --- a/clang/test/CodeGen/extend-arg-64.c +++ b/clang/test/CodeGen/extend-arg-64.c @@ -84,7 +84,7 @@ int test(void) { #ifdef D128 knr(i128); // CHECKEXT: load i128 - // CHECKEXT: call{{.*}} void (i64, i64, ...) @knr + // CHECKEXT: call{{.*}} void (i128, ...) @knr #endif knr(u32, s32, u16, s16, u8, s8); diff --git a/clang/test/Driver/openacc.c b/clang/test/Driver/openacc.c index c7f1d2545bd0..f46e2a32bcab 100644 --- a/clang/test/Driver/openacc.c +++ b/clang/test/Driver/openacc.c @@ -1,14 +1,2 @@ // RUN: %clang -S -### -fopenacc %s 2>&1 | FileCheck %s --check-prefix=CHECK-DRIVER // CHECK-DRIVER: "-cc1" {{.*}} "-fopenacc" - -// RUN: %clang -S -### -fopenacc -fexperimental-openacc-macro-override=202211 %s 2>&1 | FileCheck %s --check-prefix=CHECK-MACRO-OVERRIDE -// RUN: %clang -S -### -fopenacc -fexperimental-openacc-macro-override 202211 %s 2>&1 | FileCheck %s --check-prefix=CHECK-MACRO-OVERRIDE -// CHECK-MACRO-OVERRIDE: "-cc1"{{.*}} "-fexperimental-openacc-macro-override" "202211" - -// RUN: not %clang -S -fopenacc -fexperimental-openacc-macro-override=202211L %s 2>&1 | FileCheck %s --check-prefix=INVALID -// RUN: not %clang -S -fopenacc -fexperimental-openacc-macro-override 202211L %s 2>&1 | FileCheck %s --check-prefix=INVALID -// RUN: not %clang -S -fopenacc -fexperimental-openacc-macro-override=L202211 %s 2>&1 | FileCheck %s --check-prefix=INVALID -// RUN: not %clang -S -fopenacc -fexperimental-openacc-macro-override L202211 %s 2>&1 | FileCheck %s --check-prefix=INVALID -// RUN: not %clang -S -fopenacc -fexperimental-openacc-macro-override=2022L11 %s 2>&1 | FileCheck %s --check-prefix=INVALID -// RUN: not %clang -S -fopenacc -fexperimental-openacc-macro-override 2022L11 %s 2>&1 | FileCheck %s --check-prefix=INVALID -// INVALID: error: the clang compiler does not support diff --git a/clang/test/Preprocessor/openacc.c b/clang/test/Preprocessor/openacc.c index be7052f00e0c..283baa6c2fe4 100644 --- a/clang/test/Preprocessor/openacc.c +++ b/clang/test/Preprocessor/openacc.c @@ -1,13 +1,9 @@ // RUN: %clang_cc1 -E -fopenacc %s | FileCheck %s --check-prefix=DEFAULT -// RUN: %clang_cc1 -E -fopenacc -fexperimental-openacc-macro-override 202211 %s | FileCheck %s --check-prefix=OVERRIDE -// DEFAULT: OpenACC:1: -// OVERRIDE: OpenACC:202211: +// DEFAULT: OpenACC:202506: OpenACC:_OPENACC: // RUN: %clang_cc1 -E -dM -fopenacc %s | FileCheck %s --check-prefix=MACRO_PRINT_DEF -// RUN: %clang_cc1 -E -dM -fopenacc -fexperimental-openacc-macro-override 202211 %s | FileCheck %s --check-prefix=MACRO_PRINT_OVR -// MACRO_PRINT_DEF: #define _OPENACC 1 -// MACRO_PRINT_OVR: #define _OPENACC 202211 +// MACRO_PRINT_DEF: #define _OPENACC 202506 diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp index 39e4444cde4e..51eb33dec186 100644 --- a/flang/lib/Lower/OpenACC.cpp +++ b/flang/lib/Lower/OpenACC.cpp @@ -708,6 +708,7 @@ genDataOperandOperations(const Fortran::parser::AccObjectList &objectList, bool setDeclareAttr = false) { fir::FirOpBuilder &builder = converter.getFirOpBuilder(); Fortran::evaluate::ExpressionAnalyzer ea{semanticsContext}; + const bool unwrapBoxAddr = true; for (const auto &accObject : objectList.v) { llvm::SmallVector<mlir::Value> bounds; std::stringstream asFortran; @@ -735,8 +736,25 @@ genDataOperandOperations(const Fortran::parser::AccObjectList &objectList, Op op = createDataEntryOp<Op>( builder, operandLocation, baseAddr, asFortran, bounds, structured, implicit, dataClause, baseAddr.getType(), async, asyncDeviceTypes, - asyncOnlyDeviceTypes, /*unwrapBoxAddr=*/true, info.isPresent); + asyncOnlyDeviceTypes, unwrapBoxAddr, info.isPresent); dataOperands.push_back(op.getAccVar()); + + // For UseDeviceOp, if operand is one of a pair resulting from a + // declare operation, create a UseDeviceOp for the other operand as well. + if constexpr (std::is_same_v<Op, mlir::acc::UseDeviceOp>) { + if (auto declareOp = + mlir::dyn_cast<hlfir::DeclareOp>(baseAddr.getDefiningOp())) { + mlir::Value otherAddr = declareOp.getResult(1); + if (baseAddr != otherAddr) { + Op op = createDataEntryOp<Op>(builder, operandLocation, otherAddr, + asFortran, bounds, structured, implicit, + dataClause, otherAddr.getType(), async, + asyncDeviceTypes, asyncOnlyDeviceTypes, + unwrapBoxAddr, info.isPresent); + dataOperands.push_back(op.getAccVar()); + } + } + } } } @@ -4396,10 +4414,34 @@ getAttributeValueByDeviceType(llvm::SmallVector<mlir::Attribute> &attributes, return std::nullopt; } +// Helper function to extract string value from bind name variant +static std::optional<llvm::StringRef> getBindNameStringValue( + const std::optional<std::variant<mlir::SymbolRefAttr, mlir::StringAttr>> + &bindNameValue) { + if (!bindNameValue.has_value()) + return std::nullopt; + + return std::visit( + [](const auto &attr) -> std::optional<llvm::StringRef> { + if constexpr (std::is_same_v<std::decay_t<decltype(attr)>, + mlir::StringAttr>) { + return attr.getValue(); + } else if constexpr (std::is_same_v<std::decay_t<decltype(attr)>, + mlir::SymbolRefAttr>) { + return attr.getLeafReference(); + } else { + return std::nullopt; + } + }, + bindNameValue.value()); +} + static bool compareDeviceTypeInfo( mlir::acc::RoutineOp op, - llvm::SmallVector<mlir::Attribute> &bindNameArrayAttr, - llvm::SmallVector<mlir::Attribute> &bindNameDeviceTypeArrayAttr, + llvm::SmallVector<mlir::Attribute> &bindIdNameArrayAttr, + llvm::SmallVector<mlir::Attribute> &bindStrNameArrayAttr, + llvm::SmallVector<mlir::Attribute> &bindIdNameDeviceTypeArrayAttr, + llvm::SmallVector<mlir::Attribute> &bindStrNameDeviceTypeArrayAttr, llvm::SmallVector<mlir::Attribute> &gangArrayAttr, llvm::SmallVector<mlir::Attribute> &gangDimArrayAttr, llvm::SmallVector<mlir::Attribute> &gangDimDeviceTypeArrayAttr, @@ -4409,9 +4451,13 @@ static bool compareDeviceTypeInfo( for (uint32_t dtypeInt = 0; dtypeInt != mlir::acc::getMaxEnumValForDeviceType(); ++dtypeInt) { auto dtype = static_cast<mlir::acc::DeviceType>(dtypeInt); - if (op.getBindNameValue(dtype) != - getAttributeValueByDeviceType<llvm::StringRef, mlir::StringAttr>( - bindNameArrayAttr, bindNameDeviceTypeArrayAttr, dtype)) + auto bindNameValue = getBindNameStringValue(op.getBindNameValue(dtype)); + if (bindNameValue != + getAttributeValueByDeviceType<llvm::StringRef, mlir::StringAttr>( + bindIdNameArrayAttr, bindIdNameDeviceTypeArrayAttr, dtype) && + bindNameValue != + getAttributeValueByDeviceType<llvm::StringRef, mlir::StringAttr>( + bindStrNameArrayAttr, bindStrNameDeviceTypeArrayAttr, dtype)) return false; if (op.hasGang(dtype) != hasDeviceType(gangArrayAttr, dtype)) return false; @@ -4458,8 +4504,10 @@ getArrayAttrOrNull(fir::FirOpBuilder &builder, void createOpenACCRoutineConstruct( Fortran::lower::AbstractConverter &converter, mlir::Location loc, mlir::ModuleOp mod, mlir::func::FuncOp funcOp, std::string funcName, - bool hasNohost, llvm::SmallVector<mlir::Attribute> &bindNames, - llvm::SmallVector<mlir::Attribute> &bindNameDeviceTypes, + bool hasNohost, llvm::SmallVector<mlir::Attribute> &bindIdNames, + llvm::SmallVector<mlir::Attribute> &bindStrNames, + llvm::SmallVector<mlir::Attribute> &bindIdNameDeviceTypes, + llvm::SmallVector<mlir::Attribute> &bindStrNameDeviceTypes, llvm::SmallVector<mlir::Attribute> &gangDeviceTypes, llvm::SmallVector<mlir::Attribute> &gangDimValues, llvm::SmallVector<mlir::Attribute> &gangDimDeviceTypes, @@ -4472,7 +4520,8 @@ void createOpenACCRoutineConstruct( 0) { // If the routine is already specified with the same clauses, just skip // the operation creation. - if (compareDeviceTypeInfo(routineOp, bindNames, bindNameDeviceTypes, + if (compareDeviceTypeInfo(routineOp, bindIdNames, bindStrNames, + bindIdNameDeviceTypes, bindStrNameDeviceTypes, gangDeviceTypes, gangDimValues, gangDimDeviceTypes, seqDeviceTypes, workerDeviceTypes, vectorDeviceTypes) && @@ -4489,8 +4538,10 @@ void createOpenACCRoutineConstruct( modBuilder.create<mlir::acc::RoutineOp>( loc, routineOpStr, mlir::SymbolRefAttr::get(builder.getContext(), funcName), - getArrayAttrOrNull(builder, bindNames), - getArrayAttrOrNull(builder, bindNameDeviceTypes), + getArrayAttrOrNull(builder, bindIdNames), + getArrayAttrOrNull(builder, bindStrNames), + getArrayAttrOrNull(builder, bindIdNameDeviceTypes), + getArrayAttrOrNull(builder, bindStrNameDeviceTypes), getArrayAttrOrNull(builder, workerDeviceTypes), getArrayAttrOrNull(builder, vectorDeviceTypes), getArrayAttrOrNull(builder, seqDeviceTypes), hasNohost, @@ -4507,8 +4558,10 @@ static void interpretRoutineDeviceInfo( llvm::SmallVector<mlir::Attribute> &seqDeviceTypes, llvm::SmallVector<mlir::Attribute> &vectorDeviceTypes, llvm::SmallVector<mlir::Attribute> &workerDeviceTypes, - llvm::SmallVector<mlir::Attribute> &bindNameDeviceTypes, - llvm::SmallVector<mlir::Attribute> &bindNames, + llvm::SmallVector<mlir::Attribute> &bindIdNameDeviceTypes, + llvm::SmallVector<mlir::Attribute> &bindStrNameDeviceTypes, + llvm::SmallVector<mlir::Attribute> &bindIdNames, + llvm::SmallVector<mlir::Attribute> &bindStrNames, llvm::SmallVector<mlir::Attribute> &gangDeviceTypes, llvm::SmallVector<mlir::Attribute> &gangDimValues, llvm::SmallVector<mlir::Attribute> &gangDimDeviceTypes) { @@ -4541,16 +4594,18 @@ static void interpretRoutineDeviceInfo( if (dinfo.bindNameOpt().has_value()) { const auto &bindName = dinfo.bindNameOpt().value(); mlir::Attribute bindNameAttr; - if (const auto &bindStr{std::get_if<std::string>(&bindName)}) { + if (const auto &bindSym{ + std::get_if<Fortran::semantics::SymbolRef>(&bindName)}) { + bindNameAttr = builder.getSymbolRefAttr(converter.mangleName(*bindSym)); + bindIdNames.push_back(bindNameAttr); + bindIdNameDeviceTypes.push_back(getDeviceTypeAttr()); + } else if (const auto &bindStr{std::get_if<std::string>(&bindName)}) { bindNameAttr = builder.getStringAttr(*bindStr); - } else if (const auto &bindSym{ - std::get_if<Fortran::semantics::SymbolRef>(&bindName)}) { - bindNameAttr = builder.getStringAttr(converter.mangleName(*bindSym)); + bindStrNames.push_back(bindNameAttr); + bindStrNameDeviceTypes.push_back(getDeviceTypeAttr()); } else { llvm_unreachable("Unsupported bind name type"); } - bindNames.push_back(bindNameAttr); - bindNameDeviceTypes.push_back(getDeviceTypeAttr()); } } @@ -4566,8 +4621,9 @@ void Fortran::lower::genOpenACCRoutineConstruct( bool hasNohost{false}; llvm::SmallVector<mlir::Attribute> seqDeviceTypes, vectorDeviceTypes, - workerDeviceTypes, bindNameDeviceTypes, bindNames, gangDeviceTypes, - gangDimDeviceTypes, gangDimValues; + workerDeviceTypes, bindIdNameDeviceTypes, bindStrNameDeviceTypes, + bindIdNames, bindStrNames, gangDeviceTypes, gangDimDeviceTypes, + gangDimValues; for (const Fortran::semantics::OpenACCRoutineInfo &info : routineInfos) { // Device Independent Attributes @@ -4576,24 +4632,26 @@ void Fortran::lower::genOpenACCRoutineConstruct( } // Note: Device Independent Attributes are set to the // none device type in `info`. - interpretRoutineDeviceInfo(converter, info, seqDeviceTypes, - vectorDeviceTypes, workerDeviceTypes, - bindNameDeviceTypes, bindNames, gangDeviceTypes, - gangDimValues, gangDimDeviceTypes); + interpretRoutineDeviceInfo( + converter, info, seqDeviceTypes, vectorDeviceTypes, workerDeviceTypes, + bindIdNameDeviceTypes, bindStrNameDeviceTypes, bindIdNames, + bindStrNames, gangDeviceTypes, gangDimValues, gangDimDeviceTypes); // Device Dependent Attributes for (const Fortran::semantics::OpenACCRoutineDeviceTypeInfo &dinfo : info.deviceTypeInfos()) { - interpretRoutineDeviceInfo( - converter, dinfo, seqDeviceTypes, vectorDeviceTypes, - workerDeviceTypes, bindNameDeviceTypes, bindNames, gangDeviceTypes, - gangDimValues, gangDimDeviceTypes); + interpretRoutineDeviceInfo(converter, dinfo, seqDeviceTypes, + vectorDeviceTypes, workerDeviceTypes, + bindIdNameDeviceTypes, bindStrNameDeviceTypes, + bindIdNames, bindStrNames, gangDeviceTypes, + gangDimValues, gangDimDeviceTypes); } } createOpenACCRoutineConstruct( - converter, loc, mod, funcOp, funcName, hasNohost, bindNames, - bindNameDeviceTypes, gangDeviceTypes, gangDimValues, gangDimDeviceTypes, - seqDeviceTypes, workerDeviceTypes, vectorDeviceTypes); + converter, loc, mod, funcOp, funcName, hasNohost, bindIdNames, + bindStrNames, bindIdNameDeviceTypes, bindStrNameDeviceTypes, + gangDeviceTypes, gangDimValues, gangDimDeviceTypes, seqDeviceTypes, + workerDeviceTypes, vectorDeviceTypes); } static void diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 4458f62eea95..fcb20fdf187f 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -372,90 +372,6 @@ extractMappedBaseValues(llvm::ArrayRef<mlir::Value> vars, }); } -/// Get the directive enumeration value corresponding to the given OpenMP -/// construct PFT node. -llvm::omp::Directive -extractOmpDirective(const parser::OpenMPConstruct &ompConstruct) { - return common::visit( - common::visitors{ - [](const parser::OpenMPAllocatorsConstruct &c) { - return llvm::omp::OMPD_allocators; - }, - [](const parser::OpenMPAssumeConstruct &c) { - return llvm::omp::OMPD_assume; - }, - [](const parser::OpenMPAtomicConstruct &c) { - return llvm::omp::OMPD_atomic; - }, - [](const parser::OpenMPBlockConstruct &c) { - return std::get<parser::OmpBlockDirective>( - std::get<parser::OmpBeginBlockDirective>(c.t).t) - .v; - }, - [](const parser::OpenMPCriticalConstruct &c) { - return llvm::omp::OMPD_critical; - }, - [](const parser::OpenMPDeclarativeAllocate &c) { - return llvm::omp::OMPD_allocate; - }, - [](const parser::OpenMPDispatchConstruct &c) { - return llvm::omp::OMPD_dispatch; - }, - [](const parser::OpenMPExecutableAllocate &c) { - return llvm::omp::OMPD_allocate; - }, - [](const parser::OpenMPLoopConstruct &c) { - return std::get<parser::OmpLoopDirective>( - std::get<parser::OmpBeginLoopDirective>(c.t).t) - .v; - }, - [](const parser::OpenMPSectionConstruct &c) { - return llvm::omp::OMPD_section; - }, - [](const parser::OpenMPSectionsConstruct &c) { - return std::get<parser::OmpSectionsDirective>( - std::get<parser::OmpBeginSectionsDirective>(c.t).t) - .v; - }, - [](const parser::OpenMPStandaloneConstruct &c) { - return common::visit( - common::visitors{ - [](const parser::OpenMPSimpleStandaloneConstruct &c) { - return c.v.DirId(); - }, - [](const parser::OpenMPFlushConstruct &c) { - return llvm::omp::OMPD_flush; - }, - [](const parser::OpenMPCancelConstruct &c) { - return llvm::omp::OMPD_cancel; - }, - [](const parser::OpenMPCancellationPointConstruct &c) { - return llvm::omp::OMPD_cancellation_point; - }, - [](const parser::OmpMetadirectiveDirective &c) { - return llvm::omp::OMPD_metadirective; - }, - [](const parser::OpenMPDepobjConstruct &c) { - return llvm::omp::OMPD_depobj; - }, - [](const parser::OpenMPInteropConstruct &c) { - return llvm::omp::OMPD_interop; - }}, - c.u); - }, - [](const parser::OpenMPUtilityConstruct &c) { - return common::visit( - common::visitors{[](const parser::OmpErrorDirective &c) { - return llvm::omp::OMPD_error; - }, - [](const parser::OmpNothingDirective &c) { - return llvm::omp::OMPD_nothing; - }}, - c.u); - }}, - ompConstruct.u); -} - /// Populate the global \see hostEvalInfo after processing clauses for the given /// \p eval OpenMP target construct, or nested constructs, if these must be /// evaluated outside of the target region per the spec. diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp index 2e53f01f1da6..b194150c0f7f 100644 --- a/flang/lib/Lower/OpenMP/Utils.cpp +++ b/flang/lib/Lower/OpenMP/Utils.cpp @@ -661,6 +661,90 @@ bool collectLoopRelatedInfo( return found; } + +/// Get the directive enumeration value corresponding to the given OpenMP +/// construct PFT node. +llvm::omp::Directive +extractOmpDirective(const parser::OpenMPConstruct &ompConstruct) { + return common::visit( + common::visitors{ + [](const parser::OpenMPAllocatorsConstruct &c) { + return llvm::omp::OMPD_allocators; + }, + [](const parser::OpenMPAssumeConstruct &c) { + return llvm::omp::OMPD_assume; + }, + [](const parser::OpenMPAtomicConstruct &c) { + return llvm::omp::OMPD_atomic; + }, + [](const parser::OpenMPBlockConstruct &c) { + return std::get<parser::OmpBlockDirective>( + std::get<parser::OmpBeginBlockDirective>(c.t).t) + .v; + }, + [](const parser::OpenMPCriticalConstruct &c) { + return llvm::omp::OMPD_critical; + }, + [](const parser::OpenMPDeclarativeAllocate &c) { + return llvm::omp::OMPD_allocate; + }, + [](const parser::OpenMPDispatchConstruct &c) { + return llvm::omp::OMPD_dispatch; + }, + [](const parser::OpenMPExecutableAllocate &c) { + return llvm::omp::OMPD_allocate; + }, + [](const parser::OpenMPLoopConstruct &c) { + return std::get<parser::OmpLoopDirective>( + std::get<parser::OmpBeginLoopDirective>(c.t).t) + .v; + }, + [](const parser::OpenMPSectionConstruct &c) { + return llvm::omp::OMPD_section; + }, + [](const parser::OpenMPSectionsConstruct &c) { + return std::get<parser::OmpSectionsDirective>( + std::get<parser::OmpBeginSectionsDirective>(c.t).t) + .v; + }, + [](const parser::OpenMPStandaloneConstruct &c) { + return common::visit( + common::visitors{ + [](const parser::OpenMPSimpleStandaloneConstruct &c) { + return c.v.DirId(); + }, + [](const parser::OpenMPFlushConstruct &c) { + return llvm::omp::OMPD_flush; + }, + [](const parser::OpenMPCancelConstruct &c) { + return llvm::omp::OMPD_cancel; + }, + [](const parser::OpenMPCancellationPointConstruct &c) { + return llvm::omp::OMPD_cancellation_point; + }, + [](const parser::OmpMetadirectiveDirective &c) { + return llvm::omp::OMPD_metadirective; + }, + [](const parser::OpenMPDepobjConstruct &c) { + return llvm::omp::OMPD_depobj; + }, + [](const parser::OpenMPInteropConstruct &c) { + return llvm::omp::OMPD_interop; + }}, + c.u); + }, + [](const parser::OpenMPUtilityConstruct &c) { + return common::visit( + common::visitors{[](const parser::OmpErrorDirective &c) { + return llvm::omp::OMPD_error; + }, + [](const parser::OmpNothingDirective &c) { + return llvm::omp::OMPD_nothing; + }}, + c.u); + }}, + ompConstruct.u); +} } // namespace omp } // namespace lower } // namespace Fortran diff --git a/flang/lib/Lower/OpenMP/Utils.h b/flang/lib/Lower/OpenMP/Utils.h index 1526bd4e9023..8e3ad5c3452e 100644 --- a/flang/lib/Lower/OpenMP/Utils.h +++ b/flang/lib/Lower/OpenMP/Utils.h @@ -166,6 +166,9 @@ bool collectLoopRelatedInfo( lower::pft::Evaluation &eval, const omp::List<omp::Clause> &clauses, mlir::omp::LoopRelatedClauseOps &result, llvm::SmallVectorImpl<const semantics::Symbol *> &iv); + +llvm::omp::Directive +extractOmpDirective(const parser::OpenMPConstruct &ompConstruct); } // namespace omp } // namespace lower } // namespace Fortran diff --git a/flang/test/Lower/OpenACC/acc-host-data-unwrap-defaultbounds.f90 b/flang/test/Lower/OpenACC/acc-host-data-unwrap-defaultbounds.f90 index 164eb32a8f68..2de7cc5761a2 100644 --- a/flang/test/Lower/OpenACC/acc-host-data-unwrap-defaultbounds.f90 +++ b/flang/test/Lower/OpenACC/acc-host-data-unwrap-defaultbounds.f90 @@ -15,15 +15,17 @@ subroutine acc_host_data() !$acc end host_data ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index) -! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"} -! CHECK: acc.host_data dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>) +! CHECK: %[[DA0:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"} +! CHECK: %[[DA1:.*]] = acc.use_device varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"} + ! CHECK: acc.host_data dataOperands(%[[DA0]], %[[DA1]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) !$acc host_data use_device(a) if_present !$acc end host_data ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index) -! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"} -! CHECK: acc.host_data dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>) { +! CHECK: %[[DA0:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"} +! CHECK: %[[DA1:.*]] = acc.use_device varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"} +! CHECK: acc.host_data dataOperands(%[[DA0]], %[[DA1]] : !fir.ref<!fir.array<10xf32>>{{.*}}) { ! CHECK: } attributes {ifPresent} !$acc host_data use_device(a) if(ifCondition) @@ -33,14 +35,14 @@ subroutine acc_host_data() ! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"} ! CHECK: %[[LOAD_IFCOND:.*]] = fir.load %[[DECLIFCOND]]#0 : !fir.ref<!fir.logical<4>> ! CHECK: %[[IFCOND_I1:.*]] = fir.convert %[[LOAD_IFCOND]] : (!fir.logical<4>) -> i1 -! CHECK: acc.host_data if(%[[IFCOND_I1]]) dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>) +! CHECK: acc.host_data if(%[[IFCOND_I1]]) dataOperands(%[[DA]]{{.*}} : !fir.ref<!fir.array<10xf32>>{{.*}}) !$acc host_data use_device(a) if(.true.) !$acc end host_data ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index) ! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"} -! CHECK: acc.host_data dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>) +! CHECK: acc.host_data dataOperands(%[[DA]]{{.*}} : !fir.ref<!fir.array<10xf32>>{{.*}}) !$acc host_data use_device(a) if(.false.) a = 1.0 diff --git a/flang/test/Lower/OpenACC/acc-host-data.f90 b/flang/test/Lower/OpenACC/acc-host-data.f90 index 871eabd256ca..4d09b25b983b 100644 --- a/flang/test/Lower/OpenACC/acc-host-data.f90 +++ b/flang/test/Lower/OpenACC/acc-host-data.f90 @@ -14,34 +14,37 @@ subroutine acc_host_data() !$acc host_data use_device(a) !$acc end host_data -! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"} -! CHECK: acc.host_data dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>) +! CHECK: %[[DA0:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"} +! CHECK: %[[DA1:.*]] = acc.use_device varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"} +! CHECK: acc.host_data dataOperands(%[[DA0]], %[[DA1]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) !$acc host_data use_device(a) if_present !$acc end host_data -! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"} -! CHECK: acc.host_data dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>) { +! CHECK: %[[DA0:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"} +! CHECK: %[[DA1:.*]] = acc.use_device varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"} +! CHECK: acc.host_data dataOperands(%[[DA0]], %[[DA1]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) ! CHECK: } attributes {ifPresent} - !$acc host_data use_device(a) if_present if_present + !$acc host_data use_device(a) if_present !$acc end host_data -! CHECK: acc.host_data dataOperands(%{{.*}} : !fir.ref<!fir.array<10xf32>>) { +! CHECK: acc.host_data dataOperands(%{{.*}}{{.*}} : !fir.ref<!fir.array<10xf32>>{{.*}}) { ! CHECK: } attributes {ifPresent} !$acc host_data use_device(a) if(ifCondition) !$acc end host_data -! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"} +! CHECK: %[[DA0:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"} +! CHECK: %[[DA1:.*]] = acc.use_device varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"} ! CHECK: %[[LOAD_IFCOND:.*]] = fir.load %[[DECLIFCOND]]#0 : !fir.ref<!fir.logical<4>> ! CHECK: %[[IFCOND_I1:.*]] = fir.convert %[[LOAD_IFCOND]] : (!fir.logical<4>) -> i1 -! CHECK: acc.host_data if(%[[IFCOND_I1]]) dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>) +! CHECK: acc.host_data if(%[[IFCOND_I1]]) dataOperands(%[[DA0]]{{.*}} : !fir.ref<!fir.array<10xf32>>{{.*}}) !$acc host_data use_device(a) if(.true.) !$acc end host_data ! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"} -! CHECK: acc.host_data dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>) +! CHECK: acc.host_data dataOperands(%[[DA]]{{.*}} : !fir.ref<!fir.array<10xf32>>{{.*}}) !$acc host_data use_device(a) if(.false.) a = 1.0 diff --git a/flang/test/Lower/OpenACC/acc-routine.f90 b/flang/test/Lower/OpenACC/acc-routine.f90 index 789f3a57e1f7..1a63b4120235 100644 --- a/flang/test/Lower/OpenACC/acc-routine.f90 +++ b/flang/test/Lower/OpenACC/acc-routine.f90 @@ -2,13 +2,14 @@ ! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s -! CHECK: acc.routine @[[r14:.*]] func(@_QPacc_routine19) bind("_QPacc_routine17" [#acc.device_type<host>], "_QPacc_routine17" [#acc.device_type<default>], "_QPacc_routine16" [#acc.device_type<multicore>]) -! CHECK: acc.routine @[[r13:.*]] func(@_QPacc_routine18) bind("_QPacc_routine17" [#acc.device_type<host>], "_QPacc_routine16" [#acc.device_type<multicore>]) +! CHECK: acc.routine @[[r14:.*]] func(@_QPacc_routine19) bind(@_QPacc_routine17 [#acc.device_type<host>], @_QPacc_routine17 +! [#acc.device_type<default>], @_QPacc_routine16 [#acc.device_type<multicore>]) +! CHECK: acc.routine @[[r13:.*]] func(@_QPacc_routine18) bind(@_QPacc_routine17 [#acc.device_type<host>], @_QPacc_routine16 [#acc.device_type<multicore>]) ! CHECK: acc.routine @[[r12:.*]] func(@_QPacc_routine17) worker ([#acc.device_type<host>]) vector ([#acc.device_type<multicore>]) ! CHECK: acc.routine @[[r11:.*]] func(@_QPacc_routine16) gang([#acc.device_type<nvidia>]) seq ([#acc.device_type<host>]) ! CHECK: acc.routine @[[r10:.*]] func(@_QPacc_routine11) seq ! CHECK: acc.routine @[[r09:.*]] func(@_QPacc_routine10) seq -! CHECK: acc.routine @[[r08:.*]] func(@_QPacc_routine9) bind("_QPacc_routine9a") +! CHECK: acc.routine @[[r08:.*]] func(@_QPacc_routine9) bind(@_QPacc_routine9a) ! CHECK: acc.routine @[[r07:.*]] func(@_QPacc_routine8) bind("routine8_") ! CHECK: acc.routine @[[r06:.*]] func(@_QPacc_routine7) gang(dim: 1 : i64) ! CHECK: acc.routine @[[r05:.*]] func(@_QPacc_routine6) nohost diff --git a/flang/test/Lower/OpenACC/acc-routine03.f90 b/flang/test/Lower/OpenACC/acc-routine03.f90 index 85e4ef580f98..ddd6bda0367e 100644 --- a/flang/test/Lower/OpenACC/acc-routine03.f90 +++ b/flang/test/Lower/OpenACC/acc-routine03.f90 @@ -30,6 +30,6 @@ end interface end subroutine ! CHECK: acc.routine @acc_routine_1 func(@_QPsub2) worker nohost -! CHECK: acc.routine @acc_routine_0 func(@_QPsub1) bind("_QPsub2") worker +! CHECK: acc.routine @acc_routine_0 func(@_QPsub1) bind(@_QPsub2) worker ! CHECK: func.func @_QPsub1(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "a"}) attributes {acc.routine_info = #acc.routine_info<[@acc_routine_0]>} ! CHECK: func.func @_QPsub2(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "a"}) attributes {acc.routine_info = #acc.routine_info<[@acc_routine_1]>} diff --git a/flang/test/Lower/OpenACC/acc-use-device.f90 b/flang/test/Lower/OpenACC/acc-use-device.f90 new file mode 100644 index 000000000000..081a6e317bfc --- /dev/null +++ b/flang/test/Lower/OpenACC/acc-use-device.f90 @@ -0,0 +1,61 @@ +! This test checks whether the OpenACC use_device clause is applied on both results of hlfir.declare. + +! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s + +! Test for automatic variable appearing in use_device clause. +subroutine test() + integer :: N = 100 + real*8 :: b(-1:N) +! CHECK: %[[A0:.*]] = fir.alloca !fir.array<?xf64>, %{{.*}} {bindc_name = "b", uniq_name = "_QFtestEb"} +! CHECK: %[[A1:.*]] = fir.shape_shift {{.*}} : (index, index) -> !fir.shapeshift<1> +! CHECK: %[[A:.*]]:2 = hlfir.declare %[[A0]](%[[A1]]) {uniq_name = "_QFtestEb"} : (!fir.ref<!fir.array<?xf64>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<?xf64>>, !fir.ref<!fir.array<?xf64>>) + + !$acc data copy(b) +! CHECK: %[[B:.*]] = acc.copyin var(%[[A]]#0 : !fir.box<!fir.array<?xf64>>) -> !fir.box<!fir.array<?xf64>> {dataClause = #acc<data_clause acc_copy>, name = "b"} +! CHECK: acc.data dataOperands(%[[B]] : !fir.box<!fir.array<?xf64>>) { + + !$acc host_data use_device(b) + call vadd(b) + !$acc end host_data +! CHECK: %[[C:.*]] = acc.use_device var(%[[A]]#0 : !fir.box<!fir.array<?xf64>>) -> !fir.box<!fir.array<?xf64>> {name = "b"} +! CHECK: %[[D:.*]] = acc.use_device varPtr(%[[A]]#1 : !fir.ref<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>> {name = "b"} +! CHECK: acc.host_data dataOperands(%[[C]], %[[D]] : !fir.box<!fir.array<?xf64>>, !fir.ref<!fir.array<?xf64>>) { +! CHECK: fir.call @_QPvadd(%[[A]]#1) fastmath<contract> : (!fir.ref<!fir.array<?xf64>>) -> () + !$acc end data +! CHECK: acc.copyout accVar(%[[B]] : !fir.box<!fir.array<?xf64>>) to var(%[[A]]#0 : !fir.box<!fir.array<?xf64>>) {dataClause = #acc<data_clause acc_copy>, name = "b"} +end + +! Test for allocatable, pointer and assumed-shape variables appearing in use_device clause. +subroutine test2(a, b, c) + integer :: N = 100 + real*8, allocatable :: a(:) + real*8, target, allocatable :: d(:) + real*8 :: b(:) + real*8, pointer :: c(:) + call allocate(a(N)) + call allocate(d(N)) + c => d +! CHECK: %[[DS:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[E:.*]]:2 = hlfir.declare %arg0 dummy_scope %[[DS]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest2Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) +! CHECK: %[[F:.*]]:2 = hlfir.declare %arg1 dummy_scope %[[DS]] {uniq_name = "_QFtest2Eb"} : (!fir.box<!fir.array<?xf64>>, !fir.dscope) -> (!fir.box<!fir.array<?xf64>>, !fir.box<!fir.array<?xf64>>) +! CHECK: %[[G:.*]]:2 = hlfir.declare %arg2 dummy_scope %[[DS]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest2Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>) + + !$acc data copy(a,b,c,d) + !$acc host_data use_device(a,b,c) + call vadd2(a,b,c) + !$acc end host_data + +! CHECK: %[[H:.*]] = acc.use_device varPtr(%[[E]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>> {name = "a"} +! CHECK: %[[I:.*]] = acc.use_device varPtr(%[[E]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>> {name = "a"} +! CHECK: %[[J:.*]] = acc.use_device var(%[[F]]#0 : !fir.box<!fir.array<?xf64>>) -> !fir.box<!fir.array<?xf64>> {name = "b"} +! CHECK: %[[K:.*]] = acc.use_device var(%[[F]]#1 : !fir.box<!fir.array<?xf64>>) -> !fir.box<!fir.array<?xf64>> {name = "b"} +! CHECK: %[[L:.*]] = acc.use_device varPtr(%[[G]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>> {name = "c"} +! CHECK: %[[M:.*]] = acc.use_device varPtr(%[[G]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>> {name = "c"} +! CHECK: acc.host_data dataOperands(%[[H]], %[[I]], %[[J]], %[[K]], %[[L]], %[[M]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.box<!fir.array<?xf64>>, !fir.box<!fir.array<?xf64>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>) { + + + + + !$acc end data + +end diff --git a/libc/test/src/math/cospif_test.cpp b/libc/test/src/math/cospif_test.cpp index cb88bfcade0d..5c30fb7c8718 100644 --- a/libc/test/src/math/cospif_test.cpp +++ b/libc/test/src/math/cospif_test.cpp @@ -100,7 +100,7 @@ TEST_F(LlvmLibcCospifTest, SmallValues) { LIBC_NAMESPACE::cospif(x), 0.5); } -// SDCOMP-26094: check sinfpi in the cases for which the range reducer +// SDCOMP-26094: check cospif in the cases for which the range reducer // returns values furthest beyond its nominal upper bound of pi/4. TEST_F(LlvmLibcCospifTest, SDCOMP_26094) { for (uint32_t v : SDCOMP26094_VALUES) { diff --git a/libc/test/src/math/sincosf_test.cpp b/libc/test/src/math/sincosf_test.cpp index ad2155f329cd..4aac1fabfbd6 100644 --- a/libc/test/src/math/sincosf_test.cpp +++ b/libc/test/src/math/sincosf_test.cpp @@ -164,7 +164,7 @@ TEST_F(LlvmLibcSinCosfTest, SpecialValues) { } } -// SDCOMP-26094: check sinf in the cases for which the range reducer +// SDCOMP-26094: check sincosf in the cases for which the range reducer // returns values furthest beyond its nominal upper bound of pi/4. TEST_F(LlvmLibcSinCosfTest, SDCOMP_26094) { for (uint32_t v : SDCOMP26094_VALUES) { diff --git a/libc/test/src/math/sinpif_test.cpp b/libc/test/src/math/sinpif_test.cpp index 986c676761f0..94e3dbc4f07d 100644 --- a/libc/test/src/math/sinpif_test.cpp +++ b/libc/test/src/math/sinpif_test.cpp @@ -100,7 +100,7 @@ TEST_F(LlvmLibcSinpifTest, SmallValues) { LIBC_NAMESPACE::sinpif(x), 0.5); } -// SDCOMP-26094: check sinfpi in the cases for which the range reducer +// SDCOMP-26094: check sinpif in the cases for which the range reducer // returns values furthest beyond its nominal upper bound of pi/4. TEST_F(LlvmLibcSinpifTest, SDCOMP_26094) { for (uint32_t v : SDCOMP26094_VALUES) { diff --git a/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp b/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp index ef691b77193c..58ebb7be1199 100644 --- a/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp +++ b/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp @@ -108,13 +108,21 @@ MinidumpParser::GetThreadContext(const minidump::Thread &td) { llvm::ArrayRef<uint8_t> MinidumpParser::GetThreadContextWow64(const minidump::Thread &td) { + Log *log = GetLog(LLDBLog::Process); // On Windows, a 32-bit process can run on a 64-bit machine under WOW64. If // the minidump was captured with a 64-bit debugger, then the CONTEXT we just // grabbed from the mini_dump_thread is the one for the 64-bit "native" // process rather than the 32-bit "guest" process we care about. In this // case, we can get the 32-bit CONTEXT from the TEB (Thread Environment // Block) of the 64-bit process. - auto teb_mem = GetMemory(td.EnvironmentBlock, sizeof(TEB64)); + auto teb_mem_maybe = GetMemory(td.EnvironmentBlock, sizeof(TEB64)); + if (!teb_mem_maybe) { + LLDB_LOG_ERROR(log, teb_mem_maybe.takeError(), + "Failed to read Thread Environment Block: {0}"); + return {}; + } + + auto teb_mem = *teb_mem_maybe; if (teb_mem.empty()) return {}; @@ -126,8 +134,16 @@ MinidumpParser::GetThreadContextWow64(const minidump::Thread &td) { // Slot 1 of the thread-local storage in the 64-bit TEB points to a structure // that includes the 32-bit CONTEXT (after a ULONG). See: // https://msdn.microsoft.com/en-us/library/ms681670.aspx - auto context = + auto context_maybe = GetMemory(wow64teb->tls_slots[1] + 4, sizeof(MinidumpContext_x86_32)); + if (!context_maybe) { + LLDB_LOG_ERROR(log, context_maybe.takeError(), + "Failed to read WOW Thread Context: {0}"); + return {}; + } + + auto context = *context_maybe; + if (context.size() < sizeof(MinidumpContext_x86_32)) return {}; @@ -478,11 +494,13 @@ void MinidumpParser::PopulateMemoryRanges() { m_memory_ranges.Sort(); } -llvm::ArrayRef<uint8_t> MinidumpParser::GetMemory(lldb::addr_t addr, - size_t size) { +llvm::Expected<llvm::ArrayRef<uint8_t>> +MinidumpParser::GetMemory(lldb::addr_t addr, size_t size) { std::optional<minidump::Range> range = FindMemoryRange(addr); if (!range) - return {}; + return llvm::createStringError( + llvm::inconvertibleErrorCode(), + "No memory range found for address (0x%" PRIx64 ")", addr); // There's at least some overlap between the beginning of the desired range // (addr) and the current range. Figure out where the overlap begins and @@ -491,7 +509,11 @@ llvm::ArrayRef<uint8_t> MinidumpParser::GetMemory(lldb::addr_t addr, const size_t offset = addr - range->start; if (addr < range->start || offset >= range->range_ref.size()) - return {}; + return llvm::createStringError( + llvm::inconvertibleErrorCode(), + "Address (0x%" PRIx64 ") is not in range [0x%" PRIx64 " - 0x%" PRIx64 + ")", + addr, range->start, range->start + range->range_ref.size()); const size_t overlap = std::min(size, range->range_ref.size() - offset); return range->range_ref.slice(offset, overlap); diff --git a/lldb/source/Plugins/Process/minidump/MinidumpParser.h b/lldb/source/Plugins/Process/minidump/MinidumpParser.h index 14599f8d572a..3b7d33daca71 100644 --- a/lldb/source/Plugins/Process/minidump/MinidumpParser.h +++ b/lldb/source/Plugins/Process/minidump/MinidumpParser.h @@ -104,7 +104,8 @@ public: std::optional<Range> FindMemoryRange(lldb::addr_t addr); - llvm::ArrayRef<uint8_t> GetMemory(lldb::addr_t addr, size_t size); + llvm::Expected<llvm::ArrayRef<uint8_t>> GetMemory(lldb::addr_t addr, + size_t size); /// Returns a list of memory regions and a flag indicating whether the list is /// complete (includes all regions mapped into the process memory). diff --git a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp index ef3c00e2857d..17a421a72274 100644 --- a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp +++ b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp @@ -322,12 +322,15 @@ size_t ProcessMinidump::ReadMemory(lldb::addr_t addr, void *buf, size_t size, size_t ProcessMinidump::DoReadMemory(lldb::addr_t addr, void *buf, size_t size, Status &error) { - llvm::ArrayRef<uint8_t> mem = m_minidump_parser->GetMemory(addr, size); - if (mem.empty()) { - error = Status::FromErrorString("could not parse memory info"); + llvm::Expected<llvm::ArrayRef<uint8_t>> mem_maybe = + m_minidump_parser->GetMemory(addr, size); + if (!mem_maybe) { + error = Status::FromError(mem_maybe.takeError()); return 0; } + llvm::ArrayRef<uint8_t> mem = *mem_maybe; + std::memcpy(buf, mem.data(), mem.size()); return mem.size(); } diff --git a/lldb/test/Shell/Minidump/missing-memory-region.yaml b/lldb/test/Shell/Minidump/missing-memory-region.yaml new file mode 100644 index 000000000000..1784cacfaf1b --- /dev/null +++ b/lldb/test/Shell/Minidump/missing-memory-region.yaml @@ -0,0 +1,42 @@ +# Check that looking up a memory region not present in the Minidump fails +# even if it's in the /proc/<pid>/maps file. + +# RUN: yaml2obj %s -o %t +# RUN: %lldb -c %t -o "memory read 0x5000" 2>&1 | FileCheck %s + +# CHECK-LABEL: (lldb) memory read 0x5000 +# CHECK-NEXT: error: No memory range found for address (0x5000) + +--- !minidump +Streams: + - Type: SystemInfo + Processor Arch: AMD64 + Processor Level: 6 + Processor Revision: 15876 + Number of Processors: 40 + Platform ID: Linux + CSD Version: 'Linux 3.13.0-91-generic #138-Ubuntu SMP Fri Jun 24 17:00:34 UTC 2016 x86_64' + CPU: + Vendor ID: GenuineIntel + Version Info: 0x00000000 + Feature Info: 0x00000000 + - Type: LinuxProcStatus + Text: | + Name: test-yaml + Umask: 0002 + State: t (tracing stop) + Pid: 8567 + - Type: LinuxMaps + Text: | + 0x1000-0x1100 r-xp 00000000 00:00 0 + 0x2000-0x2200 rw-p 00000000 00:00 0 + 0x4000-0x6000 rw-- 00000000 00:00 0 + - Type: Memory64List + Memory Ranges: + - Start of Memory Range: 0x1000 + Data Size: 0x100 + Content : '' + - Start of Memory Range: 0x2000 + Data Size: 0x200 + Content : '' +... diff --git a/lldb/test/Shell/Settings/TestChildCountTruncation.test b/lldb/test/Shell/Settings/TestChildCountTruncation.test index a96a0d8310ee..da6436cb5ca2 100644 --- a/lldb/test/Shell/Settings/TestChildCountTruncation.test +++ b/lldb/test/Shell/Settings/TestChildCountTruncation.test @@ -2,7 +2,7 @@ # when target.max-children-count wasn't explicitly set. # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %clang_host -g %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/dwim-commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s --check-prefix=DWIM # diff --git a/lldb/unittests/Process/minidump/MinidumpParserTest.cpp b/lldb/unittests/Process/minidump/MinidumpParserTest.cpp index ee31c8e63644..44f653c6fa13 100644 --- a/lldb/unittests/Process/minidump/MinidumpParserTest.cpp +++ b/lldb/unittests/Process/minidump/MinidumpParserTest.cpp @@ -308,16 +308,19 @@ Streams: )"), llvm::Succeeded()); - EXPECT_EQ((llvm::ArrayRef<uint8_t>{0x54}), parser->GetMemory(0x401d46, 1)); - EXPECT_EQ((llvm::ArrayRef<uint8_t>{0x54, 0x21}), - parser->GetMemory(0x401d46, 4)); - - EXPECT_EQ((llvm::ArrayRef<uint8_t>{0xc8, 0x4d, 0x04, 0xbc, 0xe9}), - parser->GetMemory(0x7ffceb34a000, 5)); - EXPECT_EQ((llvm::ArrayRef<uint8_t>{0xc8, 0x4d, 0x04}), - parser->GetMemory(0x7ffceb34a000, 3)); - - EXPECT_EQ(llvm::ArrayRef<uint8_t>(), parser->GetMemory(0x500000, 512)); + EXPECT_THAT_EXPECTED(parser->GetMemory(0x401d46, 1), + llvm::HasValue(llvm::ArrayRef<uint8_t>{0x54})); + EXPECT_THAT_EXPECTED(parser->GetMemory(0x401d46, 4), + llvm::HasValue(llvm::ArrayRef<uint8_t>{0x54, 0x21})); + EXPECT_THAT_EXPECTED( + parser->GetMemory(0x7ffceb34a000, 5), + llvm::HasValue(llvm::ArrayRef<uint8_t>{0xc8, 0x4d, 0x04, 0xbc, 0xe9})); + EXPECT_THAT_EXPECTED( + parser->GetMemory(0x7ffceb34a000, 3), + llvm::HasValue(llvm::ArrayRef<uint8_t>{0xc8, 0x4d, 0x04})); + EXPECT_THAT_EXPECTED( + parser->GetMemory(0x500000, 512), + llvm::FailedWithMessage("No memory range found for address (0x500000)")); } TEST_F(MinidumpParserTest, FindMemoryRangeWithFullMemoryMinidump) { diff --git a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h index a101151eed7c..39fef921a959 100644 --- a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h +++ b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h @@ -530,6 +530,7 @@ private: bool isExpandedAddRecExprPHI(PHINode *PN, Instruction *IncV, const Loop *L); + Value *tryToReuseLCSSAPhi(const SCEVAddRecExpr *S); Value *expandAddRecExprLiterally(const SCEVAddRecExpr *); PHINode *getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized, const Loop *L, Type *&TruncTy, diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 0e8e4c9618bb..40464e91f9ef 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -609,6 +609,8 @@ namespace { SDValue foldABSToABD(SDNode *N, const SDLoc &DL); SDValue foldSelectToABD(SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode CC, const SDLoc &DL); + SDValue foldSelectToUMin(SDValue LHS, SDValue RHS, SDValue True, + SDValue False, ISD::CondCode CC, const SDLoc &DL); SDValue unfoldMaskedMerge(SDNode *N); SDValue unfoldExtremeBitClearingToShifts(SDNode *N); SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, @@ -859,7 +861,7 @@ namespace { auto LK = TLI.getTypeConversion(*DAG.getContext(), VT); return (LK.first == TargetLoweringBase::TypeLegal || LK.first == TargetLoweringBase::TypePromoteInteger) && - TLI.isOperationLegal(ISD::UMIN, LK.second); + TLI.isOperationLegalOrCustom(ISD::UMIN, LK.second); } public: @@ -4093,6 +4095,26 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { return N0; } + // (sub x, ([v]select (ult x, y), 0, y)) -> (umin x, (sub x, y)) + // (sub x, ([v]select (uge x, y), y, 0)) -> (umin x, (sub x, y)) + if (N1.hasOneUse() && hasUMin(VT)) { + SDValue Y; + if (sd_match(N1, m_Select(m_SetCC(m_Specific(N0), m_Value(Y), + m_SpecificCondCode(ISD::SETULT)), + m_Zero(), m_Deferred(Y))) || + sd_match(N1, m_Select(m_SetCC(m_Specific(N0), m_Value(Y), + m_SpecificCondCode(ISD::SETUGE)), + m_Deferred(Y), m_Zero())) || + sd_match(N1, m_VSelect(m_SetCC(m_Specific(N0), m_Value(Y), + m_SpecificCondCode(ISD::SETULT)), + m_Zero(), m_Deferred(Y))) || + sd_match(N1, m_VSelect(m_SetCC(m_Specific(N0), m_Value(Y), + m_SpecificCondCode(ISD::SETUGE)), + m_Deferred(Y), m_Zero()))) + return DAG.getNode(ISD::UMIN, DL, VT, N0, + DAG.getNode(ISD::SUB, DL, VT, N0, Y)); + } + if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; @@ -4442,20 +4464,6 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { sd_match(N1, m_UMaxLike(m_Specific(A), m_Specific(B)))) return DAG.getNegative(DAG.getNode(ISD::ABDU, DL, VT, A, B), DL, VT); - // (sub x, (select (ult x, y), 0, y)) -> (umin x, (sub x, y)) - // (sub x, (select (uge x, y), y, 0)) -> (umin x, (sub x, y)) - if (hasUMin(VT)) { - SDValue Y; - if (sd_match(N1, m_OneUse(m_Select(m_SetCC(m_Specific(N0), m_Value(Y), - m_SpecificCondCode(ISD::SETULT)), - m_Zero(), m_Deferred(Y)))) || - sd_match(N1, m_OneUse(m_Select(m_SetCC(m_Specific(N0), m_Value(Y), - m_SpecificCondCode(ISD::SETUGE)), - m_Deferred(Y), m_Zero())))) - return DAG.getNode(ISD::UMIN, DL, VT, N0, - DAG.getNode(ISD::SUB, DL, VT, N0, Y)); - } - return SDValue(); } @@ -12173,6 +12181,30 @@ SDValue DAGCombiner::foldSelectToABD(SDValue LHS, SDValue RHS, SDValue True, return SDValue(); } +// ([v]select (ugt x, C), (add x, ~C), x) -> (umin (add x, ~C), x) +// ([v]select (ult x, C), x, (add x, -C)) -> (umin x, (add x, -C)) +SDValue DAGCombiner::foldSelectToUMin(SDValue LHS, SDValue RHS, SDValue True, + SDValue False, ISD::CondCode CC, + const SDLoc &DL) { + APInt C; + EVT VT = True.getValueType(); + if (sd_match(RHS, m_ConstInt(C)) && hasUMin(VT)) { + if (CC == ISD::SETUGT && LHS == False && + sd_match(True, m_Add(m_Specific(False), m_SpecificInt(~C)))) { + SDValue AddC = DAG.getConstant(~C, DL, VT); + SDValue Add = DAG.getNode(ISD::ADD, DL, VT, False, AddC); + return DAG.getNode(ISD::UMIN, DL, VT, Add, False); + } + if (CC == ISD::SETULT && LHS == True && + sd_match(False, m_Add(m_Specific(True), m_SpecificInt(-C)))) { + SDValue AddC = DAG.getConstant(-C, DL, VT); + SDValue Add = DAG.getNode(ISD::ADD, DL, VT, True, AddC); + return DAG.getNode(ISD::UMIN, DL, VT, True, Add); + } + } + return SDValue(); +} + SDValue DAGCombiner::visitSELECT(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -12358,24 +12390,8 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { // (select (ugt x, C), (add x, ~C), x) -> (umin (add x, ~C), x) // (select (ult x, C), x, (add x, -C)) -> (umin x, (add x, -C)) - APInt C; - if (sd_match(Cond1, m_ConstInt(C)) && hasUMin(VT)) { - if (CC == ISD::SETUGT && Cond0 == N2 && - sd_match(N1, m_Add(m_Specific(N2), m_SpecificInt(~C)))) { - // The resulting code relies on an unsigned wrap in ADD. - // Recreating ADD to drop possible nuw/nsw flags. - SDValue AddC = DAG.getConstant(~C, DL, VT); - SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N2, AddC); - return DAG.getNode(ISD::UMIN, DL, VT, Add, N2); - } - if (CC == ISD::SETULT && Cond0 == N1 && - sd_match(N2, m_Add(m_Specific(N1), m_SpecificInt(-C)))) { - // Ditto. - SDValue AddC = DAG.getConstant(-C, DL, VT); - SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, AddC); - return DAG.getNode(ISD::UMIN, DL, VT, N1, Add); - } - } + if (SDValue UMin = foldSelectToUMin(Cond0, Cond1, N1, N2, CC, DL)) + return UMin; } if (!VT.isVector()) @@ -13412,6 +13428,11 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) { } } } + + // (vselect (ugt x, C), (add x, ~C), x) -> (umin (add x, ~C), x) + // (vselect (ult x, C), x, (add x, -C)) -> (umin x, (add x, -C)) + if (SDValue UMin = foldSelectToUMin(LHS, RHS, N1, N2, CC, DL)) + return UMin; } if (SimplifySelectOps(N, N1, N2)) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 996b0edd2420..bc57537ad5df 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -20,6 +20,7 @@ #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/CFIInstBuilder.h" #include "llvm/CodeGen/LivePhysRegs.h" @@ -35,6 +36,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/StackMaps.h" +#include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DebugInfoMetadata.h" @@ -7351,6 +7353,9 @@ bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const { case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2: case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1: case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2: + case AArch64MachineCombinerPattern::GATHER_LANE_i32: + case AArch64MachineCombinerPattern::GATHER_LANE_i16: + case AArch64MachineCombinerPattern::GATHER_LANE_i8: return true; } // end switch (Pattern) return false; @@ -7391,11 +7396,252 @@ static bool getMiscPatterns(MachineInstr &Root, return false; } +static bool getGatherPattern(MachineInstr &Root, + SmallVectorImpl<unsigned> &Patterns, + unsigned LoadLaneOpCode, unsigned NumLanes) { + const MachineFunction *MF = Root.getMF(); + + // Early exit if optimizing for size. + if (MF->getFunction().hasMinSize()) + return false; + + const MachineRegisterInfo &MRI = MF->getRegInfo(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + + // The root of the pattern must load into the last lane of the vector. + if (Root.getOperand(2).getImm() != NumLanes - 1) + return false; + + // Check that we have load into all lanes except lane 0. + // For each load we also want to check that: + // 1. It has a single non-debug use (since we will be replacing the virtual + // register) + // 2. That the addressing mode only uses a single offset register. + auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); + auto Range = llvm::seq<unsigned>(1, NumLanes - 1); + SmallSet<unsigned, 4> RemainingLanes(Range.begin(), Range.end()); + while (!RemainingLanes.empty() && CurrInstr && + CurrInstr->getOpcode() == LoadLaneOpCode && + MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) && + CurrInstr->getNumOperands() == 4) { + RemainingLanes.erase(CurrInstr->getOperand(2).getImm()); + CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg()); + } + + if (!RemainingLanes.empty()) + return false; + + // Match the SUBREG_TO_REG sequence. + if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG) + return false; + + // Verify that the subreg to reg loads an integer into the first lane. + auto Lane0LoadReg = CurrInstr->getOperand(2).getReg(); + unsigned SingleLaneSizeInBits = 128 / NumLanes; + if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits) + return false; + + // Verify that it also has a single non debug use. + if (!MRI.hasOneNonDBGUse(Lane0LoadReg)) + return false; + + switch (NumLanes) { + case 4: + Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i32); + break; + case 8: + Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i16); + break; + case 16: + Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i8); + break; + default: + llvm_unreachable("Got bad number of lanes for gather pattern."); + } + + return true; +} + +/// Search for patterns where we use LD1 instructions to load into +/// separate lanes of an 128 bit Neon register. We can increase Memory Level +/// Parallelism by loading into 2 Neon registers instead. +static bool getLoadPatterns(MachineInstr &Root, + SmallVectorImpl<unsigned> &Patterns) { + + // The pattern searches for loads into single lanes. + switch (Root.getOpcode()) { + case AArch64::LD1i32: + return getGatherPattern(Root, Patterns, Root.getOpcode(), 4); + case AArch64::LD1i16: + return getGatherPattern(Root, Patterns, Root.getOpcode(), 8); + case AArch64::LD1i8: + return getGatherPattern(Root, Patterns, Root.getOpcode(), 16); + default: + return false; + } +} + +static void +generateGatherPattern(MachineInstr &Root, + SmallVectorImpl<MachineInstr *> &InsInstrs, + SmallVectorImpl<MachineInstr *> &DelInstrs, + DenseMap<Register, unsigned> &InstrIdxForVirtReg, + unsigned Pattern, unsigned NumLanes) { + + MachineFunction &MF = *Root.getParent()->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + + // Gather the initial load instructions to build the pattern + SmallVector<MachineInstr *, 16> LoadToLaneInstrs; + MachineInstr *CurrInstr = &Root; + for (unsigned i = 0; i < NumLanes - 1; ++i) { + LoadToLaneInstrs.push_back(CurrInstr); + CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg()); + } + + // Sort the load instructions according to the lane. + llvm::sort(LoadToLaneInstrs, + [](const MachineInstr *A, const MachineInstr *B) { + return A->getOperand(2).getImm() > B->getOperand(2).getImm(); + }); + + MachineInstr *SubregToReg = CurrInstr; + LoadToLaneInstrs.push_back( + MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg())); + auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs); + + const TargetRegisterClass *FPR128RegClass = + MRI.getRegClass(Root.getOperand(0).getReg()); + + auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr, + Register SrcRegister, unsigned Lane, + Register OffsetRegister) { + auto NewRegister = MRI.createVirtualRegister(FPR128RegClass); + MachineInstrBuilder LoadIndexIntoRegister = + BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()), + NewRegister) + .addReg(SrcRegister) + .addImm(Lane) + .addReg(OffsetRegister, getKillRegState(true)); + InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size())); + InsInstrs.push_back(LoadIndexIntoRegister); + return NewRegister; + }; + + // Helper to create load instruction based on opcode + auto CreateLoadInstruction = [&](unsigned NumLanes, Register DestReg, + Register OffsetReg) -> MachineInstrBuilder { + unsigned Opcode; + switch (NumLanes) { + case 4: + Opcode = AArch64::LDRSui; + break; + case 8: + Opcode = AArch64::LDRHui; + break; + case 16: + Opcode = AArch64::LDRBui; + break; + default: + llvm_unreachable( + "Got unsupported number of lanes in machine-combiner gather pattern"); + } + // Immediate offset load + return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg) + .addReg(OffsetReg) + .addImm(0); // immediate offset + }; + + // Load the remaining lanes into register 0. + auto LanesToLoadToReg0 = + llvm::make_range(LoadToLaneInstrsAscending.begin() + 1, + LoadToLaneInstrsAscending.begin() + NumLanes / 2); + auto PrevReg = SubregToReg->getOperand(0).getReg(); + for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) { + PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1, + LoadInstr->getOperand(3).getReg()); + DelInstrs.push_back(LoadInstr); + } + auto LastLoadReg0 = PrevReg; + + // First load into register 1. Perform a LDRSui to zero out the upper lanes in + // a single instruction. + auto Lane0Load = *LoadToLaneInstrsAscending.begin(); + auto OriginalSplitLoad = + *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2); + auto DestRegForMiddleIndex = MRI.createVirtualRegister( + MRI.getRegClass(Lane0Load->getOperand(0).getReg())); + + MachineInstrBuilder MiddleIndexLoadInstr = + CreateLoadInstruction(NumLanes, DestRegForMiddleIndex, + OriginalSplitLoad->getOperand(3).getReg()); + + InstrIdxForVirtReg.insert( + std::make_pair(DestRegForMiddleIndex, InsInstrs.size())); + InsInstrs.push_back(MiddleIndexLoadInstr); + DelInstrs.push_back(OriginalSplitLoad); + + // Subreg To Reg instruction for register 1. + auto DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass); + unsigned SubregType; + switch (NumLanes) { + case 4: + SubregType = AArch64::ssub; + break; + case 8: + SubregType = AArch64::hsub; + break; + case 16: + SubregType = AArch64::bsub; + break; + default: + llvm_unreachable( + "Got invalid NumLanes for machine-combiner gather pattern"); + } + + auto SubRegToRegInstr = + BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()), + DestRegForSubregToReg) + .addImm(0) + .addReg(DestRegForMiddleIndex, getKillRegState(true)) + .addImm(SubregType); + InstrIdxForVirtReg.insert( + std::make_pair(DestRegForSubregToReg, InsInstrs.size())); + InsInstrs.push_back(SubRegToRegInstr); + + // Load remaining lanes into register 1. + auto LanesToLoadToReg1 = + llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1, + LoadToLaneInstrsAscending.end()); + PrevReg = SubRegToRegInstr->getOperand(0).getReg(); + for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) { + PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1, + LoadInstr->getOperand(3).getReg()); + if (Index == NumLanes / 2 - 2) { + break; + } + DelInstrs.push_back(LoadInstr); + } + auto LastLoadReg1 = PrevReg; + + // Create the final zip instruction to combine the results. + MachineInstrBuilder ZipInstr = + BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64), + Root.getOperand(0).getReg()) + .addReg(LastLoadReg0) + .addReg(LastLoadReg1); + InsInstrs.push_back(ZipInstr); +} + CombinerObjective AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const { switch (Pattern) { case AArch64MachineCombinerPattern::SUBADD_OP1: case AArch64MachineCombinerPattern::SUBADD_OP2: + case AArch64MachineCombinerPattern::GATHER_LANE_i32: + case AArch64MachineCombinerPattern::GATHER_LANE_i16: + case AArch64MachineCombinerPattern::GATHER_LANE_i8: return CombinerObjective::MustReduceDepth; default: return TargetInstrInfo::getCombinerObjective(Pattern); @@ -7425,6 +7671,10 @@ bool AArch64InstrInfo::getMachineCombinerPatterns( if (getMiscPatterns(Root, Patterns)) return true; + // Load patterns + if (getLoadPatterns(Root, Patterns)) + return true; + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, DoRegPressureReduce); } @@ -8680,6 +8930,21 @@ void AArch64InstrInfo::genAlternativeCodeSequence( MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs); break; } + case AArch64MachineCombinerPattern::GATHER_LANE_i32: { + generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + Pattern, 4); + break; + } + case AArch64MachineCombinerPattern::GATHER_LANE_i16: { + generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + Pattern, 8); + break; + } + case AArch64MachineCombinerPattern::GATHER_LANE_i8: { + generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + Pattern, 16); + break; + } } // end switch (Pattern) // Record MUL and ADD/SUB for deletion diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 7c255da333e4..02734866e712 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -172,6 +172,10 @@ enum AArch64MachineCombinerPattern : unsigned { FMULv8i16_indexed_OP2, FNMADD, + + GATHER_LANE_i32, + GATHER_LANE_i16, + GATHER_LANE_i8 }; class AArch64InstrInfo final : public AArch64GenInstrInfo { const AArch64RegisterInfo RI; diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index 8fb6ccaac2c9..0d4f24172b57 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -69,6 +69,39 @@ static const Intrinsic::ID ScalableVlsegIntrIds[] = { Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask, Intrinsic::riscv_vlseg8_mask}; +static const Intrinsic::ID FixedVssegIntrIds[] = { + Intrinsic::riscv_seg2_store_mask, Intrinsic::riscv_seg3_store_mask, + Intrinsic::riscv_seg4_store_mask, Intrinsic::riscv_seg5_store_mask, + Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask, + Intrinsic::riscv_seg8_store_mask}; + +static const Intrinsic::ID ScalableVssegIntrIds[] = { + Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask, + Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask, + Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask, + Intrinsic::riscv_vsseg8_mask}; + +static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) { + assert(N); + if (N == 1) + return true; + + using namespace PatternMatch; + // Right now we're only recognizing the simplest pattern. + uint64_t C; + if (match(V, m_CombineOr(m_ConstantInt(C), + m_NUWMul(m_Value(), m_ConstantInt(C)))) && + C && C % N == 0) + return true; + + if (isPowerOf2_32(N)) { + KnownBits KB = llvm::computeKnownBits(V, DL); + return KB.countMinTrailingZeros() >= Log2_32(N); + } + + return false; +} + /// Lower an interleaved load into a vlsegN intrinsic. /// /// E.g. Lower an interleaved load (Factor = 2): @@ -134,18 +167,6 @@ bool RISCVTargetLowering::lowerInterleavedLoad( return true; } -static const Intrinsic::ID FixedVssegIntrIds[] = { - Intrinsic::riscv_seg2_store_mask, Intrinsic::riscv_seg3_store_mask, - Intrinsic::riscv_seg4_store_mask, Intrinsic::riscv_seg5_store_mask, - Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask, - Intrinsic::riscv_seg8_store_mask}; - -static const Intrinsic::ID ScalableVssegIntrIds[] = { - Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask, - Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask, - Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask, - Intrinsic::riscv_vsseg8_mask}; - /// Lower an interleaved store into a vssegN intrinsic. /// /// E.g. Lower an interleaved store (Factor = 3): @@ -235,27 +256,6 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, return true; } -static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) { - assert(N); - if (N == 1) - return true; - - using namespace PatternMatch; - // Right now we're only recognizing the simplest pattern. - uint64_t C; - if (match(V, m_CombineOr(m_ConstantInt(C), - m_NUWMul(m_Value(), m_ConstantInt(C)))) && - C && C % N == 0) - return true; - - if (isPowerOf2_32(N)) { - KnownBits KB = llvm::computeKnownBits(V, DL); - return KB.countMinTrailingZeros() >= Log2_32(N); - } - - return false; -} - bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( Instruction *Load, Value *Mask, IntrinsicInst *DI) const { const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID()); diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index 739ac00ba47c..ed08c0bfa2e7 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -1223,6 +1223,24 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) { return Result; } +Value *SCEVExpander::tryToReuseLCSSAPhi(const SCEVAddRecExpr *S) { + const Loop *L = S->getLoop(); + BasicBlock *EB = L->getExitBlock(); + if (!EB || !EB->getSinglePredecessor() || + !SE.DT.dominates(EB, Builder.GetInsertBlock())) + return nullptr; + + for (auto &PN : EB->phis()) { + if (!SE.isSCEVable(PN.getType()) || PN.getType() != S->getType()) + continue; + auto *ExitV = SE.getSCEV(&PN); + if (S == ExitV) + return &PN; + } + + return nullptr; +} + Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) { // In canonical mode we compute the addrec as an expression of a canonical IV // using evaluateAtIteration and expand the resulting SCEV expression. This @@ -1262,6 +1280,11 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) { return V; } + // If S is expanded outside the defining loop, check if there is a + // matching LCSSA phi node for it. + if (Value *V = tryToReuseLCSSAPhi(S)) + return V; + // {X,+,F} --> X + {0,+,F} if (!S->getStart()->isZero()) { if (isa<PointerType>(S->getType())) { diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir new file mode 100644 index 000000000000..09eb18b0e357 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir @@ -0,0 +1,364 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -run-pass=machine-combiner -mcpu=neoverse-n2 -mtriple=aarch64-none-linux-gnu -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: split_loads_to_fpr128 +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3, $x4 + + ; CHECK-LABEL: name: split_loads_to_fpr128 + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4 + ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1 + ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub + ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY2]] + ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY3]], 0 + ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub + ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, killed [[COPY4]] + ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]] + ; CHECK-NEXT: $q0 = COPY [[ZIP]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:gpr64common = COPY $x4 + %5:fpr32 = LDRSroX %0, killed %1, 0, 1 + %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub + %7:fpr128 = LD1i32 %6, 1, killed %2 + %8:fpr128 = LD1i32 %7, 2, killed %3 + %9:fpr128 = LD1i32 %8, 3, killed %4 + $q0 = COPY %9 + RET_ReallyLR implicit $q0 + +--- +name: split_loads_to_fpr128_ui +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3, $x4 + + ; CHECK-LABEL: name: split_loads_to_fpr128_ui + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4 + ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSui [[COPY]], 0 + ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub + ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY1]] + ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY2]], 0 + ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub + ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, killed [[COPY3]] + ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]] + ; CHECK-NEXT: $q0 = COPY [[ZIP]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:gpr64common = COPY $x4 + %5:fpr32 = LDRSui %0, 0 + %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub + %7:fpr128 = LD1i32 %6, 1, killed %1 + %8:fpr128 = LD1i32 %7, 2, killed %2 + %9:fpr128 = LD1i32 %8, 3, killed %3 + $q0 = COPY %9 + RET_ReallyLR implicit $q0 + +--- +name: split_loads_to_fpr128_i16 +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8 + + ; CHECK-LABEL: name: split_loads_to_fpr128_i16 + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8 + ; CHECK-NEXT: [[LD_i16:%[0-9]+]]:fpr16 = LDRHroX [[COPY]], killed [[COPY1]], 0, 1 + ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i16]], %subreg.hsub + ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i16 [[FIRST_REG]], 1, killed [[COPY2]] + ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i16 [[LD0_1]], 2, killed [[COPY3]] + ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i16 [[LD0_2]], 3, killed [[COPY4]] + ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr16 = LDRHui [[COPY5]], 0 + ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.hsub + ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i16 [[SECOND_REG]], 1, killed [[COPY6]] + ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i16 [[LD1_1]], 2, killed [[COPY7]] + ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i16 [[LD1_2]], 3, killed [[COPY8]] + ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_3]], [[LD1_3]] + ; CHECK-NEXT: $q0 = COPY [[ZIP]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:gpr64common = COPY $x4 + %5:gpr64common = COPY $x5 + %6:gpr64common = COPY $x6 + %7:gpr64common = COPY $x7 + %8:gpr64common = COPY $x8 + %9:fpr16 = LDRHroX %0, killed %1, 0, 1 + %10:fpr128 = SUBREG_TO_REG 0, killed %9, %subreg.hsub + %11:fpr128 = LD1i16 %10, 1, killed %2 + %12:fpr128 = LD1i16 %11, 2, killed %3 + %13:fpr128 = LD1i16 %12, 3, killed %4 + %14:fpr128 = LD1i16 %13, 4, killed %5 + %15:fpr128 = LD1i16 %14, 5, killed %6 + %16:fpr128 = LD1i16 %15, 6, killed %7 + %17:fpr128 = LD1i16 %16, 7, killed %8 + $q0 = COPY %17 + RET_ReallyLR implicit $q0 + +--- +name: split_loads_to_fpr128_i16_ui +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8 + + ; CHECK-LABEL: name: split_loads_to_fpr128_i16_ui + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8 + ; CHECK-NEXT: [[LD_i16:%[0-9]+]]:fpr16 = LDRHui [[COPY]], 0 + ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i16]], %subreg.hsub + ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i16 [[FIRST_REG]], 1, killed [[COPY1]] + ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i16 [[LD0_1]], 2, killed [[COPY2]] + ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i16 [[LD0_2]], 3, killed [[COPY3]] + ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr16 = LDRHui [[COPY4]], 0 + ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.hsub + ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i16 [[SECOND_REG]], 1, killed [[COPY5]] + ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i16 [[LD1_1]], 2, killed [[COPY6]] + ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i16 [[LD1_2]], 3, killed [[COPY7]] + ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_3]], [[LD1_3]] + ; CHECK-NEXT: $q0 = COPY [[ZIP]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:gpr64common = COPY $x4 + %5:gpr64common = COPY $x5 + %6:gpr64common = COPY $x6 + %7:gpr64common = COPY $x7 + %8:gpr64common = COPY $x8 + %9:fpr16 = LDRHui %0, 0 + %10:fpr128 = SUBREG_TO_REG 0, killed %9, %subreg.hsub + %11:fpr128 = LD1i16 %10, 1, killed %1 + %12:fpr128 = LD1i16 %11, 2, killed %2 + %13:fpr128 = LD1i16 %12, 3, killed %3 + %14:fpr128 = LD1i16 %13, 4, killed %4 + %15:fpr128 = LD1i16 %14, 5, killed %5 + %16:fpr128 = LD1i16 %15, 6, killed %6 + %17:fpr128 = LD1i16 %16, 7, killed %7 + $q0 = COPY %17 + RET_ReallyLR implicit $q0 + +--- +name: split_loads_to_fpr128_i8 +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x16 + + ; CHECK-LABEL: name: split_loads_to_fpr128_i8 + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gpr64common = COPY $x9 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gpr64common = COPY $x10 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gpr64common = COPY $x11 + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gpr64common = COPY $x12 + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:gpr64common = COPY $x13 + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:gpr64common = COPY $x14 + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:gpr64common = COPY $x15 + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:gpr64common = COPY $x16 + ; CHECK-NEXT: [[LD_i8:%[0-9]+]]:fpr8 = LDRBroX [[COPY]], killed [[COPY1]], 0, 0 + ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i8]], %subreg.bsub + ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i8 [[FIRST_REG]], 1, killed [[COPY2]] + ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i8 [[LD0_1]], 2, killed [[COPY3]] + ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i8 [[LD0_2]], 3, killed [[COPY4]] + ; CHECK-NEXT: [[LD0_4:%[0-9]+]]:fpr128 = LD1i8 [[LD0_3]], 4, killed [[COPY5]] + ; CHECK-NEXT: [[LD0_5:%[0-9]+]]:fpr128 = LD1i8 [[LD0_4]], 5, killed [[COPY6]] + ; CHECK-NEXT: [[LD0_6:%[0-9]+]]:fpr128 = LD1i8 [[LD0_5]], 6, killed [[COPY7]] + ; CHECK-NEXT: [[LD0_7:%[0-9]+]]:fpr128 = LD1i8 [[LD0_6]], 7, killed [[COPY8]] + ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr8 = LDRBui [[COPY9]], 0 + ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.bsub + ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i8 [[SECOND_REG]], 1, killed [[COPY10]] + ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i8 [[LD1_1]], 2, killed [[COPY11]] + ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i8 [[LD1_2]], 3, killed [[COPY12]] + ; CHECK-NEXT: [[LD1_4:%[0-9]+]]:fpr128 = LD1i8 [[LD1_3]], 4, killed [[COPY13]] + ; CHECK-NEXT: [[LD1_5:%[0-9]+]]:fpr128 = LD1i8 [[LD1_4]], 5, killed [[COPY14]] + ; CHECK-NEXT: [[LD1_6:%[0-9]+]]:fpr128 = LD1i8 [[LD1_5]], 6, killed [[COPY15]] + ; CHECK-NEXT: [[LD1_7:%[0-9]+]]:fpr128 = LD1i8 [[LD1_6]], 7, killed [[COPY16]] + ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_7]], [[LD1_7]] + ; CHECK-NEXT: $q0 = COPY [[ZIP]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:gpr64common = COPY $x4 + %5:gpr64common = COPY $x5 + %6:gpr64common = COPY $x6 + %7:gpr64common = COPY $x7 + %8:gpr64common = COPY $x8 + %9:gpr64common = COPY $x9 + %10:gpr64common = COPY $x10 + %11:gpr64common = COPY $x11 + %12:gpr64common = COPY $x12 + %13:gpr64common = COPY $x13 + %14:gpr64common = COPY $x14 + %15:gpr64common = COPY $x15 + %16:gpr64common = COPY $x16 + %17:fpr8 = LDRBroX %0, killed %1, 0, 0 + %18:fpr128 = SUBREG_TO_REG 0, killed %17, %subreg.bsub + %19:fpr128 = LD1i8 %18, 1, killed %2 + %20:fpr128 = LD1i8 %19, 2, killed %3 + %21:fpr128 = LD1i8 %20, 3, killed %4 + %22:fpr128 = LD1i8 %21, 4, killed %5 + %23:fpr128 = LD1i8 %22, 5, killed %6 + %24:fpr128 = LD1i8 %23, 6, killed %7 + %25:fpr128 = LD1i8 %24, 7, killed %8 + %26:fpr128 = LD1i8 %25, 8, killed %9 + %27:fpr128 = LD1i8 %26, 9, killed %10 + %28:fpr128 = LD1i8 %27, 10, killed %11 + %29:fpr128 = LD1i8 %28, 11, killed %12 + %30:fpr128 = LD1i8 %29, 12, killed %13 + %31:fpr128 = LD1i8 %30, 13, killed %14 + %32:fpr128 = LD1i8 %31, 14, killed %15 + %33:fpr128 = LD1i8 %32, 15, killed %16 + $q0 = COPY %33 + RET_ReallyLR implicit $q0 + +--- +name: negative_pattern_missing_lanes +body: | + bb.0.entry: + liveins: $x0, $x1 + + ; CHECK-LABEL: name: negative_pattern_missing_lanes + ; CHECK: [[LD1:%.*]]:fpr128 = LDRQui $x1, 0 + ; CHECK-NEXT: [[LD2:%.*]]:fpr128 = LD1i32 [[LD1]] + + %0:gpr64common = COPY $x0 + %1:fpr128 = LDRQui $x1, 0 + %2:fpr128 = LD1i32 %1, 3, %0 + $q0 = COPY %2 + RET_ReallyLR implicit $q0 + +--- +name: out_of_order_lanes +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3, $x4 + + ; CHECK-LABEL: name: out_of_order_lanes + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4 + ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1 + ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub + ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY3]] + ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY2]], 0 + ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub + ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, killed [[COPY4]] + ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]] + ; CHECK-NEXT: $q0 = COPY [[ZIP]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:gpr64common = COPY $x4 + %5:fpr32 = LDRSroX %0, killed %1, 0, 1 + %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub + %7:fpr128 = LD1i32 %6, 2, killed %2 + %8:fpr128 = LD1i32 %7, 1, killed %3 + %9:fpr128 = LD1i32 %8, 3, killed %4 + $q0 = COPY %9 + RET_ReallyLR implicit $q0 + +--- +name: negative_pattern_no_subreg_to_reg +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3 + + ; CHECK-LABEL: name: negative_pattern_no_subreg_to_reg + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[INITIAL_VEC:%[0-9]+]]:fpr128 = LDRQui [[COPY]], 0 + ; CHECK-NEXT: [[LD_LANE_1:%[0-9]+]]:fpr128 = LD1i32 [[INITIAL_VEC]], 1, killed [[COPY1]] + ; CHECK-NEXT: [[LD_LANE_2:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_1]], 2, killed [[COPY2]] + ; CHECK-NEXT: [[LD_LANE_3:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_2]], 3, killed [[COPY3]] + ; CHECK-NEXT: $q0 = COPY [[LD_LANE_3]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:fpr128 = LDRQui %0, 0 + %5:fpr128 = LD1i32 %4, 1, killed %1 + %6:fpr128 = LD1i32 %5, 2, killed %2 + %7:fpr128 = LD1i32 %6, 3, killed %3 + $q0 = COPY %7 + RET_ReallyLR implicit $q0 + +--- +name: negative_pattern_multiple_users +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3, $x4 + + ; CHECK-LABEL: name: negative_pattern_multiple_users + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4 + ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1 + ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub + ; CHECK-NEXT: [[LD_LANE_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY2]] + ; CHECK-NEXT: [[LD_LANE_2:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_1]], 2, killed [[COPY3]] + ; CHECK-NEXT: [[LD_LANE_3:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_2]], 3, killed [[COPY4]] + ; CHECK-NEXT: $q0 = COPY [[LD_LANE_3]] + ; CHECK-NEXT: $q1 = COPY [[LD_LANE_2]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:gpr64common = COPY $x4 + %5:fpr32 = LDRSroX %0, killed %1, 0, 1 + %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub + %7:fpr128 = LD1i32 %6, 1, killed %2 + %8:fpr128 = LD1i32 %7, 2, killed %3 + %9:fpr128 = LD1i32 %8, 3, killed %4 + $q0 = COPY %9 + $q1 = COPY %8 + RET_ReallyLR implicit $q0, implicit $q1 diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll index 7686740aec30..13434fabefa7 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll @@ -203,89 +203,93 @@ define <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b, <12 x float> %c) ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 ; CHECK-NEXT: // kill: def $s3 killed $s3 def $q3 -; CHECK-NEXT: ldr s17, [sp, #40] -; CHECK-NEXT: add x10, sp, #56 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 +; CHECK-NEXT: ldr s17, [sp, #32] +; CHECK-NEXT: // kill: def $s5 killed $s5 def $q5 ; CHECK-NEXT: add x9, sp, #48 +; CHECK-NEXT: add x10, sp, #64 ; CHECK-NEXT: mov v1.s[1], v3.s[0] -; CHECK-NEXT: ldr s3, [sp, #32] -; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 ; CHECK-NEXT: mov v0.s[1], v2.s[0] -; CHECK-NEXT: ld1 { v17.s }[1], [x10] -; CHECK-NEXT: // kill: def $s5 killed $s5 def $q5 -; CHECK-NEXT: ldr s16, [sp, #8] ; CHECK-NEXT: // kill: def $s4 killed $s4 def $q4 -; CHECK-NEXT: add x10, sp, #24 -; CHECK-NEXT: ld1 { v3.s }[1], [x9] -; CHECK-NEXT: add x9, sp, #72 -; CHECK-NEXT: // kill: def $s7 killed $s7 def $q7 +; CHECK-NEXT: add x11, sp, #72 +; CHECK-NEXT: ld1 { v17.s }[1], [x9] +; CHECK-NEXT: ldr s18, [x10] +; CHECK-NEXT: add x9, sp, #80 +; CHECK-NEXT: add x10, sp, #56 ; CHECK-NEXT: // kill: def $s6 killed $s6 def $q6 +; CHECK-NEXT: // kill: def $s7 killed $s7 def $q7 +; CHECK-NEXT: ldr s16, [sp, #8] +; CHECK-NEXT: ldr s3, [sp, #96] +; CHECK-NEXT: ld1 { v18.s }[1], [x9] +; CHECK-NEXT: add x9, sp, #88 ; CHECK-NEXT: ldr s2, [sp] -; CHECK-NEXT: ld1 { v16.s }[1], [x10] -; CHECK-NEXT: add x10, sp, #112 -; CHECK-NEXT: ldr s20, [sp, #136] ; CHECK-NEXT: mov v1.s[2], v5.s[0] -; CHECK-NEXT: ld1 { v17.s }[2], [x9] -; CHECK-NEXT: add x9, sp, #64 -; CHECK-NEXT: ldr s5, [sp, #96] -; CHECK-NEXT: ld1 { v3.s }[2], [x9] +; CHECK-NEXT: ldr s5, [sp, #40] ; CHECK-NEXT: mov v0.s[2], v4.s[0] -; CHECK-NEXT: add x9, sp, #88 -; CHECK-NEXT: ldr s4, [sp, #104] -; CHECK-NEXT: ldr s19, [sp, #192] ; CHECK-NEXT: ld1 { v5.s }[1], [x10] -; CHECK-NEXT: add x10, sp, #80 -; CHECK-NEXT: ld1 { v17.s }[3], [x9] -; CHECK-NEXT: mov v1.s[3], v7.s[0] -; CHECK-NEXT: add x9, sp, #120 -; CHECK-NEXT: ld1 { v3.s }[3], [x10] -; CHECK-NEXT: ld1 { v4.s }[1], [x9] -; CHECK-NEXT: ldr s7, [sp, #128] +; CHECK-NEXT: ldr s19, [x11] ; CHECK-NEXT: add x10, sp, #144 +; CHECK-NEXT: zip1 v4.2d, v17.2d, v18.2d +; CHECK-NEXT: add x11, sp, #160 +; CHECK-NEXT: ldr s18, [sp, #136] +; CHECK-NEXT: ld1 { v19.s }[1], [x9] ; CHECK-NEXT: mov v0.s[3], v6.s[0] -; CHECK-NEXT: add x9, sp, #16 +; CHECK-NEXT: ldr s6, [sp, #128] +; CHECK-NEXT: mov v1.s[3], v7.s[0] +; CHECK-NEXT: add x9, sp, #24 +; CHECK-NEXT: ldr s7, [sp, #104] +; CHECK-NEXT: ld1 { v16.s }[1], [x9] +; CHECK-NEXT: add x9, sp, #112 +; CHECK-NEXT: ld1 { v6.s }[1], [x10] +; CHECK-NEXT: zip1 v5.2d, v5.2d, v19.2d +; CHECK-NEXT: add x10, sp, #120 +; CHECK-NEXT: ld1 { v3.s }[1], [x9] ; CHECK-NEXT: ld1 { v7.s }[1], [x10] -; CHECK-NEXT: ld1 { v2.s }[1], [x9] -; CHECK-NEXT: add x9, sp, #160 -; CHECK-NEXT: fmul v6.4s, v17.4s, v1.4s -; CHECK-NEXT: fmul v18.4s, v4.4s, v16.4s -; CHECK-NEXT: fmul v16.4s, v5.4s, v16.4s -; CHECK-NEXT: fmul v1.4s, v3.4s, v1.4s -; CHECK-NEXT: add x10, sp, #208 -; CHECK-NEXT: ld1 { v7.s }[2], [x9] -; CHECK-NEXT: add x9, sp, #152 -; CHECK-NEXT: ld1 { v19.s }[1], [x10] -; CHECK-NEXT: ld1 { v20.s }[1], [x9] +; CHECK-NEXT: ldr s17, [x11] ; CHECK-NEXT: add x9, sp, #176 -; CHECK-NEXT: add x10, sp, #184 -; CHECK-NEXT: fneg v6.4s, v6.4s -; CHECK-NEXT: fneg v18.4s, v18.4s -; CHECK-NEXT: fmla v16.4s, v2.4s, v4.4s -; CHECK-NEXT: fmla v1.4s, v0.4s, v17.4s -; CHECK-NEXT: ld1 { v7.s }[3], [x9] -; CHECK-NEXT: add x9, sp, #168 -; CHECK-NEXT: ld1 { v20.s }[2], [x9] -; CHECK-NEXT: ldr s4, [sp, #200] +; CHECK-NEXT: add x10, sp, #16 +; CHECK-NEXT: add x11, sp, #168 +; CHECK-NEXT: ld1 { v17.s }[1], [x9] +; CHECK-NEXT: ld1 { v2.s }[1], [x10] +; CHECK-NEXT: add x9, sp, #152 +; CHECK-NEXT: fmul v19.4s, v5.4s, v1.4s +; CHECK-NEXT: fmul v20.4s, v7.4s, v16.4s +; CHECK-NEXT: fmul v16.4s, v3.4s, v16.4s +; CHECK-NEXT: fmul v1.4s, v4.4s, v1.4s +; CHECK-NEXT: ld1 { v18.s }[1], [x9] +; CHECK-NEXT: ldr s21, [x11] +; CHECK-NEXT: zip1 v6.2d, v6.2d, v17.2d +; CHECK-NEXT: ldr s17, [sp, #192] +; CHECK-NEXT: add x9, sp, #184 +; CHECK-NEXT: add x10, sp, #208 +; CHECK-NEXT: ld1 { v21.s }[1], [x9] ; CHECK-NEXT: add x9, sp, #216 -; CHECK-NEXT: fmla v6.4s, v0.4s, v3.4s -; CHECK-NEXT: fmla v18.4s, v2.4s, v5.4s -; CHECK-NEXT: ld1 { v4.s }[1], [x9] -; CHECK-NEXT: fsub v0.4s, v7.4s, v1.4s -; CHECK-NEXT: fsub v1.4s, v19.4s, v16.4s -; CHECK-NEXT: ld1 { v20.s }[3], [x10] -; CHECK-NEXT: fadd v2.4s, v4.4s, v18.4s -; CHECK-NEXT: fadd v3.4s, v20.4s, v6.4s +; CHECK-NEXT: fneg v19.4s, v19.4s +; CHECK-NEXT: fneg v20.4s, v20.4s +; CHECK-NEXT: fmla v16.4s, v2.4s, v7.4s +; CHECK-NEXT: fmla v1.4s, v0.4s, v5.4s +; CHECK-NEXT: ld1 { v17.s }[1], [x10] +; CHECK-NEXT: ldr s5, [sp, #200] +; CHECK-NEXT: zip1 v7.2d, v18.2d, v21.2d +; CHECK-NEXT: ld1 { v5.s }[1], [x9] +; CHECK-NEXT: fmla v19.4s, v0.4s, v4.4s +; CHECK-NEXT: fmla v20.4s, v2.4s, v3.4s +; CHECK-NEXT: fsub v0.4s, v6.4s, v1.4s +; CHECK-NEXT: fsub v1.4s, v17.4s, v16.4s +; CHECK-NEXT: fadd v2.4s, v7.4s, v19.4s +; CHECK-NEXT: fadd v3.4s, v5.4s, v20.4s ; CHECK-NEXT: ext v4.16b, v0.16b, v1.16b, #12 -; CHECK-NEXT: ext v5.16b, v3.16b, v2.16b, #12 -; CHECK-NEXT: trn2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ext v5.16b, v2.16b, v3.16b, #12 +; CHECK-NEXT: trn2 v1.4s, v1.4s, v3.4s ; CHECK-NEXT: ext v4.16b, v0.16b, v4.16b, #12 -; CHECK-NEXT: ext v5.16b, v3.16b, v5.16b, #8 +; CHECK-NEXT: ext v5.16b, v2.16b, v5.16b, #8 ; CHECK-NEXT: rev64 v4.4s, v4.4s -; CHECK-NEXT: trn2 v2.4s, v4.4s, v5.4s -; CHECK-NEXT: zip2 v4.4s, v0.4s, v3.4s -; CHECK-NEXT: zip1 v0.4s, v0.4s, v3.4s -; CHECK-NEXT: ext v1.16b, v2.16b, v1.16b, #8 -; CHECK-NEXT: mov v4.d[1], v2.d[0] +; CHECK-NEXT: trn2 v3.4s, v4.4s, v5.4s +; CHECK-NEXT: zip2 v4.4s, v0.4s, v2.4s +; CHECK-NEXT: zip1 v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ext v1.16b, v3.16b, v1.16b, #8 +; CHECK-NEXT: mov v4.d[1], v3.d[0] ; CHECK-NEXT: str q0, [x8] ; CHECK-NEXT: stp q4, q1, [x8, #16] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll index acf15f1bd117..e6f27b95d92c 100644 --- a/llvm/test/CodeGen/AArch64/concat-vector.ll +++ b/llvm/test/CodeGen/AArch64/concat-vector.ll @@ -186,8 +186,9 @@ define <16 x i8> @concat_v16s8_v4s8_load(ptr %ptrA, ptr %ptrB, ptr %ptrC, ptr %p ; CHECK: // %bb.0: ; CHECK-NEXT: ldr s0, [x0] ; CHECK-NEXT: ld1 { v0.s }[1], [x1] -; CHECK-NEXT: ld1 { v0.s }[2], [x2] -; CHECK-NEXT: ld1 { v0.s }[3], [x3] +; CHECK-NEXT: ldr s1, [x2] +; CHECK-NEXT: ld1 { v1.s }[1], [x3] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %A = load <4 x i8>, ptr %ptrA %B = load <4 x i8>, ptr %ptrB diff --git a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll index c6b8e41f9bdf..4906e2e15e51 100644 --- a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll +++ b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll @@ -1431,6 +1431,7 @@ define <9 x half> @max_v9f16(<9 x half> %a, <9 x half> %b) { ; FULLFP16-NEXT: add x9, sp, #16 ; FULLFP16-NEXT: // kill: def $h3 killed $h3 def $q3 ; FULLFP16-NEXT: // kill: def $h4 killed $h4 def $q4 +; FULLFP16-NEXT: add x10, sp, #40 ; FULLFP16-NEXT: // kill: def $h5 killed $h5 def $q5 ; FULLFP16-NEXT: // kill: def $h6 killed $h6 def $q6 ; FULLFP16-NEXT: // kill: def $h7 killed $h7 def $q7 @@ -1439,30 +1440,30 @@ define <9 x half> @max_v9f16(<9 x half> %a, <9 x half> %b) { ; FULLFP16-NEXT: ld1 { v1.h }[1], [x9] ; FULLFP16-NEXT: add x9, sp, #24 ; FULLFP16-NEXT: mov v0.h[2], v2.h[0] -; FULLFP16-NEXT: ldr h2, [sp] ; FULLFP16-NEXT: ld1 { v1.h }[2], [x9] ; FULLFP16-NEXT: add x9, sp, #32 -; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v2.8h ; FULLFP16-NEXT: mov v0.h[3], v3.h[0] ; FULLFP16-NEXT: ld1 { v1.h }[3], [x9] -; FULLFP16-NEXT: add x9, sp, #40 -; FULLFP16-NEXT: ldr h3, [sp, #72] -; FULLFP16-NEXT: ld1 { v1.h }[4], [x9] +; FULLFP16-NEXT: ldr h2, [x10] ; FULLFP16-NEXT: add x9, sp, #48 +; FULLFP16-NEXT: ldr h3, [sp, #72] +; FULLFP16-NEXT: ld1 { v2.h }[1], [x9] +; FULLFP16-NEXT: add x9, sp, #56 ; FULLFP16-NEXT: fminnm v3.8h, v3.8h, v3.8h ; FULLFP16-NEXT: mov v0.h[4], v4.h[0] -; FULLFP16-NEXT: ld1 { v1.h }[5], [x9] -; FULLFP16-NEXT: add x9, sp, #56 -; FULLFP16-NEXT: fmaxnm v2.8h, v2.8h, v3.8h -; FULLFP16-NEXT: mov v0.h[5], v5.h[0] -; FULLFP16-NEXT: ld1 { v1.h }[6], [x9] +; FULLFP16-NEXT: ld1 { v2.h }[2], [x9] ; FULLFP16-NEXT: add x9, sp, #64 -; FULLFP16-NEXT: str h2, [x8, #16] +; FULLFP16-NEXT: mov v0.h[5], v5.h[0] +; FULLFP16-NEXT: ld1 { v2.h }[3], [x9] +; FULLFP16-NEXT: zip1 v1.2d, v1.2d, v2.2d +; FULLFP16-NEXT: ldr h2, [sp] ; FULLFP16-NEXT: mov v0.h[6], v6.h[0] -; FULLFP16-NEXT: ld1 { v1.h }[7], [x9] +; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v2.8h ; FULLFP16-NEXT: fminnm v1.8h, v1.8h, v1.8h ; FULLFP16-NEXT: mov v0.h[7], v7.h[0] +; FULLFP16-NEXT: fmaxnm v2.8h, v2.8h, v3.8h ; FULLFP16-NEXT: fminnm v0.8h, v0.8h, v0.8h +; FULLFP16-NEXT: str h2, [x8, #16] ; FULLFP16-NEXT: fmaxnm v0.8h, v0.8h, v1.8h ; FULLFP16-NEXT: str q0, [x8] ; FULLFP16-NEXT: ret @@ -2012,6 +2013,7 @@ define <9 x half> @min_v9f16(<9 x half> %a, <9 x half> %b) { ; FULLFP16-NEXT: add x9, sp, #16 ; FULLFP16-NEXT: // kill: def $h3 killed $h3 def $q3 ; FULLFP16-NEXT: // kill: def $h4 killed $h4 def $q4 +; FULLFP16-NEXT: add x10, sp, #40 ; FULLFP16-NEXT: // kill: def $h5 killed $h5 def $q5 ; FULLFP16-NEXT: // kill: def $h6 killed $h6 def $q6 ; FULLFP16-NEXT: // kill: def $h7 killed $h7 def $q7 @@ -2020,30 +2022,30 @@ define <9 x half> @min_v9f16(<9 x half> %a, <9 x half> %b) { ; FULLFP16-NEXT: ld1 { v1.h }[1], [x9] ; FULLFP16-NEXT: add x9, sp, #24 ; FULLFP16-NEXT: mov v0.h[2], v2.h[0] -; FULLFP16-NEXT: ldr h2, [sp] ; FULLFP16-NEXT: ld1 { v1.h }[2], [x9] ; FULLFP16-NEXT: add x9, sp, #32 -; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v2.8h ; FULLFP16-NEXT: mov v0.h[3], v3.h[0] ; FULLFP16-NEXT: ld1 { v1.h }[3], [x9] -; FULLFP16-NEXT: add x9, sp, #40 -; FULLFP16-NEXT: ldr h3, [sp, #72] -; FULLFP16-NEXT: ld1 { v1.h }[4], [x9] +; FULLFP16-NEXT: ldr h2, [x10] ; FULLFP16-NEXT: add x9, sp, #48 +; FULLFP16-NEXT: ldr h3, [sp, #72] +; FULLFP16-NEXT: ld1 { v2.h }[1], [x9] +; FULLFP16-NEXT: add x9, sp, #56 ; FULLFP16-NEXT: fminnm v3.8h, v3.8h, v3.8h ; FULLFP16-NEXT: mov v0.h[4], v4.h[0] -; FULLFP16-NEXT: ld1 { v1.h }[5], [x9] -; FULLFP16-NEXT: add x9, sp, #56 -; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v3.8h -; FULLFP16-NEXT: mov v0.h[5], v5.h[0] -; FULLFP16-NEXT: ld1 { v1.h }[6], [x9] +; FULLFP16-NEXT: ld1 { v2.h }[2], [x9] ; FULLFP16-NEXT: add x9, sp, #64 -; FULLFP16-NEXT: str h2, [x8, #16] +; FULLFP16-NEXT: mov v0.h[5], v5.h[0] +; FULLFP16-NEXT: ld1 { v2.h }[3], [x9] +; FULLFP16-NEXT: zip1 v1.2d, v1.2d, v2.2d +; FULLFP16-NEXT: ldr h2, [sp] ; FULLFP16-NEXT: mov v0.h[6], v6.h[0] -; FULLFP16-NEXT: ld1 { v1.h }[7], [x9] +; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v2.8h ; FULLFP16-NEXT: fminnm v1.8h, v1.8h, v1.8h ; FULLFP16-NEXT: mov v0.h[7], v7.h[0] +; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v3.8h ; FULLFP16-NEXT: fminnm v0.8h, v0.8h, v0.8h +; FULLFP16-NEXT: str h2, [x8, #16] ; FULLFP16-NEXT: fminnm v0.8h, v0.8h, v1.8h ; FULLFP16-NEXT: str q0, [x8] ; FULLFP16-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/fsh.ll b/llvm/test/CodeGen/AArch64/fsh.ll index 4c28c9082402..ae2ef2649102 100644 --- a/llvm/test/CodeGen/AArch64/fsh.ll +++ b/llvm/test/CodeGen/AArch64/fsh.ll @@ -2509,87 +2509,88 @@ define <7 x i32> @fshl_v7i32(<7 x i32> %a, <7 x i32> %b, <7 x i32> %c) { ; ; CHECK-GI-LABEL: fshl_v7i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr s3, [sp, #48] -; CHECK-GI-NEXT: ldr s20, [sp, #56] -; CHECK-GI-NEXT: add x9, sp, #56 +; CHECK-GI-NEXT: ldr s17, [sp, #48] +; CHECK-GI-NEXT: add x8, sp, #56 +; CHECK-GI-NEXT: add x9, sp, #64 ; CHECK-GI-NEXT: ldr s4, [sp, #48] -; CHECK-GI-NEXT: ldr s7, [sp, #80] -; CHECK-GI-NEXT: mov w12, #-1 // =0xffffffff -; CHECK-GI-NEXT: ldr s21, [sp, #88] -; CHECK-GI-NEXT: mov v3.s[1], v20.s[0] -; CHECK-GI-NEXT: fmov s20, w12 -; CHECK-GI-NEXT: ld1 { v4.s }[1], [x9] -; CHECK-GI-NEXT: ldr s17, [sp] -; CHECK-GI-NEXT: add x13, sp, #64 -; CHECK-GI-NEXT: mov v7.s[1], v21.s[0] +; CHECK-GI-NEXT: ldr s21, [sp, #56] +; CHECK-GI-NEXT: mov w10, #-1 // =0xffffffff +; CHECK-GI-NEXT: ld1 { v17.s }[1], [x8] +; CHECK-GI-NEXT: ldr s20, [x9] +; CHECK-GI-NEXT: add x8, sp, #72 +; CHECK-GI-NEXT: mov v4.s[1], v21.s[0] ; CHECK-GI-NEXT: fmov s21, w7 +; CHECK-GI-NEXT: ldr s6, [sp] +; CHECK-GI-NEXT: ld1 { v20.s }[1], [x8] ; CHECK-GI-NEXT: ldr s19, [sp, #64] -; CHECK-GI-NEXT: mov w11, #31 // =0x1f -; CHECK-GI-NEXT: mov v20.s[1], w12 +; CHECK-GI-NEXT: ldr s7, [sp, #80] +; CHECK-GI-NEXT: ldr s22, [sp, #88] +; CHECK-GI-NEXT: mov w9, #31 // =0x1f +; CHECK-GI-NEXT: mov w11, #1 // =0x1 +; CHECK-GI-NEXT: mov v21.s[1], v6.s[0] +; CHECK-GI-NEXT: fmov s6, w9 ; CHECK-GI-NEXT: ldr s18, [sp, #96] -; CHECK-GI-NEXT: ld1 { v4.s }[2], [x13] -; CHECK-GI-NEXT: mov w13, #1 // =0x1 -; CHECK-GI-NEXT: mov v3.s[2], v19.s[0] -; CHECK-GI-NEXT: mov v21.s[1], v17.s[0] -; CHECK-GI-NEXT: fmov s17, w11 -; CHECK-GI-NEXT: fmov s19, w13 +; CHECK-GI-NEXT: zip1 v17.2d, v17.2d, v20.2d +; CHECK-GI-NEXT: fmov s20, w10 +; CHECK-GI-NEXT: mov v7.s[1], v22.s[0] +; CHECK-GI-NEXT: mov v4.s[2], v19.s[0] +; CHECK-GI-NEXT: fmov s19, w11 ; CHECK-GI-NEXT: fmov s23, w0 -; CHECK-GI-NEXT: fmov s24, w11 -; CHECK-GI-NEXT: ldr s6, [sp, #8] +; CHECK-GI-NEXT: mov v6.s[1], w9 +; CHECK-GI-NEXT: fmov s24, w9 +; CHECK-GI-NEXT: ldr s2, [sp, #8] +; CHECK-GI-NEXT: mov v20.s[1], w10 ; CHECK-GI-NEXT: ldr s0, [sp, #24] ; CHECK-GI-NEXT: ldr s5, [sp, #32] +; CHECK-GI-NEXT: mov v19.s[1], w11 ; CHECK-GI-NEXT: mov v7.s[2], v18.s[0] -; CHECK-GI-NEXT: mov v17.s[1], w11 -; CHECK-GI-NEXT: mov v19.s[1], w13 -; CHECK-GI-NEXT: mov v20.s[2], w12 ; CHECK-GI-NEXT: ldr s16, [sp, #72] ; CHECK-GI-NEXT: mov v23.s[1], w1 ; CHECK-GI-NEXT: ldr s18, [sp, #80] -; CHECK-GI-NEXT: mov v21.s[2], v6.s[0] -; CHECK-GI-NEXT: mov v24.s[1], w11 +; CHECK-GI-NEXT: mov v21.s[2], v2.s[0] +; CHECK-GI-NEXT: mov v24.s[1], w9 ; CHECK-GI-NEXT: mov v0.s[1], v5.s[0] -; CHECK-GI-NEXT: fmov s6, w4 -; CHECK-GI-NEXT: add x10, sp, #88 +; CHECK-GI-NEXT: fmov s5, w4 +; CHECK-GI-NEXT: mov v20.s[2], w10 +; CHECK-GI-NEXT: add x8, sp, #88 ; CHECK-GI-NEXT: movi v22.4s, #31 -; CHECK-GI-NEXT: mov v3.s[3], v16.s[0] -; CHECK-GI-NEXT: mov v17.s[2], w11 -; CHECK-GI-NEXT: mov v19.s[2], w13 -; CHECK-GI-NEXT: ldr s2, [sp, #16] -; CHECK-GI-NEXT: ldr s1, [sp, #40] -; CHECK-GI-NEXT: ld1 { v18.s }[1], [x10] -; CHECK-GI-NEXT: eor v5.16b, v7.16b, v20.16b +; CHECK-GI-NEXT: mov v4.s[3], v16.s[0] +; CHECK-GI-NEXT: mov v6.s[2], w9 +; CHECK-GI-NEXT: mov v19.s[2], w11 +; CHECK-GI-NEXT: ldr s1, [sp, #16] +; CHECK-GI-NEXT: ldr s3, [sp, #40] +; CHECK-GI-NEXT: ld1 { v18.s }[1], [x8] ; CHECK-GI-NEXT: mov v23.s[2], w2 -; CHECK-GI-NEXT: mov v6.s[1], w5 -; CHECK-GI-NEXT: add x8, sp, #72 -; CHECK-GI-NEXT: add x9, sp, #96 -; CHECK-GI-NEXT: mov v21.s[3], v2.s[0] -; CHECK-GI-NEXT: mov v24.s[2], w11 -; CHECK-GI-NEXT: mov v0.s[2], v1.s[0] -; CHECK-GI-NEXT: ld1 { v4.s }[3], [x8] -; CHECK-GI-NEXT: bic v2.16b, v22.16b, v3.16b -; CHECK-GI-NEXT: ld1 { v18.s }[2], [x9] -; CHECK-GI-NEXT: and v1.16b, v5.16b, v17.16b +; CHECK-GI-NEXT: mov v5.s[1], w5 +; CHECK-GI-NEXT: add x8, sp, #96 +; CHECK-GI-NEXT: eor v2.16b, v7.16b, v20.16b +; CHECK-GI-NEXT: mov v21.s[3], v1.s[0] +; CHECK-GI-NEXT: mov v24.s[2], w9 +; CHECK-GI-NEXT: mov v0.s[2], v3.s[0] +; CHECK-GI-NEXT: bic v1.16b, v22.16b, v4.16b +; CHECK-GI-NEXT: ld1 { v18.s }[2], [x8] ; CHECK-GI-NEXT: neg v3.4s, v19.4s +; CHECK-GI-NEXT: and v4.16b, v17.16b, v22.16b +; CHECK-GI-NEXT: and v2.16b, v2.16b, v6.16b ; CHECK-GI-NEXT: mov v23.s[3], w3 -; CHECK-GI-NEXT: mov v6.s[2], w6 -; CHECK-GI-NEXT: and v4.16b, v4.16b, v22.16b -; CHECK-GI-NEXT: ushr v5.4s, v21.4s, #1 -; CHECK-GI-NEXT: neg v2.4s, v2.4s -; CHECK-GI-NEXT: and v7.16b, v18.16b, v24.16b +; CHECK-GI-NEXT: mov v5.s[2], w6 +; CHECK-GI-NEXT: ushr v6.4s, v21.4s, #1 ; CHECK-GI-NEXT: neg v1.4s, v1.4s +; CHECK-GI-NEXT: and v7.16b, v18.16b, v24.16b ; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-GI-NEXT: neg v2.4s, v2.4s ; CHECK-GI-NEXT: ushl v3.4s, v23.4s, v4.4s -; CHECK-GI-NEXT: ushl v2.4s, v5.4s, v2.4s -; CHECK-GI-NEXT: ushl v4.4s, v6.4s, v7.4s -; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: orr v1.16b, v3.16b, v2.16b +; CHECK-GI-NEXT: ushl v1.4s, v6.4s, v1.4s +; CHECK-GI-NEXT: ushl v4.4s, v5.4s, v7.4s +; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: orr v1.16b, v3.16b, v1.16b ; CHECK-GI-NEXT: orr v0.16b, v4.16b, v0.16b ; CHECK-GI-NEXT: mov s2, v1.s[1] ; CHECK-GI-NEXT: mov s3, v1.s[2] ; CHECK-GI-NEXT: mov s4, v1.s[3] +; CHECK-GI-NEXT: fmov w0, s1 ; CHECK-GI-NEXT: mov s5, v0.s[1] ; CHECK-GI-NEXT: mov s6, v0.s[2] -; CHECK-GI-NEXT: fmov w0, s1 ; CHECK-GI-NEXT: fmov w4, s0 ; CHECK-GI-NEXT: fmov w1, s2 ; CHECK-GI-NEXT: fmov w2, s3 diff --git a/llvm/test/CodeGen/AArch64/llvm.frexp.ll b/llvm/test/CodeGen/AArch64/llvm.frexp.ll index 2213aa1429db..4e1876db772e 100644 --- a/llvm/test/CodeGen/AArch64/llvm.frexp.ll +++ b/llvm/test/CodeGen/AArch64/llvm.frexp.ll @@ -700,13 +700,14 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi ; CHECK-NEXT: ldr s1, [sp, #44] ; CHECK-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: ld1 { v1.s }[1], [x19] ; CHECK-NEXT: mov v2.s[3], v0.s[0] -; CHECK-NEXT: ld1 { v1.s }[2], [x20] +; CHECK-NEXT: ld1 { v1.s }[1], [x19] +; CHECK-NEXT: ldr s0, [x20] +; CHECK-NEXT: ld1 { v0.s }[1], [x21] ; CHECK-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: ld1 { v1.s }[3], [x21] ; CHECK-NEXT: ldp x30, x21, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: zip1 v1.2d, v1.2d, v0.2d +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret ; @@ -872,10 +873,11 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind { ; CHECK-NEXT: bl frexpf ; CHECK-NEXT: ldr s0, [sp, #28] ; CHECK-NEXT: ld1 { v0.s }[1], [x19] -; CHECK-NEXT: ld1 { v0.s }[2], [x20] +; CHECK-NEXT: ldr s1, [x20] +; CHECK-NEXT: ld1 { v1.s }[1], [x21] ; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ld1 { v0.s }[3], [x21] ; CHECK-NEXT: ldp x30, x21, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll index 4f0c4080aa0c..9443004ea434 100644 --- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll +++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll @@ -6810,195 +6810,200 @@ define i32 @test_sdot_v48i8_double_nomla(<48 x i8> %a, <48 x i8> %b, <48 x i8> % ; CHECK-SD-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 ; CHECK-SD-NEXT: .cfi_offset w29, -16 -; CHECK-SD-NEXT: ldr b5, [sp, #208] +; CHECK-SD-NEXT: ldr b0, [sp, #208] ; CHECK-SD-NEXT: add x8, sp, #216 -; CHECK-SD-NEXT: fmov s0, w0 +; CHECK-SD-NEXT: add x9, sp, #272 +; CHECK-SD-NEXT: ldr b2, [sp, #80] ; CHECK-SD-NEXT: ldr b4, [sp, #976] -; CHECK-SD-NEXT: add x9, sp, #984 -; CHECK-SD-NEXT: add x12, sp, #328 -; CHECK-SD-NEXT: ld1 { v5.b }[1], [x8] -; CHECK-SD-NEXT: add x8, sp, #224 -; CHECK-SD-NEXT: movi v1.16b, #1 -; CHECK-SD-NEXT: mov v0.b[1], w1 -; CHECK-SD-NEXT: ld1 { v4.b }[1], [x9] -; CHECK-SD-NEXT: movi v3.2d, #0000000000000000 -; CHECK-SD-NEXT: add x11, sp, #992 ; CHECK-SD-NEXT: ldr b6, [sp, #720] -; CHECK-SD-NEXT: ldr b7, [sp, #80] -; CHECK-SD-NEXT: ld1 { v5.b }[2], [x8] +; CHECK-SD-NEXT: ld1 { v0.b }[1], [x8] +; CHECK-SD-NEXT: add x8, sp, #224 +; CHECK-SD-NEXT: fmov s16, w0 +; CHECK-SD-NEXT: ldr b17, [sp, #848] +; CHECK-SD-NEXT: add x10, sp, #24 +; CHECK-SD-NEXT: movi v19.2d, #0000000000000000 +; CHECK-SD-NEXT: ld1 { v0.b }[2], [x8] ; CHECK-SD-NEXT: add x8, sp, #232 -; CHECK-SD-NEXT: add x13, sp, #88 -; CHECK-SD-NEXT: ld1 { v4.b }[2], [x11] -; CHECK-SD-NEXT: ld1 { v7.b }[1], [x13] -; CHECK-SD-NEXT: add x13, sp, #856 -; CHECK-SD-NEXT: mov v0.b[2], w2 -; CHECK-SD-NEXT: add x14, sp, #1008 -; CHECK-SD-NEXT: add x15, sp, #872 -; CHECK-SD-NEXT: ld1 { v5.b }[3], [x8] +; CHECK-SD-NEXT: mov v16.b[1], w1 +; CHECK-SD-NEXT: ld1 { v0.b }[3], [x8] ; CHECK-SD-NEXT: add x8, sp, #240 -; CHECK-SD-NEXT: add x16, sp, #888 -; CHECK-SD-NEXT: add x10, sp, #16 -; CHECK-SD-NEXT: add x9, sp, #24 -; CHECK-SD-NEXT: add x11, sp, #40 -; CHECK-SD-NEXT: movi v2.2d, #0000000000000000 -; CHECK-SD-NEXT: ld1 { v5.b }[4], [x8] +; CHECK-SD-NEXT: mov v16.b[2], w2 +; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] ; CHECK-SD-NEXT: add x8, sp, #248 -; CHECK-SD-NEXT: mov v0.b[3], w3 -; CHECK-SD-NEXT: ld1 { v5.b }[5], [x8] +; CHECK-SD-NEXT: mov v16.b[3], w3 +; CHECK-SD-NEXT: ld1 { v0.b }[5], [x8] ; CHECK-SD-NEXT: add x8, sp, #256 -; CHECK-SD-NEXT: mov v0.b[4], w4 -; CHECK-SD-NEXT: ld1 { v5.b }[6], [x8] +; CHECK-SD-NEXT: ld1 { v0.b }[6], [x8] ; CHECK-SD-NEXT: add x8, sp, #264 -; CHECK-SD-NEXT: mov v0.b[5], w5 -; CHECK-SD-NEXT: ld1 { v5.b }[7], [x8] -; CHECK-SD-NEXT: add x8, sp, #272 -; CHECK-SD-NEXT: ld1 { v5.b }[8], [x8] +; CHECK-SD-NEXT: mov v16.b[4], w4 +; CHECK-SD-NEXT: ld1 { v0.b }[7], [x8] +; CHECK-SD-NEXT: ldr b1, [x9] ; CHECK-SD-NEXT: add x8, sp, #280 -; CHECK-SD-NEXT: mov v0.b[6], w6 -; CHECK-SD-NEXT: ld1 { v5.b }[9], [x8] +; CHECK-SD-NEXT: add x9, sp, #88 +; CHECK-SD-NEXT: mov v16.b[5], w5 +; CHECK-SD-NEXT: ld1 { v1.b }[1], [x8] ; CHECK-SD-NEXT: add x8, sp, #288 -; CHECK-SD-NEXT: mov v0.b[7], w7 -; CHECK-SD-NEXT: ld1 { v5.b }[10], [x8] +; CHECK-SD-NEXT: ld1 { v1.b }[2], [x8] ; CHECK-SD-NEXT: add x8, sp, #296 -; CHECK-SD-NEXT: ld1 { v0.b }[8], [x10] -; CHECK-SD-NEXT: add x10, sp, #128 -; CHECK-SD-NEXT: ld1 { v5.b }[11], [x8] +; CHECK-SD-NEXT: mov v16.b[6], w6 +; CHECK-SD-NEXT: ld1 { v1.b }[3], [x8] ; CHECK-SD-NEXT: add x8, sp, #304 -; CHECK-SD-NEXT: ld1 { v0.b }[9], [x9] -; CHECK-SD-NEXT: add x9, sp, #136 -; CHECK-SD-NEXT: ld1 { v5.b }[12], [x8] +; CHECK-SD-NEXT: mov v16.b[7], w7 +; CHECK-SD-NEXT: ld1 { v1.b }[4], [x8] ; CHECK-SD-NEXT: add x8, sp, #312 -; CHECK-SD-NEXT: ld1 { v5.b }[13], [x8] +; CHECK-SD-NEXT: ld1 { v1.b }[5], [x8] ; CHECK-SD-NEXT: add x8, sp, #320 -; CHECK-SD-NEXT: ld1 { v5.b }[14], [x8] -; CHECK-SD-NEXT: add x8, sp, #32 -; CHECK-SD-NEXT: ld1 { v0.b }[10], [x8] -; CHECK-SD-NEXT: add x8, sp, #144 -; CHECK-SD-NEXT: ld1 { v5.b }[15], [x12] -; CHECK-SD-NEXT: add x12, sp, #728 -; CHECK-SD-NEXT: ld1 { v6.b }[1], [x12] -; CHECK-SD-NEXT: add x12, sp, #1000 -; CHECK-SD-NEXT: ld1 { v0.b }[11], [x11] -; CHECK-SD-NEXT: ld1 { v4.b }[3], [x12] -; CHECK-SD-NEXT: add x12, sp, #736 -; CHECK-SD-NEXT: add x11, sp, #920 -; CHECK-SD-NEXT: sdot v3.4s, v5.16b, v1.16b -; CHECK-SD-NEXT: ldr b5, [sp, #848] -; CHECK-SD-NEXT: ld1 { v6.b }[2], [x12] -; CHECK-SD-NEXT: add x12, sp, #48 -; CHECK-SD-NEXT: ld1 { v5.b }[1], [x13] -; CHECK-SD-NEXT: add x13, sp, #744 -; CHECK-SD-NEXT: ld1 { v4.b }[4], [x14] -; CHECK-SD-NEXT: add x14, sp, #96 -; CHECK-SD-NEXT: ld1 { v0.b }[12], [x12] -; CHECK-SD-NEXT: ld1 { v6.b }[3], [x13] -; CHECK-SD-NEXT: add x13, sp, #864 -; CHECK-SD-NEXT: ld1 { v7.b }[2], [x14] -; CHECK-SD-NEXT: add x14, sp, #1016 -; CHECK-SD-NEXT: ld1 { v5.b }[2], [x13] -; CHECK-SD-NEXT: add x13, sp, #752 -; CHECK-SD-NEXT: ld1 { v4.b }[5], [x14] -; CHECK-SD-NEXT: add x14, sp, #104 -; CHECK-SD-NEXT: ld1 { v6.b }[4], [x13] -; CHECK-SD-NEXT: add x13, sp, #1024 -; CHECK-SD-NEXT: ld1 { v7.b }[3], [x14] -; CHECK-SD-NEXT: ld1 { v5.b }[3], [x15] -; CHECK-SD-NEXT: add x15, sp, #760 -; CHECK-SD-NEXT: add x14, sp, #112 -; CHECK-SD-NEXT: ld1 { v4.b }[6], [x13] -; CHECK-SD-NEXT: add x13, sp, #880 -; CHECK-SD-NEXT: ld1 { v6.b }[5], [x15] -; CHECK-SD-NEXT: add x15, sp, #1032 -; CHECK-SD-NEXT: ld1 { v7.b }[4], [x14] -; CHECK-SD-NEXT: ld1 { v5.b }[4], [x13] -; CHECK-SD-NEXT: add x14, sp, #768 -; CHECK-SD-NEXT: add x13, sp, #120 -; CHECK-SD-NEXT: ld1 { v4.b }[7], [x15] -; CHECK-SD-NEXT: add x15, sp, #1040 -; CHECK-SD-NEXT: ld1 { v6.b }[6], [x14] -; CHECK-SD-NEXT: ld1 { v7.b }[5], [x13] -; CHECK-SD-NEXT: add x13, sp, #776 -; CHECK-SD-NEXT: ld1 { v5.b }[5], [x16] -; CHECK-SD-NEXT: add x14, sp, #1048 -; CHECK-SD-NEXT: ld1 { v4.b }[8], [x15] -; CHECK-SD-NEXT: add x15, sp, #896 -; CHECK-SD-NEXT: ld1 { v6.b }[7], [x13] -; CHECK-SD-NEXT: ld1 { v7.b }[6], [x10] -; CHECK-SD-NEXT: add x10, sp, #784 -; CHECK-SD-NEXT: ld1 { v5.b }[6], [x15] -; CHECK-SD-NEXT: add x13, sp, #1056 -; CHECK-SD-NEXT: ld1 { v4.b }[9], [x14] -; CHECK-SD-NEXT: add x14, sp, #904 -; CHECK-SD-NEXT: ld1 { v6.b }[8], [x10] -; CHECK-SD-NEXT: ld1 { v7.b }[7], [x9] -; CHECK-SD-NEXT: add x9, sp, #792 -; CHECK-SD-NEXT: ld1 { v5.b }[7], [x14] -; CHECK-SD-NEXT: add x10, sp, #1064 -; CHECK-SD-NEXT: ld1 { v4.b }[10], [x13] -; CHECK-SD-NEXT: add x13, sp, #912 -; CHECK-SD-NEXT: ld1 { v6.b }[9], [x9] -; CHECK-SD-NEXT: ld1 { v7.b }[8], [x8] -; CHECK-SD-NEXT: add x9, sp, #800 -; CHECK-SD-NEXT: ld1 { v5.b }[8], [x13] +; CHECK-SD-NEXT: ld1 { v1.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #328 +; CHECK-SD-NEXT: ld1 { v1.b }[7], [x8] +; CHECK-SD-NEXT: ld1 { v2.b }[1], [x9] +; CHECK-SD-NEXT: add x8, sp, #96 +; CHECK-SD-NEXT: add x9, sp, #144 +; CHECK-SD-NEXT: ld1 { v2.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #104 +; CHECK-SD-NEXT: zip1 v0.2d, v0.2d, v1.2d +; CHECK-SD-NEXT: movi v1.16b, #1 +; CHECK-SD-NEXT: ld1 { v2.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #112 +; CHECK-SD-NEXT: ld1 { v2.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #120 +; CHECK-SD-NEXT: ld1 { v2.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #128 +; CHECK-SD-NEXT: ld1 { v2.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #136 +; CHECK-SD-NEXT: ld1 { v2.b }[7], [x8] +; CHECK-SD-NEXT: ldr b3, [x9] ; CHECK-SD-NEXT: add x8, sp, #152 -; CHECK-SD-NEXT: ld1 { v4.b }[11], [x10] -; CHECK-SD-NEXT: add x10, sp, #1072 -; CHECK-SD-NEXT: ld1 { v6.b }[10], [x9] -; CHECK-SD-NEXT: ld1 { v7.b }[9], [x8] -; CHECK-SD-NEXT: add x9, sp, #808 -; CHECK-SD-NEXT: ld1 { v5.b }[9], [x11] -; CHECK-SD-NEXT: add x8, sp, #56 -; CHECK-SD-NEXT: ld1 { v4.b }[12], [x10] -; CHECK-SD-NEXT: add x10, sp, #160 -; CHECK-SD-NEXT: ld1 { v0.b }[13], [x8] -; CHECK-SD-NEXT: ld1 { v6.b }[11], [x9] -; CHECK-SD-NEXT: add x9, sp, #928 -; CHECK-SD-NEXT: ld1 { v7.b }[10], [x10] -; CHECK-SD-NEXT: add x10, sp, #1080 -; CHECK-SD-NEXT: ld1 { v5.b }[10], [x9] +; CHECK-SD-NEXT: add x9, sp, #984 +; CHECK-SD-NEXT: ld1 { v3.b }[1], [x8] +; CHECK-SD-NEXT: add x8, sp, #160 +; CHECK-SD-NEXT: ld1 { v3.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #168 +; CHECK-SD-NEXT: ld1 { v3.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #176 +; CHECK-SD-NEXT: ld1 { v3.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #184 +; CHECK-SD-NEXT: ld1 { v3.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #192 +; CHECK-SD-NEXT: ld1 { v3.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #200 +; CHECK-SD-NEXT: ld1 { v3.b }[7], [x8] +; CHECK-SD-NEXT: ld1 { v4.b }[1], [x9] +; CHECK-SD-NEXT: add x8, sp, #992 +; CHECK-SD-NEXT: add x9, sp, #1040 +; CHECK-SD-NEXT: ld1 { v4.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #1000 +; CHECK-SD-NEXT: zip1 v2.2d, v2.2d, v3.2d +; CHECK-SD-NEXT: ld1 { v4.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #1008 +; CHECK-SD-NEXT: ld1 { v4.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #1016 +; CHECK-SD-NEXT: ld1 { v4.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #1024 +; CHECK-SD-NEXT: ld1 { v4.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #1032 +; CHECK-SD-NEXT: ld1 { v4.b }[7], [x8] +; CHECK-SD-NEXT: ldr b5, [x9] +; CHECK-SD-NEXT: add x8, sp, #1048 +; CHECK-SD-NEXT: add x9, sp, #728 +; CHECK-SD-NEXT: ld1 { v5.b }[1], [x8] +; CHECK-SD-NEXT: add x8, sp, #1056 +; CHECK-SD-NEXT: ld1 { v5.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #1064 +; CHECK-SD-NEXT: ld1 { v5.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #1072 +; CHECK-SD-NEXT: ld1 { v5.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #1080 +; CHECK-SD-NEXT: ld1 { v5.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #1088 +; CHECK-SD-NEXT: ld1 { v5.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #1096 +; CHECK-SD-NEXT: ld1 { v5.b }[7], [x8] +; CHECK-SD-NEXT: ld1 { v6.b }[1], [x9] +; CHECK-SD-NEXT: add x8, sp, #736 +; CHECK-SD-NEXT: add x9, sp, #784 +; CHECK-SD-NEXT: ld1 { v6.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #744 +; CHECK-SD-NEXT: zip1 v4.2d, v4.2d, v5.2d +; CHECK-SD-NEXT: movi v5.2d, #0000000000000000 +; CHECK-SD-NEXT: ld1 { v6.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #752 +; CHECK-SD-NEXT: sdot v19.4s, v4.16b, v1.16b +; CHECK-SD-NEXT: sdot v5.4s, v0.16b, v1.16b +; CHECK-SD-NEXT: ld1 { v6.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #760 +; CHECK-SD-NEXT: ld1 { v6.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #768 +; CHECK-SD-NEXT: ld1 { v6.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #776 +; CHECK-SD-NEXT: ld1 { v6.b }[7], [x8] +; CHECK-SD-NEXT: ldr b7, [x9] +; CHECK-SD-NEXT: add x8, sp, #792 +; CHECK-SD-NEXT: add x9, sp, #856 +; CHECK-SD-NEXT: ld1 { v7.b }[1], [x8] +; CHECK-SD-NEXT: add x8, sp, #800 +; CHECK-SD-NEXT: ld1 { v7.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #808 +; CHECK-SD-NEXT: ld1 { v7.b }[3], [x8] ; CHECK-SD-NEXT: add x8, sp, #816 -; CHECK-SD-NEXT: ld1 { v4.b }[13], [x10] -; CHECK-SD-NEXT: add x9, sp, #168 -; CHECK-SD-NEXT: add x10, sp, #176 -; CHECK-SD-NEXT: ld1 { v6.b }[12], [x8] -; CHECK-SD-NEXT: add x8, sp, #936 -; CHECK-SD-NEXT: ld1 { v7.b }[11], [x9] -; CHECK-SD-NEXT: add x9, sp, #1088 -; CHECK-SD-NEXT: ld1 { v5.b }[11], [x8] -; CHECK-SD-NEXT: add x8, sp, #64 -; CHECK-SD-NEXT: ld1 { v4.b }[14], [x9] -; CHECK-SD-NEXT: add x9, sp, #824 -; CHECK-SD-NEXT: ld1 { v0.b }[14], [x8] -; CHECK-SD-NEXT: ld1 { v6.b }[13], [x9] -; CHECK-SD-NEXT: add x9, sp, #944 -; CHECK-SD-NEXT: ld1 { v7.b }[12], [x10] -; CHECK-SD-NEXT: add x10, sp, #1096 -; CHECK-SD-NEXT: ld1 { v5.b }[12], [x9] +; CHECK-SD-NEXT: ld1 { v7.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #824 +; CHECK-SD-NEXT: ld1 { v7.b }[5], [x8] ; CHECK-SD-NEXT: add x8, sp, #832 -; CHECK-SD-NEXT: ld1 { v4.b }[15], [x10] -; CHECK-SD-NEXT: add x9, sp, #184 -; CHECK-SD-NEXT: add x10, sp, #72 -; CHECK-SD-NEXT: ld1 { v6.b }[14], [x8] -; CHECK-SD-NEXT: add x8, sp, #952 -; CHECK-SD-NEXT: ld1 { v7.b }[13], [x9] -; CHECK-SD-NEXT: ld1 { v5.b }[13], [x8] +; CHECK-SD-NEXT: ld1 { v7.b }[6], [x8] ; CHECK-SD-NEXT: add x8, sp, #840 -; CHECK-SD-NEXT: ld1 { v0.b }[15], [x10] -; CHECK-SD-NEXT: sdot v2.4s, v4.16b, v1.16b -; CHECK-SD-NEXT: add x9, sp, #192 -; CHECK-SD-NEXT: ld1 { v6.b }[15], [x8] +; CHECK-SD-NEXT: ld1 { v7.b }[7], [x8] +; CHECK-SD-NEXT: ld1 { v17.b }[1], [x9] +; CHECK-SD-NEXT: add x8, sp, #864 +; CHECK-SD-NEXT: add x9, sp, #16 +; CHECK-SD-NEXT: ld1 { v16.b }[8], [x9] +; CHECK-SD-NEXT: add x9, sp, #912 +; CHECK-SD-NEXT: ld1 { v17.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #872 +; CHECK-SD-NEXT: zip1 v0.2d, v6.2d, v7.2d +; CHECK-SD-NEXT: ld1 { v16.b }[9], [x10] +; CHECK-SD-NEXT: ld1 { v17.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #880 +; CHECK-SD-NEXT: sdot v19.4s, v0.16b, v1.16b +; CHECK-SD-NEXT: ld1 { v17.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #888 +; CHECK-SD-NEXT: ld1 { v17.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #896 +; CHECK-SD-NEXT: ld1 { v17.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #904 +; CHECK-SD-NEXT: ld1 { v17.b }[7], [x8] +; CHECK-SD-NEXT: ldr b18, [x9] +; CHECK-SD-NEXT: add x8, sp, #920 +; CHECK-SD-NEXT: ld1 { v18.b }[1], [x8] +; CHECK-SD-NEXT: add x8, sp, #32 +; CHECK-SD-NEXT: ld1 { v16.b }[10], [x8] +; CHECK-SD-NEXT: add x8, sp, #928 +; CHECK-SD-NEXT: ld1 { v18.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #40 +; CHECK-SD-NEXT: ld1 { v16.b }[11], [x8] +; CHECK-SD-NEXT: add x8, sp, #936 +; CHECK-SD-NEXT: ld1 { v18.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #48 +; CHECK-SD-NEXT: ld1 { v16.b }[12], [x8] +; CHECK-SD-NEXT: add x8, sp, #944 +; CHECK-SD-NEXT: ld1 { v18.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #56 +; CHECK-SD-NEXT: ld1 { v16.b }[13], [x8] +; CHECK-SD-NEXT: add x8, sp, #952 +; CHECK-SD-NEXT: ld1 { v18.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #64 +; CHECK-SD-NEXT: ld1 { v16.b }[14], [x8] ; CHECK-SD-NEXT: add x8, sp, #960 -; CHECK-SD-NEXT: ld1 { v7.b }[14], [x9] -; CHECK-SD-NEXT: ld1 { v5.b }[14], [x8] -; CHECK-SD-NEXT: sdot v3.4s, v0.16b, v1.16b -; CHECK-SD-NEXT: add x8, sp, #200 -; CHECK-SD-NEXT: add x9, sp, #968 -; CHECK-SD-NEXT: sdot v2.4s, v6.16b, v1.16b -; CHECK-SD-NEXT: ld1 { v7.b }[15], [x8] -; CHECK-SD-NEXT: ld1 { v5.b }[15], [x9] -; CHECK-SD-NEXT: sdot v3.4s, v7.16b, v1.16b -; CHECK-SD-NEXT: sdot v2.4s, v5.16b, v1.16b -; CHECK-SD-NEXT: add v0.4s, v3.4s, v2.4s +; CHECK-SD-NEXT: ld1 { v18.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #72 +; CHECK-SD-NEXT: ld1 { v16.b }[15], [x8] +; CHECK-SD-NEXT: add x8, sp, #968 +; CHECK-SD-NEXT: ld1 { v18.b }[7], [x8] +; CHECK-SD-NEXT: sdot v5.4s, v16.16b, v1.16b +; CHECK-SD-NEXT: zip1 v0.2d, v17.2d, v18.2d +; CHECK-SD-NEXT: sdot v5.4s, v2.16b, v1.16b +; CHECK-SD-NEXT: sdot v19.4s, v0.16b, v1.16b +; CHECK-SD-NEXT: add v0.4s, v5.4s, v19.4s ; CHECK-SD-NEXT: addv s0, v0.4s ; CHECK-SD-NEXT: fmov w0, s0 ; CHECK-SD-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/nontemporal.ll b/llvm/test/CodeGen/AArch64/nontemporal.ll index f8ba150a0405..f7a87ae340a7 100644 --- a/llvm/test/CodeGen/AArch64/nontemporal.ll +++ b/llvm/test/CodeGen/AArch64/nontemporal.ll @@ -683,41 +683,43 @@ define void @test_stnp_v17f32(<17 x float> %v, ptr %ptr) { ; ; CHECK-BE-LABEL: test_stnp_v17f32: ; CHECK-BE: // %bb.0: // %entry -; CHECK-BE-NEXT: // kill: def $s4 killed $s4 def $q4 +; CHECK-BE-NEXT: // kill: def $s1 killed $s1 def $q1 ; CHECK-BE-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-BE-NEXT: ldr s16, [sp, #36] +; CHECK-BE-NEXT: // kill: def $s4 killed $s4 def $q4 ; CHECK-BE-NEXT: // kill: def $s5 killed $s5 def $q5 -; CHECK-BE-NEXT: // kill: def $s1 killed $s1 def $q1 -; CHECK-BE-NEXT: ldr s17, [sp, #4] -; CHECK-BE-NEXT: add x8, sp, #44 -; CHECK-BE-NEXT: mov v4.s[1], v5.s[0] +; CHECK-BE-NEXT: add x8, sp, #12 +; CHECK-BE-NEXT: add x9, sp, #20 +; CHECK-BE-NEXT: ldr s16, [sp, #36] ; CHECK-BE-NEXT: mov v0.s[1], v1.s[0] +; CHECK-BE-NEXT: ldr s1, [sp, #4] +; CHECK-BE-NEXT: mov v4.s[1], v5.s[0] +; CHECK-BE-NEXT: add x10, sp, #52 ; CHECK-BE-NEXT: // kill: def $s6 killed $s6 def $q6 ; CHECK-BE-NEXT: // kill: def $s2 killed $s2 def $q2 ; CHECK-BE-NEXT: // kill: def $s7 killed $s7 def $q7 ; CHECK-BE-NEXT: // kill: def $s3 killed $s3 def $q3 -; CHECK-BE-NEXT: ldr s1, [sp, #68] -; CHECK-BE-NEXT: ld1 { v16.s }[1], [x8] -; CHECK-BE-NEXT: add x8, sp, #12 -; CHECK-BE-NEXT: ld1 { v17.s }[1], [x8] -; CHECK-BE-NEXT: add x8, sp, #52 -; CHECK-BE-NEXT: str s1, [x0, #64] -; CHECK-BE-NEXT: ld1 { v16.s }[2], [x8] -; CHECK-BE-NEXT: add x8, sp, #20 +; CHECK-BE-NEXT: ld1 { v1.s }[1], [x8] +; CHECK-BE-NEXT: ldr s5, [x9] +; CHECK-BE-NEXT: add x8, sp, #28 +; CHECK-BE-NEXT: add x9, sp, #44 +; CHECK-BE-NEXT: ld1 { v5.s }[1], [x8] +; CHECK-BE-NEXT: ld1 { v16.s }[1], [x9] +; CHECK-BE-NEXT: ldr s17, [x10] +; CHECK-BE-NEXT: add x8, sp, #60 ; CHECK-BE-NEXT: mov v4.s[2], v6.s[0] ; CHECK-BE-NEXT: mov v0.s[2], v2.s[0] -; CHECK-BE-NEXT: ld1 { v17.s }[2], [x8] -; CHECK-BE-NEXT: add x8, sp, #60 -; CHECK-BE-NEXT: ld1 { v16.s }[3], [x8] -; CHECK-BE-NEXT: add x8, sp, #28 -; CHECK-BE-NEXT: ld1 { v17.s }[3], [x8] +; CHECK-BE-NEXT: ld1 { v17.s }[1], [x8] +; CHECK-BE-NEXT: ldr s2, [sp, #68] +; CHECK-BE-NEXT: add x8, x0, #32 +; CHECK-BE-NEXT: zip1 v1.2d, v1.2d, v5.2d +; CHECK-BE-NEXT: add x9, x0, #48 +; CHECK-BE-NEXT: str s2, [x0, #64] +; CHECK-BE-NEXT: zip1 v5.2d, v16.2d, v17.2d ; CHECK-BE-NEXT: mov v4.s[3], v7.s[0] -; CHECK-BE-NEXT: add x8, x0, #48 ; CHECK-BE-NEXT: mov v0.s[3], v3.s[0] -; CHECK-BE-NEXT: st1 { v16.4s }, [x8] -; CHECK-BE-NEXT: add x8, x0, #32 -; CHECK-BE-NEXT: st1 { v17.4s }, [x8] +; CHECK-BE-NEXT: st1 { v1.4s }, [x8] ; CHECK-BE-NEXT: add x8, x0, #16 +; CHECK-BE-NEXT: st1 { v5.4s }, [x9] ; CHECK-BE-NEXT: st1 { v4.4s }, [x8] ; CHECK-BE-NEXT: st1 { v0.4s }, [x0] ; CHECK-BE-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll index 35b9457fbc1f..9df71cfc96cc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll @@ -5712,9 +5712,8 @@ define <8 x i8> @vsub_if_uge_v8i8(<8 x i8> %va, <8 x i8> %vb) { ; CHECK-LABEL: vsub_if_uge_v8i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmsltu.vv v0, v8, v9 ; CHECK-NEXT: vsub.vv v9, v8, v9 -; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: vminu.vv v8, v8, v9 ; CHECK-NEXT: ret %cmp = icmp ult <8 x i8> %va, %vb %select = select <8 x i1> %cmp, <8 x i8> zeroinitializer, <8 x i8> %vb @@ -5725,9 +5724,9 @@ define <8 x i8> @vsub_if_uge_v8i8(<8 x i8> %va, <8 x i8> %vb) { define <8 x i8> @vsub_if_uge_swapped_v8i8(<8 x i8> %va, <8 x i8> %vb) { ; CHECK-LABEL: vsub_if_uge_swapped_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vmsleu.vv v0, v9, v8 -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vsub.vv v9, v8, v9 +; CHECK-NEXT: vminu.vv v8, v8, v9 ; CHECK-NEXT: ret %cmp = icmp uge <8 x i8> %va, %vb %select = select <8 x i1> %cmp, <8 x i8> %vb, <8 x i8> zeroinitializer @@ -5739,9 +5738,8 @@ define <8 x i16> @vsub_if_uge_v8i16(<8 x i16> %va, <8 x i16> %vb) { ; CHECK-LABEL: vsub_if_uge_v8i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vmsltu.vv v0, v8, v9 ; CHECK-NEXT: vsub.vv v9, v8, v9 -; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: vminu.vv v8, v8, v9 ; CHECK-NEXT: ret %cmp = icmp ult <8 x i16> %va, %vb %select = select <8 x i1> %cmp, <8 x i16> zeroinitializer, <8 x i16> %vb @@ -5752,9 +5750,9 @@ define <8 x i16> @vsub_if_uge_v8i16(<8 x i16> %va, <8 x i16> %vb) { define <8 x i16> @vsub_if_uge_swapped_v8i16(<8 x i16> %va, <8 x i16> %vb) { ; CHECK-LABEL: vsub_if_uge_swapped_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; CHECK-NEXT: vmsleu.vv v0, v9, v8 -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsub.vv v9, v8, v9 +; CHECK-NEXT: vminu.vv v8, v8, v9 ; CHECK-NEXT: ret %cmp = icmp uge <8 x i16> %va, %vb %select = select <8 x i1> %cmp, <8 x i16> %vb, <8 x i16> zeroinitializer @@ -5766,9 +5764,8 @@ define <4 x i32> @vsub_if_uge_v4i32(<4 x i32> %va, <4 x i32> %vb) { ; CHECK-LABEL: vsub_if_uge_v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmsltu.vv v0, v8, v9 ; CHECK-NEXT: vsub.vv v9, v8, v9 -; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: vminu.vv v8, v8, v9 ; CHECK-NEXT: ret %cmp = icmp ult <4 x i32> %va, %vb %select = select <4 x i1> %cmp, <4 x i32> zeroinitializer, <4 x i32> %vb @@ -5779,9 +5776,9 @@ define <4 x i32> @vsub_if_uge_v4i32(<4 x i32> %va, <4 x i32> %vb) { define <4 x i32> @vsub_if_uge_swapped_v4i32(<4 x i32> %va, <4 x i32> %vb) { ; CHECK-LABEL: vsub_if_uge_swapped_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vmsleu.vv v0, v9, v8 -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsub.vv v9, v8, v9 +; CHECK-NEXT: vminu.vv v8, v8, v9 ; CHECK-NEXT: ret %cmp = icmp uge <4 x i32> %va, %vb %select = select <4 x i1> %cmp, <4 x i32> %vb, <4 x i32> zeroinitializer @@ -5793,9 +5790,8 @@ define <2 x i64> @vsub_if_uge_v2i64(<2 x i64> %va, <2 x i64> %vb) { ; CHECK-LABEL: vsub_if_uge_v2i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vmsltu.vv v0, v8, v9 ; CHECK-NEXT: vsub.vv v9, v8, v9 -; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: vminu.vv v8, v8, v9 ; CHECK-NEXT: ret %cmp = icmp ult <2 x i64> %va, %vb %select = select <2 x i1> %cmp, <2 x i64> zeroinitializer, <2 x i64> %vb @@ -5806,9 +5802,9 @@ define <2 x i64> @vsub_if_uge_v2i64(<2 x i64> %va, <2 x i64> %vb) { define <2 x i64> @vsub_if_uge_swapped_v2i64(<2 x i64> %va, <2 x i64> %vb) { ; CHECK-LABEL: vsub_if_uge_swapped_v2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; CHECK-NEXT: vmsleu.vv v0, v9, v8 -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vsub.vv v9, v8, v9 +; CHECK-NEXT: vminu.vv v8, v8, v9 ; CHECK-NEXT: ret %cmp = icmp uge <2 x i64> %va, %vb %select = select <2 x i1> %cmp, <2 x i64> %vb, <2 x i64> zeroinitializer @@ -5819,9 +5815,9 @@ define <2 x i64> @vsub_if_uge_swapped_v2i64(<2 x i64> %va, <2 x i64> %vb) { define <8 x i8> @sub_if_uge_C_v8i8(<8 x i8> %x) { ; CHECK-LABEL: sub_if_uge_C_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vmsgtu.vi v0, v8, 12 -; CHECK-NEXT: vadd.vi v8, v8, -13, v0.t +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vadd.vi v9, v8, -13 +; CHECK-NEXT: vminu.vv v8, v9, v8 ; CHECK-NEXT: ret %cmp = icmp ugt <8 x i8> %x, splat (i8 12) %sub = add <8 x i8> %x, splat (i8 -13) @@ -5832,11 +5828,10 @@ define <8 x i8> @sub_if_uge_C_v8i8(<8 x i8> %x) { define <8 x i16> @sub_if_uge_C_v8i16(<8 x i16> %x) { ; CHECK-LABEL: sub_if_uge_C_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: li a0, 2000 -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; CHECK-NEXT: vmsgtu.vx v0, v8, a0 ; CHECK-NEXT: li a0, -2001 -; CHECK-NEXT: vadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vadd.vx v9, v8, a0 +; CHECK-NEXT: vminu.vv v8, v9, v8 ; CHECK-NEXT: ret %cmp = icmp ugt <8 x i16> %x, splat (i16 2000) %sub = add <8 x i16> %x, splat (i16 -2001) @@ -5847,13 +5842,11 @@ define <8 x i16> @sub_if_uge_C_v8i16(<8 x i16> %x) { define <4 x i32> @sub_if_uge_C_v4i32(<4 x i32> %x) { ; CHECK-LABEL: sub_if_uge_C_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 16 -; CHECK-NEXT: addi a0, a0, -16 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vmsgtu.vx v0, v8, a0 ; CHECK-NEXT: lui a0, 1048560 ; CHECK-NEXT: addi a0, a0, 15 -; CHECK-NEXT: vadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vadd.vx v9, v8, a0 +; CHECK-NEXT: vminu.vv v8, v9, v8 ; CHECK-NEXT: ret %cmp = icmp ugt <4 x i32> %x, splat (i32 65520) %sub = add <4 x i32> %x, splat (i32 -65521) @@ -5864,14 +5857,11 @@ define <4 x i32> @sub_if_uge_C_v4i32(<4 x i32> %x) { define <4 x i32> @sub_if_uge_C_swapped_v4i32(<4 x i32> %x) { ; CHECK-LABEL: sub_if_uge_C_swapped_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 16 -; CHECK-NEXT: addi a0, a0, -15 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmsltu.vx v0, v8, a0 ; CHECK-NEXT: lui a0, 1048560 ; CHECK-NEXT: addi a0, a0, 15 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vadd.vx v9, v8, a0 -; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: vminu.vv v8, v8, v9 ; CHECK-NEXT: ret %cmp = icmp ult <4 x i32> %x, splat (i32 65521) %sub = add <4 x i32> %x, splat (i32 -65521) @@ -5883,38 +5873,28 @@ define <2 x i64> @sub_if_uge_C_v2i64(<2 x i64> %x) nounwind { ; RV32-LABEL: sub_if_uge_C_v2i64: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: li a0, 1 -; RV32-NEXT: lui a1, 172127 -; RV32-NEXT: mv a2, sp -; RV32-NEXT: addi a1, a1, 512 -; RV32-NEXT: sw a1, 0(sp) -; RV32-NEXT: sw a0, 4(sp) ; RV32-NEXT: li a0, -2 -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; RV32-NEXT: vlse64.v v9, (a2), zero ; RV32-NEXT: lui a1, 876449 ; RV32-NEXT: addi a1, a1, -513 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw a0, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vmsltu.vv v0, v9, v8 -; RV32-NEXT: vadd.vv v8, v8, v10, v0.t +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vadd.vv v9, v8, v9 +; RV32-NEXT: vminu.vv v8, v9, v8 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: sub_if_uge_C_v2i64: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, 2384 -; RV64-NEXT: addi a0, a0, 761 -; RV64-NEXT: slli a0, a0, 9 -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; RV64-NEXT: vmsgtu.vx v0, v8, a0 ; RV64-NEXT: lui a0, 1048278 ; RV64-NEXT: addi a0, a0, -95 ; RV64-NEXT: slli a0, a0, 12 ; RV64-NEXT: addi a0, a0, -513 -; RV64-NEXT: vadd.vx v8, v8, a0, v0.t +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vadd.vx v9, v8, a0 +; RV64-NEXT: vminu.vv v8, v9, v8 ; RV64-NEXT: ret %cmp = icmp ugt <2 x i64> %x, splat (i64 5000000000) %sub = add <2 x i64> %x, splat (i64 -5000000001) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index 041aae229288..019bbe2908a2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -1718,6 +1718,28 @@ define void @load_factor4_one_active_storeback_full(ptr %ptr) { ret void } +define <4 x i32> @vp_load_factor3_one_active(ptr %ptr) { +; CHECK-LABEL: vp_load_factor3_one_active: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vlseg3e32.v v8, (a0) +; CHECK-NEXT: ret + %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> splat (i1 true), i32 12) + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9> + ret <4 x i32> %v0 +} + +define <4 x i32> @vp_load_factor5_one_active(ptr %ptr) { +; CHECK-LABEL: vp_load_factor5_one_active: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vlseg5e32.v v8, (a0) +; CHECK-NEXT: ret + %interleaved.vec = tail call <20 x i32> @llvm.vp.load.v20i32.p0(ptr %ptr, <20 x i1> splat (i1 true), i32 20) + %v0 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15> + ret <4 x i32> %v0 +} + define void @store_factor4_one_active(ptr %ptr, <4 x i32> %v) { ; CHECK-LABEL: store_factor4_one_active: ; CHECK: # %bb.0: @@ -1804,8 +1826,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) { ; RV32-NEXT: vle32.v v12, (a0), v0.t ; RV32-NEXT: li a0, 36 ; RV32-NEXT: vmv.s.x v20, a1 -; RV32-NEXT: lui a1, %hi(.LCPI51_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI51_0) +; RV32-NEXT: lui a1, %hi(.LCPI53_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI53_0) ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vle16.v v21, (a1) ; RV32-NEXT: vcompress.vm v8, v12, v11 @@ -1880,8 +1902,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) { ; RV32-NEXT: vmv.s.x v10, a0 ; RV32-NEXT: li a0, 146 ; RV32-NEXT: vmv.s.x v11, a0 -; RV32-NEXT: lui a0, %hi(.LCPI52_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI52_0) +; RV32-NEXT: lui a0, %hi(.LCPI54_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI54_0) ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vle16.v v20, (a0) ; RV32-NEXT: li a0, 36 diff --git a/llvm/test/CodeGen/RISCV/rvv/vminu-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vminu-sdnode.ll index a21a526e00ec..9b58cb3d5c89 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vminu-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vminu-sdnode.ll @@ -898,9 +898,8 @@ define <vscale x 2 x i8> @vsub_if_uge_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 ; CHECK-LABEL: vsub_if_uge_nxv2i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma -; CHECK-NEXT: vmsltu.vv v0, v8, v9 ; CHECK-NEXT: vsub.vv v9, v8, v9 -; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: vminu.vv v8, v8, v9 ; CHECK-NEXT: ret %cmp = icmp ult <vscale x 2 x i8> %va, %vb %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i8> zeroinitializer, <vscale x 2 x i8> %vb @@ -911,9 +910,9 @@ define <vscale x 2 x i8> @vsub_if_uge_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 define <vscale x 2 x i8> @vsub_if_uge_swapped_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb) { ; CHECK-LABEL: vsub_if_uge_swapped_nxv2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, mu -; CHECK-NEXT: vmsleu.vv v0, v9, v8 -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma +; CHECK-NEXT: vsub.vv v9, v8, v9 +; CHECK-NEXT: vminu.vv v8, v8, v9 ; CHECK-NEXT: ret %cmp = icmp uge <vscale x 2 x i8> %va, %vb %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i8> %vb, <vscale x 2 x i8> zeroinitializer @@ -925,9 +924,8 @@ define <vscale x 2 x i16> @vsub_if_uge_nxv2i16(<vscale x 2 x i16> %va, <vscale x ; CHECK-LABEL: vsub_if_uge_nxv2i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; CHECK-NEXT: vmsltu.vv v0, v8, v9 ; CHECK-NEXT: vsub.vv v9, v8, v9 -; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: vminu.vv v8, v8, v9 ; CHECK-NEXT: ret %cmp = icmp ult <vscale x 2 x i16> %va, %vb %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> %vb @@ -938,9 +936,9 @@ define <vscale x 2 x i16> @vsub_if_uge_nxv2i16(<vscale x 2 x i16> %va, <vscale x define <vscale x 2 x i16> @vsub_if_uge_swapped_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb) { ; CHECK-LABEL: vsub_if_uge_swapped_nxv2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu -; CHECK-NEXT: vmsleu.vv v0, v9, v8 -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vsub.vv v9, v8, v9 +; CHECK-NEXT: vminu.vv v8, v8, v9 ; CHECK-NEXT: ret %cmp = icmp uge <vscale x 2 x i16> %va, %vb %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i16> %vb, <vscale x 2 x i16> zeroinitializer @@ -952,9 +950,8 @@ define <vscale x 2 x i32> @vsub_if_uge_nxv2i32(<vscale x 2 x i32> %va, <vscale x ; CHECK-LABEL: vsub_if_uge_nxv2i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; CHECK-NEXT: vmsltu.vv v0, v8, v9 ; CHECK-NEXT: vsub.vv v9, v8, v9 -; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: vminu.vv v8, v8, v9 ; CHECK-NEXT: ret %cmp = icmp ult <vscale x 2 x i32> %va, %vb %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> %vb @@ -965,9 +962,9 @@ define <vscale x 2 x i32> @vsub_if_uge_nxv2i32(<vscale x 2 x i32> %va, <vscale x define <vscale x 2 x i32> @vsub_if_uge_swapped_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb) { ; CHECK-LABEL: vsub_if_uge_swapped_nxv2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; CHECK-NEXT: vmsleu.vv v0, v9, v8 -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vsub.vv v9, v8, v9 +; CHECK-NEXT: vminu.vv v8, v8, v9 ; CHECK-NEXT: ret %cmp = icmp uge <vscale x 2 x i32> %va, %vb %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i32> %vb, <vscale x 2 x i32> zeroinitializer @@ -979,9 +976,8 @@ define <vscale x 2 x i64> @vsub_if_uge_nxv2i64(<vscale x 2 x i64> %va, <vscale x ; CHECK-LABEL: vsub_if_uge_nxv2i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; CHECK-NEXT: vmsltu.vv v0, v8, v10 ; CHECK-NEXT: vsub.vv v10, v8, v10 -; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 +; CHECK-NEXT: vminu.vv v8, v8, v10 ; CHECK-NEXT: ret %cmp = icmp ult <vscale x 2 x i64> %va, %vb %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> %vb @@ -992,9 +988,9 @@ define <vscale x 2 x i64> @vsub_if_uge_nxv2i64(<vscale x 2 x i64> %va, <vscale x define <vscale x 2 x i64> @vsub_if_uge_swapped_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb) { ; CHECK-LABEL: vsub_if_uge_swapped_nxv2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; CHECK-NEXT: vmsleu.vv v0, v10, v8 -; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; CHECK-NEXT: vsub.vv v10, v8, v10 +; CHECK-NEXT: vminu.vv v8, v8, v10 ; CHECK-NEXT: ret %cmp = icmp uge <vscale x 2 x i64> %va, %vb %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i64> %vb, <vscale x 2 x i64> zeroinitializer @@ -1005,9 +1001,9 @@ define <vscale x 2 x i64> @vsub_if_uge_swapped_nxv2i64(<vscale x 2 x i64> %va, < define <vscale x 2 x i8> @sub_if_uge_C_nxv2i8(<vscale x 2 x i8> %x) { ; CHECK-LABEL: sub_if_uge_C_nxv2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, mu -; CHECK-NEXT: vmsgtu.vi v0, v8, 12 -; CHECK-NEXT: vadd.vi v8, v8, -13, v0.t +; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma +; CHECK-NEXT: vadd.vi v9, v8, -13 +; CHECK-NEXT: vminu.vv v8, v9, v8 ; CHECK-NEXT: ret %cmp = icmp ugt <vscale x 2 x i8> %x, splat (i8 12) %sub = add <vscale x 2 x i8> %x, splat (i8 -13) @@ -1018,11 +1014,10 @@ define <vscale x 2 x i8> @sub_if_uge_C_nxv2i8(<vscale x 2 x i8> %x) { define <vscale x 2 x i16> @sub_if_uge_C_nxv2i16(<vscale x 2 x i16> %x) { ; CHECK-LABEL: sub_if_uge_C_nxv2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: li a0, 2000 -; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, mu -; CHECK-NEXT: vmsgtu.vx v0, v8, a0 ; CHECK-NEXT: li a0, -2001 -; CHECK-NEXT: vadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vadd.vx v9, v8, a0 +; CHECK-NEXT: vminu.vv v8, v9, v8 ; CHECK-NEXT: ret %cmp = icmp ugt <vscale x 2 x i16> %x, splat (i16 2000) %sub = add <vscale x 2 x i16> %x, splat (i16 -2001) @@ -1033,13 +1028,11 @@ define <vscale x 2 x i16> @sub_if_uge_C_nxv2i16(<vscale x 2 x i16> %x) { define <vscale x 2 x i32> @sub_if_uge_C_nxv2i32(<vscale x 2 x i32> %x) { ; CHECK-LABEL: sub_if_uge_C_nxv2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 16 -; CHECK-NEXT: addi a0, a0, -16 -; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, mu -; CHECK-NEXT: vmsgtu.vx v0, v8, a0 ; CHECK-NEXT: lui a0, 1048560 ; CHECK-NEXT: addi a0, a0, 15 -; CHECK-NEXT: vadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vadd.vx v9, v8, a0 +; CHECK-NEXT: vminu.vv v8, v9, v8 ; CHECK-NEXT: ret %cmp = icmp ugt <vscale x 2 x i32> %x, splat (i32 65520) %sub = add <vscale x 2 x i32> %x, splat (i32 -65521) @@ -1050,14 +1043,11 @@ define <vscale x 2 x i32> @sub_if_uge_C_nxv2i32(<vscale x 2 x i32> %x) { define <vscale x 2 x i32> @sub_if_uge_C_swapped_nxv2i32(<vscale x 2 x i32> %x) { ; CHECK-LABEL: sub_if_uge_C_swapped_nxv2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 16 -; CHECK-NEXT: addi a0, a0, -15 -; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; CHECK-NEXT: vmsltu.vx v0, v8, a0 ; CHECK-NEXT: lui a0, 1048560 ; CHECK-NEXT: addi a0, a0, 15 +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; CHECK-NEXT: vadd.vx v9, v8, a0 -; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: vminu.vv v8, v8, v9 ; CHECK-NEXT: ret %cmp = icmp ult <vscale x 2 x i32> %x, splat (i32 65521) %sub = add <vscale x 2 x i32> %x, splat (i32 -65521) @@ -1069,38 +1059,28 @@ define <vscale x 2 x i64> @sub_if_uge_C_nxv2i64(<vscale x 2 x i64> %x) nounwind ; RV32-LABEL: sub_if_uge_C_nxv2i64: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: li a0, 1 -; RV32-NEXT: lui a1, 172127 -; RV32-NEXT: mv a2, sp -; RV32-NEXT: addi a1, a1, 512 -; RV32-NEXT: sw a1, 0(sp) -; RV32-NEXT: sw a0, 4(sp) ; RV32-NEXT: li a0, -2 -; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, mu -; RV32-NEXT: vlse64.v v10, (a2), zero ; RV32-NEXT: lui a1, 876449 ; RV32-NEXT: addi a1, a1, -513 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw a0, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vmsltu.vv v0, v10, v8 -; RV32-NEXT: vadd.vv v8, v8, v12, v0.t +; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vadd.vv v10, v8, v10 +; RV32-NEXT: vminu.vv v8, v10, v8 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: sub_if_uge_C_nxv2i64: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, 2384 -; RV64-NEXT: addi a0, a0, 761 -; RV64-NEXT: slli a0, a0, 9 -; RV64-NEXT: vsetvli a1, zero, e64, m2, ta, mu -; RV64-NEXT: vmsgtu.vx v0, v8, a0 ; RV64-NEXT: lui a0, 1048278 ; RV64-NEXT: addi a0, a0, -95 ; RV64-NEXT: slli a0, a0, 12 ; RV64-NEXT: addi a0, a0, -513 -; RV64-NEXT: vadd.vx v8, v8, a0, v0.t +; RV64-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV64-NEXT: vadd.vx v10, v8, a0 +; RV64-NEXT: vminu.vv v8, v10, v8 ; RV64-NEXT: ret %cmp = icmp ugt <vscale x 2 x i64> %x, splat (i64 5000000000) %sub = add <vscale x 2 x i64> %x, splat (i64 -5000000001) diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll index 8cfa237858ac..23c0c826e85e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll @@ -648,6 +648,51 @@ define void @masked_store_factor4_v2(<vscale x 1 x i1> %mask, <vscale x 1 x i32> ret void } +define <vscale x 2 x i32> @load_factor2_oneactive(ptr %ptr, i32 %evl) { +; RV32-LABEL: load_factor2_oneactive: +; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: srli a1, a1, 1 +; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV32-NEXT: vlseg2e32.v v7, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: load_factor2_oneactive: +; RV64: # %bb.0: +; RV64-NEXT: slli a1, a1, 34 +; RV64-NEXT: srli a1, a1, 33 +; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV64-NEXT: vlseg2e32.v v7, (a0) +; RV64-NEXT: ret + %rvl = mul nuw i32 %evl, 4 + %wide.masked.load = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %rvl) + %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %wide.masked.load) + %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1 + ret <vscale x 2 x i32> %t0 +} + +define <vscale x 2 x i32> @load_factor5_oneactive(ptr %ptr, i32 %evl) { +; RV32-LABEL: load_factor5_oneactive: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV32-NEXT: vlseg5e32.v v5, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: load_factor5_oneactive: +; RV64: # %bb.0: +; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: srli a1, a1, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV64-NEXT: vlseg5e32.v v5, (a0) +; RV64-NEXT: ret + %rvl = mul nuw i32 %evl, 5 + %wide.masked.load = call <vscale x 10 x i32> @llvm.vp.load(ptr %ptr, <vscale x 10 x i1> splat (i1 true), i32 %rvl) + %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave5(<vscale x 10 x i32> %wide.masked.load) + %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 3 + ret <vscale x 2 x i32> %t3 +} + + ; Negative tests define {<vscale x 2 x i32>, <vscale x 2 x i32>} @not_same_mask(<vscale x 2 x i1> %mask0, <vscale x 2 x i1> %mask1, ptr %ptr, i32 %evl) { diff --git a/llvm/test/TableGen/directive1.td b/llvm/test/TableGen/directive1.td index 1d2bd51204e4..3eda077eeabf 100644 --- a/llvm/test/TableGen/directive1.td +++ b/llvm/test/TableGen/directive1.td @@ -53,6 +53,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-EMPTY: // CHECK-NEXT: #include "llvm/ADT/ArrayRef.h" // CHECK-NEXT: #include "llvm/ADT/BitmaskEnum.h" +// CHECK-NEXT: #include "llvm/ADT/Sequence.h" // CHECK-NEXT: #include "llvm/ADT/StringRef.h" // CHECK-NEXT: #include "llvm/Frontend/Directive/Spelling.h" // CHECK-NEXT: #include "llvm/Support/Compiler.h" @@ -66,22 +67,26 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-EMPTY: // CHECK-NEXT: enum class Association { // CHECK-NEXT: Block, +// CHECK-NEXT: First_ = Block, // CHECK-NEXT: Declaration, // CHECK-NEXT: Delimited, // CHECK-NEXT: Loop, // CHECK-NEXT: None, // CHECK-NEXT: Separating, +// CHECK-NEXT: Last_ = Separating, // CHECK-NEXT: }; // CHECK-EMPTY: // CHECK-NEXT: static constexpr std::size_t Association_enumSize = 6; // CHECK-EMPTY: // CHECK-NEXT: enum class Category { // CHECK-NEXT: Declarative, +// CHECK-NEXT: First_ = Declarative, // CHECK-NEXT: Executable, // CHECK-NEXT: Informational, // CHECK-NEXT: Meta, // CHECK-NEXT: Subsidiary, // CHECK-NEXT: Utility, +// CHECK-NEXT: Last_ = Utility, // CHECK-NEXT: }; // CHECK-EMPTY: // CHECK-NEXT: static constexpr std::size_t Category_enumSize = 6; @@ -96,6 +101,8 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-EMPTY: // CHECK-NEXT: enum class Directive { // CHECK-NEXT: TDLD_dira, +// CHECK-NEXT: First_ = TDLD_dira, +// CHECK-NEXT: Last_ = TDLD_dira, // CHECK-NEXT: }; // CHECK-EMPTY: // CHECK-NEXT: static constexpr std::size_t Directive_enumSize = 1; @@ -104,8 +111,10 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-EMPTY: // CHECK-NEXT: enum class Clause { // CHECK-NEXT: TDLC_clausea, +// CHECK-NEXT: First_ = TDLC_clausea, // CHECK-NEXT: TDLC_clauseb, // CHECK-NEXT: TDLC_clausec, +// CHECK-NEXT: Last_ = TDLC_clausec, // CHECK-NEXT: }; // CHECK-EMPTY: // CHECK-NEXT: static constexpr std::size_t Clause_enumSize = 3; @@ -151,6 +160,22 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-NEXT: LLVM_ABI StringRef getTdlAKindName(AKind x); // CHECK-EMPTY: // CHECK-NEXT: } // namespace tdl +// CHECK-EMPTY: +// CHECK-NEXT: template <> struct enum_iteration_traits<tdl::Association> { +// CHECK-NEXT: static constexpr bool is_iterable = true; +// CHECK-NEXT: }; +// CHECK-EMPTY: +// CHECK-NEXT: template <> struct enum_iteration_traits<tdl::Category> { +// CHECK-NEXT: static constexpr bool is_iterable = true; +// CHECK-NEXT: }; +// CHECK-EMPTY: +// CHECK-NEXT: template <> struct enum_iteration_traits<tdl::Directive> { +// CHECK-NEXT: static constexpr bool is_iterable = true; +// CHECK-NEXT: }; +// CHECK-EMPTY: +// CHECK-NEXT: template <> struct enum_iteration_traits<tdl::Clause> { +// CHECK-NEXT: static constexpr bool is_iterable = true; +// CHECK-NEXT: }; // CHECK-NEXT: } // namespace llvm // CHECK-NEXT: #endif // LLVM_Tdl_INC diff --git a/llvm/test/TableGen/directive2.td b/llvm/test/TableGen/directive2.td index 3a64bb3900a3..a25197c3efd9 100644 --- a/llvm/test/TableGen/directive2.td +++ b/llvm/test/TableGen/directive2.td @@ -46,6 +46,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-NEXT: #define LLVM_Tdl_INC // CHECK-EMPTY: // CHECK-NEXT: #include "llvm/ADT/ArrayRef.h" +// CHECK-NEXT: #include "llvm/ADT/Sequence.h" // CHECK-NEXT: #include "llvm/ADT/StringRef.h" // CHECK-NEXT: #include "llvm/Frontend/Directive/Spelling.h" // CHECK-NEXT: #include "llvm/Support/Compiler.h" @@ -57,22 +58,26 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-EMPTY: // CHECK-NEXT: enum class Association { // CHECK-NEXT: Block, +// CHECK-NEXT: First_ = Block, // CHECK-NEXT: Declaration, // CHECK-NEXT: Delimited, // CHECK-NEXT: Loop, // CHECK-NEXT: None, // CHECK-NEXT: Separating, +// CHECK-NEXT: Last_ = Separating, // CHECK-NEXT: }; // CHECK-EMPTY: // CHECK-NEXT: static constexpr std::size_t Association_enumSize = 6; // CHECK-EMPTY: // CHECK-NEXT: enum class Category { // CHECK-NEXT: Declarative, +// CHECK-NEXT: First_ = Declarative, // CHECK-NEXT: Executable, // CHECK-NEXT: Informational, // CHECK-NEXT: Meta, // CHECK-NEXT: Subsidiary, // CHECK-NEXT: Utility, +// CHECK-NEXT: Last_ = Utility, // CHECK-NEXT: }; // CHECK-EMPTY: // CHECK-NEXT: static constexpr std::size_t Category_enumSize = 6; @@ -87,15 +92,19 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-EMPTY: // CHECK-NEXT: enum class Directive { // CHECK-NEXT: TDLD_dira, +// CHECK-NEXT: First_ = TDLD_dira, +// CHECK-NEXT: Last_ = TDLD_dira, // CHECK-NEXT: }; // CHECK-EMPTY: // CHECK-NEXT: static constexpr std::size_t Directive_enumSize = 1; // CHECK-EMPTY: // CHECK-NEXT: enum class Clause { // CHECK-NEXT: TDLC_clausea, +// CHECK-NEXT: First_ = TDLC_clausea, // CHECK-NEXT: TDLC_clauseb, // CHECK-NEXT: TDLC_clausec, // CHECK-NEXT: TDLC_claused, +// CHECK-NEXT: Last_ = TDLC_claused, // CHECK-NEXT: }; // CHECK-EMPTY: // CHECK-NEXT: static constexpr std::size_t Clause_enumSize = 4; @@ -124,6 +133,22 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-NEXT: LLVM_ABI Category getDirectiveCategory(Directive D); // CHECK-NEXT: LLVM_ABI SourceLanguage getDirectiveLanguages(Directive D); // CHECK-NEXT: } // namespace tdl +// CHECK-EMPTY: +// CHECK-NEXT: template <> struct enum_iteration_traits<tdl::Association> { +// CHECK-NEXT: static constexpr bool is_iterable = true; +// CHECK-NEXT: }; +// CHECK-EMPTY: +// CHECK-NEXT: template <> struct enum_iteration_traits<tdl::Category> { +// CHECK-NEXT: static constexpr bool is_iterable = true; +// CHECK-NEXT: }; +// CHECK-EMPTY: +// CHECK-NEXT: template <> struct enum_iteration_traits<tdl::Directive> { +// CHECK-NEXT: static constexpr bool is_iterable = true; +// CHECK-NEXT: }; +// CHECK-EMPTY: +// CHECK-NEXT: template <> struct enum_iteration_traits<tdl::Clause> { +// CHECK-NEXT: static constexpr bool is_iterable = true; +// CHECK-NEXT: }; // CHECK-NEXT: } // namespace llvm // CHECK-NEXT: #endif // LLVM_Tdl_INC diff --git a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll index 2747895f06a7..ce4270dc4b7f 100644 --- a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll +++ b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll @@ -18,11 +18,9 @@ define void @reuse_lcssa_phi_for_add_rec1(ptr %head) { ; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 1 ; CHECK-NEXT: br i1 [[EC_1]], label %[[PH:.*]], label %[[LOOP_1]] ; CHECK: [[PH]]: -; CHECK-NEXT: [[IV_2_LCSSA:%.*]] = phi i32 [ [[IV_2]], %[[LOOP_1]] ] ; CHECK-NEXT: [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LOOP_1]] ] -; CHECK-NEXT: [[IV_2_NEXT_LCSSA:%.*]] = phi i32 [ [[IV_2_NEXT]], %[[LOOP_1]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[IV_2_NEXT]], %[[LOOP_1]] ] ; CHECK-NEXT: [[SRC_2:%.*]] = tail call noalias noundef dereferenceable_or_null(8) ptr @calloc(i64 1, i64 8) -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[IV_2_LCSSA]], 1 ; CHECK-NEXT: [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[TMP0]], i32 1) ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMIN]] ; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 diff --git a/llvm/unittests/Frontend/OpenMPDirectiveNameParserTest.cpp b/llvm/unittests/Frontend/OpenMPDirectiveNameParserTest.cpp index 0363a08cc0f0..10329820bef7 100644 --- a/llvm/unittests/Frontend/OpenMPDirectiveNameParserTest.cpp +++ b/llvm/unittests/Frontend/OpenMPDirectiveNameParserTest.cpp @@ -48,12 +48,6 @@ static std::string &prepareParamName(std::string &Name) { return Name; } -namespace llvm { -template <> struct enum_iteration_traits<omp::Directive> { - static constexpr bool is_iterable = true; -}; -} // namespace llvm - // Test tokenizing. class Tokenize : public testing::TestWithParam<omp::Directive> {}; @@ -87,12 +81,10 @@ getParamName1(const testing::TestParamInfo<Tokenize::ParamType> &Info) { return prepareParamName(Name); } -INSTANTIATE_TEST_SUITE_P( - DirectiveNameParserTest, Tokenize, - testing::ValuesIn( - llvm::enum_seq(static_cast<omp::Directive>(0), - static_cast<omp::Directive>(omp::Directive_enumSize))), - getParamName1); +INSTANTIATE_TEST_SUITE_P(DirectiveNameParserTest, Tokenize, + testing::ValuesIn(llvm::enum_seq_inclusive( + omp::Directive::First_, omp::Directive::Last_)), + getParamName1); // Test parsing of valid names. @@ -131,9 +123,8 @@ getParamName2(const testing::TestParamInfo<ParseValid::ParamType> &Info) { INSTANTIATE_TEST_SUITE_P( DirectiveNameParserTest, ParseValid, - testing::Combine(testing::ValuesIn(llvm::enum_seq( - static_cast<omp::Directive>(0), - static_cast<omp::Directive>(omp::Directive_enumSize))), + testing::Combine(testing::ValuesIn(llvm::enum_seq_inclusive( + omp::Directive::First_, omp::Directive::Last_)), testing::ValuesIn(omp::getOpenMPVersions())), getParamName2); diff --git a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp index 177eecebce9a..f0e23690367d 100644 --- a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp +++ b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp @@ -106,8 +106,16 @@ static void generateEnumClass(ArrayRef<const Record *> Records, raw_ostream &OS, bool ExportEnums) { OS << "\n"; OS << "enum class " << Enum << " {\n"; - for (const Record *R : Records) { - OS << " " << getIdentifierName(R, Prefix) << ",\n"; + if (!Records.empty()) { + std::string N; + for (auto [I, R] : llvm::enumerate(Records)) { + N = getIdentifierName(R, Prefix); + OS << " " << N << ",\n"; + // Make the sentinel names less likely to conflict with actual names... + if (I == 0) + OS << " First_ = " << N << ",\n"; + } + OS << " Last_ = " << N << ",\n"; } OS << "};\n"; OS << "\n"; @@ -282,6 +290,7 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) { if (DirLang.hasEnableBitmaskEnumInNamespace()) OS << "#include \"llvm/ADT/BitmaskEnum.h\"\n"; + OS << "#include \"llvm/ADT/Sequence.h\"\n"; OS << "#include \"llvm/ADT/StringRef.h\"\n"; OS << "#include \"llvm/Frontend/Directive/Spelling.h\"\n"; OS << "#include \"llvm/Support/Compiler.h\"\n"; @@ -375,6 +384,15 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) { for (auto Ns : reverse(Namespaces)) OS << "} // namespace " << Ns << "\n"; + // These specializations need to be in ::llvm. + for (StringRef Enum : {"Association", "Category", "Directive", "Clause"}) { + OS << "\n"; + OS << "template <> struct enum_iteration_traits<" + << DirLang.getCppNamespace() << "::" << Enum << "> {\n"; + OS << " static constexpr bool is_iterable = true;\n"; + OS << "};\n"; + } + OS << "} // namespace llvm\n"; OS << "#endif // LLVM_" << Lang << "_INC\n"; diff --git a/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h b/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h index b68262f09f48..ee401cca8f55 100644 --- a/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h +++ b/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h @@ -707,6 +707,19 @@ public: /// this for uniformity with `applyDomain`. void applyRange(const IntegerRelation &rel); + /// Let the relation `this` be R1, and the relation `rel` be R2. Requires + /// R1 and R2 to have the same domain. + /// + /// Let R3 be the rangeProduct of R1 and R2. Then x R3 (y, z) iff + /// (x R1 y and x R2 z). + /// + /// Example: + /// + /// R1: (i, j) -> k : f(i, j, k) = 0 + /// R2: (i, j) -> l : g(i, j, l) = 0 + /// R1.rangeProduct(R2): (i, j) -> (k, l) : f(i, j, k) = 0 and g(i, j, l) = 0 + IntegerRelation rangeProduct(const IntegerRelation &rel); + /// Given a relation `other: (A -> B)`, this operation merges the symbol and /// local variables and then takes the composition of `other` on `this: (B -> /// C)`. The resulting relation represents tuples of the form: `A -> C`. diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h index 4eb666239d4e..8f87235fcd23 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h @@ -29,6 +29,7 @@ #include "mlir/Interfaces/ControlFlowInterfaces.h" #include "mlir/Interfaces/LoopLikeInterface.h" #include "mlir/Interfaces/SideEffectInterfaces.h" +#include <variant> #define GET_TYPEDEF_CLASSES #include "mlir/Dialect/OpenACC/OpenACCOpsTypes.h.inc" diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td index 66378f116784..96b9adcc53b3 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td @@ -2772,8 +2772,10 @@ def OpenACC_RoutineOp : OpenACC_Op<"routine", [IsolatedFromAbove]> { }]; let arguments = (ins SymbolNameAttr:$sym_name, SymbolRefAttr:$func_name, - OptionalAttr<StrArrayAttr>:$bindName, - OptionalAttr<DeviceTypeArrayAttr>:$bindNameDeviceType, + OptionalAttr<SymbolRefArrayAttr>:$bindIdName, + OptionalAttr<StrArrayAttr>:$bindStrName, + OptionalAttr<DeviceTypeArrayAttr>:$bindIdNameDeviceType, + OptionalAttr<DeviceTypeArrayAttr>:$bindStrNameDeviceType, OptionalAttr<DeviceTypeArrayAttr>:$worker, OptionalAttr<DeviceTypeArrayAttr>:$vector, OptionalAttr<DeviceTypeArrayAttr>:$seq, UnitAttr:$nohost, @@ -2815,14 +2817,14 @@ def OpenACC_RoutineOp : OpenACC_Op<"routine", [IsolatedFromAbove]> { std::optional<int64_t> getGangDimValue(); std::optional<int64_t> getGangDimValue(mlir::acc::DeviceType deviceType); - std::optional<llvm::StringRef> getBindNameValue(); - std::optional<llvm::StringRef> getBindNameValue(mlir::acc::DeviceType deviceType); + std::optional<::std::variant<mlir::SymbolRefAttr, mlir::StringAttr>> getBindNameValue(); + std::optional<::std::variant<mlir::SymbolRefAttr, mlir::StringAttr>> getBindNameValue(mlir::acc::DeviceType deviceType); }]; let assemblyFormat = [{ $sym_name `func` `(` $func_name `)` oilist ( - `bind` `(` custom<BindName>($bindName, $bindNameDeviceType) `)` + `bind` `(` custom<BindName>($bindIdName, $bindStrName ,$bindIdNameDeviceType, $bindStrNameDeviceType) `)` | `gang` `` custom<RoutineGangClause>($gang, $gangDim, $gangDimDeviceType) | `worker` custom<DeviceTypeArrayAttr>($worker) | `vector` custom<DeviceTypeArrayAttr>($vector) diff --git a/mlir/include/mlir/IR/PatternMatch.h b/mlir/include/mlir/IR/PatternMatch.h index afeb784b85a1..3a2dbd136b43 100644 --- a/mlir/include/mlir/IR/PatternMatch.h +++ b/mlir/include/mlir/IR/PatternMatch.h @@ -475,6 +475,25 @@ public: RewriterBase::Listener *rewriteListener; }; + /// A listener that logs notification events to llvm::dbgs() before + /// forwarding to the base listener. + struct PatternLoggingListener : public RewriterBase::ForwardingListener { + PatternLoggingListener(OpBuilder::Listener *listener, StringRef patternName) + : RewriterBase::ForwardingListener(listener), patternName(patternName) { + } + + void notifyOperationInserted(Operation *op, InsertPoint previous) override; + void notifyOperationModified(Operation *op) override; + void notifyOperationReplaced(Operation *op, Operation *newOp) override; + void notifyOperationReplaced(Operation *op, + ValueRange replacement) override; + void notifyOperationErased(Operation *op) override; + void notifyPatternBegin(const Pattern &pattern, Operation *op) override; + + private: + StringRef patternName; + }; + /// Move the blocks that belong to "region" before the given position in /// another region "parent". The two regions must be different. The caller /// is responsible for creating or updating the operation transferring flow diff --git a/mlir/lib/Analysis/Presburger/IntegerRelation.cpp b/mlir/lib/Analysis/Presburger/IntegerRelation.cpp index 17e48e0d069b..5c4d4d13580a 100644 --- a/mlir/lib/Analysis/Presburger/IntegerRelation.cpp +++ b/mlir/lib/Analysis/Presburger/IntegerRelation.cpp @@ -2481,6 +2481,44 @@ void IntegerRelation::applyDomain(const IntegerRelation &rel) { void IntegerRelation::applyRange(const IntegerRelation &rel) { compose(rel); } +IntegerRelation IntegerRelation::rangeProduct(const IntegerRelation &rel) { + /// R1: (i, j) -> k : f(i, j, k) = 0 + /// R2: (i, j) -> l : g(i, j, l) = 0 + /// R1.rangeProduct(R2): (i, j) -> (k, l) : f(i, j, k) = 0 and g(i, j, l) = 0 + assert(getNumDomainVars() == rel.getNumDomainVars() && + "Range product is only defined for relations with equal domains"); + + // explicit copy of `this` + IntegerRelation result = *this; + unsigned relRangeVarStart = rel.getVarKindOffset(VarKind::Range); + unsigned numThisRangeVars = getNumRangeVars(); + unsigned numNewSymbolVars = result.getNumSymbolVars() - getNumSymbolVars(); + + result.appendVar(VarKind::Range, rel.getNumRangeVars()); + + // Copy each equality from `rel` and update the copy to account for range + // variables from `this`. The `rel` equality is a list of coefficients of the + // variables from `rel`, and so the range variables need to be shifted right + // by the number of `this` range variables and symbols. + for (unsigned i = 0; i < rel.getNumEqualities(); ++i) { + SmallVector<DynamicAPInt> copy = + SmallVector<DynamicAPInt>(rel.getEquality(i)); + copy.insert(copy.begin() + relRangeVarStart, + numThisRangeVars + numNewSymbolVars, DynamicAPInt(0)); + result.addEquality(copy); + } + + for (unsigned i = 0; i < rel.getNumInequalities(); ++i) { + SmallVector<DynamicAPInt> copy = + SmallVector<DynamicAPInt>(rel.getInequality(i)); + copy.insert(copy.begin() + relRangeVarStart, + numThisRangeVars + numNewSymbolVars, DynamicAPInt(0)); + result.addInequality(copy); + } + + return result; +} + void IntegerRelation::printSpace(raw_ostream &os) const { space.print(os); os << getNumConstraints() << " constraints\n"; diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp index d96148288530..7b790e90e0d8 100644 --- a/mlir/lib/Bindings/Python/IRCore.cpp +++ b/mlir/lib/Bindings/Python/IRCore.cpp @@ -97,6 +97,10 @@ Args: binary: Whether to write bytes (True) or str (False). Defaults to False. large_elements_limit: Whether to elide elements attributes above this number of elements. Defaults to None (no limit). + large_resource_limit: Whether to elide resource attributes above this + number of characters. Defaults to None (no limit). If large_elements_limit + is set and this is None, the behavior will be to use large_elements_limit + as large_resource_limit. enable_debug_info: Whether to print debug/location information. Defaults to False. pretty_debug_info: Whether to format debug information for easier reading @@ -1303,6 +1307,7 @@ void PyOperation::checkValid() const { } void PyOperationBase::print(std::optional<int64_t> largeElementsLimit, + std::optional<int64_t> largeResourceLimit, bool enableDebugInfo, bool prettyDebugInfo, bool printGenericOpForm, bool useLocalScope, bool useNameLocAsPrefix, bool assumeVerified, @@ -1314,10 +1319,10 @@ void PyOperationBase::print(std::optional<int64_t> largeElementsLimit, fileObject = nb::module_::import_("sys").attr("stdout"); MlirOpPrintingFlags flags = mlirOpPrintingFlagsCreate(); - if (largeElementsLimit) { + if (largeElementsLimit) mlirOpPrintingFlagsElideLargeElementsAttrs(flags, *largeElementsLimit); - mlirOpPrintingFlagsElideLargeResourceString(flags, *largeElementsLimit); - } + if (largeResourceLimit) + mlirOpPrintingFlagsElideLargeResourceString(flags, *largeResourceLimit); if (enableDebugInfo) mlirOpPrintingFlagsEnableDebugInfo(flags, /*enable=*/true, /*prettyForm=*/prettyDebugInfo); @@ -1405,6 +1410,7 @@ void PyOperationBase::walk( nb::object PyOperationBase::getAsm(bool binary, std::optional<int64_t> largeElementsLimit, + std::optional<int64_t> largeResourceLimit, bool enableDebugInfo, bool prettyDebugInfo, bool printGenericOpForm, bool useLocalScope, bool useNameLocAsPrefix, bool assumeVerified, @@ -1416,6 +1422,7 @@ nb::object PyOperationBase::getAsm(bool binary, fileObject = nb::module_::import_("io").attr("StringIO")(); } print(/*largeElementsLimit=*/largeElementsLimit, + /*largeResourceLimit=*/largeResourceLimit, /*enableDebugInfo=*/enableDebugInfo, /*prettyDebugInfo=*/prettyDebugInfo, /*printGenericOpForm=*/printGenericOpForm, @@ -3348,6 +3355,7 @@ void mlir::python::populateIRCore(nb::module_ &m) { [](PyOperationBase &self) { return self.getAsm(/*binary=*/false, /*largeElementsLimit=*/std::nullopt, + /*largeResourceLimit=*/std::nullopt, /*enableDebugInfo=*/false, /*prettyDebugInfo=*/false, /*printGenericOpForm=*/false, @@ -3363,11 +3371,12 @@ void mlir::python::populateIRCore(nb::module_ &m) { nb::arg("state"), nb::arg("file").none() = nb::none(), nb::arg("binary") = false, kOperationPrintStateDocstring) .def("print", - nb::overload_cast<std::optional<int64_t>, bool, bool, bool, bool, - bool, bool, nb::object, bool, bool>( - &PyOperationBase::print), + nb::overload_cast<std::optional<int64_t>, std::optional<int64_t>, + bool, bool, bool, bool, bool, bool, nb::object, + bool, bool>(&PyOperationBase::print), // Careful: Lots of arguments must match up with print method. nb::arg("large_elements_limit").none() = nb::none(), + nb::arg("large_resource_limit").none() = nb::none(), nb::arg("enable_debug_info") = false, nb::arg("pretty_debug_info") = false, nb::arg("print_generic_op_form") = false, @@ -3383,6 +3392,7 @@ void mlir::python::populateIRCore(nb::module_ &m) { // Careful: Lots of arguments must match up with get_asm method. nb::arg("binary") = false, nb::arg("large_elements_limit").none() = nb::none(), + nb::arg("large_resource_limit").none() = nb::none(), nb::arg("enable_debug_info") = false, nb::arg("pretty_debug_info") = false, nb::arg("print_generic_op_form") = false, diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h index 9befcce725bb..0fdd2d1a7eff 100644 --- a/mlir/lib/Bindings/Python/IRModule.h +++ b/mlir/lib/Bindings/Python/IRModule.h @@ -599,18 +599,18 @@ class PyOperationBase { public: virtual ~PyOperationBase() = default; /// Implements the bound 'print' method and helps with others. - void print(std::optional<int64_t> largeElementsLimit, bool enableDebugInfo, + void print(std::optional<int64_t> largeElementsLimit, + std::optional<int64_t> largeResourceLimit, bool enableDebugInfo, bool prettyDebugInfo, bool printGenericOpForm, bool useLocalScope, bool useNameLocAsPrefix, bool assumeVerified, nanobind::object fileObject, bool binary, bool skipRegions); void print(PyAsmState &state, nanobind::object fileObject, bool binary); - nanobind::object getAsm(bool binary, - std::optional<int64_t> largeElementsLimit, - bool enableDebugInfo, bool prettyDebugInfo, - bool printGenericOpForm, bool useLocalScope, - bool useNameLocAsPrefix, bool assumeVerified, - bool skipRegions); + nanobind::object + getAsm(bool binary, std::optional<int64_t> largeElementsLimit, + std::optional<int64_t> largeResourceLimit, bool enableDebugInfo, + bool prettyDebugInfo, bool printGenericOpForm, bool useLocalScope, + bool useNameLocAsPrefix, bool assumeVerified, bool skipRegions); // Implement the bound 'writeBytecode' method. void writeBytecode(const nanobind::object &fileObject, diff --git a/mlir/lib/Bindings/Python/Pass.cpp b/mlir/lib/Bindings/Python/Pass.cpp index 8d84864b9db4..20017e25b69b 100644 --- a/mlir/lib/Bindings/Python/Pass.cpp +++ b/mlir/lib/Bindings/Python/Pass.cpp @@ -78,12 +78,19 @@ void mlir::python::populatePassManagerSubmodule(nb::module_ &m) { [](PyPassManager &passManager, bool printBeforeAll, bool printAfterAll, bool printModuleScope, bool printAfterChange, bool printAfterFailure, std::optional<int64_t> largeElementsLimit, - bool enableDebugInfo, bool printGenericOpForm, + std::optional<int64_t> largeResourceLimit, bool enableDebugInfo, + bool printGenericOpForm, std::optional<std::string> optionalTreePrintingPath) { MlirOpPrintingFlags flags = mlirOpPrintingFlagsCreate(); - if (largeElementsLimit) + if (largeElementsLimit) { mlirOpPrintingFlagsElideLargeElementsAttrs(flags, *largeElementsLimit); + mlirOpPrintingFlagsElideLargeResourceString(flags, + *largeElementsLimit); + } + if (largeResourceLimit) + mlirOpPrintingFlagsElideLargeResourceString(flags, + *largeResourceLimit); if (enableDebugInfo) mlirOpPrintingFlagsEnableDebugInfo(flags, /*enable=*/true, /*prettyForm=*/false); @@ -103,6 +110,7 @@ void mlir::python::populatePassManagerSubmodule(nb::module_ &m) { "print_module_scope"_a = false, "print_after_change"_a = false, "print_after_failure"_a = false, "large_elements_limit"_a.none() = nb::none(), + "large_resource_limit"_a.none() = nb::none(), "enable_debug_info"_a = false, "print_generic_op_form"_a = false, "tree_printing_dir_path"_a.none() = nb::none(), "Enable IR printing, default as mlir-print-ir-after-all.") diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp index f2eab62b286a..fbc1f003ab64 100644 --- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp +++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp @@ -21,6 +21,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/LogicalResult.h" +#include <variant> using namespace mlir; using namespace acc; @@ -3461,40 +3462,88 @@ LogicalResult acc::RoutineOp::verify() { return success(); } -static ParseResult parseBindName(OpAsmParser &parser, mlir::ArrayAttr &bindName, - mlir::ArrayAttr &deviceTypes) { - llvm::SmallVector<mlir::Attribute> bindNameAttrs; - llvm::SmallVector<mlir::Attribute> deviceTypeAttrs; +static ParseResult parseBindName(OpAsmParser &parser, + mlir::ArrayAttr &bindIdName, + mlir::ArrayAttr &bindStrName, + mlir::ArrayAttr &deviceIdTypes, + mlir::ArrayAttr &deviceStrTypes) { + llvm::SmallVector<mlir::Attribute> bindIdNameAttrs; + llvm::SmallVector<mlir::Attribute> bindStrNameAttrs; + llvm::SmallVector<mlir::Attribute> deviceIdTypeAttrs; + llvm::SmallVector<mlir::Attribute> deviceStrTypeAttrs; if (failed(parser.parseCommaSeparatedList([&]() { - if (parser.parseAttribute(bindNameAttrs.emplace_back())) + mlir::Attribute newAttr; + bool isSymbolRefAttr; + auto parseResult = parser.parseAttribute(newAttr); + if (auto symbolRefAttr = dyn_cast<mlir::SymbolRefAttr>(newAttr)) { + bindIdNameAttrs.push_back(symbolRefAttr); + isSymbolRefAttr = true; + } else if (auto stringAttr = dyn_cast<mlir::StringAttr>(newAttr)) { + bindStrNameAttrs.push_back(stringAttr); + isSymbolRefAttr = false; + } + if (parseResult) return failure(); if (failed(parser.parseOptionalLSquare())) { - deviceTypeAttrs.push_back(mlir::acc::DeviceTypeAttr::get( - parser.getContext(), mlir::acc::DeviceType::None)); + if (isSymbolRefAttr) { + deviceIdTypeAttrs.push_back(mlir::acc::DeviceTypeAttr::get( + parser.getContext(), mlir::acc::DeviceType::None)); + } else { + deviceStrTypeAttrs.push_back(mlir::acc::DeviceTypeAttr::get( + parser.getContext(), mlir::acc::DeviceType::None)); + } } else { - if (parser.parseAttribute(deviceTypeAttrs.emplace_back()) || - parser.parseRSquare()) - return failure(); + if (isSymbolRefAttr) { + if (parser.parseAttribute(deviceIdTypeAttrs.emplace_back()) || + parser.parseRSquare()) + return failure(); + } else { + if (parser.parseAttribute(deviceStrTypeAttrs.emplace_back()) || + parser.parseRSquare()) + return failure(); + } } return success(); }))) return failure(); - bindName = ArrayAttr::get(parser.getContext(), bindNameAttrs); - deviceTypes = ArrayAttr::get(parser.getContext(), deviceTypeAttrs); + bindIdName = ArrayAttr::get(parser.getContext(), bindIdNameAttrs); + bindStrName = ArrayAttr::get(parser.getContext(), bindStrNameAttrs); + deviceIdTypes = ArrayAttr::get(parser.getContext(), deviceIdTypeAttrs); + deviceStrTypes = ArrayAttr::get(parser.getContext(), deviceStrTypeAttrs); return success(); } static void printBindName(mlir::OpAsmPrinter &p, mlir::Operation *op, - std::optional<mlir::ArrayAttr> bindName, - std::optional<mlir::ArrayAttr> deviceTypes) { - llvm::interleaveComma(llvm::zip(*bindName, *deviceTypes), p, - [&](const auto &pair) { - p << std::get<0>(pair); - printSingleDeviceType(p, std::get<1>(pair)); - }); + std::optional<mlir::ArrayAttr> bindIdName, + std::optional<mlir::ArrayAttr> bindStrName, + std::optional<mlir::ArrayAttr> deviceIdTypes, + std::optional<mlir::ArrayAttr> deviceStrTypes) { + // Create combined vectors for all bind names and device types + llvm::SmallVector<mlir::Attribute> allBindNames; + llvm::SmallVector<mlir::Attribute> allDeviceTypes; + + // Append bindIdName and deviceIdTypes + if (hasDeviceTypeValues(deviceIdTypes)) { + allBindNames.append(bindIdName->begin(), bindIdName->end()); + allDeviceTypes.append(deviceIdTypes->begin(), deviceIdTypes->end()); + } + + // Append bindStrName and deviceStrTypes + if (hasDeviceTypeValues(deviceStrTypes)) { + allBindNames.append(bindStrName->begin(), bindStrName->end()); + allDeviceTypes.append(deviceStrTypes->begin(), deviceStrTypes->end()); + } + + // Print the combined sequence + if (!allBindNames.empty()) + llvm::interleaveComma(llvm::zip(allBindNames, allDeviceTypes), p, + [&](const auto &pair) { + p << std::get<0>(pair); + printSingleDeviceType(p, std::get<1>(pair)); + }); } static ParseResult parseRoutineGangClause(OpAsmParser &parser, @@ -3654,19 +3703,32 @@ bool RoutineOp::hasSeq(mlir::acc::DeviceType deviceType) { return hasDeviceType(getSeq(), deviceType); } -std::optional<llvm::StringRef> RoutineOp::getBindNameValue() { +std::optional<std::variant<mlir::SymbolRefAttr, mlir::StringAttr>> +RoutineOp::getBindNameValue() { return getBindNameValue(mlir::acc::DeviceType::None); } -std::optional<llvm::StringRef> +std::optional<std::variant<mlir::SymbolRefAttr, mlir::StringAttr>> RoutineOp::getBindNameValue(mlir::acc::DeviceType deviceType) { - if (!hasDeviceTypeValues(getBindNameDeviceType())) + if (!hasDeviceTypeValues(getBindIdNameDeviceType()) && + !hasDeviceTypeValues(getBindStrNameDeviceType())) { return std::nullopt; - if (auto pos = findSegment(*getBindNameDeviceType(), deviceType)) { - auto attr = (*getBindName())[*pos]; + } + + if (auto pos = findSegment(*getBindIdNameDeviceType(), deviceType)) { + auto attr = (*getBindIdName())[*pos]; + auto symbolRefAttr = dyn_cast<mlir::SymbolRefAttr>(attr); + assert(symbolRefAttr && "expected SymbolRef"); + return symbolRefAttr; + } + + if (auto pos = findSegment(*getBindStrNameDeviceType(), deviceType)) { + auto attr = (*getBindStrName())[*pos]; auto stringAttr = dyn_cast<mlir::StringAttr>(attr); - return stringAttr.getValue(); + assert(stringAttr && "expected String"); + return stringAttr; } + return std::nullopt; } diff --git a/mlir/lib/IR/CMakeLists.txt b/mlir/lib/IR/CMakeLists.txt index 4cabac185171..3ef69cea18f0 100644 --- a/mlir/lib/IR/CMakeLists.txt +++ b/mlir/lib/IR/CMakeLists.txt @@ -29,6 +29,7 @@ add_mlir_library(MLIRIR ODSSupport.cpp Operation.cpp OperationSupport.cpp + PatternLoggingListener.cpp PatternMatch.cpp Region.cpp RegionKindInterface.cpp diff --git a/mlir/lib/IR/PatternLoggingListener.cpp b/mlir/lib/IR/PatternLoggingListener.cpp new file mode 100644 index 000000000000..ce2123ae1a19 --- /dev/null +++ b/mlir/lib/IR/PatternLoggingListener.cpp @@ -0,0 +1,50 @@ +#include "mlir/IR/PatternMatch.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "pattern-logging-listener" +#define DBGS() (llvm::dbgs() << "[" << DEBUG_TYPE << "] ") +#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n") + +using namespace mlir; + +void RewriterBase::PatternLoggingListener::notifyOperationInserted( + Operation *op, InsertPoint previous) { + LDBG(patternName << " | notifyOperationInserted" + << " | " << op->getName()); + ForwardingListener::notifyOperationInserted(op, previous); +} + +void RewriterBase::PatternLoggingListener::notifyOperationModified( + Operation *op) { + LDBG(patternName << " | notifyOperationModified" + << " | " << op->getName()); + ForwardingListener::notifyOperationModified(op); +} + +void RewriterBase::PatternLoggingListener::notifyOperationReplaced( + Operation *op, Operation *newOp) { + LDBG(patternName << " | notifyOperationReplaced (with op)" + << " | " << op->getName() << " | " << newOp->getName()); + ForwardingListener::notifyOperationReplaced(op, newOp); +} + +void RewriterBase::PatternLoggingListener::notifyOperationReplaced( + Operation *op, ValueRange replacement) { + LDBG(patternName << " | notifyOperationReplaced (with values)" + << " | " << op->getName()); + ForwardingListener::notifyOperationReplaced(op, replacement); +} + +void RewriterBase::PatternLoggingListener::notifyOperationErased( + Operation *op) { + LDBG(patternName << " | notifyOperationErased" + << " | " << op->getName()); + ForwardingListener::notifyOperationErased(op); +} + +void RewriterBase::PatternLoggingListener::notifyPatternBegin( + const Pattern &pattern, Operation *op) { + LDBG(patternName << " | notifyPatternBegin" + << " | " << op->getName()); + ForwardingListener::notifyPatternBegin(pattern, op); +} diff --git a/mlir/lib/Rewrite/PatternApplicator.cpp b/mlir/lib/Rewrite/PatternApplicator.cpp index 4a12183492fd..b2b372b7b124 100644 --- a/mlir/lib/Rewrite/PatternApplicator.cpp +++ b/mlir/lib/Rewrite/PatternApplicator.cpp @@ -15,6 +15,10 @@ #include "ByteCode.h" #include "llvm/Support/Debug.h" +#ifndef NDEBUG +#include "llvm/ADT/ScopeExit.h" +#endif + #define DEBUG_TYPE "pattern-application" using namespace mlir; @@ -206,11 +210,19 @@ LogicalResult PatternApplicator::matchAndRewrite( } else { LLVM_DEBUG(llvm::dbgs() << "Trying to match \"" << bestPattern->getDebugName() << "\"\n"); - const auto *pattern = static_cast<const RewritePattern *>(bestPattern); - result = pattern->matchAndRewrite(op, rewriter); +#ifndef NDEBUG + OpBuilder::Listener *oldListener = rewriter.getListener(); + auto loggingListener = + std::make_unique<RewriterBase::PatternLoggingListener>( + oldListener, pattern->getDebugName()); + rewriter.setListener(loggingListener.get()); + auto resetListenerCallback = llvm::make_scope_exit( + [&] { rewriter.setListener(oldListener); }); +#endif + result = pattern->matchAndRewrite(op, rewriter); LLVM_DEBUG(llvm::dbgs() << "\"" << bestPattern->getDebugName() << "\" result " << succeeded(result) << "\n"); diff --git a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi index ed476da28d6b..be71737e4b5b 100644 --- a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi +++ b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi @@ -200,6 +200,7 @@ class _OperationBase: def get_asm( binary: Literal[True], large_elements_limit: int | None = None, + large_resource_limit: int | None = None, enable_debug_info: bool = False, pretty_debug_info: bool = False, print_generic_op_form: bool = False, @@ -212,6 +213,7 @@ class _OperationBase: self, binary: bool = False, large_elements_limit: int | None = None, + large_resource_limit: int | None = None, enable_debug_info: bool = False, pretty_debug_info: bool = False, print_generic_op_form: bool = False, @@ -253,6 +255,7 @@ class _OperationBase: def print( self, large_elements_limit: int | None = None, + large_resource_limit: int | None = None, enable_debug_info: bool = False, pretty_debug_info: bool = False, print_generic_op_form: bool = False, @@ -270,6 +273,10 @@ class _OperationBase: binary: Whether to write bytes (True) or str (False). Defaults to False. large_elements_limit: Whether to elide elements attributes above this number of elements. Defaults to None (no limit). + large_resource_limit: Whether to elide resource strings above this + number of characters. Defaults to None (no limit). If large_elements_limit + is set and this is None, the behavior will be to use large_elements_limit + as large_resource_limit. enable_debug_info: Whether to print debug/location information. Defaults to False. pretty_debug_info: Whether to format debug information for easier reading diff --git a/mlir/python/mlir/_mlir_libs/_mlir/passmanager.pyi b/mlir/python/mlir/_mlir_libs/_mlir/passmanager.pyi index 0d2eaffe16d3..1010daddae2a 100644 --- a/mlir/python/mlir/_mlir_libs/_mlir/passmanager.pyi +++ b/mlir/python/mlir/_mlir_libs/_mlir/passmanager.pyi @@ -23,6 +23,7 @@ class PassManager: print_after_change: bool = False, print_after_failure: bool = False, large_elements_limit: int | None = None, + large_resource_limit: int | None = None, enable_debug_info: bool = False, print_generic_op_form: bool = False, tree_printing_dir_path: str | None = None, diff --git a/mlir/test/IR/test-pattern-logging-listener.mlir b/mlir/test/IR/test-pattern-logging-listener.mlir new file mode 100644 index 000000000000..a1d27741a072 --- /dev/null +++ b/mlir/test/IR/test-pattern-logging-listener.mlir @@ -0,0 +1,17 @@ +// RUN: mlir-opt %s --test-walk-pattern-rewrite-driver \ +// RUN: --allow-unregistered-dialect --debug-only=pattern-logging-listener 2>&1 | FileCheck %s + +// Check that when replacing an op with a new op, we get appropriate +// pattern-logging lines. The regex is because the anonymous namespace is +// printed differently on different platforms. + +// CHECK: [pattern-logging-listener] {{.anonymous.namespace.}}::ReplaceWithNewOp | notifyOperationInserted | test.new_op +// CHECK: [pattern-logging-listener] {{.anonymous.namespace.}}::ReplaceWithNewOp | notifyOperationReplaced (with values) | test.replace_with_new_op +// CHECK: [pattern-logging-listener] {{.anonymous.namespace.}}::ReplaceWithNewOp | notifyOperationModified | arith.addi +// CHECK: [pattern-logging-listener] {{.anonymous.namespace.}}::ReplaceWithNewOp | notifyOperationModified | arith.addi +// CHECK: [pattern-logging-listener] {{.anonymous.namespace.}}::ReplaceWithNewOp | notifyOperationErased | test.replace_with_new_op +func.func @replace_with_new_op() -> i32 { + %a = "test.replace_with_new_op"() : () -> (i32) + %res = arith.addi %a, %a : i32 + return %res : i32 +} diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py index 9b5cadd62bef..233fef8ec429 100644 --- a/mlir/test/lit.cfg.py +++ b/mlir/test/lit.cfg.py @@ -301,6 +301,17 @@ if "MLIR_OPT_CHECK_IR_ROUNDTRIP" in os.environ: ToolSubst("mlir-opt", "mlir-opt --verify-roundtrip", unresolved="fatal"), ] ) +elif "MLIR_GENERATE_PATTERN_CATALOG" in os.environ: + tools.extend( + [ + ToolSubst( + "mlir-opt", + "mlir-opt --debug-only=pattern-logging-listener --mlir-disable-threading", + unresolved="fatal", + ), + ToolSubst("FileCheck", "FileCheck --dump-input=always", unresolved="fatal"), + ] + ) else: tools.extend(["mlir-opt"]) diff --git a/mlir/test/python/ir/operation.py b/mlir/test/python/ir/operation.py index b08fe98397fb..ede1571f940f 100644 --- a/mlir/test/python/ir/operation.py +++ b/mlir/test/python/ir/operation.py @@ -686,6 +686,15 @@ def testOperationPrint(): skip_regions=True, ) + # Test print with large_resource_limit. + # CHECK: func.func @f1(%arg0: i32) -> i32 + # CHECK-NOT: resource1: "0x08 + module.operation.print(large_resource_limit=2) + + # Test large_elements_limit has no effect on resource string + # CHECK: func.func @f1(%arg0: i32) -> i32 + # CHECK: resource1: "0x08 + module.operation.print(large_elements_limit=2) # CHECK-LABEL: TEST: testKnownOpView @run diff --git a/mlir/test/python/pass_manager.py b/mlir/test/python/pass_manager.py index 85d2eb304882..e26d42bb3291 100644 --- a/mlir/test/python/pass_manager.py +++ b/mlir/test/python/pass_manager.py @@ -363,6 +363,63 @@ def testPrintIrLargeLimitElements(): pm.run(module) +# CHECK-LABEL: TEST: testPrintIrLargeResourceLimit +@run +def testPrintIrLargeResourceLimit(): + with Context() as ctx: + module = ModuleOp.parse( + """ + module { + func.func @main() -> tensor<3xi64> { + %0 = arith.constant dense_resource<blob1> : tensor<3xi64> + return %0 : tensor<3xi64> + } + } + {-# + dialect_resources: { + builtin: { + blob1: "0x010000000000000002000000000000000300000000000000" + } + } + #-} + """ + ) + pm = PassManager.parse("builtin.module(canonicalize)") + ctx.enable_multithreading(False) + pm.enable_ir_printing(large_resource_limit=4) + # CHECK-NOT: blob1: "0x01 + pm.run(module) + + +# CHECK-LABEL: TEST: testPrintIrLargeResourceLimitVsElementsLimit +@run +def testPrintIrLargeResourceLimitVsElementsLimit(): + """Test that large_elements_limit does not affect the printing of resources.""" + with Context() as ctx: + module = ModuleOp.parse( + """ + module { + func.func @main() -> tensor<3xi64> { + %0 = arith.constant dense_resource<blob1> : tensor<3xi64> + return %0 : tensor<3xi64> + } + } + {-# + dialect_resources: { + builtin: { + blob1: "0x010000000000000002000000000000000300000000000000" + } + } + #-} + """ + ) + pm = PassManager.parse("builtin.module(canonicalize)") + ctx.enable_multithreading(False) + pm.enable_ir_printing(large_elements_limit=1) + # CHECK-NOT: blob1: "0x01 + pm.run(module) + + # CHECK-LABEL: TEST: testPrintIrTree @run def testPrintIrTree(): diff --git a/mlir/unittests/Analysis/Presburger/IntegerRelationTest.cpp b/mlir/unittests/Analysis/Presburger/IntegerRelationTest.cpp index 7df500bc9568..dd0b09f7f05d 100644 --- a/mlir/unittests/Analysis/Presburger/IntegerRelationTest.cpp +++ b/mlir/unittests/Analysis/Presburger/IntegerRelationTest.cpp @@ -608,3 +608,97 @@ TEST(IntegerRelationTest, convertVarKindToLocal) { EXPECT_EQ(space.getId(VarKind::Symbol, 0), Identifier(&identifiers[3])); EXPECT_EQ(space.getId(VarKind::Symbol, 1), Identifier(&identifiers[4])); } + +TEST(IntegerRelationTest, rangeProduct) { + IntegerRelation r1 = parseRelationFromSet( + "(i, j, k) : (2*i + 3*k == 0, i >= 0, j >= 0, k >= 0)", 2); + IntegerRelation r2 = parseRelationFromSet( + "(i, j, l) : (4*i + 6*j + 9*l == 0, i >= 0, j >= 0, l >= 0)", 2); + + IntegerRelation rangeProd = r1.rangeProduct(r2); + IntegerRelation expected = + parseRelationFromSet("(i, j, k, l) : (2*i + 3*k == 0, 4*i + 6*j + 9*l == " + "0, i >= 0, j >= 0, k >= 0, l >= 0)", + 2); + + EXPECT_TRUE(expected.isEqual(rangeProd)); +} + +TEST(IntegerRelationTest, rangeProductMultdimRange) { + IntegerRelation r1 = + parseRelationFromSet("(i, k) : (2*i + 3*k == 0, i >= 0, k >= 0)", 1); + IntegerRelation r2 = parseRelationFromSet( + "(i, l, m) : (4*i + 6*m + 9*l == 0, i >= 0, l >= 0, m >= 0)", 1); + + IntegerRelation rangeProd = r1.rangeProduct(r2); + IntegerRelation expected = + parseRelationFromSet("(i, k, l, m) : (2*i + 3*k == 0, 4*i + 6*m + 9*l == " + "0, i >= 0, k >= 0, l >= 0, m >= 0)", + 1); + + EXPECT_TRUE(expected.isEqual(rangeProd)); +} + +TEST(IntegerRelationTest, rangeProductMultdimRangeSwapped) { + IntegerRelation r1 = parseRelationFromSet( + "(i, l, m) : (4*i + 6*m + 9*l == 0, i >= 0, l >= 0, m >= 0)", 1); + IntegerRelation r2 = + parseRelationFromSet("(i, k) : (2*i + 3*k == 0, i >= 0, k >= 0)", 1); + + IntegerRelation rangeProd = r1.rangeProduct(r2); + IntegerRelation expected = + parseRelationFromSet("(i, l, m, k) : (2*i + 3*k == 0, 4*i + 6*m + 9*l == " + "0, i >= 0, k >= 0, l >= 0, m >= 0)", + 1); + + EXPECT_TRUE(expected.isEqual(rangeProd)); +} + +TEST(IntegerRelationTest, rangeProductEmptyDomain) { + IntegerRelation r1 = + parseRelationFromSet("(i, j) : (4*i + 9*j == 0, i >= 0, j >= 0)", 0); + IntegerRelation r2 = + parseRelationFromSet("(k, l) : (2*k + 3*l == 0, k >= 0, l >= 0)", 0); + IntegerRelation rangeProd = r1.rangeProduct(r2); + IntegerRelation expected = + parseRelationFromSet("(i, j, k, l) : (2*k + 3*l == 0, 4*i + 9*j == " + "0, i >= 0, j >= 0, k >= 0, l >= 0)", + 0); + EXPECT_TRUE(expected.isEqual(rangeProd)); +} + +TEST(IntegerRelationTest, rangeProductEmptyRange) { + IntegerRelation r1 = + parseRelationFromSet("(i, j) : (4*i + 9*j == 0, i >= 0, j >= 0)", 2); + IntegerRelation r2 = + parseRelationFromSet("(i, j) : (2*i + 3*j == 0, i >= 0, j >= 0)", 2); + IntegerRelation rangeProd = r1.rangeProduct(r2); + IntegerRelation expected = + parseRelationFromSet("(i, j) : (2*i + 3*j == 0, 4*i + 9*j == " + "0, i >= 0, j >= 0)", + 2); + EXPECT_TRUE(expected.isEqual(rangeProd)); +} + +TEST(IntegerRelationTest, rangeProductEmptyDomainAndRange) { + IntegerRelation r1 = parseRelationFromSet("() : ()", 0); + IntegerRelation r2 = parseRelationFromSet("() : ()", 0); + IntegerRelation rangeProd = r1.rangeProduct(r2); + IntegerRelation expected = parseRelationFromSet("() : ()", 0); + EXPECT_TRUE(expected.isEqual(rangeProd)); +} + +TEST(IntegerRelationTest, rangeProductSymbols) { + IntegerRelation r1 = parseRelationFromSet( + "(i, j)[s] : (2*i + 3*j + s == 0, i >= 0, j >= 0)", 1); + IntegerRelation r2 = parseRelationFromSet( + "(i, l)[s] : (3*i + 4*l + s == 0, i >= 0, l >= 0)", 1); + + IntegerRelation rangeProd = r1.rangeProduct(r2); + IntegerRelation expected = parseRelationFromSet( + "(i, j, l)[s] : (2*i + 3*j + s == 0, 3*i + 4*l + s == " + "0, i >= 0, j >= 0, l >= 0)", + 1); + + EXPECT_TRUE(expected.isEqual(rangeProd)); +} diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp index aa16421cbec5..836efdb307f9 100644 --- a/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp +++ b/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp @@ -519,14 +519,44 @@ TEST_F(OpenACCOpsTest, routineOpTest) { op->removeGangDimDeviceTypeAttr(); op->removeGangDimAttr(); - op->setBindNameDeviceTypeAttr(b.getArrayAttr({dtypeNone})); - op->setBindNameAttr(b.getArrayAttr({b.getStringAttr("fname")})); + op->setBindIdNameDeviceTypeAttr( + b.getArrayAttr({DeviceTypeAttr::get(&context, DeviceType::Host)})); + op->setBindStrNameDeviceTypeAttr(b.getArrayAttr({dtypeNone})); + op->setBindIdNameAttr( + b.getArrayAttr({SymbolRefAttr::get(&context, "test_symbol")})); + op->setBindStrNameAttr(b.getArrayAttr({b.getStringAttr("fname")})); EXPECT_TRUE(op->getBindNameValue().has_value()); - EXPECT_EQ(op->getBindNameValue().value(), "fname"); - for (auto d : dtypesWithoutNone) - EXPECT_FALSE(op->getBindNameValue(d).has_value()); - op->removeBindNameDeviceTypeAttr(); - op->removeBindNameAttr(); + EXPECT_TRUE(op->getBindNameValue(DeviceType::Host).has_value()); + EXPECT_EQ(std::visit( + [](const auto &attr) -> std::string { + if constexpr (std::is_same_v<std::decay_t<decltype(attr)>, + mlir::StringAttr>) { + return attr.str(); + } else { + return attr.getLeafReference().str(); + } + }, + op->getBindNameValue().value()), + "fname"); + EXPECT_EQ(std::visit( + [](const auto &attr) -> std::string { + if constexpr (std::is_same_v<std::decay_t<decltype(attr)>, + mlir::StringAttr>) { + return attr.str(); + } else { + return attr.getLeafReference().str(); + } + }, + op->getBindNameValue(DeviceType::Host).value()), + "test_symbol"); + for (auto d : dtypesWithoutNone) { + if (d != DeviceType::Host) + EXPECT_FALSE(op->getBindNameValue(d).has_value()); + } + op->removeBindIdNameDeviceTypeAttr(); + op->removeBindStrNameDeviceTypeAttr(); + op->removeBindIdNameAttr(); + op->removeBindStrNameAttr(); } template <typename Op> |
