diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
36 files changed, 2892 insertions, 3623 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.gfx11plus-fake16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.gfx11plus-fake16.mir new file mode 100644 index 000000000000..23da26d96b62 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.gfx11plus-fake16.mir @@ -0,0 +1,374 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -mattr=-real-true16 -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX11 %s + +--- +name: fcmp_false_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_false_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX11-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(false), [[TRUNC]](s16), [[TRUNC1]] + ; GFX11-NEXT: S_ENDPGM 0, implicit [[FCMP]](s1) + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(false), %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: fcmp_oeq_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_oeq_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_CMP_EQ_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_EQ_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_EQ_F16_fake16_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(oeq), %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: fcmp_ogt_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_ogt_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_CMP_GT_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_GT_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_GT_F16_fake16_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(ogt), %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: fcmp_oge_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_oge_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_CMP_GE_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_GE_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_GE_F16_fake16_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(oge), %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: fcmp_olt_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_olt_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_CMP_LT_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LT_F16_fake16_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(olt), %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: fcmp_ole_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_ole_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_CMP_LE_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LE_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LE_F16_fake16_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(ole), %2, %3 + S_ENDPGM 0, implicit %4 +... +--- +name: fcmp_one_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_one_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_CMP_LG_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LG_F16_fake16_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(one), %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: fcmp_ord_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_ord_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_CMP_LG_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LG_F16_fake16_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(one), %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: fcmp_uno_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_uno_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_CMP_U_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_U_F16_fake16_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(uno), %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: fcmp_ueq_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_ueq_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_CMP_NLG_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLG_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLG_F16_fake16_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(ueq), %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: fcmp_ugt_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_ugt_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_CMP_NLE_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLE_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLE_F16_fake16_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(ugt), %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: fcmp_uge_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_uge_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_CMP_NLT_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLT_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLT_F16_fake16_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(uge), %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: fcmp_ult_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_ult_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_CMP_NGE_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NGE_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NGE_F16_fake16_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(ult), %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: fcmp_ule_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_ule_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_CMP_NGT_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NGT_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NGT_F16_fake16_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(ule), %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: fcmp_une_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_une_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_CMP_NEQ_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NEQ_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NEQ_F16_fake16_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(une), %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: fcmp_true_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_true_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX11-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(true), [[TRUNC]](s16), [[TRUNC1]] + ; GFX11-NEXT: S_ENDPGM 0, implicit [[FCMP]](s1) + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(true), %2, %3 + S_ENDPGM 0, implicit %4 +... + diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.gfx11plus.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.gfx11plus.mir new file mode 100644 index 000000000000..a7140e6a74fd --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.gfx11plus.mir @@ -0,0 +1,402 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -mattr=+real-true16 -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX11 %s + +--- +name: fcmp_false_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_false_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX11-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(false), [[TRUNC]](s16), [[TRUNC1]] + ; GFX11-NEXT: S_ENDPGM 0, implicit [[FCMP]](s1) + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(false), %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: fcmp_oeq_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_oeq_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16 + ; GFX11-NEXT: [[V_CMP_EQ_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_EQ_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_EQ_F16_t16_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(oeq), %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: fcmp_ogt_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_ogt_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16 + ; GFX11-NEXT: [[V_CMP_GT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_GT_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_GT_F16_t16_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(ogt), %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: fcmp_oge_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_oge_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16 + ; GFX11-NEXT: [[V_CMP_GE_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_GE_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_GE_F16_t16_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(oge), %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: fcmp_olt_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_olt_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16 + ; GFX11-NEXT: [[V_CMP_LT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LT_F16_t16_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(olt), %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: fcmp_ole_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_ole_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16 + ; GFX11-NEXT: [[V_CMP_LE_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LE_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LE_F16_t16_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(ole), %2, %3 + S_ENDPGM 0, implicit %4 +... +--- +name: fcmp_one_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_one_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16 + ; GFX11-NEXT: [[V_CMP_LG_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LG_F16_t16_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(one), %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: fcmp_ord_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_ord_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16 + ; GFX11-NEXT: [[V_CMP_LG_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LG_F16_t16_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(one), %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: fcmp_uno_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_uno_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16 + ; GFX11-NEXT: [[V_CMP_U_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_U_F16_t16_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(uno), %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: fcmp_ueq_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_ueq_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16 + ; GFX11-NEXT: [[V_CMP_NLG_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLG_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLG_F16_t16_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(ueq), %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: fcmp_ugt_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_ugt_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16 + ; GFX11-NEXT: [[V_CMP_NLE_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLE_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLE_F16_t16_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(ugt), %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: fcmp_uge_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_uge_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16 + ; GFX11-NEXT: [[V_CMP_NLT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLT_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLT_F16_t16_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(uge), %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: fcmp_ult_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_ult_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16 + ; GFX11-NEXT: [[V_CMP_NGE_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NGE_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NGE_F16_t16_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(ult), %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: fcmp_ule_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_ule_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16 + ; GFX11-NEXT: [[V_CMP_NGT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NGT_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NGT_F16_t16_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(ule), %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: fcmp_une_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_une_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16 + ; GFX11-NEXT: [[V_CMP_NEQ_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NEQ_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NEQ_F16_t16_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(une), %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: fcmp_true_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: fcmp_true_s16_vv + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX11-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(true), [[TRUNC]](s16), [[TRUNC1]] + ; GFX11-NEXT: S_ENDPGM 0, implicit [[FCMP]](s1) + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vcc(s1) = G_FCMP floatpred(true), %2, %3 + S_ENDPGM 0, implicit %4 +... + diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.mir index 5c387baf4675..85b1d402146c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.mir @@ -1,7 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=WAVE64 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=WAVE32 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX11 %s --- name: fcmp_false_s16_vv @@ -31,15 +30,6 @@ body: | ; WAVE32-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(false), [[TRUNC]](s16), [[TRUNC1]] ; WAVE32-NEXT: S_ENDPGM 0, implicit [[FCMP]](s1) ; - ; GFX11-LABEL: name: fcmp_false_s16_vv - ; GFX11: liveins: $vgpr0, $vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) - ; GFX11-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(false), [[TRUNC]](s16), [[TRUNC1]] - ; GFX11-NEXT: S_ENDPGM 0, implicit [[FCMP]](s1) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -72,13 +62,6 @@ body: | ; WAVE32-NEXT: [[V_CMP_EQ_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_EQ_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_EQ_F16_e64_]] ; - ; GFX11-LABEL: name: fcmp_oeq_s16_vv - ; GFX11: liveins: $vgpr0, $vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: [[V_CMP_EQ_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_EQ_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_EQ_F16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -111,13 +94,6 @@ body: | ; WAVE32-NEXT: [[V_CMP_GT_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_GT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_GT_F16_e64_]] ; - ; GFX11-LABEL: name: fcmp_ogt_s16_vv - ; GFX11: liveins: $vgpr0, $vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: [[V_CMP_GT_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_GT_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_GT_F16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -150,13 +126,6 @@ body: | ; WAVE32-NEXT: [[V_CMP_GE_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_GE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_GE_F16_e64_]] ; - ; GFX11-LABEL: name: fcmp_oge_s16_vv - ; GFX11: liveins: $vgpr0, $vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: [[V_CMP_GE_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_GE_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_GE_F16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -189,13 +158,6 @@ body: | ; WAVE32-NEXT: [[V_CMP_LT_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_LT_F16_e64_]] ; - ; GFX11-LABEL: name: fcmp_olt_s16_vv - ; GFX11: liveins: $vgpr0, $vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: [[V_CMP_LT_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LT_F16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -228,13 +190,6 @@ body: | ; WAVE32-NEXT: [[V_CMP_LE_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_LE_F16_e64_]] ; - ; GFX11-LABEL: name: fcmp_ole_s16_vv - ; GFX11: liveins: $vgpr0, $vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: [[V_CMP_LE_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LE_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LE_F16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -266,13 +221,6 @@ body: | ; WAVE32-NEXT: [[V_CMP_LG_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_LG_F16_e64_]] ; - ; GFX11-LABEL: name: fcmp_one_s16_vv - ; GFX11: liveins: $vgpr0, $vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: [[V_CMP_LG_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LG_F16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -305,13 +253,6 @@ body: | ; WAVE32-NEXT: [[V_CMP_LG_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_LG_F16_e64_]] ; - ; GFX11-LABEL: name: fcmp_ord_s16_vv - ; GFX11: liveins: $vgpr0, $vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: [[V_CMP_LG_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LG_F16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -344,13 +285,6 @@ body: | ; WAVE32-NEXT: [[V_CMP_U_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_U_F16_e64_]] ; - ; GFX11-LABEL: name: fcmp_uno_s16_vv - ; GFX11: liveins: $vgpr0, $vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: [[V_CMP_U_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_U_F16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -383,13 +317,6 @@ body: | ; WAVE32-NEXT: [[V_CMP_NLG_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLG_F16_e64_]] ; - ; GFX11-LABEL: name: fcmp_ueq_s16_vv - ; GFX11: liveins: $vgpr0, $vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: [[V_CMP_NLG_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLG_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLG_F16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -422,13 +349,6 @@ body: | ; WAVE32-NEXT: [[V_CMP_NLE_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLE_F16_e64_]] ; - ; GFX11-LABEL: name: fcmp_ugt_s16_vv - ; GFX11: liveins: $vgpr0, $vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: [[V_CMP_NLE_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLE_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLE_F16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -461,13 +381,6 @@ body: | ; WAVE32-NEXT: [[V_CMP_NLT_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLT_F16_e64_]] ; - ; GFX11-LABEL: name: fcmp_uge_s16_vv - ; GFX11: liveins: $vgpr0, $vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: [[V_CMP_NLT_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLT_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLT_F16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -500,13 +413,6 @@ body: | ; WAVE32-NEXT: [[V_CMP_NGE_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NGE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_NGE_F16_e64_]] ; - ; GFX11-LABEL: name: fcmp_ult_s16_vv - ; GFX11: liveins: $vgpr0, $vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: [[V_CMP_NGE_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NGE_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NGE_F16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -539,13 +445,6 @@ body: | ; WAVE32-NEXT: [[V_CMP_NGT_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NGT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_NGT_F16_e64_]] ; - ; GFX11-LABEL: name: fcmp_ule_s16_vv - ; GFX11: liveins: $vgpr0, $vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: [[V_CMP_NGT_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NGT_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NGT_F16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -578,13 +477,6 @@ body: | ; WAVE32-NEXT: [[V_CMP_NEQ_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NEQ_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_NEQ_F16_e64_]] ; - ; GFX11-LABEL: name: fcmp_une_s16_vv - ; GFX11: liveins: $vgpr0, $vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: [[V_CMP_NEQ_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NEQ_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NEQ_F16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -621,15 +513,6 @@ body: | ; WAVE32-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(true), [[TRUNC]](s16), [[TRUNC1]] ; WAVE32-NEXT: S_ENDPGM 0, implicit [[FCMP]](s1) ; - ; GFX11-LABEL: name: fcmp_true_s16_vv - ; GFX11: liveins: $vgpr0, $vgpr1 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) - ; GFX11-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(true), [[TRUNC]](s16), [[TRUNC1]] - ; GFX11-NEXT: S_ENDPGM 0, implicit [[FCMP]](s1) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/promote-dependency-on-invariant-result.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/promote-dependency-on-invariant-result.ll new file mode 100644 index 000000000000..090aa067a526 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/promote-dependency-on-invariant-result.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -O0 -global-isel=true --stop-after=irtranslator -o - %s | FileCheck %s + +declare ptr @llvm.invariant.start.p5(i64 immarg, ptr addrspace(5) nocapture) +declare void @llvm.invariant.end.p5(ptr, i64 immarg, ptr addrspace(5) nocapture) + +define void @use_invariant_promotable_lds(ptr addrspace(5) %arg, i32 %i) { + ; CHECK-LABEL: name: use_invariant_promotable_lds + ; CHECK: bb.1.bb: + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF + ; CHECK-NEXT: G_STORE [[C]](s32), [[DEF]](p0) :: (store (s32) into %ir.tmp) + ; CHECK-NEXT: SI_RETURN +bb: + %tmp = call ptr @llvm.invariant.start.p5(i64 4, ptr addrspace(5) %arg) + call void @llvm.invariant.end.p5(ptr %tmp, i64 4, ptr addrspace(5) %arg) + store i32 0, ptr %tmp, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-cs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-cs.ll index 0818f607da0a..96775f4763e3 100644 --- a/llvm/test/CodeGen/AMDGPU/amdpal-cs.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-cs.ll @@ -8,6 +8,7 @@ ; GCN-NEXT: amdpal.pipelines: ; GCN-NEXT: - .hardware_stages: ; GCN-NEXT: .cs: +; GCN-NEXT: .entry_point: _amdgpu_cs ; GCN-NEXT: .entry_point_symbol: cs_amdpal ; GCN-NEXT: .scratch_memory_size: 0 ; GCN: .registers: diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-es.ll b/llvm/test/CodeGen/AMDGPU/amdpal-es.ll index e37d22c7df37..1379246c3257 100644 --- a/llvm/test/CodeGen/AMDGPU/amdpal-es.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-es.ll @@ -7,6 +7,7 @@ ; GCN-NEXT: amdpal.pipelines: ; GCN-NEXT: - .hardware_stages: ; GCN-NEXT: .es: +; GCN-NEXT: .entry_point: _amdgpu_es ; GCN-NEXT: .entry_point_symbol: es_amdpal ; GCN-NEXT: .scratch_memory_size: 0 ; GCN: .registers: diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-gs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-gs.ll index d847f75a5c09..1fba34a50094 100644 --- a/llvm/test/CodeGen/AMDGPU/amdpal-gs.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-gs.ll @@ -8,6 +8,7 @@ ; GCN-NEXT: amdpal.pipelines: ; GCN-NEXT: - .hardware_stages: ; GCN-NEXT: .gs: +; GCN-NEXT: .entry_point: _amdgpu_gs ; GCN-NEXT: .entry_point_symbol: gs_amdpal ; GCN-NEXT: .scratch_memory_size: 0 ; GCN: .registers: diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-hs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-hs.ll index 74f5f440c99d..53c6b95f0735 100644 --- a/llvm/test/CodeGen/AMDGPU/amdpal-hs.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-hs.ll @@ -8,6 +8,7 @@ ; GCN-NEXT: amdpal.pipelines: ; GCN-NEXT: - .hardware_stages: ; GCN-NEXT: .hs: +; GCN-NEXT: .entry_point: _amdgpu_hs ; GCN-NEXT: .entry_point_symbol: hs_amdpal ; GCN-NEXT: .scratch_memory_size: 0 ; GCN: .registers: diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-ls.ll b/llvm/test/CodeGen/AMDGPU/amdpal-ls.ll index 287cc1201a3c..ebe753134a42 100644 --- a/llvm/test/CodeGen/AMDGPU/amdpal-ls.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-ls.ll @@ -7,6 +7,7 @@ ; GCN-NEXT: amdpal.pipelines: ; GCN-NEXT: - .hardware_stages: ; GCN-NEXT: .ls: +; GCN-NEXT: .entry_point: _amdgpu_ls ; GCN-NEXT: .entry_point_symbol: ls_amdpal ; GCN-NEXT: .scratch_memory_size: 0 ; GCN: .registers: diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-psenable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-psenable.ll index e1767182c359..32f19e2af32e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdpal-psenable.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-psenable.ll @@ -11,6 +11,7 @@ ; GCN-NEXT: amdpal.pipelines: ; GCN-NEXT: - .hardware_stages: ; GCN-NEXT: .ps: +; GCN-NEXT: .entry_point: _amdgpu_ps ; GCN-NEXT: .entry_point_symbol: amdpal_psenable ; GCN-NEXT: .scratch_memory_size: 0 ; GCN: .registers: diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-vs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-vs.ll index b225d978601a..853d221ee3aa 100644 --- a/llvm/test/CodeGen/AMDGPU/amdpal-vs.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-vs.ll @@ -8,6 +8,7 @@ ; GCN-NEXT: amdpal.pipelines: ; GCN-NEXT: - .hardware_stages: ; GCN-NEXT: .vs: +; GCN-NEXT: .entry_point: _amdgpu_vs ; GCN-NEXT: .entry_point_symbol: vs_amdpal ; GCN-NEXT: .scratch_memory_size: 0 ; GCN: .registers: diff --git a/llvm/test/CodeGen/AMDGPU/amdpal.ll b/llvm/test/CodeGen/AMDGPU/amdpal.ll index 97fcf0606b5b..171df029615e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdpal.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal.ll @@ -86,6 +86,7 @@ declare void @llvm.amdgcn.raw.ptr.buffer.store.f32(float, ptr addrspace(8), i32, ; PAL-NEXT: amdpal.pipelines: ; PAL-NEXT: - .hardware_stages: ; PAL-NEXT: .cs: +; PAL-NEXT: .entry_point: _amdgpu_cs ; PAL-NEXT: .entry_point_symbol: scratch2_cs ; PAL-NEXT: .scratch_memory_size: 0x10 ; PAL-NEXT: .sgpr_count: 0x diff --git a/llvm/test/CodeGen/AMDGPU/bitcast_vector_bigint.ll b/llvm/test/CodeGen/AMDGPU/bitcast_vector_bigint.ll new file mode 100644 index 000000000000..2c6aabec7633 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/bitcast_vector_bigint.ll @@ -0,0 +1,351 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s + +; Make sure stack use isn't introduced for these bitcasts. + +define i160 @bitcast_v5i32_to_i160(<5 x i32> %vec) { +; GFX9-LABEL: bitcast_v5i32_to_i160: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: bitcast_v5i32_to_i160: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %bitcast = bitcast <5 x i32> %vec to i160 + ret i160 %bitcast +} + +define i192 @bitcast_v6i32_to_i192(<6 x i32> %vec) { +; GFX9-LABEL: bitcast_v6i32_to_i192: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: bitcast_v6i32_to_i192: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %bitcast = bitcast <6 x i32> %vec to i192 + ret i192 %bitcast +} + +define i224 @bitcast_v7i32_to_i224(<7 x i32> %vec) { +; GFX9-LABEL: bitcast_v7i32_to_i224: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: bitcast_v7i32_to_i224: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %bitcast = bitcast <7 x i32> %vec to i224 + ret i224 %bitcast +} + +define i256 @bitcast_v8i32_to_i256(<8 x i32> %vec) { +; GFX9-LABEL: bitcast_v8i32_to_i256: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: bitcast_v8i32_to_i256: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %bitcast = bitcast <8 x i32> %vec to i256 + ret i256 %bitcast +} + +define <5 x i32> @bitcast_i160_to_v5i32(i160 %int) { +; GFX9-LABEL: bitcast_i160_to_v5i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: bitcast_i160_to_v5i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %bitcast = bitcast i160 %int to <5 x i32> + ret <5 x i32> %bitcast +} + +define <6 x i32> @bitcast_i192_to_v6i32(i192 %int) { +; GFX9-LABEL: bitcast_i192_to_v6i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: bitcast_i192_to_v6i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %bitcast = bitcast i192 %int to <6 x i32> + ret <6 x i32> %bitcast +} + +define <7 x i32> @bitcast_i224_to_v7i32(i224 %int) { +; GFX9-LABEL: bitcast_i224_to_v7i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: bitcast_i224_to_v7i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %bitcast = bitcast i224 %int to <7 x i32> + ret <7 x i32> %bitcast +} + +define <8 x i32> @bitcast_i256_to_v8i32(i256 %int) { +; GFX9-LABEL: bitcast_i256_to_v8i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: bitcast_i256_to_v8i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %bitcast = bitcast i256 %int to <8 x i32> + ret <8 x i32> %bitcast +} + +define i192 @bitcast_v3i64_to_i192(<3 x i64> %vec) { +; GFX9-LABEL: bitcast_v3i64_to_i192: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: bitcast_v3i64_to_i192: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %bitcast = bitcast <3 x i64> %vec to i192 + ret i192 %bitcast +} + +define <3 x i64> @bitcast_i192_to_v3i64(i192 %int) { +; GFX9-LABEL: bitcast_i192_to_v3i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: bitcast_i192_to_v3i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %bitcast = bitcast i192 %int to <3 x i64> + ret <3 x i64> %bitcast +} + +define <10 x i16> @bitcast_i160_to_v10i16(i160 %int) { +; GFX9-LABEL: bitcast_i160_to_v10i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v0 +; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: bitcast_i160_to_v10i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_bfi_b32 v0, 0xffff, v0, v0 +; GFX12-NEXT: v_bfi_b32 v2, 0xffff, v2, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %bitcast = bitcast i160 %int to <10 x i16> + ret <10 x i16> %bitcast +} + +define i160 @bitcast_v10i16_to_i160(<10 x i16> %vec) { +; GFX9-LABEL: bitcast_v10i16_to_i160: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: bitcast_v10i16_to_i160: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %bitcast = bitcast <10 x i16> %vec to i160 + ret i160 %bitcast +} + +define i12 @bitcast_v2i6_to_i12(<2 x i6> %vec) { +; GFX9-LABEL: bitcast_v2i6_to_i12: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 6, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xfff, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: bitcast_v2i6_to_i12: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshlrev_b16 v1, 6, v1 +; GFX12-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX12-NEXT: v_and_b32_e32 v0, 0xfff, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %bitcast = bitcast <2 x i6> %vec to i12 + ret i12 %bitcast +} + +define <2 x i6> @bitcast_i12_to_v2i6(i12 %int) { +; GFX9-LABEL: bitcast_i12_to_v2i6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v2, 63, v0 +; GFX9-NEXT: v_lshrrev_b16_e32 v0, 6, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 63, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: bitcast_i12_to_v2i6: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b16 v1, 6, v0 +; GFX12-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_and_b32_e32 v1, 63, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %bitcast = bitcast i12 %int to <2 x i6> + ret <2 x i6> %bitcast +} + +define i160 @bitcast_v5f32_to_i160(<5 x float> %vec) { +; GFX9-LABEL: bitcast_v5f32_to_i160: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: bitcast_v5f32_to_i160: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %bitcast = bitcast <5 x float> %vec to i160 + ret i160 %bitcast +} + +define <5 x float> @bitcast_i160_to_v5f32(i160 %int) { +; GFX9-LABEL: bitcast_i160_to_v5f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: bitcast_i160_to_v5f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %bitcast = bitcast i160 %int to <5 x float> + ret <5 x float> %bitcast +} + +define <6 x float> @bitcast_i192_to_v6f32(i192 %int) { +; GFX9-LABEL: bitcast_i192_to_v6f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: bitcast_i192_to_v6f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %bitcast = bitcast i192 %int to <6 x float> + ret <6 x float> %bitcast +} + +define i192 @bitcast_v6f32_to_i192(<6 x float> %vec) { +; GFX9-LABEL: bitcast_v6f32_to_i192: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: bitcast_v6f32_to_i192: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %bitcast = bitcast <6 x float> %vec to i192 + ret i192 %bitcast +} diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll index 7eaa52d89b9b..405058b24dcc 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll @@ -3091,15 +3091,6 @@ define i160 @load_i160(ptr addrspace(8) inreg %buf) { ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0 ; SDAG-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:16 -; SDAG-NEXT: s_mov_b32 s4, s33 -; SDAG-NEXT: s_add_i32 s33, s32, 0x7c0 -; SDAG-NEXT: s_and_b32 s33, s33, 0xfffff800 -; SDAG-NEXT: s_mov_b32 s5, s34 -; SDAG-NEXT: s_mov_b32 s34, s32 -; SDAG-NEXT: s_addk_i32 s32, 0x1800 -; SDAG-NEXT: s_mov_b32 s32, s34 -; SDAG-NEXT: s_mov_b32 s34, s5 -; SDAG-NEXT: s_mov_b32 s33, s4 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -3119,17 +3110,8 @@ define void @store_i160(i160 %data, ptr addrspace(8) inreg %buf) { ; SDAG-LABEL: store_i160: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_mov_b32 s4, s33 -; SDAG-NEXT: s_add_i32 s33, s32, 0x7c0 -; SDAG-NEXT: s_and_b32 s33, s33, 0xfffff800 -; SDAG-NEXT: s_mov_b32 s5, s34 -; SDAG-NEXT: s_mov_b32 s34, s32 -; SDAG-NEXT: s_addk_i32 s32, 0x1000 ; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; SDAG-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:16 -; SDAG-NEXT: s_mov_b32 s32, s34 -; SDAG-NEXT: s_mov_b32 s34, s5 -; SDAG-NEXT: s_mov_b32 s33, s4 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll index 17ab8fc780fb..6bf126af5ade 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -457,58 +457,27 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr ; ; EG-LABEL: v_ctpop_v4i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 3, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 37, @12, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XY, T0.X, 1 +; EG-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T6.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_64 T8.XY, T0.X, 0, #1 +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV T0.Y, T4.X, -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ALU clause starting at 12: -; EG-NEXT: AND_INT * T0.W, T8.X, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T0.W, PV.W, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV * T4.X, PV.W, -; EG-NEXT: MOV T0.X, PV.X, -; EG-NEXT: LSHR * T0.W, T8.X, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T0.W, PV.W, -; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T4.X, PV.W, -; EG-NEXT: MOV * T0.X, T5.X, -; EG-NEXT: AND_INT * T0.W, T8.Y, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T0.W, PV.W, -; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV * T5.X, PV.W, -; EG-NEXT: MOV T0.X, PV.X, -; EG-NEXT: LSHR * T0.W, T8.Y, literal.x, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: LSHR * T0.W, T0.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T0.W, PV.W, -; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, +; EG-NEXT: BCNT_INT T0.Y, PV.W, +; EG-NEXT: AND_INT * T0.W, T0.X, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, -; EG-NEXT: OR_INT * T8.Y, T1.W, PV.W, +; EG-NEXT: BCNT_INT T0.X, PV.W, +; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: MOV T5.X, PV.Y, -; EG-NEXT: MOV * T8.X, T4.X, %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i16>, ptr addrspace(1) %in, i32 %tid %val = load <4 x i16>, ptr addrspace(1) %in.gep, align 16 @@ -601,94 +570,33 @@ define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addr ; ; EG-LABEL: v_ctpop_v8i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 3, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 73, @12, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T12.X, 1 +; EG-NEXT: ALU 13, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T8.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_128 T12.XYZW, T0.X, 0, #1 +; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV T0.Y, T4.X, -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ALU clause starting at 12: -; EG-NEXT: LSHR * T0.W, T12.X, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BCNT_INT * T0.W, PV.W, -; EG-NEXT: LSHL T0.W, PV.W, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV * T4.X, PV.W, -; EG-NEXT: MOV T0.X, PV.X, -; EG-NEXT: AND_INT * T0.W, T12.X, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T0.W, PV.W, -; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV T4.X, PV.W, -; EG-NEXT: MOV * T0.X, T5.X, -; EG-NEXT: LSHR * T0.W, T12.Y, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T0.W, PV.W, -; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T5.X, PV.W, -; EG-NEXT: MOV T0.X, PV.X, -; EG-NEXT: AND_INT * T0.W, T12.Y, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T0.W, PV.W, -; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.Y, PS, PV.W, -; EG-NEXT: MOV T5.X, PV.Y, -; EG-NEXT: MOV * T0.X, T8.X, -; EG-NEXT: LSHR * T0.W, T12.Z, literal.x, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: LSHR * T0.W, T0.Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: BCNT_INT T0.W, PV.W, -; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T8.X, PV.W, -; EG-NEXT: MOV T0.X, PV.X, -; EG-NEXT: AND_INT * T0.W, T12.Z, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.Z, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T0.W, PV.W, -; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV T8.X, PV.W, -; EG-NEXT: MOV * T0.X, T9.X, -; EG-NEXT: LSHR * T0.W, T12.W, literal.x, +; EG-NEXT: BCNT_INT T0.Z, PS, +; EG-NEXT: LSHR * T1.W, T0.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T0.W, PV.W, +; EG-NEXT: BCNT_INT T0.Y, PV.W, ; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T9.X, PV.W, -; EG-NEXT: MOV T0.X, PV.X, -; EG-NEXT: AND_INT * T0.W, T12.W, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T0.W, PV.W, -; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: LSHR T12.X, KC0[2].Y, literal.x, -; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: BCNT_INT T0.X, PV.W, +; EG-NEXT: LSHR * T8.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: MOV T9.X, PV.W, -; EG-NEXT: MOV * T0.X, T4.X, -; EG-NEXT: MOV * T0.Z, T8.X, %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <8 x i16>, ptr addrspace(1) %in, i32 %tid %val = load <8 x i16>, ptr addrspace(1) %in.gep, align 32 @@ -837,174 +745,46 @@ define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr add ; ; EG-LABEL: v_ctpop_v16i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 3, @12, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 1 @8 -; EG-NEXT: ALU 114, @16, KC0[], KC1[] -; EG-NEXT: ALU 34, @131, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T22.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T21.X, 1 +; EG-NEXT: ALU 2, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @6 +; EG-NEXT: ALU 25, @13, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T14.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T13.X, 1 ; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 8: -; EG-NEXT: VTX_READ_128 T20.XYZW, T0.X, 16, #1 -; EG-NEXT: VTX_READ_128 T21.XYZW, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 12: -; EG-NEXT: MOV T0.Y, T4.X, -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_128 T12.XYZW, T0.X, 16, #1 +; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, ; EG-NEXT: 5(7.006492e-45), 0(0.000000e+00) ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ALU clause starting at 16: -; EG-NEXT: LSHR * T0.W, T20.X, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BCNT_INT * T0.W, PV.W, -; EG-NEXT: LSHL T0.W, PV.W, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV * T4.X, PV.W, -; EG-NEXT: MOV T0.X, PV.X, -; EG-NEXT: AND_INT * T0.W, T20.X, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T0.W, PV.W, -; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV T4.X, PV.W, -; EG-NEXT: MOV * T0.X, T5.X, -; EG-NEXT: LSHR * T0.W, T20.Y, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T0.W, PV.W, -; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T5.X, PV.W, -; EG-NEXT: MOV T0.X, PV.X, -; EG-NEXT: AND_INT * T0.W, T20.Y, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T0.W, PV.W, -; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.Y, PS, PV.W, -; EG-NEXT: MOV T5.X, PV.Y, -; EG-NEXT: MOV * T0.X, T8.X, -; EG-NEXT: LSHR * T0.W, T20.Z, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T0.W, PV.W, -; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T8.X, PV.W, -; EG-NEXT: MOV T0.X, PV.X, -; EG-NEXT: AND_INT * T0.W, T20.Z, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T0.W, PV.W, -; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV T8.X, PV.W, -; EG-NEXT: MOV * T0.X, T9.X, -; EG-NEXT: LSHR * T0.W, T20.W, literal.x, +; EG-NEXT: ALU clause starting at 13: +; EG-NEXT: LSHR * T0.W, T12.Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T0.W, PV.W, -; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, +; EG-NEXT: BCNT_INT T12.W, PV.W, +; EG-NEXT: AND_INT * T0.W, T12.Z, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: BCNT_INT T12.Z, PS, +; EG-NEXT: LSHR T0.W, T0.Z, literal.x, +; EG-NEXT: LSHR * T1.W, T12.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T9.X, PV.W, -; EG-NEXT: MOV T0.X, PV.X, -; EG-NEXT: AND_INT * T0.W, T20.W, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: BCNT_INT T12.Y, PS, +; EG-NEXT: AND_INT T0.Z, T0.Z, literal.x, ; EG-NEXT: BCNT_INT T0.W, PV.W, -; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV T9.X, PV.W, -; EG-NEXT: MOV * T0.X, T12.X, -; EG-NEXT: LSHR * T1.W, T21.X, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T1.W, PV.W, -; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHL * T1.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, -; EG-NEXT: MOV * T12.X, PV.W, -; EG-NEXT: MOV T0.X, PV.X, -; EG-NEXT: AND_INT * T1.W, T21.X, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T1.W, PV.W, -; EG-NEXT: AND_INT * T2.W, PV.X, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T1.W, PS, PV.W, -; EG-NEXT: MOV T12.X, PV.W, -; EG-NEXT: MOV * T0.X, T13.X, -; EG-NEXT: LSHR * T1.W, T21.Y, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T1.W, PV.W, -; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHL * T1.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, -; EG-NEXT: MOV * T13.X, PV.W, -; EG-NEXT: MOV T0.X, PV.X, -; EG-NEXT: AND_INT * T1.W, T21.Y, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T1.W, PV.W, -; EG-NEXT: AND_INT * T2.W, PV.X, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T20.Y, PS, PV.W, -; EG-NEXT: MOV T13.X, PV.Y, -; EG-NEXT: MOV * T0.X, T16.X, -; EG-NEXT: LSHR * T1.W, T21.Z, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T1.W, PV.W, -; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHL * T1.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, -; EG-NEXT: ALU clause starting at 131: -; EG-NEXT: MOV * T16.X, T1.W, -; EG-NEXT: MOV T0.X, PV.X, -; EG-NEXT: AND_INT * T1.W, T21.Z, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T1.W, PV.W, -; EG-NEXT: AND_INT * T2.W, PV.X, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T1.W, PS, PV.W, -; EG-NEXT: MOV T16.X, PV.W, -; EG-NEXT: MOV * T0.X, T17.X, -; EG-NEXT: LSHR * T1.W, T21.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T1.W, PV.W, -; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, +; EG-NEXT: AND_INT * T1.W, T12.X, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHL * T1.W, PV.W, literal.x, +; EG-NEXT: BCNT_INT T12.X, PS, +; EG-NEXT: BCNT_INT T0.Z, PV.Z, +; EG-NEXT: LSHR T1.W, T0.X, literal.x, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, -; EG-NEXT: MOV * T17.X, PV.W, -; EG-NEXT: MOV T0.X, PV.X, -; EG-NEXT: AND_INT T1.W, T21.W, literal.x, -; EG-NEXT: LSHR * T21.X, KC0[2].Y, literal.y, -; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45) -; EG-NEXT: AND_INT T0.Z, PV.X, literal.x, -; EG-NEXT: BCNT_INT T1.W, PV.W, -; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, -; EG-NEXT: -65536(nan), 16(2.242078e-44) -; EG-NEXT: LSHR T22.X, PS, literal.x, -; EG-NEXT: OR_INT * T20.W, PV.Z, PV.W, +; EG-NEXT: LSHR T13.X, PS, literal.x, +; EG-NEXT: BCNT_INT T0.Y, PV.W, +; EG-NEXT: AND_INT * T1.W, T0.X, literal.y, +; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41) +; EG-NEXT: BCNT_INT T0.X, PV.W, +; EG-NEXT: LSHR * T14.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: MOV T17.X, PV.W, -; EG-NEXT: MOV * T0.X, T4.X, -; EG-NEXT: MOV * T0.Z, T8.X, -; EG-NEXT: MOV T20.X, T12.X, -; EG-NEXT: MOV * T20.Z, T16.X, BS:VEC_120/SCL_212 %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <16 x i16>, ptr addrspace(1) %in, i32 %tid %val = load <16 x i16>, ptr addrspace(1) %in.gep, align 32 diff --git a/llvm/test/CodeGen/AMDGPU/dead_copy.mir b/llvm/test/CodeGen/AMDGPU/dead_copy.mir index 2b54c61056a9..5bc42e9c4719 100644 --- a/llvm/test/CodeGen/AMDGPU/dead_copy.mir +++ b/llvm/test/CodeGen/AMDGPU/dead_copy.mir @@ -1,4 +1,5 @@ # RUN: llc -o - %s -mtriple=amdgcn -mcpu=fiji -run-pass=machine-cp -verify-machineinstrs | FileCheck -check-prefix=GCN %s +# RUN: llc -o - %s -mtriple=amdgcn -mcpu=fiji -passes=machine-cp | FileCheck -check-prefix=GCN %s # GCN-LABEL: dead_copy # GCN: bb.0 diff --git a/llvm/test/CodeGen/AMDGPU/elf-notes.ll b/llvm/test/CodeGen/AMDGPU/elf-notes.ll index e91bed464136..b205678bd908 100644 --- a/llvm/test/CodeGen/AMDGPU/elf-notes.ll +++ b/llvm/test/CodeGen/AMDGPU/elf-notes.ll @@ -66,6 +66,7 @@ ; OSABI-PAL-ELF: amdpal.pipelines: ; OSABI-PAL-ELF: - .hardware_stages: ; OSABI-PAL-ELF: .cs: +; OSABI-PAL-ELF: .entry_point: _amdgpu_cs ; OSABI-PAL-ELF: .entry_point_symbol: elf_notes ; OSABI-PAL-ELF: .scratch_memory_size: 0 ; OSABI-PAL-ELF: .sgpr_count: 96 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll index 8704f4e78044..121891adef18 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -1025,74 +1025,67 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> ; ; EG-LABEL: v3i16_arg: ; EG: ; %bb.0: ; %entry -; EG-NEXT: ALU 0, @12, KC0[], KC1[] -; EG-NEXT: TEX 2 @6 -; EG-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 0 -; EG-NEXT: MEM_RAT MSKOR T5.XW, T8.X +; EG-NEXT: ALU 0, @10, KC0[], KC1[] +; EG-NEXT: TEX 1 @6 +; EG-NEXT: ALU 14, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T3.X, 0 +; EG-NEXT: MEM_RAT MSKOR T2.XW, T0.X ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3 -; EG-NEXT: VTX_READ_16 T7.X, T5.X, 46, #3 -; EG-NEXT: VTX_READ_16 T5.X, T5.X, 48, #3 -; EG-NEXT: ALU clause starting at 12: -; EG-NEXT: MOV * T5.X, 0.0, -; EG-NEXT: ALU clause starting at 13: +; EG-NEXT: VTX_READ_16 T1.X, T0.X, 44, #3 +; EG-NEXT: VTX_READ_16 T0.X, T0.X, 48, #3 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: MOV * T0.X, 0.0, +; EG-NEXT: ALU clause starting at 11: ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: AND_INT T1.W, PV.W, literal.x, -; EG-NEXT: AND_INT * T2.W, T5.X, literal.y, +; EG-NEXT: AND_INT * T2.W, T0.X, literal.y, ; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) ; EG-NEXT: LSHL * T1.W, PV.W, literal.x, ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: LSHL T5.X, T2.W, PV.W, -; EG-NEXT: LSHL * T5.W, literal.x, PV.W, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: MOV T5.Y, 0.0, -; EG-NEXT: MOV * T5.Z, 0.0, -; EG-NEXT: LSHR T8.X, T0.W, literal.x, -; EG-NEXT: LSHL T0.W, T7.X, literal.y, -; EG-NEXT: AND_INT * T1.W, T6.X, literal.z, -; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) +; EG-NEXT: LSHL T2.X, T2.W, PV.W, +; EG-NEXT: LSHL * T2.W, literal.x, PV.W, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: OR_INT T6.X, PV.W, PS, -; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, +; EG-NEXT: MOV T2.Y, 0.0, +; EG-NEXT: MOV * T2.Z, 0.0, +; EG-NEXT: LSHR T0.X, T0.W, literal.x, +; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CM-LABEL: v3i16_arg: ; CM: ; %bb.0: ; %entry ; CM-NEXT: ALU 0, @12, KC0[], KC1[] -; CM-NEXT: TEX 2 @6 -; CM-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT MSKOR T5.XW, T8.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6.X, T7.X +; CM-NEXT: TEX 0 @8 +; CM-NEXT: ALU 13, @13, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT MSKOR T1.XW, T2.X +; CM-NEXT: ALU 1, @27, KC0[CB0:0-32], KC1[] +; CM-NEXT: TEX 0 @10 +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X ; CM-NEXT: CF_END -; CM-NEXT: Fetch clause starting at 6: -; CM-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3 -; CM-NEXT: VTX_READ_16 T7.X, T5.X, 46, #3 -; CM-NEXT: VTX_READ_16 T5.X, T5.X, 48, #3 +; CM-NEXT: Fetch clause starting at 8: +; CM-NEXT: VTX_READ_16 T1.X, T0.X, 48, #3 +; CM-NEXT: Fetch clause starting at 10: +; CM-NEXT: VTX_READ_16 T0.X, T0.X, 44, #3 ; CM-NEXT: ALU clause starting at 12: -; CM-NEXT: MOV * T5.X, 0.0, +; CM-NEXT: MOV * T0.X, 0.0, ; CM-NEXT: ALU clause starting at 13: ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; CM-NEXT: AND_INT * T1.W, PV.W, literal.x, ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; CM-NEXT: AND_INT T0.Z, T5.X, literal.x, +; CM-NEXT: AND_INT T0.Z, T1.X, literal.x, ; CM-NEXT: LSHL * T1.W, PV.W, literal.y, ; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) -; CM-NEXT: LSHL T5.X, PV.Z, PV.W, -; CM-NEXT: LSHL * T5.W, literal.x, PV.W, +; CM-NEXT: LSHL T1.X, PV.Z, PV.W, +; CM-NEXT: LSHL * T1.W, literal.x, PV.W, ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; CM-NEXT: MOV T5.Y, 0.0, -; CM-NEXT: MOV * T5.Z, 0.0, -; CM-NEXT: LSHL T0.Z, T7.X, literal.x, -; CM-NEXT: AND_INT * T1.W, T6.X, literal.y, BS:VEC_120/SCL_212 -; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41) -; CM-NEXT: OR_INT * T6.X, PV.Z, PV.W, -; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, +; CM-NEXT: MOV T1.Y, 0.0, +; CM-NEXT: MOV * T1.Z, 0.0, +; CM-NEXT: LSHR * T2.X, T0.W, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; CM-NEXT: LSHR * T8.X, T0.W, literal.x, +; CM-NEXT: ALU clause starting at 27: +; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: store <3 x i16> %in, ptr addrspace(1) %out, align 4 @@ -2676,205 +2669,47 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) { ; ; EG-LABEL: v8i16_arg: ; EG: ; %bb.0: ; %entry -; EG-NEXT: ALU 1, @36, KC0[], KC1[] -; EG-NEXT: TEX 0 @20 -; EG-NEXT: ALU 5, @38, KC0[], KC1[] -; EG-NEXT: TEX 0 @22 -; EG-NEXT: ALU 5, @44, KC0[], KC1[] -; EG-NEXT: TEX 0 @24 -; EG-NEXT: ALU 5, @50, KC0[], KC1[] -; EG-NEXT: TEX 0 @26 -; EG-NEXT: ALU 5, @56, KC0[], KC1[] -; EG-NEXT: TEX 0 @28 -; EG-NEXT: ALU 5, @62, KC0[], KC1[] -; EG-NEXT: TEX 0 @30 -; EG-NEXT: ALU 5, @68, KC0[], KC1[] -; EG-NEXT: TEX 0 @32 -; EG-NEXT: ALU 5, @74, KC0[], KC1[] -; EG-NEXT: TEX 0 @34 -; EG-NEXT: ALU 8, @80, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1 +; EG-NEXT: ALU 0, @14, KC0[], KC1[] +; EG-NEXT: TEX 3 @6 +; EG-NEXT: ALU 4, @15, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 20: -; EG-NEXT: VTX_READ_16 T8.X, T7.X, 66, #3 -; EG-NEXT: Fetch clause starting at 22: -; EG-NEXT: VTX_READ_16 T8.X, T7.X, 58, #3 -; EG-NEXT: Fetch clause starting at 24: -; EG-NEXT: VTX_READ_16 T8.X, T7.X, 64, #3 -; EG-NEXT: Fetch clause starting at 26: -; EG-NEXT: VTX_READ_16 T8.X, T7.X, 56, #3 -; EG-NEXT: Fetch clause starting at 28: -; EG-NEXT: VTX_READ_16 T8.X, T7.X, 62, #3 -; EG-NEXT: Fetch clause starting at 30: -; EG-NEXT: VTX_READ_16 T8.X, T7.X, 54, #3 -; EG-NEXT: Fetch clause starting at 32: -; EG-NEXT: VTX_READ_16 T8.X, T7.X, 60, #3 -; EG-NEXT: Fetch clause starting at 34: -; EG-NEXT: VTX_READ_16 T7.X, T7.X, 52, #3 -; EG-NEXT: ALU clause starting at 36: -; EG-NEXT: MOV * T0.Y, T3.X, -; EG-NEXT: MOV * T7.X, 0.0, -; EG-NEXT: ALU clause starting at 38: -; EG-NEXT: LSHL T0.W, T8.X, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV T3.X, PV.W, -; EG-NEXT: MOV * T0.Y, T5.X, -; EG-NEXT: ALU clause starting at 44: -; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, -; EG-NEXT: LSHL * T1.W, T8.X, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV T5.X, PV.W, -; EG-NEXT: MOV * T0.Y, T3.X, -; EG-NEXT: ALU clause starting at 50: -; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, -; EG-NEXT: -65536(nan), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV T3.X, PV.W, -; EG-NEXT: MOV * T0.Y, T5.X, -; EG-NEXT: ALU clause starting at 56: -; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, -; EG-NEXT: -65536(nan), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV T5.X, PV.W, -; EG-NEXT: MOV * T0.Y, T2.X, -; EG-NEXT: ALU clause starting at 62: -; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, -; EG-NEXT: LSHL * T1.W, T8.X, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV T2.X, PV.W, -; EG-NEXT: MOV * T0.Y, T4.X, -; EG-NEXT: ALU clause starting at 68: -; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, -; EG-NEXT: LSHL * T1.W, T8.X, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV T4.X, PV.W, -; EG-NEXT: MOV * T0.Y, T2.X, -; EG-NEXT: ALU clause starting at 74: -; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, -; EG-NEXT: -65536(nan), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T7.Z, PV.W, PS, -; EG-NEXT: MOV T2.X, PV.Z, -; EG-NEXT: MOV * T0.Y, T4.X, -; EG-NEXT: ALU clause starting at 80: -; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x, -; EG-NEXT: AND_INT T0.W, T0.Y, literal.y, -; EG-NEXT: AND_INT * T1.W, T7.X, literal.z, -; EG-NEXT: 2(2.802597e-45), -65536(nan) -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: OR_INT * T7.X, PV.W, PS, -; EG-NEXT: MOV T4.X, PV.X, -; EG-NEXT: MOV * T7.W, T3.X, -; EG-NEXT: MOV * T7.Y, T5.X, +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_16 T1.X, T0.X, 52, #3 +; EG-NEXT: VTX_READ_16 T2.X, T0.X, 54, #3 +; EG-NEXT: VTX_READ_16 T3.X, T0.X, 62, #3 +; EG-NEXT: VTX_READ_16 T0.X, T0.X, 60, #3 +; EG-NEXT: ALU clause starting at 14: +; EG-NEXT: MOV * T0.X, 0.0, +; EG-NEXT: ALU clause starting at 15: +; EG-NEXT: MOV T1.Y, T2.X, +; EG-NEXT: MOV * T1.Z, T0.X, BS:VEC_120/SCL_212 +; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; EG-NEXT: MOV * T1.W, T3.X, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CM-LABEL: v8i16_arg: ; CM: ; %bb.0: ; %entry -; CM-NEXT: ALU 1, @36, KC0[], KC1[] -; CM-NEXT: TEX 0 @20 -; CM-NEXT: ALU 5, @38, KC0[], KC1[] -; CM-NEXT: TEX 0 @22 -; CM-NEXT: ALU 5, @44, KC0[], KC1[] -; CM-NEXT: TEX 0 @24 -; CM-NEXT: ALU 5, @50, KC0[], KC1[] -; CM-NEXT: TEX 0 @26 -; CM-NEXT: ALU 5, @56, KC0[], KC1[] -; CM-NEXT: TEX 0 @28 -; CM-NEXT: ALU 5, @62, KC0[], KC1[] -; CM-NEXT: TEX 0 @30 -; CM-NEXT: ALU 5, @68, KC0[], KC1[] -; CM-NEXT: TEX 0 @32 -; CM-NEXT: ALU 5, @74, KC0[], KC1[] -; CM-NEXT: TEX 0 @34 -; CM-NEXT: ALU 8, @80, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T8.X +; CM-NEXT: ALU 0, @14, KC0[], KC1[] +; CM-NEXT: TEX 3 @6 +; CM-NEXT: ALU 4, @15, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X ; CM-NEXT: CF_END ; CM-NEXT: PAD -; CM-NEXT: Fetch clause starting at 20: -; CM-NEXT: VTX_READ_16 T8.X, T7.X, 66, #3 -; CM-NEXT: Fetch clause starting at 22: -; CM-NEXT: VTX_READ_16 T8.X, T7.X, 58, #3 -; CM-NEXT: Fetch clause starting at 24: -; CM-NEXT: VTX_READ_16 T8.X, T7.X, 64, #3 -; CM-NEXT: Fetch clause starting at 26: -; CM-NEXT: VTX_READ_16 T8.X, T7.X, 56, #3 -; CM-NEXT: Fetch clause starting at 28: -; CM-NEXT: VTX_READ_16 T8.X, T7.X, 62, #3 -; CM-NEXT: Fetch clause starting at 30: -; CM-NEXT: VTX_READ_16 T8.X, T7.X, 54, #3 -; CM-NEXT: Fetch clause starting at 32: -; CM-NEXT: VTX_READ_16 T8.X, T7.X, 60, #3 -; CM-NEXT: Fetch clause starting at 34: -; CM-NEXT: VTX_READ_16 T7.X, T7.X, 52, #3 -; CM-NEXT: ALU clause starting at 36: -; CM-NEXT: MOV * T0.Y, T3.X, -; CM-NEXT: MOV * T7.X, 0.0, -; CM-NEXT: ALU clause starting at 38: -; CM-NEXT: LSHL T0.Z, T8.X, literal.x, -; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, -; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41) -; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, -; CM-NEXT: MOV T3.X, PV.W, -; CM-NEXT: MOV * T0.Y, T5.X, -; CM-NEXT: ALU clause starting at 44: -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: LSHL * T0.W, T8.X, literal.y, -; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T5.X, PV.W, -; CM-NEXT: MOV * T0.Y, T3.X, -; CM-NEXT: ALU clause starting at 50: -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, -; CM-NEXT: -65536(nan), 65535(9.183409e-41) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T3.X, PV.W, -; CM-NEXT: MOV * T0.Y, T5.X, -; CM-NEXT: ALU clause starting at 56: -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, -; CM-NEXT: -65536(nan), 65535(9.183409e-41) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T5.X, PV.W, -; CM-NEXT: MOV * T0.Y, T2.X, -; CM-NEXT: ALU clause starting at 62: -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: LSHL * T0.W, T8.X, literal.y, -; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T2.X, PV.W, -; CM-NEXT: MOV * T0.Y, T4.X, -; CM-NEXT: ALU clause starting at 68: -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: LSHL * T0.W, T8.X, literal.y, -; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T4.X, PV.W, -; CM-NEXT: MOV * T0.Y, T2.X, -; CM-NEXT: ALU clause starting at 74: -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, -; CM-NEXT: -65536(nan), 65535(9.183409e-41) -; CM-NEXT: OR_INT * T7.Z, PV.Z, PV.W, -; CM-NEXT: MOV T2.X, PV.Z, -; CM-NEXT: MOV * T0.Y, T4.X, -; CM-NEXT: ALU clause starting at 80: -; CM-NEXT: LSHR T8.X, KC0[2].Y, literal.x, -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.y, -; CM-NEXT: AND_INT * T0.W, T7.X, literal.z, -; CM-NEXT: 2(2.802597e-45), -65536(nan) -; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; CM-NEXT: OR_INT * T7.X, PV.Z, PV.W, -; CM-NEXT: MOV T4.X, PV.X, -; CM-NEXT: MOV * T7.W, T3.X, -; CM-NEXT: MOV * T7.Y, T5.X, +; CM-NEXT: Fetch clause starting at 6: +; CM-NEXT: VTX_READ_16 T1.X, T0.X, 52, #3 +; CM-NEXT: VTX_READ_16 T2.X, T0.X, 54, #3 +; CM-NEXT: VTX_READ_16 T3.X, T0.X, 62, #3 +; CM-NEXT: VTX_READ_16 T0.X, T0.X, 60, #3 +; CM-NEXT: ALU clause starting at 14: +; CM-NEXT: MOV * T0.X, 0.0, +; CM-NEXT: ALU clause starting at 15: +; CM-NEXT: MOV T1.Y, T2.X, +; CM-NEXT: MOV * T1.Z, T0.X, BS:VEC_120/SCL_212 +; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; CM-NEXT: MOV * T1.W, T3.X, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: store <8 x i16> %in, ptr addrspace(1) %out ret void @@ -3618,392 +3453,68 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) { ; ; EG-LABEL: v16i16_arg: ; EG: ; %bb.0: ; %entry -; EG-NEXT: ALU 1, @68, KC0[], KC1[] -; EG-NEXT: TEX 0 @36 -; EG-NEXT: ALU 5, @70, KC0[], KC1[] -; EG-NEXT: TEX 0 @38 -; EG-NEXT: ALU 5, @76, KC0[], KC1[] -; EG-NEXT: TEX 0 @40 -; EG-NEXT: ALU 5, @82, KC0[], KC1[] -; EG-NEXT: TEX 0 @42 -; EG-NEXT: ALU 5, @88, KC0[], KC1[] -; EG-NEXT: TEX 0 @44 -; EG-NEXT: ALU 5, @94, KC0[], KC1[] -; EG-NEXT: TEX 0 @46 -; EG-NEXT: ALU 5, @100, KC0[], KC1[] -; EG-NEXT: TEX 0 @48 -; EG-NEXT: ALU 5, @106, KC0[], KC1[] -; EG-NEXT: TEX 0 @50 -; EG-NEXT: ALU 5, @112, KC0[], KC1[] -; EG-NEXT: TEX 0 @52 -; EG-NEXT: ALU 5, @118, KC0[], KC1[] -; EG-NEXT: TEX 0 @54 -; EG-NEXT: ALU 5, @124, KC0[], KC1[] -; EG-NEXT: TEX 0 @56 -; EG-NEXT: ALU 5, @130, KC0[], KC1[] -; EG-NEXT: TEX 0 @58 -; EG-NEXT: ALU 5, @136, KC0[], KC1[] -; EG-NEXT: TEX 0 @60 -; EG-NEXT: ALU 5, @142, KC0[], KC1[] -; EG-NEXT: TEX 0 @62 -; EG-NEXT: ALU 5, @148, KC0[], KC1[] -; EG-NEXT: TEX 0 @64 -; EG-NEXT: ALU 5, @154, KC0[], KC1[] -; EG-NEXT: TEX 0 @66 -; EG-NEXT: ALU 13, @160, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 1 +; EG-NEXT: ALU 0, @22, KC0[], KC1[] +; EG-NEXT: TEX 7 @6 +; EG-NEXT: ALU 10, @23, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T0.X, 1 ; EG-NEXT: CF_END -; EG-NEXT: Fetch clause starting at 36: -; EG-NEXT: VTX_READ_16 T12.X, T11.X, 98, #3 -; EG-NEXT: Fetch clause starting at 38: -; EG-NEXT: VTX_READ_16 T12.X, T11.X, 90, #3 -; EG-NEXT: Fetch clause starting at 40: -; EG-NEXT: VTX_READ_16 T12.X, T11.X, 82, #3 -; EG-NEXT: Fetch clause starting at 42: -; EG-NEXT: VTX_READ_16 T12.X, T11.X, 74, #3 -; EG-NEXT: Fetch clause starting at 44: -; EG-NEXT: VTX_READ_16 T12.X, T11.X, 96, #3 -; EG-NEXT: Fetch clause starting at 46: -; EG-NEXT: VTX_READ_16 T12.X, T11.X, 88, #3 -; EG-NEXT: Fetch clause starting at 48: -; EG-NEXT: VTX_READ_16 T12.X, T11.X, 80, #3 -; EG-NEXT: Fetch clause starting at 50: -; EG-NEXT: VTX_READ_16 T12.X, T11.X, 72, #3 -; EG-NEXT: Fetch clause starting at 52: -; EG-NEXT: VTX_READ_16 T12.X, T11.X, 94, #3 -; EG-NEXT: Fetch clause starting at 54: -; EG-NEXT: VTX_READ_16 T12.X, T11.X, 86, #3 -; EG-NEXT: Fetch clause starting at 56: -; EG-NEXT: VTX_READ_16 T12.X, T11.X, 78, #3 -; EG-NEXT: Fetch clause starting at 58: -; EG-NEXT: VTX_READ_16 T12.X, T11.X, 70, #3 -; EG-NEXT: Fetch clause starting at 60: -; EG-NEXT: VTX_READ_16 T12.X, T11.X, 92, #3 -; EG-NEXT: Fetch clause starting at 62: -; EG-NEXT: VTX_READ_16 T12.X, T11.X, 84, #3 -; EG-NEXT: Fetch clause starting at 64: -; EG-NEXT: VTX_READ_16 T13.X, T11.X, 76, #3 -; EG-NEXT: Fetch clause starting at 66: -; EG-NEXT: VTX_READ_16 T11.X, T11.X, 68, #3 -; EG-NEXT: ALU clause starting at 68: -; EG-NEXT: MOV * T0.Y, T3.X, -; EG-NEXT: MOV * T11.X, 0.0, -; EG-NEXT: ALU clause starting at 70: -; EG-NEXT: LSHL T0.W, T12.X, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV T3.X, PV.W, -; EG-NEXT: MOV * T0.Y, T5.X, -; EG-NEXT: ALU clause starting at 76: -; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, -; EG-NEXT: LSHL * T1.W, T12.X, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV T5.X, PV.W, -; EG-NEXT: MOV * T0.Y, T7.X, -; EG-NEXT: ALU clause starting at 82: -; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, -; EG-NEXT: LSHL * T1.W, T12.X, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV T7.X, PV.W, -; EG-NEXT: MOV * T0.Y, T9.X, -; EG-NEXT: ALU clause starting at 88: -; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, -; EG-NEXT: LSHL * T1.W, T12.X, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV T9.X, PV.W, -; EG-NEXT: MOV * T0.Y, T3.X, -; EG-NEXT: ALU clause starting at 94: -; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, -; EG-NEXT: -65536(nan), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV T3.X, PV.W, -; EG-NEXT: MOV * T0.Y, T5.X, -; EG-NEXT: ALU clause starting at 100: -; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, -; EG-NEXT: -65536(nan), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV T5.X, PV.W, -; EG-NEXT: MOV * T0.Y, T7.X, -; EG-NEXT: ALU clause starting at 106: -; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, -; EG-NEXT: -65536(nan), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV T7.X, PV.W, -; EG-NEXT: MOV * T0.Y, T9.X, -; EG-NEXT: ALU clause starting at 112: -; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, -; EG-NEXT: -65536(nan), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV T9.X, PV.W, -; EG-NEXT: MOV * T0.Y, T2.X, -; EG-NEXT: ALU clause starting at 118: -; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, -; EG-NEXT: LSHL * T1.W, T12.X, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV T2.X, PV.W, -; EG-NEXT: MOV * T0.Y, T4.X, -; EG-NEXT: ALU clause starting at 124: -; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, -; EG-NEXT: LSHL * T1.W, T12.X, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV T4.X, PV.W, -; EG-NEXT: MOV * T0.Y, T6.X, -; EG-NEXT: ALU clause starting at 130: -; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, -; EG-NEXT: LSHL * T1.W, T12.X, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV T6.X, PV.W, -; EG-NEXT: MOV * T0.Y, T8.X, -; EG-NEXT: ALU clause starting at 136: -; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, -; EG-NEXT: LSHL * T1.W, T12.X, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV T8.X, PV.W, -; EG-NEXT: MOV * T0.Y, T2.X, -; EG-NEXT: ALU clause starting at 142: -; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, -; EG-NEXT: -65536(nan), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T12.Z, PV.W, PS, -; EG-NEXT: MOV T2.X, PV.Z, -; EG-NEXT: MOV * T0.Y, T4.X, -; EG-NEXT: ALU clause starting at 148: -; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, -; EG-NEXT: -65536(nan), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T12.X, PV.W, PS, -; EG-NEXT: MOV T4.X, PV.X, -; EG-NEXT: MOV * T0.Y, T6.X, -; EG-NEXT: ALU clause starting at 154: -; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, T13.X, literal.y, -; EG-NEXT: -65536(nan), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T11.Z, PV.W, PS, -; EG-NEXT: MOV T6.X, PV.Z, -; EG-NEXT: MOV * T0.Y, T8.X, -; EG-NEXT: ALU clause starting at 160: -; EG-NEXT: LSHR T13.X, KC0[2].Y, literal.x, +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_16 T1.X, T0.X, 84, #3 +; EG-NEXT: VTX_READ_16 T2.X, T0.X, 86, #3 +; EG-NEXT: VTX_READ_16 T3.X, T0.X, 94, #3 +; EG-NEXT: VTX_READ_16 T4.X, T0.X, 78, #3 +; EG-NEXT: VTX_READ_16 T5.X, T0.X, 76, #3 +; EG-NEXT: VTX_READ_16 T6.X, T0.X, 92, #3 +; EG-NEXT: VTX_READ_16 T7.X, T0.X, 68, #3 +; EG-NEXT: VTX_READ_16 T0.X, T0.X, 70, #3 +; EG-NEXT: ALU clause starting at 22: +; EG-NEXT: MOV * T0.X, 0.0, +; EG-NEXT: ALU clause starting at 23: +; EG-NEXT: MOV T1.Y, T2.X, +; EG-NEXT: MOV * T7.Y, T0.X, +; EG-NEXT: MOV * T1.Z, T6.X, +; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; EG-NEXT: MOV T7.Z, T5.X, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) -; EG-NEXT: LSHR T14.X, PV.W, literal.x, -; EG-NEXT: AND_INT T0.W, T0.Y, literal.y, -; EG-NEXT: AND_INT * T1.W, T11.X, literal.z, -; EG-NEXT: 2(2.802597e-45), -65536(nan) -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: OR_INT * T11.X, PV.W, PS, -; EG-NEXT: MOV T8.X, PV.X, -; EG-NEXT: MOV * T12.W, T3.X, -; EG-NEXT: MOV T12.Y, T5.X, -; EG-NEXT: MOV T11.W, T7.X, BS:VEC_120/SCL_212 -; EG-NEXT: MOV * T11.Y, T9.X, +; EG-NEXT: LSHR T2.X, PV.W, literal.x, +; EG-NEXT: MOV T7.W, T4.X, +; EG-NEXT: MOV * T1.W, T3.X, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CM-LABEL: v16i16_arg: ; CM: ; %bb.0: ; %entry -; CM-NEXT: ALU 1, @68, KC0[], KC1[] -; CM-NEXT: TEX 0 @36 -; CM-NEXT: ALU 5, @70, KC0[], KC1[] -; CM-NEXT: TEX 0 @38 -; CM-NEXT: ALU 5, @76, KC0[], KC1[] -; CM-NEXT: TEX 0 @40 -; CM-NEXT: ALU 5, @82, KC0[], KC1[] -; CM-NEXT: TEX 0 @42 -; CM-NEXT: ALU 5, @88, KC0[], KC1[] -; CM-NEXT: TEX 0 @44 -; CM-NEXT: ALU 5, @94, KC0[], KC1[] -; CM-NEXT: TEX 0 @46 -; CM-NEXT: ALU 5, @100, KC0[], KC1[] -; CM-NEXT: TEX 0 @48 -; CM-NEXT: ALU 5, @106, KC0[], KC1[] -; CM-NEXT: TEX 0 @50 -; CM-NEXT: ALU 5, @112, KC0[], KC1[] -; CM-NEXT: TEX 0 @52 -; CM-NEXT: ALU 5, @118, KC0[], KC1[] -; CM-NEXT: TEX 0 @54 -; CM-NEXT: ALU 5, @124, KC0[], KC1[] -; CM-NEXT: TEX 0 @56 -; CM-NEXT: ALU 5, @130, KC0[], KC1[] -; CM-NEXT: TEX 0 @58 -; CM-NEXT: ALU 5, @136, KC0[], KC1[] -; CM-NEXT: TEX 0 @60 -; CM-NEXT: ALU 5, @142, KC0[], KC1[] -; CM-NEXT: TEX 0 @62 -; CM-NEXT: ALU 5, @148, KC0[], KC1[] -; CM-NEXT: TEX 0 @64 -; CM-NEXT: ALU 5, @154, KC0[], KC1[] -; CM-NEXT: TEX 0 @66 -; CM-NEXT: ALU 14, @160, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T11, T14.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T13.X +; CM-NEXT: ALU 0, @22, KC0[], KC1[] +; CM-NEXT: TEX 7 @6 +; CM-NEXT: ALU 11, @23, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T2.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X ; CM-NEXT: CF_END -; CM-NEXT: Fetch clause starting at 36: -; CM-NEXT: VTX_READ_16 T12.X, T11.X, 98, #3 -; CM-NEXT: Fetch clause starting at 38: -; CM-NEXT: VTX_READ_16 T12.X, T11.X, 90, #3 -; CM-NEXT: Fetch clause starting at 40: -; CM-NEXT: VTX_READ_16 T12.X, T11.X, 82, #3 -; CM-NEXT: Fetch clause starting at 42: -; CM-NEXT: VTX_READ_16 T12.X, T11.X, 74, #3 -; CM-NEXT: Fetch clause starting at 44: -; CM-NEXT: VTX_READ_16 T12.X, T11.X, 96, #3 -; CM-NEXT: Fetch clause starting at 46: -; CM-NEXT: VTX_READ_16 T12.X, T11.X, 88, #3 -; CM-NEXT: Fetch clause starting at 48: -; CM-NEXT: VTX_READ_16 T12.X, T11.X, 80, #3 -; CM-NEXT: Fetch clause starting at 50: -; CM-NEXT: VTX_READ_16 T12.X, T11.X, 72, #3 -; CM-NEXT: Fetch clause starting at 52: -; CM-NEXT: VTX_READ_16 T12.X, T11.X, 94, #3 -; CM-NEXT: Fetch clause starting at 54: -; CM-NEXT: VTX_READ_16 T12.X, T11.X, 86, #3 -; CM-NEXT: Fetch clause starting at 56: -; CM-NEXT: VTX_READ_16 T12.X, T11.X, 78, #3 -; CM-NEXT: Fetch clause starting at 58: -; CM-NEXT: VTX_READ_16 T12.X, T11.X, 70, #3 -; CM-NEXT: Fetch clause starting at 60: -; CM-NEXT: VTX_READ_16 T12.X, T11.X, 92, #3 -; CM-NEXT: Fetch clause starting at 62: -; CM-NEXT: VTX_READ_16 T12.X, T11.X, 84, #3 -; CM-NEXT: Fetch clause starting at 64: -; CM-NEXT: VTX_READ_16 T13.X, T11.X, 76, #3 -; CM-NEXT: Fetch clause starting at 66: -; CM-NEXT: VTX_READ_16 T11.X, T11.X, 68, #3 -; CM-NEXT: ALU clause starting at 68: -; CM-NEXT: MOV * T0.Y, T3.X, -; CM-NEXT: MOV * T11.X, 0.0, -; CM-NEXT: ALU clause starting at 70: -; CM-NEXT: LSHL T0.Z, T12.X, literal.x, -; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, -; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41) -; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, -; CM-NEXT: MOV T3.X, PV.W, -; CM-NEXT: MOV * T0.Y, T5.X, -; CM-NEXT: ALU clause starting at 76: -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: LSHL * T0.W, T12.X, literal.y, -; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T5.X, PV.W, -; CM-NEXT: MOV * T0.Y, T7.X, -; CM-NEXT: ALU clause starting at 82: -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: LSHL * T0.W, T12.X, literal.y, -; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T7.X, PV.W, -; CM-NEXT: MOV * T0.Y, T9.X, -; CM-NEXT: ALU clause starting at 88: -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: LSHL * T0.W, T12.X, literal.y, -; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T9.X, PV.W, -; CM-NEXT: MOV * T0.Y, T3.X, -; CM-NEXT: ALU clause starting at 94: -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, -; CM-NEXT: -65536(nan), 65535(9.183409e-41) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T3.X, PV.W, -; CM-NEXT: MOV * T0.Y, T5.X, -; CM-NEXT: ALU clause starting at 100: -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, -; CM-NEXT: -65536(nan), 65535(9.183409e-41) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T5.X, PV.W, -; CM-NEXT: MOV * T0.Y, T7.X, -; CM-NEXT: ALU clause starting at 106: -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, -; CM-NEXT: -65536(nan), 65535(9.183409e-41) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T7.X, PV.W, -; CM-NEXT: MOV * T0.Y, T9.X, -; CM-NEXT: ALU clause starting at 112: -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, -; CM-NEXT: -65536(nan), 65535(9.183409e-41) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T9.X, PV.W, -; CM-NEXT: MOV * T0.Y, T2.X, -; CM-NEXT: ALU clause starting at 118: -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: LSHL * T0.W, T12.X, literal.y, -; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T2.X, PV.W, -; CM-NEXT: MOV * T0.Y, T4.X, -; CM-NEXT: ALU clause starting at 124: -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: LSHL * T0.W, T12.X, literal.y, -; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T4.X, PV.W, -; CM-NEXT: MOV * T0.Y, T6.X, -; CM-NEXT: ALU clause starting at 130: -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: LSHL * T0.W, T12.X, literal.y, -; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T6.X, PV.W, -; CM-NEXT: MOV * T0.Y, T8.X, -; CM-NEXT: ALU clause starting at 136: -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: LSHL * T0.W, T12.X, literal.y, -; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, -; CM-NEXT: MOV T8.X, PV.W, -; CM-NEXT: MOV * T0.Y, T2.X, -; CM-NEXT: ALU clause starting at 142: -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, -; CM-NEXT: -65536(nan), 65535(9.183409e-41) -; CM-NEXT: OR_INT * T12.Z, PV.Z, PV.W, -; CM-NEXT: MOV T2.X, PV.Z, -; CM-NEXT: MOV * T0.Y, T4.X, -; CM-NEXT: ALU clause starting at 148: -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, -; CM-NEXT: -65536(nan), 65535(9.183409e-41) -; CM-NEXT: OR_INT * T12.X, PV.Z, PV.W, -; CM-NEXT: MOV T4.X, PV.X, -; CM-NEXT: MOV * T0.Y, T6.X, -; CM-NEXT: ALU clause starting at 154: -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, -; CM-NEXT: AND_INT * T0.W, T13.X, literal.y, -; CM-NEXT: -65536(nan), 65535(9.183409e-41) -; CM-NEXT: OR_INT * T11.Z, PV.Z, PV.W, -; CM-NEXT: MOV T6.X, PV.Z, -; CM-NEXT: MOV * T0.Y, T8.X, -; CM-NEXT: ALU clause starting at 160: +; CM-NEXT: Fetch clause starting at 6: +; CM-NEXT: VTX_READ_16 T1.X, T0.X, 84, #3 +; CM-NEXT: VTX_READ_16 T2.X, T0.X, 86, #3 +; CM-NEXT: VTX_READ_16 T3.X, T0.X, 78, #3 +; CM-NEXT: VTX_READ_16 T4.X, T0.X, 94, #3 +; CM-NEXT: VTX_READ_16 T5.X, T0.X, 76, #3 +; CM-NEXT: VTX_READ_16 T6.X, T0.X, 92, #3 +; CM-NEXT: VTX_READ_16 T7.X, T0.X, 68, #3 +; CM-NEXT: VTX_READ_16 T0.X, T0.X, 70, #3 +; CM-NEXT: ALU clause starting at 22: +; CM-NEXT: MOV * T0.X, 0.0, +; CM-NEXT: ALU clause starting at 23: +; CM-NEXT: MOV * T1.Y, T2.X, +; CM-NEXT: MOV T7.Y, T0.X, +; CM-NEXT: MOV T1.Z, T6.X, BS:VEC_120/SCL_212 ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: LSHR * T13.X, PV.W, literal.x, +; CM-NEXT: LSHR T0.X, PV.W, literal.x, +; CM-NEXT: MOV T7.Z, T5.X, +; CM-NEXT: MOV * T1.W, T4.X, BS:VEC_120/SCL_212 +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; CM-NEXT: LSHR T2.X, KC0[2].Y, literal.x, +; CM-NEXT: MOV * T7.W, T3.X, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; CM-NEXT: LSHR T14.X, KC0[2].Y, literal.x, -; CM-NEXT: AND_INT T0.Z, T0.Y, literal.y, -; CM-NEXT: AND_INT * T0.W, T11.X, literal.z, -; CM-NEXT: 2(2.802597e-45), -65536(nan) -; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; CM-NEXT: OR_INT * T11.X, PV.Z, PV.W, -; CM-NEXT: MOV T8.X, PV.X, -; CM-NEXT: MOV * T12.W, T3.X, -; CM-NEXT: MOV T12.Y, T5.X, -; CM-NEXT: MOV * T11.W, T7.X, BS:VEC_120/SCL_212 -; CM-NEXT: MOV * T11.Y, T9.X, entry: store <16 x i16> %in, ptr addrspace(1) %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 2afac4e90aa4..458afa4d6aad 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -212,38 +212,32 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp ; ; EG-LABEL: constant_load_v3i16: ; EG: ; %bb.0: ; %entry -; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 2 @6 -; EG-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 0 -; EG-NEXT: MEM_RAT MSKOR T5.XW, T8.X +; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @6 +; EG-NEXT: ALU 14, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T3.X, 0 +; EG-NEXT: MEM_RAT MSKOR T2.XW, T0.X ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_16 T6.X, T5.X, 0, #1 -; EG-NEXT: VTX_READ_16 T7.X, T5.X, 2, #1 -; EG-NEXT: VTX_READ_16 T5.X, T5.X, 4, #1 -; EG-NEXT: ALU clause starting at 12: -; EG-NEXT: MOV * T5.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 13: +; EG-NEXT: VTX_READ_16 T1.X, T0.X, 0, #1 +; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 11: ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: AND_INT T1.W, PV.W, literal.x, -; EG-NEXT: AND_INT * T2.W, T5.X, literal.y, +; EG-NEXT: AND_INT * T2.W, T0.X, literal.y, ; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) ; EG-NEXT: LSHL * T1.W, PV.W, literal.x, ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: LSHL T5.X, T2.W, PV.W, -; EG-NEXT: LSHL * T5.W, literal.x, PV.W, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: MOV T5.Y, 0.0, -; EG-NEXT: MOV * T5.Z, 0.0, -; EG-NEXT: LSHR T8.X, T0.W, literal.x, -; EG-NEXT: LSHL T0.W, T7.X, literal.y, -; EG-NEXT: AND_INT * T1.W, T6.X, literal.z, -; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) +; EG-NEXT: LSHL T2.X, T2.W, PV.W, +; EG-NEXT: LSHL * T2.W, literal.x, PV.W, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: OR_INT T6.X, PV.W, PS, -; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, +; EG-NEXT: MOV T2.Y, 0.0, +; EG-NEXT: MOV * T2.Z, 0.0, +; EG-NEXT: LSHR T0.X, T0.W, literal.x, +; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; GFX12-LABEL: constant_load_v3i16: diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index b945c7c3def6..c608bef3f726 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -9491,50 +9491,24 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; ; EG-LABEL: constant_zextload_v4i8_to_v4i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 31, @10, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XY, T7.X, 1 +; EG-NEXT: ALU 6, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XY, T5.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1 +; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T0.Y, T4.X, -; EG-NEXT: MOV * T7.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: AND_INT T0.W, T7.X, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 255(3.573311e-43), -65536(nan) -; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV * T4.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T0.W, T7.X, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV T4.X, PV.W, -; EG-NEXT: MOV T0.Y, T5.X, +; EG-NEXT: MOV * T4.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: ; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T0.W, T7.X, literal.x, PV.W, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 16(2.242078e-44), -65536(nan) -; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV * T5.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T7.X, literal.x, +; EG-NEXT: BFE_UINT * T4.Y, T4.X, literal.x, PV.W, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x, -; EG-NEXT: OR_INT * T8.Y, PV.W, PS, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: MOV T5.X, PV.Y, -; EG-NEXT: MOV * T8.X, T4.X, +; EG-NEXT: AND_INT T4.X, T4.X, literal.x, +; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y, +; EG-NEXT: 255(3.573311e-43), 2(2.802597e-45) ; ; GFX12-LABEL: constant_zextload_v4i8_to_v4i16: ; GFX12: ; %bb.0: @@ -9633,56 +9607,23 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out ; ; EG-LABEL: constant_sextload_v4i8_to_v4i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 37, @10, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XY, T7.X, 1 +; EG-NEXT: ALU 5, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1 +; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T0.Y, T4.X, -; EG-NEXT: MOV * T7.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T0.W, PV.W, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 65535(9.183409e-41), -65536(nan) -; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV * T4.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T7.X, literal.x, +; EG-NEXT: MOV * T4.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: BFE_INT T5.X, T4.X, 0.0, literal.x, +; EG-NEXT: LSHR T0.W, T4.X, literal.x, +; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) +; EG-NEXT: BFE_INT * T5.Y, PV.W, 0.0, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T4.X, PV.W, -; EG-NEXT: MOV T0.Y, T5.X, -; EG-NEXT: LSHR * T0.W, T7.X, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), -65536(nan) -; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T5.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: ASHR * T0.W, T7.X, literal.x, -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: LSHL * T0.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x, -; EG-NEXT: OR_INT * T8.Y, PV.W, PS, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: MOV T5.X, PV.Y, -; EG-NEXT: MOV * T8.X, T4.X, ; ; GFX12-LABEL: constant_sextload_v4i8_to_v4i16: ; GFX12: ; %bb.0: @@ -9800,80 +9741,27 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; ; EG-LABEL: constant_zextload_v8i8_to_v8i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 61, @10, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T11.X, 1 +; EG-NEXT: ALU 9, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_64 T11.XY, T11.X, 0, #1 +; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T0.Y, T8.X, -; EG-NEXT: MOV * T11.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: AND_INT T0.W, T11.X, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 255(3.573311e-43), -65536(nan) -; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV * T8.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T0.W, T11.X, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV T8.X, PV.W, -; EG-NEXT: MOV T0.Y, T9.X, +; EG-NEXT: MOV * T5.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: ; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T1.W, T11.X, literal.x, PV.W, -; EG-NEXT: AND_INT * T2.W, PV.Y, literal.y, -; EG-NEXT: 16(2.242078e-44), -65536(nan) -; EG-NEXT: OR_INT * T1.W, PS, PV.W, -; EG-NEXT: MOV * T9.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T11.X, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T12.Y, PV.W, PS, -; EG-NEXT: MOV T9.X, PV.Y, -; EG-NEXT: MOV * T0.Y, T4.X, -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T11.Y, literal.y, -; EG-NEXT: -65536(nan), 255(3.573311e-43) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV * T4.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T1.W, T11.Y, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV T4.X, PV.W, -; EG-NEXT: MOV T0.Y, T5.X, -; EG-NEXT: BFE_UINT * T0.W, T11.Y, literal.x, T0.W, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, PV.W, T0.W, -; EG-NEXT: MOV * T5.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T11.Y, literal.x, +; EG-NEXT: BFE_UINT * T6.W, T5.Y, literal.x, PV.W, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: LSHR T11.X, KC0[2].Y, literal.x, -; EG-NEXT: OR_INT * T12.W, PV.W, PS, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: MOV T5.X, PV.W, -; EG-NEXT: MOV * T12.X, T8.X, -; EG-NEXT: MOV * T12.Z, T4.X, +; EG-NEXT: BFE_UINT T6.Y, T5.X, literal.x, T0.W, +; EG-NEXT: AND_INT * T6.Z, T5.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 255(3.573311e-43) +; EG-NEXT: AND_INT T6.X, T5.X, literal.x, +; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y, +; EG-NEXT: 255(3.573311e-43), 2(2.802597e-45) ; ; GFX12-LABEL: constant_zextload_v8i8_to_v8i16: ; GFX12: ; %bb.0: @@ -10017,93 +9905,28 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out ; ; EG-LABEL: constant_sextload_v8i8_to_v8i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 74, @10, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T11.X, 1 +; EG-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_64 T11.XY, T11.X, 0, #1 +; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T0.Y, T8.X, -; EG-NEXT: MOV * T11.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: BFE_INT * T0.W, T11.X, 0.0, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T0.W, PV.W, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 65535(9.183409e-41), -65536(nan) -; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV * T8.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T11.X, literal.x, +; EG-NEXT: MOV * T5.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: BFE_INT * T6.Z, T5.Y, 0.0, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T8.X, PV.W, -; EG-NEXT: MOV T0.Y, T9.X, -; EG-NEXT: LSHR * T0.W, T11.X, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), -65536(nan) -; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T9.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: ASHR * T0.W, T11.X, literal.x, -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: LSHL * T0.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: OR_INT * T12.Y, PV.W, PS, -; EG-NEXT: MOV T9.X, PV.Y, -; EG-NEXT: MOV T0.Y, T4.X, -; EG-NEXT: BFE_INT * T0.W, T11.Y, 0.0, literal.x, +; EG-NEXT: BFE_INT T6.X, T5.X, 0.0, literal.x, +; EG-NEXT: LSHR * T0.W, T5.Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: -65536(nan), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV * T4.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T11.Y, literal.x, +; EG-NEXT: BFE_INT T6.W, PV.W, 0.0, literal.x, +; EG-NEXT: LSHR * T0.W, T5.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T4.X, PV.W, -; EG-NEXT: MOV T0.Y, T5.X, -; EG-NEXT: LSHR * T0.W, T11.Y, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), -65536(nan) -; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T5.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: ASHR * T0.W, T11.Y, literal.x, -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: LSHL * T0.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: LSHR T11.X, KC0[2].Y, literal.x, -; EG-NEXT: OR_INT * T12.W, PV.W, PS, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: MOV T5.X, PV.W, -; EG-NEXT: MOV * T12.X, T8.X, -; EG-NEXT: MOV * T12.Z, T4.X, +; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x, +; EG-NEXT: BFE_INT * T6.Y, PS, 0.0, literal.y, +; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; ; GFX12-LABEL: constant_sextload_v8i8_to_v8i16: ; GFX12: ; %bb.0: @@ -10296,146 +10119,37 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; ; EG-LABEL: constant_zextload_v16i8_to_v16i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @8 -; EG-NEXT: ALU 103, @12, KC0[], KC1[] -; EG-NEXT: ALU 20, @116, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T22.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T21.X, 1 +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 19, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T10.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T7.X, 1 ; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 8: -; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 0, #1 -; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: MOV * T0.Y, T16.X, -; EG-NEXT: MOV * T19.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 12: -; EG-NEXT: AND_INT T0.W, T19.X, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 255(3.573311e-43), -65536(nan) -; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV * T16.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T0.W, T19.X, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV T16.X, PV.W, -; EG-NEXT: MOV T0.Y, T17.X, +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T7.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: ; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T1.W, T19.X, literal.x, PV.W, -; EG-NEXT: AND_INT * T2.W, PV.Y, literal.y, -; EG-NEXT: 16(2.242078e-44), -65536(nan) -; EG-NEXT: OR_INT * T1.W, PS, PV.W, -; EG-NEXT: MOV * T17.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T19.X, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T20.Y, PV.W, PS, -; EG-NEXT: MOV T17.X, PV.Y, -; EG-NEXT: MOV * T0.Y, T12.X, -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T19.Y, literal.y, -; EG-NEXT: -65536(nan), 255(3.573311e-43) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV * T12.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T1.W, T19.Y, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV T12.X, PV.W, -; EG-NEXT: MOV T0.Y, T13.X, -; EG-NEXT: BFE_UINT * T1.W, T19.Y, literal.x, T0.W, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, -; EG-NEXT: MOV * T13.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T19.Y, literal.x, +; EG-NEXT: BFE_UINT * T8.W, T7.Y, literal.x, PV.W, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T20.W, PV.W, PS, -; EG-NEXT: MOV T13.X, PV.W, -; EG-NEXT: MOV * T0.Y, T8.X, -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T19.Z, literal.y, -; EG-NEXT: -65536(nan), 255(3.573311e-43) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV * T8.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T1.W, T19.Z, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV T8.X, PV.W, -; EG-NEXT: MOV T0.Y, T9.X, -; EG-NEXT: BFE_UINT * T1.W, T19.Z, literal.x, T0.W, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, -; EG-NEXT: MOV * T9.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T19.Z, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T19.Y, PV.W, PS, -; EG-NEXT: MOV T9.X, PV.Y, -; EG-NEXT: MOV * T0.Y, T4.X, -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T19.W, literal.y, -; EG-NEXT: -65536(nan), 255(3.573311e-43) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV * T4.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T1.W, T19.W, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV T4.X, PV.W, -; EG-NEXT: MOV T0.Y, T5.X, -; EG-NEXT: BFE_UINT * T0.W, T19.W, literal.x, T0.W, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: ALU clause starting at 116: -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, PV.W, T0.W, -; EG-NEXT: MOV * T5.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR T0.W, T19.W, literal.x, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44) -; EG-NEXT: LSHR T21.X, PS, literal.x, -; EG-NEXT: AND_INT T1.W, PV.Y, literal.y, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.z, -; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41) -; EG-NEXT: 16711680(2.341805e-38), 0(0.000000e+00) -; EG-NEXT: LSHR T22.X, KC0[2].Y, literal.x, -; EG-NEXT: OR_INT * T19.W, PV.W, PS, +; EG-NEXT: BFE_UINT T8.Y, T7.X, literal.x, T0.W, +; EG-NEXT: AND_INT T8.Z, T7.Y, literal.y, +; EG-NEXT: BFE_UINT * T9.W, T7.W, literal.x, T0.W, +; EG-NEXT: 8(1.121039e-44), 255(3.573311e-43) +; EG-NEXT: AND_INT T8.X, T7.X, literal.x, +; EG-NEXT: BFE_UINT T9.Y, T7.Z, literal.y, T0.W, +; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.z, +; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: AND_INT * T9.Z, T7.W, literal.x, +; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) +; EG-NEXT: AND_INT T9.X, T7.Z, literal.x, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44) +; EG-NEXT: LSHR * T10.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: MOV T5.X, PV.W, -; EG-NEXT: MOV * T20.X, T16.X, -; EG-NEXT: MOV * T20.Z, T12.X, -; EG-NEXT: MOV T19.X, T8.X, -; EG-NEXT: MOV * T19.Z, T4.X, BS:VEC_120/SCL_212 ; ; GFX12-LABEL: constant_zextload_v16i8_to_v16i16: ; GFX12: ; %bb.0: @@ -10683,173 +10397,38 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; ; EG-LABEL: constant_sextload_v16i8_to_v16i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @8 -; EG-NEXT: ALU 104, @12, KC0[], KC1[] -; EG-NEXT: ALU 46, @117, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T22.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T21.X, 1 +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T10.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T7.X, 1 ; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 8: -; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 0, #1 -; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: MOV * T0.Y, T16.X, -; EG-NEXT: MOV * T19.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 12: -; EG-NEXT: BFE_INT * T0.W, T19.X, 0.0, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T0.W, PV.W, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 65535(9.183409e-41), -65536(nan) -; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV * T16.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T19.X, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T16.X, PV.W, -; EG-NEXT: MOV T0.Y, T17.X, -; EG-NEXT: LSHR * T0.W, T19.X, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), -65536(nan) -; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T17.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: ASHR * T0.W, T19.X, literal.x, -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: LSHL * T0.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: OR_INT * T20.Y, PV.W, PS, -; EG-NEXT: MOV T17.X, PV.Y, -; EG-NEXT: MOV T0.Y, T12.X, -; EG-NEXT: BFE_INT * T0.W, T19.Y, 0.0, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: -65536(nan), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV * T12.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T19.Y, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T12.X, PV.W, -; EG-NEXT: MOV T0.Y, T13.X, -; EG-NEXT: LSHR * T0.W, T19.Y, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), -65536(nan) -; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T13.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: ASHR * T0.W, T19.Y, literal.x, -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: LSHL * T0.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: OR_INT * T20.W, PV.W, PS, -; EG-NEXT: MOV T13.X, PV.W, -; EG-NEXT: MOV T0.Y, T8.X, -; EG-NEXT: BFE_INT * T0.W, T19.Z, 0.0, literal.x, +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T7.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: BFE_INT * T8.Z, T7.Y, 0.0, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: -65536(nan), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV * T8.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T19.Z, literal.x, +; EG-NEXT: BFE_INT T8.X, T7.X, 0.0, literal.x, +; EG-NEXT: BFE_INT T9.Z, T7.W, 0.0, literal.x, +; EG-NEXT: LSHR * T0.W, T7.Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T8.X, PV.W, -; EG-NEXT: MOV T0.Y, T9.X, -; EG-NEXT: LSHR * T0.W, T19.Z, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), -65536(nan) -; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T9.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: ASHR * T0.W, T19.Z, literal.x, -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: LSHL * T0.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: ALU clause starting at 117: -; EG-NEXT: OR_INT * T19.Y, T1.W, T0.W, -; EG-NEXT: MOV T9.X, PV.Y, -; EG-NEXT: MOV T0.Y, T4.X, -; EG-NEXT: BFE_INT * T0.W, T19.W, 0.0, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: -65536(nan), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV * T4.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T19.W, literal.x, +; EG-NEXT: BFE_INT T9.X, T7.Z, 0.0, literal.x, +; EG-NEXT: LSHR T0.Z, T7.W, literal.x, +; EG-NEXT: BFE_INT T8.W, PV.W, 0.0, literal.x, +; EG-NEXT: LSHR * T0.W, T7.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T4.X, PV.W, -; EG-NEXT: MOV T0.Y, T5.X, -; EG-NEXT: LSHR * T0.W, T19.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), -65536(nan) -; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T5.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: ASHR T0.W, T19.W, literal.x, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, -; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44) -; EG-NEXT: LSHR T21.X, PS, literal.x, -; EG-NEXT: AND_INT T1.W, PV.Y, literal.y, -; EG-NEXT: LSHL * T0.W, PV.W, literal.z, -; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41) +; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x, +; EG-NEXT: BFE_INT T8.Y, PS, 0.0, literal.y, +; EG-NEXT: LSHR T1.Z, T7.Z, literal.y, +; EG-NEXT: BFE_INT T9.W, PV.Z, 0.0, literal.y, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, +; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T22.X, KC0[2].Y, literal.x, -; EG-NEXT: OR_INT * T19.W, PV.W, PS, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: MOV T5.X, PV.W, -; EG-NEXT: MOV * T20.X, T16.X, -; EG-NEXT: MOV * T20.Z, T12.X, -; EG-NEXT: MOV T19.X, T8.X, -; EG-NEXT: MOV * T19.Z, T4.X, BS:VEC_120/SCL_212 +; EG-NEXT: LSHR T10.X, PS, literal.x, +; EG-NEXT: BFE_INT * T9.Y, PV.Z, 0.0, literal.y, +; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; ; GFX12-LABEL: constant_sextload_v16i8_to_v16i16: ; GFX12: ; %bb.0: @@ -11194,276 +10773,58 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; EG-LABEL: constant_zextload_v32i8_to_v32i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 1, @14, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 1 @10 -; EG-NEXT: ALU 103, @16, KC0[], KC1[] -; EG-NEXT: ALU 104, @120, KC0[], KC1[] -; EG-NEXT: ALU 41, @225, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T42.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T41.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T39.X, 1 +; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @8 +; EG-NEXT: ALU 37, @13, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T18.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T12.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T16.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T11.X, 1 ; EG-NEXT: CF_END -; EG-NEXT: Fetch clause starting at 10: -; EG-NEXT: VTX_READ_128 T37.XYZW, T35.X, 16, #1 -; EG-NEXT: VTX_READ_128 T35.XYZW, T35.X, 0, #1 -; EG-NEXT: ALU clause starting at 14: -; EG-NEXT: MOV * T0.Y, T16.X, -; EG-NEXT: MOV * T35.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 16: -; EG-NEXT: AND_INT T0.W, T37.X, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 255(3.573311e-43), -65536(nan) -; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV * T16.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T0.W, T37.X, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV T16.X, PV.W, -; EG-NEXT: MOV T0.Y, T17.X, +; EG-NEXT: Fetch clause starting at 8: +; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1 +; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1 +; EG-NEXT: ALU clause starting at 12: +; EG-NEXT: MOV * T11.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 13: ; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T1.W, T37.X, literal.x, PV.W, -; EG-NEXT: AND_INT * T2.W, PV.Y, literal.y, -; EG-NEXT: 16(2.242078e-44), -65536(nan) -; EG-NEXT: OR_INT * T1.W, PS, PV.W, -; EG-NEXT: MOV * T17.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T37.X, literal.x, +; EG-NEXT: BFE_UINT * T13.W, T11.Y, literal.x, PV.W, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T36.Y, PV.W, PS, -; EG-NEXT: MOV T17.X, PV.Y, -; EG-NEXT: MOV * T0.Y, T12.X, -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T37.Y, literal.y, -; EG-NEXT: -65536(nan), 255(3.573311e-43) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV * T12.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T1.W, T37.Y, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV T12.X, PV.W, -; EG-NEXT: MOV T0.Y, T13.X, -; EG-NEXT: BFE_UINT * T1.W, T37.Y, literal.x, T0.W, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, -; EG-NEXT: MOV * T13.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T37.Y, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T36.W, PV.W, PS, -; EG-NEXT: MOV T13.X, PV.W, -; EG-NEXT: MOV * T0.Y, T8.X, -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T37.Z, literal.y, -; EG-NEXT: -65536(nan), 255(3.573311e-43) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV * T8.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T1.W, T37.Z, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV T8.X, PV.W, -; EG-NEXT: MOV T0.Y, T9.X, -; EG-NEXT: BFE_UINT * T1.W, T37.Z, literal.x, T0.W, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, -; EG-NEXT: MOV * T9.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T37.Z, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T37.Y, PV.W, PS, -; EG-NEXT: MOV T9.X, PV.Y, -; EG-NEXT: MOV * T0.Y, T4.X, -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T37.W, literal.y, -; EG-NEXT: -65536(nan), 255(3.573311e-43) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV * T4.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T1.W, T37.W, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV T4.X, PV.W, -; EG-NEXT: MOV T0.Y, T5.X, -; EG-NEXT: BFE_UINT * T1.W, T37.W, literal.x, T0.W, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: ALU clause starting at 120: -; EG-NEXT: AND_INT * T2.W, T0.Y, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, -; EG-NEXT: MOV * T5.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T37.W, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T37.W, PV.W, PS, -; EG-NEXT: MOV T5.X, PV.W, -; EG-NEXT: MOV * T0.Y, T32.X, -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T35.X, literal.y, -; EG-NEXT: -65536(nan), 255(3.573311e-43) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV * T32.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T1.W, T35.X, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV T32.X, PV.W, -; EG-NEXT: MOV T0.Y, T33.X, -; EG-NEXT: BFE_UINT * T1.W, T35.X, literal.x, T0.W, BS:VEC_120/SCL_212 -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, -; EG-NEXT: MOV * T33.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T35.X, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T38.Y, PV.W, PS, -; EG-NEXT: MOV T33.X, PV.Y, -; EG-NEXT: MOV * T0.Y, T28.X, -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T35.Y, literal.y, -; EG-NEXT: -65536(nan), 255(3.573311e-43) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV * T28.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T1.W, T35.Y, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV T28.X, PV.W, -; EG-NEXT: MOV T0.Y, T29.X, -; EG-NEXT: BFE_UINT * T1.W, T35.Y, literal.x, T0.W, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, -; EG-NEXT: MOV * T29.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T35.Y, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T38.W, PV.W, PS, -; EG-NEXT: MOV T29.X, PV.W, -; EG-NEXT: MOV * T0.Y, T24.X, -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T35.Z, literal.y, -; EG-NEXT: -65536(nan), 255(3.573311e-43) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV * T24.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T1.W, T35.Z, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV T24.X, PV.W, -; EG-NEXT: MOV T0.Y, T25.X, -; EG-NEXT: BFE_UINT * T1.W, T35.Z, literal.x, T0.W, +; EG-NEXT: BFE_UINT T13.Y, T11.X, literal.x, T0.W, +; EG-NEXT: AND_INT T13.Z, T11.Y, literal.y, +; EG-NEXT: BFE_UINT * T14.W, T11.W, literal.x, T0.W, +; EG-NEXT: 8(1.121039e-44), 255(3.573311e-43) +; EG-NEXT: AND_INT T13.X, T11.X, literal.x, +; EG-NEXT: BFE_UINT T14.Y, T11.Z, literal.y, T0.W, +; EG-NEXT: LSHR * T11.X, KC0[2].Y, literal.z, +; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: AND_INT T14.Z, T11.W, literal.x, +; EG-NEXT: BFE_UINT * T15.W, T12.Y, literal.y, T0.W, +; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) +; EG-NEXT: AND_INT T14.X, T11.Z, literal.x, +; EG-NEXT: BFE_UINT T15.Y, T12.X, literal.y, T0.W, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, +; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, -; EG-NEXT: MOV * T25.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T35.Z, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T35.Y, PV.W, PS, -; EG-NEXT: MOV T25.X, PV.Y, -; EG-NEXT: MOV * T0.Y, T20.X, -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T35.W, literal.y, -; EG-NEXT: -65536(nan), 255(3.573311e-43) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV * T20.X, PV.W, -; EG-NEXT: ALU clause starting at 225: -; EG-NEXT: MOV T0.Y, T20.X, -; EG-NEXT: LSHL * T1.W, T35.W, literal.x, +; EG-NEXT: LSHR T16.X, PV.W, literal.x, +; EG-NEXT: AND_INT T15.Z, T12.Y, literal.y, +; EG-NEXT: BFE_UINT T17.W, T12.W, literal.z, T0.W, +; EG-NEXT: AND_INT * T15.X, T12.X, literal.y, +; EG-NEXT: 2(2.802597e-45), 255(3.573311e-43) ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T1.W, PV.W, PS, -; EG-NEXT: MOV T20.X, PV.W, -; EG-NEXT: MOV T0.Y, T21.X, -; EG-NEXT: BFE_UINT * T0.W, T35.W, literal.x, T0.W, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, PV.W, T0.W, -; EG-NEXT: MOV * T21.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: BFE_UINT T17.Y, T12.Z, literal.x, T0.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 32(4.484155e-44) +; EG-NEXT: LSHR T12.X, PV.W, literal.x, +; EG-NEXT: AND_INT T17.Z, T12.W, literal.y, +; EG-NEXT: AND_INT * T17.X, T12.Z, literal.y, +; EG-NEXT: 2(2.802597e-45), 255(3.573311e-43) ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T39.X, PV.W, literal.x, -; EG-NEXT: LSHR * T40.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: LSHR T0.W, T35.W, literal.x, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 48(6.726233e-44) -; EG-NEXT: LSHR T41.X, PS, literal.x, -; EG-NEXT: AND_INT T0.Z, T0.Y, literal.y, -; EG-NEXT: AND_INT T0.W, PV.W, literal.z, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w, -; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41) -; EG-NEXT: 16711680(2.341805e-38), 32(4.484155e-44) -; EG-NEXT: LSHR T42.X, PS, literal.x, -; EG-NEXT: OR_INT * T35.W, PV.Z, PV.W, +; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) +; EG-NEXT: LSHR * T18.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: MOV T21.X, PV.W, -; EG-NEXT: MOV * T36.X, T16.X, -; EG-NEXT: MOV * T36.Z, T12.X, -; EG-NEXT: MOV T37.X, T8.X, -; EG-NEXT: MOV T37.Z, T4.X, BS:VEC_120/SCL_212 -; EG-NEXT: MOV * T38.X, T32.X, -; EG-NEXT: MOV * T38.Z, T28.X, -; EG-NEXT: MOV T35.X, T24.X, -; EG-NEXT: MOV * T35.Z, T20.X, BS:VEC_120/SCL_212 ; ; GFX12-LABEL: constant_zextload_v32i8_to_v32i16: ; GFX12: ; %bb.0: @@ -11919,331 +11280,60 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; EG-LABEL: constant_sextload_v32i8_to_v32i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 1, @14, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 1 @10 -; EG-NEXT: ALU 104, @16, KC0[], KC1[] -; EG-NEXT: ALU 104, @121, KC0[], KC1[] -; EG-NEXT: ALU 95, @226, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T42.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T41.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T39.X, 1 +; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @8 +; EG-NEXT: ALU 39, @13, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T12.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T11.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T14.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T13.X, 1 ; EG-NEXT: CF_END -; EG-NEXT: Fetch clause starting at 10: -; EG-NEXT: VTX_READ_128 T37.XYZW, T35.X, 16, #1 -; EG-NEXT: VTX_READ_128 T35.XYZW, T35.X, 0, #1 -; EG-NEXT: ALU clause starting at 14: -; EG-NEXT: MOV * T0.Y, T16.X, -; EG-NEXT: MOV * T35.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 16: -; EG-NEXT: BFE_INT * T0.W, T37.X, 0.0, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T0.W, PV.W, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 65535(9.183409e-41), -65536(nan) -; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV * T16.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T37.X, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T16.X, PV.W, -; EG-NEXT: MOV T0.Y, T17.X, -; EG-NEXT: LSHR * T0.W, T37.X, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), -65536(nan) -; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T17.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: ASHR * T0.W, T37.X, literal.x, -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: LSHL * T0.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: OR_INT * T36.Y, PV.W, PS, -; EG-NEXT: MOV T17.X, PV.Y, -; EG-NEXT: MOV T0.Y, T12.X, -; EG-NEXT: BFE_INT * T0.W, T37.Y, 0.0, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: -65536(nan), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV * T12.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T37.Y, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T12.X, PV.W, -; EG-NEXT: MOV T0.Y, T13.X, -; EG-NEXT: LSHR * T0.W, T37.Y, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), -65536(nan) -; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T13.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: ASHR * T0.W, T37.Y, literal.x, -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: LSHL * T0.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: OR_INT * T36.W, PV.W, PS, -; EG-NEXT: MOV T13.X, PV.W, -; EG-NEXT: MOV T0.Y, T8.X, -; EG-NEXT: BFE_INT * T0.W, T37.Z, 0.0, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: -65536(nan), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV * T8.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T37.Z, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T8.X, PV.W, -; EG-NEXT: MOV T0.Y, T9.X, -; EG-NEXT: LSHR * T0.W, T37.Z, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), -65536(nan) -; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T9.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: ASHR * T0.W, T37.Z, literal.x, -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: LSHL * T0.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: ALU clause starting at 121: -; EG-NEXT: OR_INT * T37.Y, T1.W, T0.W, -; EG-NEXT: MOV T9.X, PV.Y, -; EG-NEXT: MOV T0.Y, T4.X, -; EG-NEXT: BFE_INT * T0.W, T37.W, 0.0, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: -65536(nan), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV * T4.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T37.W, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T4.X, PV.W, -; EG-NEXT: MOV T0.Y, T5.X, -; EG-NEXT: LSHR * T0.W, T37.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), -65536(nan) -; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T5.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: ASHR * T0.W, T37.W, literal.x, -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: LSHL * T0.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: OR_INT * T37.W, PV.W, PS, -; EG-NEXT: MOV T5.X, PV.W, -; EG-NEXT: MOV T0.Y, T32.X, -; EG-NEXT: BFE_INT * T0.W, T35.X, 0.0, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: -65536(nan), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV * T32.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T35.X, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T32.X, PV.W, -; EG-NEXT: MOV T0.Y, T33.X, -; EG-NEXT: LSHR * T0.W, T35.X, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), -65536(nan) -; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T33.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: ASHR * T0.W, T35.X, literal.x, -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: LSHL * T0.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: OR_INT * T38.Y, PV.W, PS, -; EG-NEXT: MOV T33.X, PV.Y, -; EG-NEXT: MOV T0.Y, T28.X, -; EG-NEXT: BFE_INT * T0.W, T35.Y, 0.0, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: -65536(nan), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV * T28.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T35.Y, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T28.X, PV.W, -; EG-NEXT: MOV T0.Y, T29.X, -; EG-NEXT: LSHR * T0.W, T35.Y, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), -65536(nan) -; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T29.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: ASHR * T0.W, T35.Y, literal.x, -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: ALU clause starting at 226: -; EG-NEXT: AND_INT T1.W, T0.Y, literal.x, -; EG-NEXT: LSHL * T0.W, T0.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: OR_INT * T38.W, PV.W, PS, -; EG-NEXT: MOV T29.X, PV.W, -; EG-NEXT: MOV T0.Y, T24.X, -; EG-NEXT: BFE_INT * T0.W, T35.Z, 0.0, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: -65536(nan), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV * T24.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T35.Z, literal.x, +; EG-NEXT: Fetch clause starting at 8: +; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1 +; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1 +; EG-NEXT: ALU clause starting at 12: +; EG-NEXT: MOV * T11.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 13: +; EG-NEXT: LSHR T13.X, KC0[2].Y, literal.x, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) +; EG-NEXT: LSHR T14.X, PV.W, literal.x, +; EG-NEXT: BFE_INT * T15.Z, T11.Y, 0.0, literal.y, +; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) +; EG-NEXT: BFE_INT T15.X, T11.X, 0.0, literal.x, +; EG-NEXT: LSHR T0.Y, T12.W, literal.x, +; EG-NEXT: BFE_INT T16.Z, T11.W, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: LSHR T0.W, T12.Y, literal.x, +; EG-NEXT: LSHR * T1.W, T11.Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T24.X, PV.W, -; EG-NEXT: MOV T0.Y, T25.X, -; EG-NEXT: LSHR * T0.W, T35.Z, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), -65536(nan) -; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T25.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: ASHR * T0.W, T35.Z, literal.x, -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: LSHL * T0.W, PV.W, literal.y, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: OR_INT * T35.Y, PV.W, PS, -; EG-NEXT: MOV T25.X, PV.Y, -; EG-NEXT: MOV T0.Y, T20.X, -; EG-NEXT: BFE_INT * T0.W, T35.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T16.X, T11.Z, 0.0, literal.x, +; EG-NEXT: LSHR T1.Y, T11.W, literal.x, +; EG-NEXT: BFE_INT T17.Z, T12.Y, 0.0, literal.x, +; EG-NEXT: BFE_INT T15.W, PS, 0.0, literal.x, +; EG-NEXT: LSHR * T1.W, T11.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, -; EG-NEXT: -65536(nan), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV * T20.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T35.W, literal.x, +; EG-NEXT: BFE_INT T17.X, T12.X, 0.0, literal.x, +; EG-NEXT: BFE_INT T15.Y, PS, 0.0, literal.x, +; EG-NEXT: BFE_INT T18.Z, T12.W, 0.0, literal.x, +; EG-NEXT: BFE_INT T16.W, PV.Y, 0.0, literal.x, +; EG-NEXT: LSHR * T1.W, T11.Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T20.X, PV.W, -; EG-NEXT: MOV T0.Y, T21.X, -; EG-NEXT: LSHR * T0.W, T35.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), -65536(nan) -; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T21.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T39.X, PV.W, literal.x, -; EG-NEXT: LSHR * T40.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: ASHR T0.W, T35.W, literal.x, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, -; EG-NEXT: 24(3.363116e-44), 48(6.726233e-44) -; EG-NEXT: LSHR T41.X, PS, literal.x, -; EG-NEXT: AND_INT T0.Z, T0.Y, literal.y, -; EG-NEXT: LSHL T0.W, PV.W, literal.z, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w, -; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41) -; EG-NEXT: 16(2.242078e-44), 32(4.484155e-44) -; EG-NEXT: LSHR T42.X, PS, literal.x, -; EG-NEXT: OR_INT * T35.W, PV.Z, PV.W, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: MOV T21.X, PV.W, -; EG-NEXT: MOV * T36.X, T16.X, -; EG-NEXT: MOV * T36.Z, T12.X, -; EG-NEXT: MOV T37.X, T8.X, -; EG-NEXT: MOV T37.Z, T4.X, BS:VEC_120/SCL_212 -; EG-NEXT: MOV * T38.X, T32.X, -; EG-NEXT: MOV * T38.Z, T28.X, -; EG-NEXT: MOV T35.X, T24.X, -; EG-NEXT: MOV * T35.Z, T20.X, BS:VEC_120/SCL_212 +; EG-NEXT: BFE_INT T18.X, T12.Z, 0.0, literal.x, +; EG-NEXT: BFE_INT T16.Y, PS, 0.0, literal.x, +; EG-NEXT: LSHR T0.Z, T12.X, literal.x, +; EG-NEXT: BFE_INT T17.W, T0.W, 0.0, literal.x, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 32(4.484155e-44) +; EG-NEXT: LSHR T11.X, PS, literal.x, +; EG-NEXT: BFE_INT T17.Y, PV.Z, 0.0, literal.y, +; EG-NEXT: LSHR T0.Z, T12.Z, literal.y, +; EG-NEXT: BFE_INT T18.W, T0.Y, 0.0, literal.y, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, +; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) +; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) +; EG-NEXT: LSHR T12.X, PS, literal.x, +; EG-NEXT: BFE_INT * T18.Y, PV.Z, 0.0, literal.y, +; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; ; GFX12-LABEL: constant_sextload_v32i8_to_v32i16: ; GFX12: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 8589158f11a7..573338231bd5 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -254,74 +254,63 @@ define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspac ; ; EG-LABEL: global_load_v3i16: ; EG: ; %bb.0: ; %entry -; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 2 @6 -; EG-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 0 -; EG-NEXT: MEM_RAT MSKOR T5.XW, T8.X +; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @6 +; EG-NEXT: ALU 14, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T3.X, 0 +; EG-NEXT: MEM_RAT MSKOR T2.XW, T0.X ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_16 T6.X, T5.X, 0, #1 -; EG-NEXT: VTX_READ_16 T7.X, T5.X, 2, #1 -; EG-NEXT: VTX_READ_16 T5.X, T5.X, 4, #1 -; EG-NEXT: ALU clause starting at 12: -; EG-NEXT: MOV * T5.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 13: +; EG-NEXT: VTX_READ_16 T1.X, T0.X, 0, #1 +; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 11: ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: AND_INT T1.W, PV.W, literal.x, -; EG-NEXT: AND_INT * T2.W, T5.X, literal.y, +; EG-NEXT: AND_INT * T2.W, T0.X, literal.y, ; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) ; EG-NEXT: LSHL * T1.W, PV.W, literal.x, ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: LSHL T5.X, T2.W, PV.W, -; EG-NEXT: LSHL * T5.W, literal.x, PV.W, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: MOV T5.Y, 0.0, -; EG-NEXT: MOV * T5.Z, 0.0, -; EG-NEXT: LSHR T8.X, T0.W, literal.x, -; EG-NEXT: LSHL T0.W, T7.X, literal.y, -; EG-NEXT: AND_INT * T1.W, T6.X, literal.z, -; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) +; EG-NEXT: LSHL T2.X, T2.W, PV.W, +; EG-NEXT: LSHL * T2.W, literal.x, PV.W, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: OR_INT T6.X, PV.W, PS, -; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, +; EG-NEXT: MOV T2.Y, 0.0, +; EG-NEXT: MOV * T2.Z, 0.0, +; EG-NEXT: LSHR T0.X, T0.W, literal.x, +; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CM-LABEL: global_load_v3i16: ; CM: ; %bb.0: ; %entry -; CM-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[] -; CM-NEXT: TEX 2 @6 -; CM-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT MSKOR T5.XW, T8.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6.X, T7.X +; CM-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] +; CM-NEXT: TEX 1 @6 +; CM-NEXT: ALU 15, @11, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT MSKOR T2.XW, T3.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X ; CM-NEXT: CF_END ; CM-NEXT: Fetch clause starting at 6: -; CM-NEXT: VTX_READ_16 T6.X, T5.X, 0, #1 -; CM-NEXT: VTX_READ_16 T7.X, T5.X, 2, #1 -; CM-NEXT: VTX_READ_16 T5.X, T5.X, 4, #1 -; CM-NEXT: ALU clause starting at 12: -; CM-NEXT: MOV * T5.X, KC0[2].Z, -; CM-NEXT: ALU clause starting at 13: +; CM-NEXT: VTX_READ_16 T1.X, T0.X, 0, #1 +; CM-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1 +; CM-NEXT: ALU clause starting at 10: +; CM-NEXT: MOV * T0.X, KC0[2].Z, +; CM-NEXT: ALU clause starting at 11: ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; CM-NEXT: AND_INT * T1.W, PV.W, literal.x, ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; CM-NEXT: AND_INT T0.Z, T5.X, literal.x, +; CM-NEXT: AND_INT T0.Z, T0.X, literal.x, ; CM-NEXT: LSHL * T1.W, PV.W, literal.y, ; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) -; CM-NEXT: LSHL T5.X, PV.Z, PV.W, -; CM-NEXT: LSHL * T5.W, literal.x, PV.W, +; CM-NEXT: LSHL T2.X, PV.Z, PV.W, +; CM-NEXT: LSHL * T2.W, literal.x, PV.W, ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; CM-NEXT: MOV T5.Y, 0.0, -; CM-NEXT: MOV * T5.Z, 0.0, -; CM-NEXT: LSHL T0.Z, T7.X, literal.x, -; CM-NEXT: AND_INT * T1.W, T6.X, literal.y, BS:VEC_120/SCL_212 -; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41) -; CM-NEXT: OR_INT * T6.X, PV.Z, PV.W, -; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, +; CM-NEXT: MOV T2.Y, 0.0, +; CM-NEXT: MOV * T2.Z, 0.0, +; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; CM-NEXT: LSHR * T8.X, T0.W, literal.x, +; CM-NEXT: LSHR * T3.X, T0.W, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %ld = load <3 x i16>, ptr addrspace(1) %in diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll index fb34b5e1f3af..896e60900c74 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll @@ -916,38 +916,22 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal +; EG: BFE_{{U?}}INT +; EG: BFE_{{U?}}INT +; EG: BFE_{{U?}}INT +; EG: BFE_{{U?}}INT +; EG: BFE_{{U?}}INT +; EG: BFE_{{U?}}INT +; EG: BFE_{{U?}}INT +; EG: BFE_{{U?}}INT +; EG: BFE_{{U?}}INT +; EG: BFE_{{U?}}INT +; EG: BFE_{{U?}}INT +; EG: BFE_{{U?}}INT +; EG: BFE_{{U?}}INT +; EG: BFE_{{U?}}INT +; EG: BFE_{{U?}}INT +; EG: BFE_{{U?}}INT define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { %load = load <32 x i8>, ptr addrspace(1) %in %ext = sext <32 x i8> %load to <32 x i16> diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll index e8744c7828d4..2b10d469acf5 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll @@ -6,76 +6,37 @@ define amdgpu_kernel void @buffer_last_use_load_0(ptr addrspace(7) %in, ptr addrspace(7) %out) { ; GFX12-LABEL: buffer_last_use_load_0: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_clause 0x2 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x20 -; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x10 +; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x10 +; GFX12-NEXT: s_mov_b32 s12, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s12 +; GFX12-NEXT: s_mov_b32 s9, s12 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9 -; GFX12-NEXT: v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11 -; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:32 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: scratch_load_b64 v[5:6], off, off offset:40 -; GFX12-NEXT: scratch_load_b32 v4, off, off offset:36 -; GFX12-NEXT: s_load_b32 s1, s[4:5], 0x30 -; GFX12-NEXT: scratch_store_b128 off, v[7:10], off +; GFX12-NEXT: s_mov_b32 s6, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: s_mov_b32 s8, s1 +; GFX12-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] +; GFX12-NEXT: s_mov_b32 s13, s2 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: scratch_load_b64 v[1:2], off, off offset:8 -; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4 -; GFX12-NEXT: v_mov_b32_e32 v7, s6 -; GFX12-NEXT: v_mov_b32_e32 v9, s0 +; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 +; GFX12-NEXT: s_mov_b32 s5, s12 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, s1 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v4 -; GFX12-NEXT: v_readfirstlane_b32 s5, v5 -; GFX12-NEXT: v_readfirstlane_b32 s6, v6 -; GFX12-NEXT: v_readfirstlane_b32 s7, v7 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_LU -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 -; GFX12-NEXT: ; implicit-def: $vgpr9 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB0_1 -; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v4, s8 -; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: .LBB0_3: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x1 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-NEXT: s_mov_b32 s4, s3 +; GFX12-NEXT: s_mov_b32 s3, s12 +; GFX12-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] +; GFX12-NEXT: s_mov_b32 s13, s2 +; GFX12-NEXT: s_mov_b32 s2, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_store_b32 v8, v4, s[4:7], null offen -; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX12-NEXT: ; implicit-def: $vgpr8 -; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB0_3 -; GFX12-NEXT: ; %bb.4: +; GFX12-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen ; GFX12-NEXT: s_endpgm entry: %val = load i32, ptr addrspace(7) %in, !amdgpu.last.use !{} @@ -86,77 +47,38 @@ entry: define amdgpu_kernel void @buffer_last_use_load_1(ptr addrspace(7) %in, ptr addrspace(7) %out) { ; GFX12-LABEL: buffer_last_use_load_1: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_clause 0x2 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x20 -; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x10 +; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x10 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_mov_b32 s12, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s12 +; GFX12-NEXT: s_mov_b32 s9, s12 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 -; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX12-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 -; GFX12-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 -; GFX12-NEXT: scratch_store_b128 off, v[1:4], off offset:32 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: scratch_load_b64 v[6:7], off, off offset:40 -; GFX12-NEXT: scratch_load_b32 v5, off, off offset:36 -; GFX12-NEXT: s_load_b32 s1, s[4:5], 0x30 -; GFX12-NEXT: scratch_store_b128 off, v[8:11], off +; GFX12-NEXT: s_mov_b32 s6, s3 +; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX12-NEXT: s_mov_b32 s8, s1 +; GFX12-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] +; GFX12-NEXT: s_mov_b32 s13, s2 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: scratch_load_b64 v[2:3], off, off offset:8 -; GFX12-NEXT: scratch_load_b32 v1, off, off offset:4 -; GFX12-NEXT: v_mov_b32_e32 v8, s6 -; GFX12-NEXT: v_lshl_add_u32 v9, v0, 2, s0 +; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 +; GFX12-NEXT: s_mov_b32 s5, s12 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, s1 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v5 -; GFX12-NEXT: v_readfirstlane_b32 s5, v6 -; GFX12-NEXT: v_readfirstlane_b32 s6, v7 -; GFX12-NEXT: v_readfirstlane_b32 s7, v8 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[5:6] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v0, v9, s[4:7], null offen th:TH_LOAD_LU -; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7_vgpr8 -; GFX12-NEXT: ; implicit-def: $vgpr9 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB1_1 -; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v5, s8 -; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x1 -; GFX12-NEXT: v_readfirstlane_b32 s4, v1 -; GFX12-NEXT: v_readfirstlane_b32 s5, v2 -; GFX12-NEXT: v_readfirstlane_b32 s6, v3 -; GFX12-NEXT: v_readfirstlane_b32 s7, v4 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-NEXT: s_mov_b32 s4, s3 +; GFX12-NEXT: s_mov_b32 s3, s12 +; GFX12-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] +; GFX12-NEXT: s_mov_b32 s13, s2 +; GFX12-NEXT: s_mov_b32 s2, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_store_b32 v0, v5, s[4:7], null offen -; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 -; GFX12-NEXT: ; implicit-def: $vgpr0 -; GFX12-NEXT: ; implicit-def: $vgpr5 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB1_3 -; GFX12-NEXT: ; %bb.4: +; GFX12-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen ; GFX12-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -169,76 +91,37 @@ entry: define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %in, ptr addrspace(7) %out) { ; GFX12-LABEL: buffer_last_use_and_volatile_load: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_clause 0x2 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x20 -; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x10 +; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x10 +; GFX12-NEXT: s_mov_b32 s12, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s12 +; GFX12-NEXT: s_mov_b32 s9, s12 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9 -; GFX12-NEXT: v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11 -; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:32 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: scratch_load_b64 v[5:6], off, off offset:40 -; GFX12-NEXT: scratch_load_b32 v4, off, off offset:36 -; GFX12-NEXT: s_load_b32 s1, s[4:5], 0x30 -; GFX12-NEXT: scratch_store_b128 off, v[7:10], off +; GFX12-NEXT: s_mov_b32 s6, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: s_mov_b32 s8, s1 +; GFX12-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] +; GFX12-NEXT: s_mov_b32 s13, s2 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: scratch_load_b64 v[1:2], off, off offset:8 -; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4 -; GFX12-NEXT: v_mov_b32_e32 v7, s6 -; GFX12-NEXT: v_mov_b32_e32 v9, s0 +; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 +; GFX12-NEXT: s_mov_b32 s5, s12 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, s1 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v4 -; GFX12-NEXT: v_readfirstlane_b32 s5, v5 -; GFX12-NEXT: v_readfirstlane_b32 s6, v6 -; GFX12-NEXT: v_readfirstlane_b32 s7, v7 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 -; GFX12-NEXT: ; implicit-def: $vgpr9 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB2_1 -; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v4, s8 -; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: .LBB2_3: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x1 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-NEXT: s_mov_b32 s4, s3 +; GFX12-NEXT: s_mov_b32 s3, s12 +; GFX12-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] +; GFX12-NEXT: s_mov_b32 s13, s2 +; GFX12-NEXT: s_mov_b32 s2, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_store_b32 v8, v4, s[4:7], null offen -; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX12-NEXT: ; implicit-def: $vgpr8 -; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB2_3 -; GFX12-NEXT: ; %bb.4: +; GFX12-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen ; GFX12-NEXT: s_endpgm entry: %val = load volatile i32, ptr addrspace(7) %in, !amdgpu.last.use !{} @@ -249,76 +132,37 @@ entry: define amdgpu_kernel void @buffer_last_use_and_nontemporal_load(ptr addrspace(7) %in, ptr addrspace(7) %out) { ; GFX12-LABEL: buffer_last_use_and_nontemporal_load: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_clause 0x2 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x20 -; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x10 +; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x10 +; GFX12-NEXT: s_mov_b32 s12, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s7, s12 +; GFX12-NEXT: s_mov_b32 s9, s12 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9 -; GFX12-NEXT: v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11 -; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:32 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: scratch_load_b64 v[5:6], off, off offset:40 -; GFX12-NEXT: scratch_load_b32 v4, off, off offset:36 -; GFX12-NEXT: s_load_b32 s1, s[4:5], 0x30 -; GFX12-NEXT: scratch_store_b128 off, v[7:10], off +; GFX12-NEXT: s_mov_b32 s6, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: s_mov_b32 s8, s1 +; GFX12-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] +; GFX12-NEXT: s_mov_b32 s13, s2 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: scratch_load_b64 v[1:2], off, off offset:8 -; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4 -; GFX12-NEXT: v_mov_b32_e32 v7, s6 -; GFX12-NEXT: v_mov_b32_e32 v9, s0 +; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 +; GFX12-NEXT: s_mov_b32 s5, s12 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, s1 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v4 -; GFX12-NEXT: v_readfirstlane_b32 s5, v5 -; GFX12-NEXT: v_readfirstlane_b32 s6, v6 -; GFX12-NEXT: v_readfirstlane_b32 s7, v7 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_LU -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 -; GFX12-NEXT: ; implicit-def: $vgpr9 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB3_1 -; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v4, s8 -; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: .LBB3_3: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x1 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-NEXT: s_mov_b32 s4, s3 +; GFX12-NEXT: s_mov_b32 s3, s12 +; GFX12-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] +; GFX12-NEXT: s_mov_b32 s13, s2 +; GFX12-NEXT: s_mov_b32 s2, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_store_b32 v8, v4, s[4:7], null offen -; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX12-NEXT: ; implicit-def: $vgpr8 -; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB3_3 -; GFX12-NEXT: ; %bb.4: +; GFX12-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen ; GFX12-NEXT: s_endpgm entry: %val = load i32, ptr addrspace(7) %in, !amdgpu.last.use !{}, !nontemporal !0 diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll index a5f6c2fe5d26..a62910e4e571 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll @@ -13,30 +13,32 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, ptr addrspace(7) %out) { ; GFX9-SDAG-LABEL: buffer_nontemporal_load_store: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-SDAG-NEXT: s_load_dword s11, s[8:9], 0x10 ; GFX9-SDAG-NEXT: s_mov_b32 s10, 0 -; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s15 -; GFX9-SDAG-NEXT: s_mov_b32 s15, s10 +; GFX9-SDAG-NEXT: s_mov_b32 s5, s10 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s14, s7 -; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-SDAG-NEXT: s_mov_b32 s12, s5 -; GFX9-SDAG-NEXT: s_or_b64 s[14:15], s[14:15], s[10:11] -; GFX9-SDAG-NEXT: s_mov_b32 s13, s6 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-SDAG-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen glc slc +; GFX9-SDAG-NEXT: s_mov_b32 s4, s3 +; GFX9-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[10:11] +; GFX9-SDAG-NEXT: s_mov_b32 s11, s2 +; GFX9-SDAG-NEXT: s_mov_b32 s2, s1 +; GFX9-SDAG-NEXT: s_mov_b32 s3, s10 +; GFX9-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[10:11] +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc slc ; GFX9-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30 -; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x20 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s10 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 +; GFX9-SDAG-NEXT: s_mov_b32 s5, s10 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s8, s7 -; GFX9-SDAG-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] -; GFX9-SDAG-NEXT: s_mov_b32 s8, s5 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s6 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-SDAG-NEXT: s_mov_b32 s4, s3 +; GFX9-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[10:11] +; GFX9-SDAG-NEXT: s_mov_b32 s11, s2 +; GFX9-SDAG-NEXT: s_mov_b32 s2, s1 +; GFX9-SDAG-NEXT: s_mov_b32 s3, s10 +; GFX9-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[10:11] +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc +; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen glc slc ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: buffer_nontemporal_load_store: @@ -72,68 +74,31 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX940-SDAG-LABEL: buffer_nontemporal_load_store: ; GFX940-SDAG: ; %bb.0: ; %entry ; GFX940-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX940-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10 -; GFX940-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20 -; GFX940-SDAG-NEXT: s_load_dword s7, s[4:5], 0x30 +; GFX940-SDAG-NEXT: s_load_dword s13, s[4:5], 0x10 +; GFX940-SDAG-NEXT: s_mov_b32 s12, 0 +; GFX940-SDAG-NEXT: s_mov_b32 s7, s12 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX940-SDAG-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 sc0 sc1 -; GFX940-SDAG-NEXT: scratch_load_dwordx2 v[10:11], off, off offset:40 -; GFX940-SDAG-NEXT: scratch_load_dword v4, off, off offset:36 -; GFX940-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GFX940-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GFX940-SDAG-NEXT: scratch_store_dwordx4 off, v[0:3], off sc0 sc1 -; GFX940-SDAG-NEXT: scratch_load_dwordx2 v[12:13], off, off offset:8 -; GFX940-SDAG-NEXT: s_nop 0 -; GFX940-SDAG-NEXT: scratch_load_dword v0, off, off offset:4 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v7, s6 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v9, s0 -; GFX940-SDAG-NEXT: s_mov_b64 s[2:3], exec -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(4) -; GFX940-SDAG-NEXT: v_mov_b32_e32 v5, v10 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v6, v11 -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(1) -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, v12 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, v13 -; GFX940-SDAG-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-SDAG-NEXT: v_readfirstlane_b32 s4, v4 -; GFX940-SDAG-NEXT: v_readfirstlane_b32 s5, v5 -; GFX940-SDAG-NEXT: v_readfirstlane_b32 s6, v6 -; GFX940-SDAG-NEXT: v_readfirstlane_b32 s7, v7 -; GFX940-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5] -; GFX940-SDAG-NEXT: s_nop 0 -; GFX940-SDAG-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[6:7] -; GFX940-SDAG-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-SDAG-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-SDAG-NEXT: buffer_load_dword v8, v9, s[4:7], 0 offen nt -; GFX940-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 -; GFX940-SDAG-NEXT: ; implicit-def: $vgpr9 -; GFX940-SDAG-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-SDAG-NEXT: s_cbranch_execnz .LBB0_1 -; GFX940-SDAG-NEXT: ; %bb.2: -; GFX940-SDAG-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-SDAG-NEXT: v_mov_b32_e32 v4, s8 -; GFX940-SDAG-NEXT: s_mov_b64 s[0:1], exec +; GFX940-SDAG-NEXT: s_mov_b32 s6, s3 +; GFX940-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] +; GFX940-SDAG-NEXT: s_mov_b32 s13, s2 +; GFX940-SDAG-NEXT: s_mov_b32 s2, s1 +; GFX940-SDAG-NEXT: s_mov_b32 s3, s12 +; GFX940-SDAG-NEXT: s_or_b64 s[8:9], s[2:3], s[12:13] +; GFX940-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-SDAG-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen nt +; GFX940-SDAG-NEXT: s_load_dword s13, s[4:5], 0x30 +; GFX940-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 +; GFX940-SDAG-NEXT: s_mov_b32 s5, s12 +; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-SDAG-NEXT: s_mov_b32 s4, s3 +; GFX940-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] +; GFX940-SDAG-NEXT: s_mov_b32 s13, s2 +; GFX940-SDAG-NEXT: s_mov_b32 s2, s1 +; GFX940-SDAG-NEXT: s_mov_b32 s3, s12 +; GFX940-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: .LBB0_3: ; =>This Inner Loop Header: Depth=1 -; GFX940-SDAG-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-SDAG-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-SDAG-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-SDAG-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-SDAG-NEXT: s_nop 0 -; GFX940-SDAG-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-SDAG-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-SDAG-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-SDAG-NEXT: buffer_store_dword v8, v4, s[4:7], 0 offen sc0 nt sc1 -; GFX940-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX940-SDAG-NEXT: ; implicit-def: $vgpr8 -; GFX940-SDAG-NEXT: ; implicit-def: $vgpr4 -; GFX940-SDAG-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-SDAG-NEXT: s_cbranch_execnz .LBB0_3 -; GFX940-SDAG-NEXT: ; %bb.4: +; GFX940-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen sc0 nt sc1 ; GFX940-SDAG-NEXT: s_endpgm ; ; GFX940-GISEL-LABEL: buffer_nontemporal_load_store: @@ -169,31 +134,34 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX10-SDAG-LABEL: buffer_nontemporal_load_store: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-SDAG-NEXT: s_load_dword s11, s[8:9], 0x10 ; GFX10-SDAG-NEXT: s_mov_b32 s10, 0 -; GFX10-SDAG-NEXT: s_add_u32 s0, s0, s15 +; GFX10-SDAG-NEXT: s_mov_b32 s5, s10 ; GFX10-SDAG-NEXT: s_mov_b32 s13, s10 -; GFX10-SDAG-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-SDAG-NEXT: s_mov_b32 s12, s7 -; GFX10-SDAG-NEXT: s_or_b64 s[14:15], s[12:13], s[10:11] -; GFX10-SDAG-NEXT: s_mov_b32 s12, s5 -; GFX10-SDAG-NEXT: s_mov_b32 s13, s6 -; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen slc +; GFX10-SDAG-NEXT: s_mov_b32 s4, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-SDAG-NEXT: s_mov_b32 s12, s1 +; GFX10-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[10:11] +; GFX10-SDAG-NEXT: s_mov_b32 s11, s2 +; GFX10-SDAG-NEXT: s_or_b64 s[4:5], s[12:13], s[10:11] +; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen slc ; GFX10-SDAG-NEXT: s_clause 0x1 ; GFX10-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x20 -; GFX10-SDAG-NEXT: s_mov_b32 s9, s10 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 +; GFX10-SDAG-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-SDAG-NEXT: s_mov_b32 s5, s10 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-SDAG-NEXT: s_mov_b32 s8, s7 -; GFX10-SDAG-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] -; GFX10-SDAG-NEXT: s_mov_b32 s8, s5 -; GFX10-SDAG-NEXT: s_mov_b32 s9, s6 +; GFX10-SDAG-NEXT: s_mov_b32 s4, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[10:11] +; GFX10-SDAG-NEXT: s_mov_b32 s11, s2 +; GFX10-SDAG-NEXT: s_mov_b32 s2, s1 +; GFX10-SDAG-NEXT: s_mov_b32 s3, s10 +; GFX10-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[10:11] ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc +; GFX10-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen glc slc ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: buffer_nontemporal_load_store: @@ -229,69 +197,37 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; ; GFX11-SDAG-LABEL: buffer_nontemporal_load_store: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_clause 0x2 +; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-SDAG-NEXT: s_load_b128 s[8:11], s[4:5], 0x20 -; GFX11-SDAG-NEXT: s_load_b32 s6, s[4:5], 0x10 +; GFX11-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x10 +; GFX11-SDAG-NEXT: s_mov_b32 s12, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_mov_b32 s7, s12 +; GFX11-SDAG-NEXT: s_mov_b32 s9, s12 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11 -; GFX11-SDAG-NEXT: scratch_store_b128 off, v[0:3], off offset:32 +; GFX11-SDAG-NEXT: s_mov_b32 s6, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-SDAG-NEXT: s_mov_b32 s8, s1 +; GFX11-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] +; GFX11-SDAG-NEXT: s_mov_b32 s13, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: scratch_load_b64 v[5:6], off, off offset:40 -; GFX11-SDAG-NEXT: scratch_load_b32 v4, off, off offset:36 -; GFX11-SDAG-NEXT: s_load_b32 s1, s[4:5], 0x30 -; GFX11-SDAG-NEXT: scratch_store_b128 off, v[7:10], off -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: scratch_load_b64 v[1:2], off, off offset:8 -; GFX11-SDAG-NEXT: scratch_load_b32 v0, off, off offset:4 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v7, s6 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v9, s0 +; GFX11-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 +; GFX11-SDAG-NEXT: s_mov_b32 s5, s12 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s4, v4 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s5, v5 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s6, v6 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s7, v7 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] -; GFX11-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-SDAG-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-SDAG-NEXT: buffer_load_b32 v8, v9, s[4:7], 0 offen slc dlc -; GFX11-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 -; GFX11-SDAG-NEXT: ; implicit-def: $vgpr9 -; GFX11-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB0_1 -; GFX11-SDAG-NEXT: ; %bb.2: -; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, s8 -; GFX11-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX11-SDAG-NEXT: .LBB0_3: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-SDAG-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-SDAG-NEXT: s_mov_b32 s4, s3 +; GFX11-SDAG-NEXT: s_mov_b32 s3, s12 +; GFX11-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] +; GFX11-SDAG-NEXT: s_mov_b32 s13, s2 +; GFX11-SDAG-NEXT: s_mov_b32 s2, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: buffer_store_b32 v8, v4, s[4:7], 0 offen glc slc dlc -; GFX11-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX11-SDAG-NEXT: ; implicit-def: $vgpr8 -; GFX11-SDAG-NEXT: ; implicit-def: $vgpr4 -; GFX11-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB0_3 -; GFX11-SDAG-NEXT: ; %bb.4: +; GFX11-SDAG-NEXT: buffer_store_b32 v0, v1, s[4:7], 0 offen glc slc dlc ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: buffer_nontemporal_load_store: @@ -330,76 +266,37 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; ; GFX12-SDAG-LABEL: buffer_nontemporal_load_store: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_clause 0x2 +; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX12-SDAG-NEXT: s_load_b128 s[8:11], s[4:5], 0x20 -; GFX12-SDAG-NEXT: s_load_b32 s6, s[4:5], 0x10 +; GFX12-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x10 +; GFX12-SDAG-NEXT: s_mov_b32 s12, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_mov_b32 s7, s12 +; GFX12-SDAG-NEXT: s_mov_b32 s9, s12 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11 -; GFX12-SDAG-NEXT: scratch_store_b128 off, v[0:3], off offset:32 -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: scratch_load_b64 v[5:6], off, off offset:40 -; GFX12-SDAG-NEXT: scratch_load_b32 v4, off, off offset:36 -; GFX12-SDAG-NEXT: s_load_b32 s1, s[4:5], 0x30 -; GFX12-SDAG-NEXT: scratch_store_b128 off, v[7:10], off +; GFX12-SDAG-NEXT: s_mov_b32 s6, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-SDAG-NEXT: s_mov_b32 s8, s1 +; GFX12-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] +; GFX12-SDAG-NEXT: s_mov_b32 s13, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: scratch_load_b64 v[1:2], off, off offset:8 -; GFX12-SDAG-NEXT: scratch_load_b32 v0, off, off offset:4 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v7, s6 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v9, s0 +; GFX12-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 +; GFX12-SDAG-NEXT: s_mov_b32 s5, s12 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; GFX12-SDAG-NEXT: s_mov_b32 s1, exec_lo -; GFX12-SDAG-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x2 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s4, v4 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v5 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s6, v6 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s7, v7 -; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] -; GFX12-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_NT -; GFX12-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 -; GFX12-SDAG-NEXT: ; implicit-def: $vgpr9 -; GFX12-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB0_1 -; GFX12-SDAG-NEXT: ; %bb.2: -; GFX12-SDAG-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, s8 -; GFX12-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX12-SDAG-NEXT: .LBB0_3: ; =>This Inner Loop Header: Depth=1 -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x1 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-SDAG-NEXT: s_mov_b32 s4, s3 +; GFX12-SDAG-NEXT: s_mov_b32 s3, s12 +; GFX12-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] +; GFX12-SDAG-NEXT: s_mov_b32 s13, s2 +; GFX12-SDAG-NEXT: s_mov_b32 s2, s1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: buffer_store_b32 v8, v4, s[4:7], null offen th:TH_STORE_NT -; GFX12-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX12-SDAG-NEXT: ; implicit-def: $vgpr8 -; GFX12-SDAG-NEXT: ; implicit-def: $vgpr4 -; GFX12-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB0_3 -; GFX12-SDAG-NEXT: ; %bb.4: +; GFX12-SDAG-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: buffer_nontemporal_load_store: @@ -444,30 +341,32 @@ entry: define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrspace(7) %in, ptr addrspace(7) %out) { ; GFX9-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-SDAG-NEXT: s_load_dword s11, s[8:9], 0x10 ; GFX9-SDAG-NEXT: s_mov_b32 s10, 0 -; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s15 -; GFX9-SDAG-NEXT: s_mov_b32 s15, s10 +; GFX9-SDAG-NEXT: s_mov_b32 s5, s10 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s14, s7 -; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-SDAG-NEXT: s_mov_b32 s12, s5 -; GFX9-SDAG-NEXT: s_or_b64 s[14:15], s[14:15], s[10:11] -; GFX9-SDAG-NEXT: s_mov_b32 s13, s6 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-SDAG-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen glc +; GFX9-SDAG-NEXT: s_mov_b32 s4, s3 +; GFX9-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[10:11] +; GFX9-SDAG-NEXT: s_mov_b32 s11, s2 +; GFX9-SDAG-NEXT: s_mov_b32 s2, s1 +; GFX9-SDAG-NEXT: s_mov_b32 s3, s10 +; GFX9-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[10:11] +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc ; GFX9-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30 -; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x20 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s10 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 +; GFX9-SDAG-NEXT: s_mov_b32 s5, s10 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s8, s7 -; GFX9-SDAG-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] -; GFX9-SDAG-NEXT: s_mov_b32 s8, s5 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s6 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-SDAG-NEXT: s_mov_b32 s4, s3 +; GFX9-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[10:11] +; GFX9-SDAG-NEXT: s_mov_b32 s11, s2 +; GFX9-SDAG-NEXT: s_mov_b32 s2, s1 +; GFX9-SDAG-NEXT: s_mov_b32 s3, s10 +; GFX9-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[10:11] +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store: @@ -503,68 +402,31 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX940-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store: ; GFX940-SDAG: ; %bb.0: ; %entry ; GFX940-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX940-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10 -; GFX940-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20 -; GFX940-SDAG-NEXT: s_load_dword s7, s[4:5], 0x30 +; GFX940-SDAG-NEXT: s_load_dword s13, s[4:5], 0x10 +; GFX940-SDAG-NEXT: s_mov_b32 s12, 0 +; GFX940-SDAG-NEXT: s_mov_b32 s7, s12 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX940-SDAG-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 sc0 sc1 -; GFX940-SDAG-NEXT: scratch_load_dwordx2 v[10:11], off, off offset:40 -; GFX940-SDAG-NEXT: scratch_load_dword v4, off, off offset:36 -; GFX940-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GFX940-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GFX940-SDAG-NEXT: scratch_store_dwordx4 off, v[0:3], off sc0 sc1 -; GFX940-SDAG-NEXT: scratch_load_dwordx2 v[12:13], off, off offset:8 -; GFX940-SDAG-NEXT: s_nop 0 -; GFX940-SDAG-NEXT: scratch_load_dword v0, off, off offset:4 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v7, s6 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v9, s0 -; GFX940-SDAG-NEXT: s_mov_b64 s[2:3], exec -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(4) -; GFX940-SDAG-NEXT: v_mov_b32_e32 v5, v10 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v6, v11 -; GFX940-SDAG-NEXT: s_waitcnt vmcnt(1) -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, v12 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, v13 -; GFX940-SDAG-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-SDAG-NEXT: v_readfirstlane_b32 s4, v4 -; GFX940-SDAG-NEXT: v_readfirstlane_b32 s5, v5 -; GFX940-SDAG-NEXT: v_readfirstlane_b32 s6, v6 -; GFX940-SDAG-NEXT: v_readfirstlane_b32 s7, v7 -; GFX940-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5] -; GFX940-SDAG-NEXT: s_nop 0 -; GFX940-SDAG-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[6:7] -; GFX940-SDAG-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-SDAG-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-SDAG-NEXT: buffer_load_dword v8, v9, s[4:7], 0 offen sc0 sc1 -; GFX940-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 -; GFX940-SDAG-NEXT: ; implicit-def: $vgpr9 -; GFX940-SDAG-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-SDAG-NEXT: s_cbranch_execnz .LBB1_1 -; GFX940-SDAG-NEXT: ; %bb.2: -; GFX940-SDAG-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-SDAG-NEXT: v_mov_b32_e32 v4, s8 -; GFX940-SDAG-NEXT: s_mov_b64 s[0:1], exec +; GFX940-SDAG-NEXT: s_mov_b32 s6, s3 +; GFX940-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] +; GFX940-SDAG-NEXT: s_mov_b32 s13, s2 +; GFX940-SDAG-NEXT: s_mov_b32 s2, s1 +; GFX940-SDAG-NEXT: s_mov_b32 s3, s12 +; GFX940-SDAG-NEXT: s_or_b64 s[8:9], s[2:3], s[12:13] +; GFX940-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-SDAG-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1 +; GFX940-SDAG-NEXT: s_load_dword s13, s[4:5], 0x30 +; GFX940-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 +; GFX940-SDAG-NEXT: s_mov_b32 s5, s12 +; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-SDAG-NEXT: s_mov_b32 s4, s3 +; GFX940-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] +; GFX940-SDAG-NEXT: s_mov_b32 s13, s2 +; GFX940-SDAG-NEXT: s_mov_b32 s2, s1 +; GFX940-SDAG-NEXT: s_mov_b32 s3, s12 +; GFX940-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 -; GFX940-SDAG-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-SDAG-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-SDAG-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-SDAG-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-SDAG-NEXT: s_nop 0 -; GFX940-SDAG-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-SDAG-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-SDAG-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-SDAG-NEXT: buffer_store_dword v8, v4, s[4:7], 0 offen sc0 sc1 -; GFX940-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX940-SDAG-NEXT: ; implicit-def: $vgpr8 -; GFX940-SDAG-NEXT: ; implicit-def: $vgpr4 -; GFX940-SDAG-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-SDAG-NEXT: s_cbranch_execnz .LBB1_3 -; GFX940-SDAG-NEXT: ; %bb.4: +; GFX940-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1 ; GFX940-SDAG-NEXT: s_endpgm ; ; GFX940-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store: @@ -600,31 +462,34 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX10-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-SDAG-NEXT: s_load_dword s11, s[8:9], 0x10 ; GFX10-SDAG-NEXT: s_mov_b32 s10, 0 -; GFX10-SDAG-NEXT: s_add_u32 s0, s0, s15 +; GFX10-SDAG-NEXT: s_mov_b32 s5, s10 ; GFX10-SDAG-NEXT: s_mov_b32 s13, s10 -; GFX10-SDAG-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-SDAG-NEXT: s_mov_b32 s12, s7 -; GFX10-SDAG-NEXT: s_or_b64 s[14:15], s[12:13], s[10:11] -; GFX10-SDAG-NEXT: s_mov_b32 s12, s5 -; GFX10-SDAG-NEXT: s_mov_b32 s13, s6 -; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen glc dlc +; GFX10-SDAG-NEXT: s_mov_b32 s4, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-SDAG-NEXT: s_mov_b32 s12, s1 +; GFX10-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[10:11] +; GFX10-SDAG-NEXT: s_mov_b32 s11, s2 +; GFX10-SDAG-NEXT: s_or_b64 s[4:5], s[12:13], s[10:11] +; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc ; GFX10-SDAG-NEXT: s_clause 0x1 ; GFX10-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x20 -; GFX10-SDAG-NEXT: s_mov_b32 s9, s10 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 +; GFX10-SDAG-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-SDAG-NEXT: s_mov_b32 s5, s10 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-SDAG-NEXT: s_mov_b32 s8, s7 -; GFX10-SDAG-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] -; GFX10-SDAG-NEXT: s_mov_b32 s8, s5 -; GFX10-SDAG-NEXT: s_mov_b32 s9, s6 +; GFX10-SDAG-NEXT: s_mov_b32 s4, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[10:11] +; GFX10-SDAG-NEXT: s_mov_b32 s11, s2 +; GFX10-SDAG-NEXT: s_mov_b32 s2, s1 +; GFX10-SDAG-NEXT: s_mov_b32 s3, s10 +; GFX10-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[10:11] ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; GFX10-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store: @@ -660,69 +525,37 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; ; GFX11-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_clause 0x2 +; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-SDAG-NEXT: s_load_b128 s[8:11], s[4:5], 0x20 -; GFX11-SDAG-NEXT: s_load_b32 s6, s[4:5], 0x10 +; GFX11-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x10 +; GFX11-SDAG-NEXT: s_mov_b32 s12, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_mov_b32 s7, s12 +; GFX11-SDAG-NEXT: s_mov_b32 s9, s12 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11 -; GFX11-SDAG-NEXT: scratch_store_b128 off, v[0:3], off offset:32 +; GFX11-SDAG-NEXT: s_mov_b32 s6, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-SDAG-NEXT: s_mov_b32 s8, s1 +; GFX11-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] +; GFX11-SDAG-NEXT: s_mov_b32 s13, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: scratch_load_b64 v[5:6], off, off offset:40 -; GFX11-SDAG-NEXT: scratch_load_b32 v4, off, off offset:36 -; GFX11-SDAG-NEXT: s_load_b32 s1, s[4:5], 0x30 -; GFX11-SDAG-NEXT: scratch_store_b128 off, v[7:10], off -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: scratch_load_b64 v[1:2], off, off offset:8 -; GFX11-SDAG-NEXT: scratch_load_b32 v0, off, off offset:4 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v7, s6 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v9, s0 +; GFX11-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 +; GFX11-SDAG-NEXT: s_mov_b32 s5, s12 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s4, v4 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s5, v5 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s6, v6 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s7, v7 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] -; GFX11-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-SDAG-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-SDAG-NEXT: buffer_load_b32 v8, v9, s[4:7], 0 offen glc dlc -; GFX11-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 -; GFX11-SDAG-NEXT: ; implicit-def: $vgpr9 -; GFX11-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB1_1 -; GFX11-SDAG-NEXT: ; %bb.2: -; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, s8 -; GFX11-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX11-SDAG-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-SDAG-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-SDAG-NEXT: s_mov_b32 s4, s3 +; GFX11-SDAG-NEXT: s_mov_b32 s3, s12 +; GFX11-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] +; GFX11-SDAG-NEXT: s_mov_b32 s13, s2 +; GFX11-SDAG-NEXT: s_mov_b32 s2, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: buffer_store_b32 v8, v4, s[4:7], 0 offen dlc -; GFX11-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX11-SDAG-NEXT: ; implicit-def: $vgpr8 -; GFX11-SDAG-NEXT: ; implicit-def: $vgpr4 -; GFX11-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB1_3 -; GFX11-SDAG-NEXT: ; %bb.4: +; GFX11-SDAG-NEXT: buffer_store_b32 v0, v1, s[4:7], 0 offen dlc ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store: @@ -761,77 +594,37 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; ; GFX12-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_clause 0x2 +; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX12-SDAG-NEXT: s_load_b128 s[8:11], s[4:5], 0x20 -; GFX12-SDAG-NEXT: s_load_b32 s6, s[4:5], 0x10 +; GFX12-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x10 +; GFX12-SDAG-NEXT: s_mov_b32 s12, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_mov_b32 s7, s12 +; GFX12-SDAG-NEXT: s_mov_b32 s9, s12 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11 -; GFX12-SDAG-NEXT: scratch_store_b128 off, v[0:3], off offset:32 -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: scratch_load_b64 v[5:6], off, off offset:40 -; GFX12-SDAG-NEXT: scratch_load_b32 v4, off, off offset:36 -; GFX12-SDAG-NEXT: s_load_b32 s1, s[4:5], 0x30 -; GFX12-SDAG-NEXT: scratch_store_b128 off, v[7:10], off +; GFX12-SDAG-NEXT: s_mov_b32 s6, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-SDAG-NEXT: s_mov_b32 s8, s1 +; GFX12-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] +; GFX12-SDAG-NEXT: s_mov_b32 s13, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: scratch_load_b64 v[1:2], off, off offset:8 -; GFX12-SDAG-NEXT: scratch_load_b32 v0, off, off offset:4 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v7, s6 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v9, s0 +; GFX12-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 +; GFX12-SDAG-NEXT: s_mov_b32 s5, s12 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; GFX12-SDAG-NEXT: s_mov_b32 s1, exec_lo -; GFX12-SDAG-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x2 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s4, v4 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v5 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s6, v6 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s7, v7 -; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] -; GFX12-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_NT scope:SCOPE_SYS -; GFX12-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 -; GFX12-SDAG-NEXT: ; implicit-def: $vgpr9 -; GFX12-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB1_1 -; GFX12-SDAG-NEXT: ; %bb.2: -; GFX12-SDAG-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, s8 -; GFX12-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX12-SDAG-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x1 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-SDAG-NEXT: s_mov_b32 s4, s3 +; GFX12-SDAG-NEXT: s_mov_b32 s3, s12 +; GFX12-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] +; GFX12-SDAG-NEXT: s_mov_b32 s13, s2 +; GFX12-SDAG-NEXT: s_mov_b32 s2, s1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: buffer_store_b32 v8, v4, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS -; GFX12-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX12-SDAG-NEXT: ; implicit-def: $vgpr8 -; GFX12-SDAG-NEXT: ; implicit-def: $vgpr4 -; GFX12-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB1_3 -; GFX12-SDAG-NEXT: ; %bb.4: +; GFX12-SDAG-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store: diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll index ffe9e06c04ae..5a9f53ec0077 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll @@ -330,17 +330,17 @@ define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0, define amdgpu_kernel void @memcpy_alt_type(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %n) #0 { ; OPT-LABEL: @memcpy_alt_type( -; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7 +; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 15 ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]] ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] ; OPT: loop-memcpy-expansion: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1 +; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 1 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1 -; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8 +; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1 +; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] ; OPT: loop-memcpy-residual: @@ -681,13 +681,25 @@ define amdgpu_kernel void @memcpy_global_align2_global_align2_1039(ptr addrspace ; OPT: load-store-loop: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(1) [[TMP1]], align 2 +; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 2 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[TMP3]], align 2 -; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 2 -; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1038 +; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 2 +; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 +; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; OPT: memcpy-split: +; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 +; OPT-NEXT: [[TMP16:%.*]] = load i64, ptr addrspace(1) [[TMP15]], align 2 +; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 +; OPT-NEXT: store i64 [[TMP16]], ptr addrspace(1) [[TMP17]], align 2 +; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1032 +; OPT-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP9]], align 2 +; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1032 +; OPT-NEXT: store i32 [[TMP10]], ptr addrspace(1) [[TMP11]], align 2 +; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1036 +; OPT-NEXT: [[TMP13:%.*]] = load i16, ptr addrspace(1) [[TMP12]], align 2 +; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1036 +; OPT-NEXT: store i16 [[TMP13]], ptr addrspace(1) [[TMP14]], align 2 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1038 ; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 2 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1038 @@ -731,13 +743,17 @@ define amdgpu_kernel void @memcpy_global_align2_global_align4_1027(ptr addrspace ; OPT: load-store-loop: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(1) [[TMP1]], align 2 +; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[TMP3]], align 2 -; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 2 -; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1026 +; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 2 +; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 +; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; OPT: memcpy-split: +; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 +; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(1) [[TMP9]], align 4 +; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 +; OPT-NEXT: store i16 [[TMP10]], ptr addrspace(1) [[TMP11]], align 2 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1026 ; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 2 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1026 @@ -754,13 +770,17 @@ define amdgpu_kernel void @memcpy_global_align4_global_align2_1027(ptr addrspace ; OPT: load-store-loop: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(1) [[TMP1]], align 2 +; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 2 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[TMP3]], align 2 -; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 2 -; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1026 +; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 +; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 +; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; OPT: memcpy-split: +; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 +; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(1) [[TMP9]], align 2 +; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 +; OPT-NEXT: store i16 [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1026 ; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 2 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1026 @@ -804,13 +824,17 @@ define amdgpu_kernel void @memcpy_private_align2_private_align4_1027(ptr addrspa ; OPT: load-store-loop: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(5) [[TMP1]], align 2 +; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 4 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(5) [[TMP3]], align 2 -; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 2 -; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1026 +; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 2 +; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 +; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; OPT: memcpy-split: +; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024 +; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(5) [[TMP9]], align 4 +; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024 +; OPT-NEXT: store i16 [[TMP10]], ptr addrspace(5) [[TMP11]], align 2 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026 ; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[TMP6]], align 2 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1026 @@ -854,13 +878,17 @@ define amdgpu_kernel void @memcpy_private_align4_private_align2_1027(ptr addrspa ; OPT: load-store-loop: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(5) [[TMP1]], align 2 +; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 2 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(5) [[TMP3]], align 2 -; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 2 -; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1026 +; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 4 +; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 +; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; OPT: memcpy-split: +; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024 +; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(5) [[TMP9]], align 2 +; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024 +; OPT-NEXT: store i16 [[TMP10]], ptr addrspace(5) [[TMP11]], align 4 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026 ; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[TMP6]], align 2 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1026 @@ -904,13 +932,17 @@ define amdgpu_kernel void @memcpy_private_align2_private_align2_1027(ptr addrspa ; OPT: load-store-loop: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(5) [[TMP1]], align 2 +; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 2 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(5) [[TMP3]], align 2 -; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 2 -; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1026 +; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 2 +; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 +; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; OPT: memcpy-split: +; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024 +; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(5) [[TMP9]], align 2 +; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024 +; OPT-NEXT: store i16 [[TMP10]], ptr addrspace(5) [[TMP11]], align 2 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026 ; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[TMP6]], align 2 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1026 @@ -958,17 +990,17 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(ptr addrs define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 { ; OPT-LABEL: @memcpy_global_align2_global_align2_variable( -; OPT-NEXT: [[TMP2:%.*]] = and i64 [[N:%.*]], 1 +; OPT-NEXT: [[TMP2:%.*]] = and i64 [[N:%.*]], 15 ; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]] ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] ; OPT: loop-memcpy-expansion: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP6:%.*]] = load i16, ptr addrspace(1) [[TMP5]], align 2 +; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 2 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: store i16 [[TMP6]], ptr addrspace(1) [[TMP7]], align 2 -; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 2 +; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 2 +; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 16 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] ; OPT: loop-memcpy-residual: @@ -1028,17 +1060,17 @@ define amdgpu_kernel void @memcpy_global_align1_global_align1_variable(ptr addrs define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 { ; OPT-LABEL: @memcpy_local_align4_local_align4_variable( -; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7 +; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 15 ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]] ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] ; OPT: loop-memcpy-expansion: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 4 +; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 4 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 4 -; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8 +; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 4 +; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] ; OPT: loop-memcpy-residual: @@ -1063,17 +1095,17 @@ define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspa define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 { ; OPT-LABEL: @memcpy_local_align2_local_align2_variable( -; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 1 +; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 15 ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]] ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] ; OPT: loop-memcpy-expansion: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP6:%.*]] = load i16, ptr addrspace(3) [[TMP5]], align 2 +; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 2 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: store i16 [[TMP6]], ptr addrspace(3) [[TMP7]], align 2 -; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 2 +; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 2 +; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] ; OPT: loop-memcpy-residual: @@ -1098,17 +1130,17 @@ define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspa define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 { ; OPT-LABEL: @memcpy_local_align1_local_align1_variable( -; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7 +; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 15 ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]] ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] ; OPT: loop-memcpy-expansion: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1 +; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 1 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1 -; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8 +; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1 +; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] ; OPT: loop-memcpy-residual: @@ -1133,17 +1165,17 @@ define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(ptr addrspa define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrspace(3) %dst, ptr addrspace(1) %src, i32 %n) #0 { ; OPT-LABEL: @memcpy_local_align4_global_align4_variable( -; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7 +; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 15 ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]] ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] ; OPT: loop-memcpy-expansion: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(1) [[TMP5]], align 4 +; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 4 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 4 -; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8 +; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 4 +; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] ; OPT: loop-memcpy-residual: @@ -1168,17 +1200,17 @@ define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrsp define amdgpu_kernel void @memcpy_global_align4_local_align4_variable(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %n) #0 { ; OPT-LABEL: @memcpy_global_align4_local_align4_variable( -; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7 +; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 15 ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]] ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] ; OPT: loop-memcpy-expansion: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 4 +; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 4 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4 -; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8 +; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4 +; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] ; OPT: loop-memcpy-residual: @@ -1693,10 +1725,10 @@ define amdgpu_kernel void @memmove_local_align1_private_align1(ptr addrspace(3) ; ALL: load-store-loop: ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; ALL-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META6:![0-9]+]] +; ALL-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META6:![0-9]+]] ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]] -; ALL-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 1, !noalias [[META6]] -; ALL-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 8 +; ALL-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 1, !noalias [[META6]] +; ALL-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; ALL-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 256 ; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; ALL: memcpy-split: @@ -1708,17 +1740,17 @@ define amdgpu_kernel void @memmove_local_align1_private_align1(ptr addrspace(3) define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(5) %src, i32 %size) { ; MAX1024-LABEL: @memmove_local_align1_private_align1_unknown_size( -; MAX1024-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7 +; MAX1024-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15 ; MAX1024-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]] ; MAX1024-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; MAX1024-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] ; MAX1024: loop-memcpy-expansion: ; MAX1024-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; MAX1024-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; MAX1024-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP7]], align 1, !alias.scope [[META0:![0-9]+]] +; MAX1024-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP7]], align 1, !alias.scope [[META0:![0-9]+]] ; MAX1024-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]] -; MAX1024-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(3) [[TMP6]], align 1, !noalias [[META0]] -; MAX1024-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8 +; MAX1024-NEXT: store <4 x i32> [[TMP5]], ptr addrspace(3) [[TMP6]], align 1, !noalias [[META0]] +; MAX1024-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16 ; MAX1024-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] ; MAX1024-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] ; MAX1024: loop-memcpy-residual: @@ -1738,17 +1770,17 @@ define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr ; MAX1024-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] ; ; ALL-LABEL: @memmove_local_align1_private_align1_unknown_size( -; ALL-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7 +; ALL-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15 ; ALL-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]] ; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; ALL-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] ; ALL: loop-memcpy-expansion: ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; ALL-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP7]], align 1, !alias.scope [[META9:![0-9]+]] +; ALL-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP7]], align 1, !alias.scope [[META9:![0-9]+]] ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]] -; ALL-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(3) [[TMP6]], align 1, !noalias [[META9]] -; ALL-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8 +; ALL-NEXT: store <4 x i32> [[TMP5]], ptr addrspace(3) [[TMP6]], align 1, !noalias [[META9]] +; ALL-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16 ; ALL-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] ; ALL-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] ; ALL: loop-memcpy-residual: @@ -1781,10 +1813,10 @@ define amdgpu_kernel void @memmove_private_align1_local_align1(ptr addrspace(5) ; ALL: load-store-loop: ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; ALL-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP1]], align 1, !alias.scope [[META12:![0-9]+]] +; ALL-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(3) [[TMP1]], align 1, !alias.scope [[META12:![0-9]+]] ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]] -; ALL-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias [[META12]] -; ALL-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 8 +; ALL-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias [[META12]] +; ALL-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; ALL-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 256 ; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; ALL: memcpy-split: @@ -1796,17 +1828,17 @@ define amdgpu_kernel void @memmove_private_align1_local_align1(ptr addrspace(5) define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr addrspace(5) %dst, ptr addrspace(3) %src, i32 %size) { ; MAX1024-LABEL: @memmove_private_align1_local_align1_unknown_size( -; MAX1024-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7 +; MAX1024-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15 ; MAX1024-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]] ; MAX1024-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; MAX1024-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] ; MAX1024: loop-memcpy-expansion: ; MAX1024-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; MAX1024-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; MAX1024-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP7]], align 1, !alias.scope [[META3:![0-9]+]] +; MAX1024-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP7]], align 1, !alias.scope [[META3:![0-9]+]] ; MAX1024-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]] -; MAX1024-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(5) [[TMP6]], align 1, !noalias [[META3]] -; MAX1024-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8 +; MAX1024-NEXT: store <4 x i32> [[TMP5]], ptr addrspace(5) [[TMP6]], align 1, !noalias [[META3]] +; MAX1024-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16 ; MAX1024-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] ; MAX1024-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] ; MAX1024: loop-memcpy-residual: @@ -1826,17 +1858,17 @@ define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr ; MAX1024-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] ; ; ALL-LABEL: @memmove_private_align1_local_align1_unknown_size( -; ALL-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7 +; ALL-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15 ; ALL-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]] ; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; ALL-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] ; ALL: loop-memcpy-expansion: ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; ALL-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP7]], align 1, !alias.scope [[META15:![0-9]+]] +; ALL-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP7]], align 1, !alias.scope [[META15:![0-9]+]] ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]] -; ALL-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(5) [[TMP6]], align 1, !noalias [[META15]] -; ALL-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8 +; ALL-NEXT: store <4 x i32> [[TMP5]], ptr addrspace(5) [[TMP6]], align 1, !noalias [[META15]] +; ALL-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16 ; ALL-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] ; ALL-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] ; ALL: loop-memcpy-residual: @@ -1871,20 +1903,20 @@ define amdgpu_kernel void @memmove_flat_align1_local_align1(ptr addrspace(0) %ds ; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]] ; ALL: memmove_bwd_loop: ; ALL-NEXT: [[TMP2:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ] -; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP2]], 8 +; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP2]], 256 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_INDEX]] -; ALL-NEXT: [[ELEMENT:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP3]], align 1 +; ALL-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(3) [[TMP3]], align 1 ; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[BWD_INDEX]] -; ALL-NEXT: store <2 x i32> [[ELEMENT]], ptr [[TMP4]], align 1 +; ALL-NEXT: store <64 x i32> [[ELEMENT]], ptr [[TMP4]], align 1 ; ALL-NEXT: [[TMP5:%.*]] = icmp eq i32 [[BWD_INDEX]], 0 ; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]] ; ALL: memmove_fwd_loop: ; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i32 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ] ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_INDEX]] -; ALL-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP6]], align 1 +; ALL-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(3) [[TMP6]], align 1 ; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[FWD_INDEX]] -; ALL-NEXT: store <2 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1 -; ALL-NEXT: [[TMP8]] = add i32 [[FWD_INDEX]], 8 +; ALL-NEXT: store <64 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1 +; ALL-NEXT: [[TMP8]] = add i32 [[FWD_INDEX]], 256 ; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 256 ; ALL-NEXT: br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]] ; ALL: memmove_done: @@ -1896,7 +1928,7 @@ define amdgpu_kernel void @memmove_flat_align1_local_align1(ptr addrspace(0) %ds define amdgpu_kernel void @memmove_flat_align1_local_align1_unknown_size(ptr addrspace(0) %dst, ptr addrspace(3) %src, i32 %size) { ; OPT-LABEL: @memmove_flat_align1_local_align1_unknown_size( -; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7 +; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15 ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]] ; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0 ; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP3]], 0 @@ -1918,11 +1950,11 @@ define amdgpu_kernel void @memmove_flat_align1_local_align1_unknown_size(ptr add ; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]] ; OPT: memmove_bwd_main_loop: ; OPT-NEXT: [[TMP9:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ] -; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 8 +; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 16 ; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_MAIN_INDEX]] -; OPT-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP10]], align 1 +; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP10]], align 1 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[BWD_MAIN_INDEX]] -; OPT-NEXT: store <2 x i32> [[ELEMENT1]], ptr [[TMP11]], align 1 +; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr [[TMP11]], align 1 ; OPT-NEXT: [[TMP12:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0 ; OPT-NEXT: br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]] ; OPT: memmove_copy_forward: @@ -1930,10 +1962,10 @@ define amdgpu_kernel void @memmove_flat_align1_local_align1_unknown_size(ptr add ; OPT: memmove_fwd_main_loop: ; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ] ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_MAIN_INDEX]] -; OPT-NEXT: [[ELEMENT2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP13]], align 1 +; OPT-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP13]], align 1 ; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[FWD_MAIN_INDEX]] -; OPT-NEXT: store <2 x i32> [[ELEMENT2]], ptr [[TMP14]], align 1 -; OPT-NEXT: [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 8 +; OPT-NEXT: store <4 x i32> [[ELEMENT2]], ptr [[TMP14]], align 1 +; OPT-NEXT: [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 16 ; OPT-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]] ; OPT: memmove_fwd_middle: @@ -1965,20 +1997,20 @@ define amdgpu_kernel void @memmove_local_align1_flat_align1(ptr addrspace(3) %ds ; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]] ; ALL: memmove_bwd_loop: ; ALL-NEXT: [[TMP2:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ] -; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP2]], 8 +; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP2]], 256 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[BWD_INDEX]] -; ALL-NEXT: [[ELEMENT:%.*]] = load <2 x i32>, ptr [[TMP3]], align 1 +; ALL-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr [[TMP3]], align 1 ; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_INDEX]] -; ALL-NEXT: store <2 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP4]], align 1 +; ALL-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP4]], align 1 ; ALL-NEXT: [[TMP5:%.*]] = icmp eq i32 [[BWD_INDEX]], 0 ; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]] ; ALL: memmove_fwd_loop: ; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i32 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ] ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[FWD_INDEX]] -; ALL-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr [[TMP6]], align 1 +; ALL-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr [[TMP6]], align 1 ; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_INDEX]] -; ALL-NEXT: store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP7]], align 1 -; ALL-NEXT: [[TMP8]] = add i32 [[FWD_INDEX]], 8 +; ALL-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP7]], align 1 +; ALL-NEXT: [[TMP8]] = add i32 [[FWD_INDEX]], 256 ; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 256 ; ALL-NEXT: br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]] ; ALL: memmove_done: @@ -1990,7 +2022,7 @@ define amdgpu_kernel void @memmove_local_align1_flat_align1(ptr addrspace(3) %ds define amdgpu_kernel void @memmove_local_align1_flat_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(0) %src, i32 %size) { ; OPT-LABEL: @memmove_local_align1_flat_align1_unknown_size( -; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7 +; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15 ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]] ; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0 ; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP3]], 0 @@ -2012,11 +2044,11 @@ define amdgpu_kernel void @memmove_local_align1_flat_align1_unknown_size(ptr add ; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]] ; OPT: memmove_bwd_main_loop: ; OPT-NEXT: [[TMP9:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ] -; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 8 +; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 16 ; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[BWD_MAIN_INDEX]] -; OPT-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr [[TMP10]], align 1 +; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr [[TMP10]], align 1 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_MAIN_INDEX]] -; OPT-NEXT: store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP11]], align 1 +; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP11]], align 1 ; OPT-NEXT: [[TMP12:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0 ; OPT-NEXT: br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]] ; OPT: memmove_copy_forward: @@ -2024,10 +2056,10 @@ define amdgpu_kernel void @memmove_local_align1_flat_align1_unknown_size(ptr add ; OPT: memmove_fwd_main_loop: ; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ] ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[FWD_MAIN_INDEX]] -; OPT-NEXT: [[ELEMENT2:%.*]] = load <2 x i32>, ptr [[TMP13]], align 1 +; OPT-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr [[TMP13]], align 1 ; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_MAIN_INDEX]] -; OPT-NEXT: store <2 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP14]], align 1 -; OPT-NEXT: [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 8 +; OPT-NEXT: store <4 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP14]], align 1 +; OPT-NEXT: [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 16 ; OPT-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]] ; OPT: memmove_fwd_middle: @@ -2058,20 +2090,20 @@ define amdgpu_kernel void @memmove_local_align1_local_align1(ptr addrspace(3) %d ; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]] ; ALL: memmove_bwd_loop: ; ALL-NEXT: [[TMP1:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ] -; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP1]], 8 +; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP1]], 256 ; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_INDEX]] -; ALL-NEXT: [[ELEMENT:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP2]], align 1 +; ALL-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(3) [[TMP2]], align 1 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_INDEX]] -; ALL-NEXT: store <2 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP3]], align 1 +; ALL-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP3]], align 1 ; ALL-NEXT: [[TMP4:%.*]] = icmp eq i32 [[BWD_INDEX]], 0 ; ALL-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]] ; ALL: memmove_fwd_loop: ; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i32 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ] ; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_INDEX]] -; ALL-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1 +; ALL-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(3) [[TMP5]], align 1 ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_INDEX]] -; ALL-NEXT: store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP6]], align 1 -; ALL-NEXT: [[TMP7]] = add i32 [[FWD_INDEX]], 8 +; ALL-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP6]], align 1 +; ALL-NEXT: [[TMP7]] = add i32 [[FWD_INDEX]], 256 ; ALL-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 256 ; ALL-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]] ; ALL: memmove_done: @@ -2083,7 +2115,7 @@ define amdgpu_kernel void @memmove_local_align1_local_align1(ptr addrspace(3) %d define amdgpu_kernel void @memmove_local_align1_local_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %size) { ; OPT-LABEL: @memmove_local_align1_local_align1_unknown_size( -; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7 +; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15 ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]] ; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0 ; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP3]], 0 @@ -2104,11 +2136,11 @@ define amdgpu_kernel void @memmove_local_align1_local_align1_unknown_size(ptr ad ; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]] ; OPT: memmove_bwd_main_loop: ; OPT-NEXT: [[TMP8:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ] -; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP8]], 8 +; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP8]], 16 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_MAIN_INDEX]] -; OPT-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP9]], align 1 +; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP9]], align 1 ; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_MAIN_INDEX]] -; OPT-NEXT: store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP10]], align 1 +; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP10]], align 1 ; OPT-NEXT: [[TMP11:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0 ; OPT-NEXT: br i1 [[TMP11]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]] ; OPT: memmove_copy_forward: @@ -2116,10 +2148,10 @@ define amdgpu_kernel void @memmove_local_align1_local_align1_unknown_size(ptr ad ; OPT: memmove_fwd_main_loop: ; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP14:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ] ; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_MAIN_INDEX]] -; OPT-NEXT: [[ELEMENT2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP12]], align 1 +; OPT-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP12]], align 1 ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_MAIN_INDEX]] -; OPT-NEXT: store <2 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP13]], align 1 -; OPT-NEXT: [[TMP14]] = add i32 [[FWD_MAIN_INDEX]], 8 +; OPT-NEXT: store <4 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP13]], align 1 +; OPT-NEXT: [[TMP14]] = add i32 [[FWD_MAIN_INDEX]], 16 ; OPT-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP14]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP15]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]] ; OPT: memmove_fwd_middle: diff --git a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll index a68d2e575607..bc8bcc622810 100644 --- a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll +++ b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll @@ -306,10 +306,10 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align ; CHECK-LABEL: memmove_p0_p3: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v7, 7, v3 +; CHECK-NEXT: v_and_b32_e32 v7, 15, v3 ; CHECK-NEXT: v_mov_b32_e32 v8, 0 ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; CHECK-NEXT: v_and_b32_e32 v5, -8, v3 +; CHECK-NEXT: v_and_b32_e32 v5, -16, v3 ; CHECK-NEXT: v_mov_b32_e32 v6, v4 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[7:8] @@ -338,15 +338,15 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB2_5: ; %memmove_fwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ds_read_b64 v[13:14], v4 -; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -8 +; CHECK-NEXT: ds_read_b128 v[13:16], v4 +; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v12, s5, -1, v12, s5 -; CHECK-NEXT: v_add_nc_u32_e32 v4, 8, v4 +; CHECK-NEXT: v_add_nc_u32_e32 v4, 16, v4 ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[11:12] ; CHECK-NEXT: s_or_b32 s9, s5, s9 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_dwordx2 v[9:10], v[13:14] -; CHECK-NEXT: v_add_co_u32 v9, s6, v9, 8 +; CHECK-NEXT: flat_store_dwordx4 v[9:10], v[13:16] +; CHECK-NEXT: v_add_co_u32 v9, s6, v9, 16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s6, 0, v10, s6 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; CHECK-NEXT: s_cbranch_execnz .LBB2_5 @@ -355,7 +355,7 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 ; CHECK-NEXT: s_cbranch_execz .LBB2_9 ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader -; CHECK-NEXT: v_and_b32_e32 v3, -8, v3 +; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v5 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v6, s5 ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3 @@ -414,26 +414,26 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB2_16 ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader -; CHECK-NEXT: v_and_b32_e32 v3, -8, v3 -; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -8 +; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16 ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; CHECK-NEXT: v_add3_u32 v2, v3, v2, -8 +; CHECK-NEXT: v_add3_u32 v2, v3, v2, -16 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB2_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ds_read_b64 v[3:4], v2 -; CHECK-NEXT: v_add_co_u32 v7, vcc_lo, v5, -8 -; CHECK-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, -1, v6, vcc_lo -; CHECK-NEXT: v_add_co_u32 v9, vcc_lo, v0, v5 -; CHECK-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v1, v6, vcc_lo -; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[7:8] -; CHECK-NEXT: v_mov_b32_e32 v5, v7 -; CHECK-NEXT: v_add_nc_u32_e32 v2, -8, v2 -; CHECK-NEXT: v_mov_b32_e32 v6, v8 +; CHECK-NEXT: ds_read_b128 v[7:10], v2 +; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v5, -16 +; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v6, vcc_lo +; CHECK-NEXT: v_add_co_u32 v11, vcc_lo, v0, v5 +; CHECK-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v1, v6, vcc_lo +; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[3:4] +; CHECK-NEXT: v_mov_b32_e32 v6, v4 +; CHECK-NEXT: v_add_nc_u32_e32 v2, -16, v2 +; CHECK-NEXT: v_mov_b32_e32 v5, v3 ; CHECK-NEXT: s_or_b32 s7, s4, s7 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_dwordx2 v[9:10], v[3:4] +; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[7:10] ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB2_15 ; CHECK-NEXT: .LBB2_16: ; %Flow36 @@ -1043,9 +1043,9 @@ define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align ; CHECK-LABEL: memmove_p1_p3: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v7, -8, v3 +; CHECK-NEXT: v_and_b32_e32 v7, -16, v3 ; CHECK-NEXT: v_mov_b32_e32 v8, v4 -; CHECK-NEXT: v_and_b32_e32 v5, 7, v3 +; CHECK-NEXT: v_and_b32_e32 v5, 15, v3 ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo @@ -1056,16 +1056,16 @@ define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: .LBB7_2: ; %loop-memcpy-expansion ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ds_read_b64 v[10:11], v9 -; CHECK-NEXT: v_add_co_u32 v12, vcc_lo, v0, s4 -; CHECK-NEXT: s_add_u32 s4, s4, 8 -; CHECK-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, s5, v1, vcc_lo +; CHECK-NEXT: ds_read_b128 v[10:13], v9 +; CHECK-NEXT: v_add_co_u32 v14, vcc_lo, v0, s4 +; CHECK-NEXT: s_add_u32 s4, s4, 16 +; CHECK-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, s5, v1, vcc_lo ; CHECK-NEXT: s_addc_u32 s5, s5, 0 -; CHECK-NEXT: v_add_nc_u32_e32 v9, 8, v9 +; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8] ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_store_dwordx2 v[12:13], v[10:11], off +; CHECK-NEXT: global_store_dwordx4 v[14:15], v[10:13], off ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB7_2 ; CHECK-NEXT: .LBB7_3: ; %Flow9 @@ -1076,7 +1076,7 @@ define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execz .LBB7_7 ; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader -; CHECK-NEXT: v_and_b32_e32 v3, -8, v3 +; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3 @@ -1327,11 +1327,11 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align ; CHECK-LABEL: memmove_p3_p0: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v5, 7, v3 +; CHECK-NEXT: v_and_b32_e32 v5, 15, v3 ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v0 ; CHECK-NEXT: s_mov_b64 s[4:5], src_shared_base -; CHECK-NEXT: v_and_b32_e32 v7, -8, v3 +; CHECK-NEXT: v_and_b32_e32 v7, -16, v3 ; CHECK-NEXT: v_mov_b32_e32 v8, v4 ; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[5:6] ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, s5, vcc_lo @@ -1361,16 +1361,16 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB10_5: ; %memmove_fwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: flat_load_dwordx2 v[13:14], v[9:10] -; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -8 +; CHECK-NEXT: flat_load_dwordx4 v[13:16], v[9:10] +; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v12, s5, -1, v12, s5 -; CHECK-NEXT: v_add_co_u32 v9, s5, v9, 8 +; CHECK-NEXT: v_add_co_u32 v9, s5, v9, 16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s5, 0, v10, s5 ; CHECK-NEXT: v_cmp_eq_u64_e64 s6, 0, v[11:12] ; CHECK-NEXT: s_or_b32 s9, s6, s9 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_write_b64 v4, v[13:14] -; CHECK-NEXT: v_add_nc_u32_e32 v4, 8, v4 +; CHECK-NEXT: ds_write_b128 v4, v[13:16] +; CHECK-NEXT: v_add_nc_u32_e32 v4, 16, v4 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; CHECK-NEXT: s_cbranch_execnz .LBB10_5 ; CHECK-NEXT: .LBB10_6: ; %Flow34 @@ -1378,7 +1378,7 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 ; CHECK-NEXT: s_cbranch_execz .LBB10_9 ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader -; CHECK-NEXT: v_and_b32_e32 v3, -8, v3 +; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v3, v0, v3 ; CHECK-NEXT: v_add_co_u32 v0, s5, v1, v7 @@ -1437,23 +1437,23 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB10_16 ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader -; CHECK-NEXT: v_and_b32_e32 v3, -8, v3 -; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, -8 +; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 +; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, -16 ; CHECK-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, -1, v2, vcc_lo -; CHECK-NEXT: v_add3_u32 v0, v3, v0, -8 +; CHECK-NEXT: v_add3_u32 v0, v3, v0, -16 ; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: .LBB10_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, v7 ; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v2, v8, vcc_lo -; CHECK-NEXT: v_add_co_u32 v7, vcc_lo, v7, -8 +; CHECK-NEXT: v_add_co_u32 v7, vcc_lo, v7, -16 ; CHECK-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, -1, v8, vcc_lo -; CHECK-NEXT: flat_load_dwordx2 v[3:4], v[3:4] +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[3:4] ; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[7:8] ; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_write_b64 v0, v[3:4] -; CHECK-NEXT: v_add_nc_u32_e32 v0, -8, v0 +; CHECK-NEXT: ds_write_b128 v0, v[3:6] +; CHECK-NEXT: v_add_nc_u32_e32 v0, -16, v0 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_cbranch_execnz .LBB10_15 ; CHECK-NEXT: .LBB10_16: ; %Flow36 @@ -1470,9 +1470,9 @@ define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align ; CHECK-LABEL: memmove_p3_p1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v7, -8, v3 +; CHECK-NEXT: v_and_b32_e32 v7, -16, v3 ; CHECK-NEXT: v_mov_b32_e32 v8, v4 -; CHECK-NEXT: v_and_b32_e32 v5, 7, v3 +; CHECK-NEXT: v_and_b32_e32 v5, 15, v3 ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo @@ -1485,14 +1485,14 @@ define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4 ; CHECK-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, s5, v2, vcc_lo -; CHECK-NEXT: s_add_u32 s4, s4, 8 +; CHECK-NEXT: s_add_u32 s4, s4, 16 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8] -; CHECK-NEXT: global_load_dwordx2 v[10:11], v[10:11], off +; CHECK-NEXT: global_load_dwordx4 v[10:13], v[10:11], off ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: ds_write_b64 v9, v[10:11] -; CHECK-NEXT: v_add_nc_u32_e32 v9, 8, v9 +; CHECK-NEXT: ds_write_b128 v9, v[10:13] +; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB11_2 ; CHECK-NEXT: .LBB11_3: ; %Flow9 @@ -1503,7 +1503,7 @@ define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execz .LBB11_7 ; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader -; CHECK-NEXT: v_and_b32_e32 v3, -8, v3 +; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3 @@ -1538,8 +1538,8 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v5, 0 -; CHECK-NEXT: v_and_b32_e32 v4, 7, v2 -; CHECK-NEXT: v_and_b32_e32 v6, -8, v2 +; CHECK-NEXT: v_and_b32_e32 v4, 15, v2 +; CHECK-NEXT: v_and_b32_e32 v6, -16, v2 ; CHECK-NEXT: v_mov_b32_e32 v7, v3 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[4:5] @@ -1563,15 +1563,15 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: .LBB12_5: ; %memmove_fwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ds_read_b64 v[9:10], v3 -; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -8 +; CHECK-NEXT: ds_read_b128 v[9:12], v3 +; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v7, s5, -1, v7, s5 -; CHECK-NEXT: v_add_nc_u32_e32 v3, 8, v3 +; CHECK-NEXT: v_add_nc_u32_e32 v3, 16, v3 ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[6:7] ; CHECK-NEXT: s_or_b32 s8, s5, s8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: ds_write_b64 v8, v[9:10] -; CHECK-NEXT: v_add_nc_u32_e32 v8, 8, v8 +; CHECK-NEXT: ds_write_b128 v8, v[9:12] +; CHECK-NEXT: v_add_nc_u32_e32 v8, 16, v8 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_cbranch_execnz .LBB12_5 ; CHECK-NEXT: .LBB12_6: ; %Flow41 @@ -1579,7 +1579,7 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 ; CHECK-NEXT: s_cbranch_execz .LBB12_9 ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader -; CHECK-NEXT: v_and_b32_e32 v2, -8, v2 +; CHECK-NEXT: v_and_b32_e32 v2, -16, v2 ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2 @@ -1630,24 +1630,24 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB12_16 ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader -; CHECK-NEXT: v_and_b32_e32 v5, -8, v2 +; CHECK-NEXT: v_and_b32_e32 v5, -16, v2 ; CHECK-NEXT: s_mov_b32 s6, 0 -; CHECK-NEXT: v_add_nc_u32_e32 v4, -8, v5 +; CHECK-NEXT: v_add_nc_u32_e32 v4, -16, v5 ; CHECK-NEXT: v_add_nc_u32_e32 v2, v0, v4 ; CHECK-NEXT: v_sub_co_u32 v0, vcc_lo, 0, v5 ; CHECK-NEXT: v_add_nc_u32_e32 v4, v1, v4 ; CHECK-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo ; CHECK-NEXT: .LBB12_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ds_read_b64 v[5:6], v4 -; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, 8 +; CHECK-NEXT: ds_read_b128 v[5:8], v4 +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16 ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; CHECK-NEXT: v_add_nc_u32_e32 v4, -8, v4 +; CHECK-NEXT: v_add_nc_u32_e32 v4, -16, v4 ; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; CHECK-NEXT: s_or_b32 s6, vcc_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: ds_write_b64 v2, v[5:6] -; CHECK-NEXT: v_add_nc_u32_e32 v2, -8, v2 +; CHECK-NEXT: ds_write_b128 v2, v[5:8] +; CHECK-NEXT: v_add_nc_u32_e32 v2, -16, v2 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execnz .LBB12_15 ; CHECK-NEXT: .LBB12_16: ; %Flow43 @@ -1664,9 +1664,9 @@ define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align ; CHECK-LABEL: memmove_p3_p4: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v7, -8, v3 +; CHECK-NEXT: v_and_b32_e32 v7, -16, v3 ; CHECK-NEXT: v_mov_b32_e32 v8, v4 -; CHECK-NEXT: v_and_b32_e32 v5, 7, v3 +; CHECK-NEXT: v_and_b32_e32 v5, 15, v3 ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo @@ -1679,14 +1679,14 @@ define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4 ; CHECK-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, s5, v2, vcc_lo -; CHECK-NEXT: s_add_u32 s4, s4, 8 +; CHECK-NEXT: s_add_u32 s4, s4, 16 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8] -; CHECK-NEXT: global_load_dwordx2 v[10:11], v[10:11], off +; CHECK-NEXT: global_load_dwordx4 v[10:13], v[10:11], off ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: ds_write_b64 v9, v[10:11] -; CHECK-NEXT: v_add_nc_u32_e32 v9, 8, v9 +; CHECK-NEXT: ds_write_b128 v9, v[10:13] +; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB13_2 ; CHECK-NEXT: .LBB13_3: ; %Flow9 @@ -1697,7 +1697,7 @@ define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execz .LBB13_7 ; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader -; CHECK-NEXT: v_and_b32_e32 v3, -8, v3 +; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3 @@ -1735,27 +1735,30 @@ define void @memmove_p3_p5(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo -; CHECK-NEXT: v_and_b32_e32 v2, -8, v4 -; CHECK-NEXT: v_and_b32_e32 v5, 7, v4 +; CHECK-NEXT: v_and_b32_e32 v2, -16, v4 +; CHECK-NEXT: v_and_b32_e32 v5, 15, v4 ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[2:3] ; CHECK-NEXT: s_cbranch_execz .LBB14_3 ; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader ; CHECK-NEXT: v_mov_b32_e32 v7, v1 ; CHECK-NEXT: v_mov_b32_e32 v8, v0 ; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB14_2: ; %loop-memcpy-expansion ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_clause 0x3 ; CHECK-NEXT: buffer_load_dword v9, v7, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v10, v7, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_add_u32 s4, s4, 8 +; CHECK-NEXT: buffer_load_dword v11, v7, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v12, v7, s[0:3], 0 offen offset:12 +; CHECK-NEXT: s_add_u32 s4, s4, 16 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 -; CHECK-NEXT: v_add_nc_u32_e32 v7, 8, v7 +; CHECK-NEXT: v_add_nc_u32_e32 v7, 16, v7 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[2:3] ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: ds_write_b64 v8, v[9:10] -; CHECK-NEXT: v_add_nc_u32_e32 v8, 8, v8 +; CHECK-NEXT: ds_write_b128 v8, v[9:12] +; CHECK-NEXT: v_add_nc_u32_e32 v8, 16, v8 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB14_2 ; CHECK-NEXT: .LBB14_3: ; %Flow14 @@ -1766,7 +1769,7 @@ define void @memmove_p3_p5(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execz .LBB14_7 ; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader -; CHECK-NEXT: v_and_b32_e32 v2, -8, v4 +; CHECK-NEXT: v_and_b32_e32 v2, -16, v4 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2 @@ -2021,25 +2024,28 @@ define void @memmove_p5_p3(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo -; CHECK-NEXT: v_and_b32_e32 v2, -8, v4 -; CHECK-NEXT: v_and_b32_e32 v5, 7, v4 +; CHECK-NEXT: v_and_b32_e32 v2, -16, v4 +; CHECK-NEXT: v_and_b32_e32 v5, 15, v4 ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[2:3] ; CHECK-NEXT: s_cbranch_execz .LBB17_3 ; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader ; CHECK-NEXT: v_mov_b32_e32 v7, v1 ; CHECK-NEXT: v_mov_b32_e32 v8, v0 ; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB17_2: ; %loop-memcpy-expansion ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ds_read_b64 v[9:10], v7 -; CHECK-NEXT: s_add_u32 s4, s4, 8 +; CHECK-NEXT: ds_read_b128 v[9:12], v7 +; CHECK-NEXT: s_add_u32 s4, s4, 16 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 -; CHECK-NEXT: v_add_nc_u32_e32 v7, 8, v7 +; CHECK-NEXT: v_add_nc_u32_e32 v7, 16, v7 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[2:3] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_store_dword v12, v8, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v11, v8, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_store_dword v10, v8, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen -; CHECK-NEXT: v_add_nc_u32_e32 v8, 8, v8 +; CHECK-NEXT: v_add_nc_u32_e32 v8, 16, v8 ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB17_2 @@ -2051,7 +2057,7 @@ define void @memmove_p5_p3(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execz .LBB17_7 ; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader -; CHECK-NEXT: v_and_b32_e32 v2, -8, v4 +; CHECK-NEXT: v_and_b32_e32 v2, -16, v4 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index af7f92798a93..a6db7d331cef 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -828,81 +828,30 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16> %a, <4 x i16> %b) #0 { ; EG-LABEL: s_test_imin_sle_v4i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 1, @28, KC0[], KC1[] -; EG-NEXT: TEX 1 @12 -; EG-NEXT: ALU 9, @30, KC0[], KC1[] -; EG-NEXT: TEX 1 @16 -; EG-NEXT: ALU 10, @40, KC0[], KC1[] -; EG-NEXT: TEX 1 @20 -; EG-NEXT: ALU 10, @51, KC0[], KC1[] -; EG-NEXT: TEX 1 @24 -; EG-NEXT: ALU 11, @62, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XY, T5.X, 1 +; EG-NEXT: ALU 0, @14, KC0[], KC1[] +; EG-NEXT: TEX 3 @6 +; EG-NEXT: ALU 9, @15, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 12: -; EG-NEXT: VTX_READ_16 T6.X, T5.X, 50, #3 -; EG-NEXT: VTX_READ_16 T7.X, T5.X, 58, #3 -; EG-NEXT: Fetch clause starting at 16: -; EG-NEXT: VTX_READ_16 T6.X, T5.X, 48, #3 -; EG-NEXT: VTX_READ_16 T7.X, T5.X, 56, #3 -; EG-NEXT: Fetch clause starting at 20: -; EG-NEXT: VTX_READ_16 T6.X, T5.X, 46, #3 -; EG-NEXT: VTX_READ_16 T7.X, T5.X, 54, #3 -; EG-NEXT: Fetch clause starting at 24: -; EG-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3 -; EG-NEXT: VTX_READ_16 T5.X, T5.X, 52, #3 -; EG-NEXT: ALU clause starting at 28: -; EG-NEXT: MOV * T0.Y, T3.X, -; EG-NEXT: MOV * T5.X, 0.0, -; EG-NEXT: ALU clause starting at 30: -; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x, -; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: MIN_INT * T0.W, PV.Z, PV.W, -; EG-NEXT: LSHL T0.W, PV.W, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV * T3.X, PV.W, -; EG-NEXT: MOV * T0.Y, PV.X, -; EG-NEXT: ALU clause starting at 40: -; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x, -; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: MIN_INT T0.W, PV.Z, PV.W, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T3.X, PV.W, -; EG-NEXT: MOV * T0.Y, T2.X, -; EG-NEXT: ALU clause starting at 51: -; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x, -; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: MIN_INT T0.W, PV.Z, PV.W, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_16 T1.X, T0.X, 46, #3 +; EG-NEXT: VTX_READ_16 T2.X, T0.X, 52, #3 +; EG-NEXT: VTX_READ_16 T3.X, T0.X, 44, #3 +; EG-NEXT: VTX_READ_16 T0.X, T0.X, 54, #3 +; EG-NEXT: ALU clause starting at 14: +; EG-NEXT: MOV * T0.X, 0.0, +; EG-NEXT: ALU clause starting at 15: +; EG-NEXT: BFE_INT T0.Z, T1.X, 0.0, literal.x, +; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T2.X, PV.W, -; EG-NEXT: MOV * T0.Y, PV.X, -; EG-NEXT: ALU clause starting at 62: -; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x, -; EG-NEXT: BFE_INT * T0.W, T5.X, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: MIN_INT T0.Y, PV.Z, PV.W, +; EG-NEXT: BFE_INT T0.Z, T3.X, 0.0, literal.x, +; EG-NEXT: BFE_INT * T0.W, T2.X, 0.0, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: MIN_INT * T0.W, PV.Z, PV.W, -; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x, -; EG-NEXT: AND_INT T1.W, T0.Y, literal.y, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.z, -; EG-NEXT: 2(2.802597e-45), -65536(nan) -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: OR_INT * T6.X, PV.W, PS, -; EG-NEXT: MOV T2.X, PV.X, -; EG-NEXT: MOV * T6.Y, T3.X, +; EG-NEXT: MIN_INT T0.X, PV.Z, PV.W, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CI-LABEL: s_test_imin_sle_v4i16: ; CI: ; %bb.0: @@ -1848,49 +1797,40 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; EG-LABEL: v_test_umin_ule_v3i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 3, @20, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 1 @8 -; EG-NEXT: ALU 11, @24, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 3 @12 -; EG-NEXT: ALU 8, @36, KC0[], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T8.X, 0 -; EG-NEXT: MEM_RAT MSKOR T7.XW, T0.X +; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 3 @6 +; EG-NEXT: ALU 17, @18, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0 +; EG-NEXT: MEM_RAT MSKOR T4.XW, T0.X ; EG-NEXT: CF_END -; EG-NEXT: Fetch clause starting at 8: -; EG-NEXT: VTX_READ_16 T7.X, T6.X, 4, #1 -; EG-NEXT: VTX_READ_16 T8.X, T0.X, 4, #1 -; EG-NEXT: Fetch clause starting at 12: -; EG-NEXT: VTX_READ_16 T8.X, T6.X, 0, #1 -; EG-NEXT: VTX_READ_16 T9.X, T0.X, 0, #1 -; EG-NEXT: VTX_READ_16 T6.X, T6.X, 2, #1 -; EG-NEXT: VTX_READ_16 T0.X, T0.X, 2, #1 -; EG-NEXT: ALU clause starting at 20: +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_16 T2.X, T1.X, 0, #1 +; EG-NEXT: VTX_READ_16 T3.X, T0.X, 0, #1 +; EG-NEXT: VTX_READ_16 T1.X, T1.X, 4, #1 +; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1 +; EG-NEXT: ALU clause starting at 14: ; EG-NEXT: LSHL * T0.W, T0.X, literal.x, ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) ; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ADD_INT * T6.X, KC0[2].W, PV.W, -; EG-NEXT: ALU clause starting at 24: +; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W, +; EG-NEXT: ALU clause starting at 18: ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, ; EG-NEXT: ADD_INT * T1.W, PV.W, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: AND_INT * T2.W, PV.W, literal.x, ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) ; EG-NEXT: LSHL T2.W, PV.W, literal.x, -; EG-NEXT: MIN_UINT * T3.W, T8.X, T7.X, +; EG-NEXT: MIN_UINT * T3.W, T0.X, T1.X, ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: LSHL T7.X, PS, PV.W, -; EG-NEXT: LSHL * T7.W, literal.x, PV.W, +; EG-NEXT: LSHL T4.X, PS, PV.W, +; EG-NEXT: LSHL * T4.W, literal.x, PV.W, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: MOV * T7.Y, 0.0, -; EG-NEXT: ALU clause starting at 36: -; EG-NEXT: MOV T7.Z, 0.0, -; EG-NEXT: MIN_UINT * T2.W, T0.X, T6.X, +; EG-NEXT: MOV T4.Y, 0.0, +; EG-NEXT: MOV * T4.Z, 0.0, ; EG-NEXT: LSHR T0.X, T1.W, literal.x, -; EG-NEXT: LSHL T1.W, PV.W, literal.y, -; EG-NEXT: MIN_UINT * T2.W, T9.X, T8.X, -; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) -; EG-NEXT: OR_INT T6.X, PV.W, PS, -; EG-NEXT: LSHR * T8.X, T0.W, literal.x, +; EG-NEXT: MIN_UINT * T1.X, T3.X, T2.X, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: LSHR * T2.X, T0.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CI-LABEL: v_test_umin_ule_v3i16: @@ -2936,142 +2876,46 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16> %a, <8 x i16> %b) #0 { ; EG-LABEL: s_test_umin_ult_v8i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 1, @52, KC0[], KC1[] -; EG-NEXT: TEX 1 @20 -; EG-NEXT: ALU 9, @54, KC0[], KC1[] -; EG-NEXT: TEX 1 @24 -; EG-NEXT: ALU 8, @64, KC0[], KC1[] -; EG-NEXT: TEX 1 @28 -; EG-NEXT: ALU 10, @73, KC0[], KC1[] -; EG-NEXT: TEX 1 @32 -; EG-NEXT: ALU 8, @84, KC0[], KC1[] -; EG-NEXT: TEX 1 @36 -; EG-NEXT: ALU 10, @93, KC0[], KC1[] -; EG-NEXT: TEX 1 @40 -; EG-NEXT: ALU 8, @104, KC0[], KC1[] -; EG-NEXT: TEX 1 @44 -; EG-NEXT: ALU 10, @113, KC0[], KC1[] -; EG-NEXT: TEX 1 @48 -; EG-NEXT: ALU 10, @124, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1 +; EG-NEXT: ALU 0, @24, KC0[], KC1[] +; EG-NEXT: TEX 2 @8 +; EG-NEXT: ALU 2, @25, KC0[], KC1[] +; EG-NEXT: TEX 4 @14 +; EG-NEXT: ALU 14, @28, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 20: -; EG-NEXT: VTX_READ_16 T8.X, T7.X, 66, #3 -; EG-NEXT: VTX_READ_16 T9.X, T7.X, 82, #3 -; EG-NEXT: Fetch clause starting at 24: -; EG-NEXT: VTX_READ_16 T8.X, T7.X, 64, #3 -; EG-NEXT: VTX_READ_16 T9.X, T7.X, 80, #3 -; EG-NEXT: Fetch clause starting at 28: -; EG-NEXT: VTX_READ_16 T8.X, T7.X, 62, #3 -; EG-NEXT: VTX_READ_16 T9.X, T7.X, 78, #3 -; EG-NEXT: Fetch clause starting at 32: -; EG-NEXT: VTX_READ_16 T8.X, T7.X, 60, #3 -; EG-NEXT: VTX_READ_16 T9.X, T7.X, 76, #3 -; EG-NEXT: Fetch clause starting at 36: -; EG-NEXT: VTX_READ_16 T8.X, T7.X, 58, #3 -; EG-NEXT: VTX_READ_16 T9.X, T7.X, 74, #3 -; EG-NEXT: Fetch clause starting at 40: -; EG-NEXT: VTX_READ_16 T8.X, T7.X, 56, #3 -; EG-NEXT: VTX_READ_16 T9.X, T7.X, 72, #3 -; EG-NEXT: Fetch clause starting at 44: -; EG-NEXT: VTX_READ_16 T8.X, T7.X, 54, #3 -; EG-NEXT: VTX_READ_16 T9.X, T7.X, 70, #3 -; EG-NEXT: Fetch clause starting at 48: -; EG-NEXT: VTX_READ_16 T8.X, T7.X, 52, #3 -; EG-NEXT: VTX_READ_16 T7.X, T7.X, 68, #3 -; EG-NEXT: ALU clause starting at 52: -; EG-NEXT: MOV * T0.Y, T3.X, -; EG-NEXT: MOV * T7.X, 0.0, -; EG-NEXT: ALU clause starting at 54: -; EG-NEXT: AND_INT T0.W, T8.X, literal.x, -; EG-NEXT: AND_INT * T1.W, T9.X, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: MIN_UINT * T0.W, PV.W, PS, -; EG-NEXT: LSHL T0.W, PV.W, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) -; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV * T3.X, PV.W, -; EG-NEXT: MOV * T0.Y, PV.X, -; EG-NEXT: ALU clause starting at 64: -; EG-NEXT: AND_INT T0.W, T8.X, literal.x, -; EG-NEXT: AND_INT * T1.W, T9.X, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, T0.Y, literal.x, -; EG-NEXT: MIN_UINT * T0.W, PV.W, PS, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV T3.X, PV.W, -; EG-NEXT: MOV * T0.Y, T2.X, -; EG-NEXT: ALU clause starting at 73: -; EG-NEXT: AND_INT T0.W, T8.X, literal.x, -; EG-NEXT: AND_INT * T1.W, T9.X, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: MIN_UINT T0.W, PV.W, PS, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T2.X, PV.W, -; EG-NEXT: MOV * T0.Y, PV.X, -; EG-NEXT: ALU clause starting at 84: -; EG-NEXT: AND_INT T0.W, T8.X, literal.x, -; EG-NEXT: AND_INT * T1.W, T9.X, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, T0.Y, literal.x, -; EG-NEXT: MIN_UINT * T0.W, PV.W, PS, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T7.Z, PV.W, PS, -; EG-NEXT: MOV T2.X, PV.Z, -; EG-NEXT: MOV * T0.Y, T5.X, -; EG-NEXT: ALU clause starting at 93: -; EG-NEXT: AND_INT T0.W, T8.X, literal.x, -; EG-NEXT: AND_INT * T1.W, T9.X, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: MIN_UINT T0.W, PV.W, PS, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T5.X, PV.W, -; EG-NEXT: MOV * T0.Y, PV.X, -; EG-NEXT: ALU clause starting at 104: -; EG-NEXT: AND_INT T0.W, T8.X, literal.x, -; EG-NEXT: AND_INT * T1.W, T9.X, literal.x, +; EG-NEXT: Fetch clause starting at 8: +; EG-NEXT: VTX_READ_16 T1.X, T0.X, 62, #3 +; EG-NEXT: VTX_READ_16 T2.X, T0.X, 60, #3 +; EG-NEXT: VTX_READ_16 T3.X, T0.X, 78, #3 +; EG-NEXT: Fetch clause starting at 14: +; EG-NEXT: VTX_READ_16 T1.X, T0.X, 68, #3 +; EG-NEXT: VTX_READ_16 T3.X, T0.X, 52, #3 +; EG-NEXT: VTX_READ_16 T4.X, T0.X, 70, #3 +; EG-NEXT: VTX_READ_16 T5.X, T0.X, 54, #3 +; EG-NEXT: VTX_READ_16 T0.X, T0.X, 76, #3 +; EG-NEXT: ALU clause starting at 24: +; EG-NEXT: MOV * T0.X, 0.0, +; EG-NEXT: ALU clause starting at 25: +; EG-NEXT: AND_INT T0.W, T1.X, literal.x, +; EG-NEXT: AND_INT * T1.W, T3.X, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: AND_INT T2.W, T0.Y, literal.x, -; EG-NEXT: MIN_UINT * T0.W, PV.W, PS, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, PV.W, PS, -; EG-NEXT: MOV T5.X, PV.W, -; EG-NEXT: MOV * T0.Y, T4.X, -; EG-NEXT: ALU clause starting at 113: -; EG-NEXT: AND_INT T0.W, T8.X, literal.x, -; EG-NEXT: AND_INT * T1.W, T9.X, literal.x, +; EG-NEXT: ALU clause starting at 28: +; EG-NEXT: AND_INT T0.Z, T2.X, literal.x, +; EG-NEXT: AND_INT T2.W, T0.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: MIN_UINT * T0.W, T0.W, T1.W, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: MIN_UINT T0.W, PV.W, PS, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, +; EG-NEXT: MIN_UINT T0.Z, PV.Z, PV.W, +; EG-NEXT: AND_INT T1.W, T5.X, literal.x, +; EG-NEXT: AND_INT * T2.W, T4.X, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T4.X, PV.W, -; EG-NEXT: MOV * T0.Y, PV.X, -; EG-NEXT: ALU clause starting at 124: -; EG-NEXT: AND_INT T0.W, T8.X, literal.x, -; EG-NEXT: AND_INT * T1.W, T7.X, literal.x, +; EG-NEXT: MIN_UINT T0.Y, PV.W, PS, +; EG-NEXT: AND_INT T1.W, T3.X, literal.x, +; EG-NEXT: AND_INT * T2.W, T1.X, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x, -; EG-NEXT: AND_INT T2.W, T0.Y, literal.y, -; EG-NEXT: MIN_UINT * T0.W, PV.W, PS, -; EG-NEXT: 2(2.802597e-45), -65536(nan) -; EG-NEXT: OR_INT * T7.X, PV.W, PS, -; EG-NEXT: MOV T4.X, PV.X, -; EG-NEXT: MOV * T7.W, T3.X, -; EG-NEXT: MOV * T7.Y, T5.X, +; EG-NEXT: MIN_UINT T0.X, PV.W, PS, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CI-LABEL: s_test_umin_ult_v8i16: ; CI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/minimummaximum.ll b/llvm/test/CodeGen/AMDGPU/minimummaximum.ll index c375b16ee380..7e867a537298 100644 --- a/llvm/test/CodeGen/AMDGPU/minimummaximum.ll +++ b/llvm/test/CodeGen/AMDGPU/minimummaximum.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,SDAG %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-FAKE16 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-TRUE16 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-FAKE16 %s define amdgpu_ps float @test_minmax_f32(float %a, float %b, float %c) { ; GFX12-LABEL: test_minmax_f32: @@ -72,30 +74,84 @@ define amdgpu_ps float @test_maxmin_commuted_f32(float %a, float %b, float %c) { } define amdgpu_ps half @test_minmax_f16(half %a, half %b, half %c) { -; GFX12-LABEL: test_minmax_f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_maximumminimum_f16 v0, v0, v1, v2 -; GFX12-NEXT: ; return to shader part epilog +; SDAG-TRUE16-LABEL: test_minmax_f16: +; SDAG-TRUE16: ; %bb.0: +; SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-TRUE16-NEXT: v_maximumminimum_f16 v0.l, v0.l, v0.h, v1.l +; SDAG-TRUE16-NEXT: ; return to shader part epilog +; +; SDAG-FAKE16-LABEL: test_minmax_f16: +; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: v_maximumminimum_f16 v0, v0, v1, v2 +; SDAG-FAKE16-NEXT: ; return to shader part epilog +; +; GISEL-TRUE16-LABEL: test_minmax_f16: +; GISEL-TRUE16: ; %bb.0: +; GISEL-TRUE16-NEXT: v_maximumminimum_f16 v0.l, v0.l, v1.l, v2.l +; GISEL-TRUE16-NEXT: ; return to shader part epilog +; +; GISEL-FAKE16-LABEL: test_minmax_f16: +; GISEL-FAKE16: ; %bb.0: +; GISEL-FAKE16-NEXT: v_maximumminimum_f16 v0, v0, v1, v2 +; GISEL-FAKE16-NEXT: ; return to shader part epilog %max = call half @llvm.maximum.f16(half %a, half %b) %minmax = call half @llvm.minimum.f16(half %max, half %c) ret half %minmax } define amdgpu_ps half @test_minmax_commuted_f16(half %a, half %b, half %c) { -; GFX12-LABEL: test_minmax_commuted_f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_maximumminimum_f16 v0, v0, v1, v2 -; GFX12-NEXT: ; return to shader part epilog +; SDAG-TRUE16-LABEL: test_minmax_commuted_f16: +; SDAG-TRUE16: ; %bb.0: +; SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-TRUE16-NEXT: v_maximumminimum_f16 v0.l, v0.l, v0.h, v1.l +; SDAG-TRUE16-NEXT: ; return to shader part epilog +; +; SDAG-FAKE16-LABEL: test_minmax_commuted_f16: +; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: v_maximumminimum_f16 v0, v0, v1, v2 +; SDAG-FAKE16-NEXT: ; return to shader part epilog +; +; GISEL-TRUE16-LABEL: test_minmax_commuted_f16: +; GISEL-TRUE16: ; %bb.0: +; GISEL-TRUE16-NEXT: v_maximumminimum_f16 v0.l, v0.l, v1.l, v2.l +; GISEL-TRUE16-NEXT: ; return to shader part epilog +; +; GISEL-FAKE16-LABEL: test_minmax_commuted_f16: +; GISEL-FAKE16: ; %bb.0: +; GISEL-FAKE16-NEXT: v_maximumminimum_f16 v0, v0, v1, v2 +; GISEL-FAKE16-NEXT: ; return to shader part epilog %max = call half @llvm.maximum.f16(half %a, half %b) %minmax = call half @llvm.minimum.f16(half %c, half %max) ret half %minmax } define amdgpu_ps half @test_maxmin_commuted_f16(half %a, half %b, half %c) { -; GFX12-LABEL: test_maxmin_commuted_f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_minimummaximum_f16 v0, v0, v1, v2 -; GFX12-NEXT: ; return to shader part epilog +; SDAG-TRUE16-LABEL: test_maxmin_commuted_f16: +; SDAG-TRUE16: ; %bb.0: +; SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-TRUE16-NEXT: v_minimummaximum_f16 v0.l, v0.l, v0.h, v1.l +; SDAG-TRUE16-NEXT: ; return to shader part epilog +; +; SDAG-FAKE16-LABEL: test_maxmin_commuted_f16: +; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: v_minimummaximum_f16 v0, v0, v1, v2 +; SDAG-FAKE16-NEXT: ; return to shader part epilog +; +; GISEL-TRUE16-LABEL: test_maxmin_commuted_f16: +; GISEL-TRUE16: ; %bb.0: +; GISEL-TRUE16-NEXT: v_minimummaximum_f16 v0.l, v0.l, v1.l, v2.l +; GISEL-TRUE16-NEXT: ; return to shader part epilog +; +; GISEL-FAKE16-LABEL: test_maxmin_commuted_f16: +; GISEL-FAKE16: ; %bb.0: +; GISEL-FAKE16-NEXT: v_minimummaximum_f16 v0, v0, v1, v2 +; GISEL-FAKE16-NEXT: ; return to shader part epilog %min = call half @llvm.minimum.f16(half %a, half %b) %maxmin = call half @llvm.maximum.f16(half %c, half %min) ret half %maxmin diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll index 774a22fb907d..954dab3d0fc6 100644 --- a/llvm/test/CodeGen/AMDGPU/minmax.ll +++ b/llvm/test/CodeGen/AMDGPU/minmax.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,SDAG,SDAG-GFX11 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GISEL,GISEL-GFX11 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-GFX12 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-GFX12 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,SDAG,SDAG-GFX11,SDAG-GFX11-TRUE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,SDAG,SDAG-GFX11,SDAG-GFX11-FAKE16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GISEL,GISEL-GFX11,GISEL-GFX11-TRUE16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GISEL,GISEL-GFX11,GISEL-GFX11-FAKE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-GFX12,SDAG-GFX12-TRUE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-GFX12,SDAG-GFX12-FAKE16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-GFX12,GISEL-GFX12-TRUE16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-GFX12,GISEL-GFX12-FAKE16 %s define i32 @test_minmax_i32(i32 %a, i32 %b, i32 %c) { ; GFX11-LABEL: test_minmax_i32: @@ -467,47 +471,111 @@ define void @test_med3_f32(ptr addrspace(1) %arg, float %x, float %y, float %z) } define amdgpu_ps half @test_minmax_f16_ieee_false(half %a, half %b, half %c) { -; GFX11-LABEL: test_minmax_f16_ieee_false: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_maxmin_f16 v0, v0, v1, v2 -; GFX11-NEXT: ; return to shader part epilog -; -; GFX12-LABEL: test_minmax_f16_ieee_false: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_maxmin_num_f16 v0, v0, v1, v2 -; GFX12-NEXT: ; return to shader part epilog +; SDAG-GFX11-TRUE16-LABEL: test_minmax_f16_ieee_false: +; SDAG-GFX11-TRUE16: ; %bb.0: +; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; SDAG-GFX11-TRUE16-NEXT: v_maxmin_f16 v0.l, v0.l, v0.h, v1.l +; SDAG-GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; SDAG-GFX11-FAKE16-LABEL: test_minmax_f16_ieee_false: +; SDAG-GFX11-FAKE16: ; %bb.0: +; SDAG-GFX11-FAKE16-NEXT: v_maxmin_f16 v0, v0, v1, v2 +; SDAG-GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GISEL-GFX11-TRUE16-LABEL: test_minmax_f16_ieee_false: +; GISEL-GFX11-TRUE16: ; %bb.0: +; GISEL-GFX11-TRUE16-NEXT: v_maxmin_f16 v0.l, v0.l, v1.l, v2.l +; GISEL-GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GISEL-GFX11-FAKE16-LABEL: test_minmax_f16_ieee_false: +; GISEL-GFX11-FAKE16: ; %bb.0: +; GISEL-GFX11-FAKE16-NEXT: v_maxmin_f16 v0, v0, v1, v2 +; GISEL-GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; SDAG-GFX12-TRUE16-LABEL: test_minmax_f16_ieee_false: +; SDAG-GFX12-TRUE16: ; %bb.0: +; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; SDAG-GFX12-TRUE16-NEXT: v_maxmin_num_f16 v0.l, v0.l, v0.h, v1.l +; SDAG-GFX12-TRUE16-NEXT: ; return to shader part epilog +; +; SDAG-GFX12-FAKE16-LABEL: test_minmax_f16_ieee_false: +; SDAG-GFX12-FAKE16: ; %bb.0: +; SDAG-GFX12-FAKE16-NEXT: v_maxmin_num_f16 v0, v0, v1, v2 +; SDAG-GFX12-FAKE16-NEXT: ; return to shader part epilog +; +; GISEL-GFX12-TRUE16-LABEL: test_minmax_f16_ieee_false: +; GISEL-GFX12-TRUE16: ; %bb.0: +; GISEL-GFX12-TRUE16-NEXT: v_maxmin_num_f16 v0.l, v0.l, v1.l, v2.l +; GISEL-GFX12-TRUE16-NEXT: ; return to shader part epilog +; +; GISEL-GFX12-FAKE16-LABEL: test_minmax_f16_ieee_false: +; GISEL-GFX12-FAKE16: ; %bb.0: +; GISEL-GFX12-FAKE16-NEXT: v_maxmin_num_f16 v0, v0, v1, v2 +; GISEL-GFX12-FAKE16-NEXT: ; return to shader part epilog %max = call half @llvm.maxnum.f16(half %a, half %b) %minmax = call half @llvm.minnum.f16(half %max, half %c) ret half %minmax } define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b, half inreg %c, ptr addrspace(1) inreg %out) { -; SDAG-GFX11-LABEL: s_test_minmax_f16_ieee_false: -; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 -; SDAG-GFX11-NEXT: s_mov_b32 s5, s4 -; SDAG-GFX11-NEXT: s_mov_b32 s4, s3 -; SDAG-GFX11-NEXT: v_maxmin_f16 v0, s0, s1, v0 -; SDAG-GFX11-NEXT: global_store_b16 v1, v0, s[4:5] -; SDAG-GFX11-NEXT: s_endpgm -; -; GISEL-GFX11-LABEL: s_test_minmax_f16_ieee_false: -; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 -; GISEL-GFX11-NEXT: s_mov_b32 s6, s3 -; GISEL-GFX11-NEXT: s_mov_b32 s7, s4 -; GISEL-GFX11-NEXT: v_maxmin_f16 v0, s0, s1, v0 -; GISEL-GFX11-NEXT: global_store_b16 v1, v0, s[6:7] -; GISEL-GFX11-NEXT: s_endpgm -; -; SDAG-GFX12-LABEL: s_test_minmax_f16_ieee_false: -; SDAG-GFX12: ; %bb.0: -; SDAG-GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 -; SDAG-GFX12-NEXT: s_mov_b32 s5, s4 -; SDAG-GFX12-NEXT: s_mov_b32 s4, s3 -; SDAG-GFX12-NEXT: v_maxmin_num_f16 v0, s0, s1, v0 -; SDAG-GFX12-NEXT: global_store_b16 v1, v0, s[4:5] -; SDAG-GFX12-NEXT: s_endpgm +; SDAG-GFX11-TRUE16-LABEL: s_test_minmax_f16_ieee_false: +; SDAG-GFX11-TRUE16: ; %bb.0: +; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-GFX11-TRUE16-NEXT: s_mov_b32 s5, s4 +; SDAG-GFX11-TRUE16-NEXT: s_mov_b32 s4, s3 +; SDAG-GFX11-TRUE16-NEXT: v_maxmin_f16 v0.l, s0, s1, v0.l +; SDAG-GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[4:5] +; SDAG-GFX11-TRUE16-NEXT: s_endpgm +; +; SDAG-GFX11-FAKE16-LABEL: s_test_minmax_f16_ieee_false: +; SDAG-GFX11-FAKE16: ; %bb.0: +; SDAG-GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; SDAG-GFX11-FAKE16-NEXT: s_mov_b32 s5, s4 +; SDAG-GFX11-FAKE16-NEXT: s_mov_b32 s4, s3 +; SDAG-GFX11-FAKE16-NEXT: v_maxmin_f16 v0, s0, s1, v0 +; SDAG-GFX11-FAKE16-NEXT: global_store_b16 v1, v0, s[4:5] +; SDAG-GFX11-FAKE16-NEXT: s_endpgm +; +; GISEL-GFX11-TRUE16-LABEL: s_test_minmax_f16_ieee_false: +; GISEL-GFX11-TRUE16: ; %bb.0: +; GISEL-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GISEL-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-GFX11-TRUE16-NEXT: s_mov_b32 s6, s3 +; GISEL-GFX11-TRUE16-NEXT: s_mov_b32 s7, s4 +; GISEL-GFX11-TRUE16-NEXT: v_maxmin_f16 v0.l, s0, s1, v0.l +; GISEL-GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[6:7] +; GISEL-GFX11-TRUE16-NEXT: s_endpgm +; +; GISEL-GFX11-FAKE16-LABEL: s_test_minmax_f16_ieee_false: +; GISEL-GFX11-FAKE16: ; %bb.0: +; GISEL-GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; GISEL-GFX11-FAKE16-NEXT: s_mov_b32 s6, s3 +; GISEL-GFX11-FAKE16-NEXT: s_mov_b32 s7, s4 +; GISEL-GFX11-FAKE16-NEXT: v_maxmin_f16 v0, s0, s1, v0 +; GISEL-GFX11-FAKE16-NEXT: global_store_b16 v1, v0, s[6:7] +; GISEL-GFX11-FAKE16-NEXT: s_endpgm +; +; SDAG-GFX12-TRUE16-LABEL: s_test_minmax_f16_ieee_false: +; SDAG-GFX12-TRUE16: ; %bb.0: +; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; SDAG-GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-GFX12-TRUE16-NEXT: s_mov_b32 s5, s4 +; SDAG-GFX12-TRUE16-NEXT: s_mov_b32 s4, s3 +; SDAG-GFX12-TRUE16-NEXT: v_maxmin_num_f16 v0.l, s0, s1, v0.l +; SDAG-GFX12-TRUE16-NEXT: global_store_b16 v1, v0, s[4:5] +; SDAG-GFX12-TRUE16-NEXT: s_endpgm +; +; SDAG-GFX12-FAKE16-LABEL: s_test_minmax_f16_ieee_false: +; SDAG-GFX12-FAKE16: ; %bb.0: +; SDAG-GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; SDAG-GFX12-FAKE16-NEXT: s_mov_b32 s5, s4 +; SDAG-GFX12-FAKE16-NEXT: s_mov_b32 s4, s3 +; SDAG-GFX12-FAKE16-NEXT: v_maxmin_num_f16 v0, s0, s1, v0 +; SDAG-GFX12-FAKE16-NEXT: global_store_b16 v1, v0, s[4:5] +; SDAG-GFX12-FAKE16-NEXT: s_endpgm ; ; GISEL-GFX12-LABEL: s_test_minmax_f16_ieee_false: ; GISEL-GFX12: ; %bb.0: @@ -526,136 +594,320 @@ define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b } define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) { -; SDAG-GFX11-LABEL: test_minmax_commuted_f16_ieee_true: -; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX11-NEXT: v_max_f16_e32 v1, v1, v1 -; SDAG-GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; SDAG-GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; SDAG-GFX11-NEXT: v_maxmin_f16 v0, v0, v1, v2 -; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-GFX11-LABEL: test_minmax_commuted_f16_ieee_true: -; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; GISEL-GFX11-NEXT: v_max_f16_e32 v1, v1, v1 -; GISEL-GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GISEL-GFX11-NEXT: v_maxmin_f16 v0, v0, v1, v2 -; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] -; -; SDAG-GFX12-LABEL: test_minmax_commuted_f16_ieee_true: -; SDAG-GFX12: ; %bb.0: -; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 -; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 -; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 -; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1 -; SDAG-GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 -; SDAG-GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; SDAG-GFX12-NEXT: v_maxmin_num_f16 v0, v0, v1, v2 -; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-GFX12-LABEL: test_minmax_commuted_f16_ieee_true: -; GISEL-GFX12: ; %bb.0: -; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 -; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 -; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 -; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 -; GISEL-GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GISEL-GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1 -; GISEL-GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GISEL-GFX12-NEXT: v_maxmin_num_f16 v0, v0, v1, v2 -; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-TRUE16-LABEL: test_minmax_commuted_f16_ieee_true: +; SDAG-GFX11-TRUE16: ; %bb.0: +; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l +; SDAG-GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v1.l, v1.l +; SDAG-GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; SDAG-GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v0.h, v0.h +; SDAG-GFX11-TRUE16-NEXT: v_maxmin_f16 v0.l, v0.l, v1.l, v0.h +; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-FAKE16-LABEL: test_minmax_commuted_f16_ieee_true: +; SDAG-GFX11-FAKE16: ; %bb.0: +; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1 +; SDAG-GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 +; SDAG-GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; SDAG-GFX11-FAKE16-NEXT: v_maxmin_f16 v0, v0, v1, v2 +; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-TRUE16-LABEL: test_minmax_commuted_f16_ieee_true: +; GISEL-GFX11-TRUE16: ; %bb.0: +; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GISEL-GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l +; GISEL-GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v2.l, v2.l +; GISEL-GFX11-TRUE16-NEXT: v_maxmin_f16 v0.l, v0.l, v0.h, v1.l +; GISEL-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-FAKE16-LABEL: test_minmax_commuted_f16_ieee_true: +; GISEL-GFX11-FAKE16: ; %bb.0: +; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 +; GISEL-GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1 +; GISEL-GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GISEL-GFX11-FAKE16-NEXT: v_maxmin_f16 v0, v0, v1, v2 +; GISEL-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-TRUE16-LABEL: test_minmax_commuted_f16_ieee_true: +; SDAG-GFX12-TRUE16: ; %bb.0: +; SDAG-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l +; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v1.l, v1.l +; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v0.h, v0.h +; SDAG-GFX12-TRUE16-NEXT: v_maxmin_num_f16 v0.l, v0.l, v1.l, v0.h +; SDAG-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-FAKE16-LABEL: test_minmax_commuted_f16_ieee_true: +; SDAG-GFX12-FAKE16: ; %bb.0: +; SDAG-GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1 +; SDAG-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 +; SDAG-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; SDAG-GFX12-FAKE16-NEXT: v_maxmin_num_f16 v0, v0, v1, v2 +; SDAG-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-TRUE16-LABEL: test_minmax_commuted_f16_ieee_true: +; GISEL-GFX12-TRUE16: ; %bb.0: +; GISEL-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l +; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l +; GISEL-GFX12-TRUE16-NEXT: v_maxmin_num_f16 v0.l, v0.l, v0.h, v1.l +; GISEL-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-FAKE16-LABEL: test_minmax_commuted_f16_ieee_true: +; GISEL-GFX12-FAKE16: ; %bb.0: +; GISEL-GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 +; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1 +; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GISEL-GFX12-FAKE16-NEXT: v_maxmin_num_f16 v0, v0, v1, v2 +; GISEL-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %max = call half @llvm.maxnum.f16(half %a, half %b) %minmax = call half @llvm.minnum.f16(half %c, half %max) ret half %minmax } define amdgpu_ps half @test_maxmin_f16_ieee_false(half %a, half %b, half %c) { -; GFX11-LABEL: test_maxmin_f16_ieee_false: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_minmax_f16 v0, v0, v1, v2 -; GFX11-NEXT: ; return to shader part epilog -; -; GFX12-LABEL: test_maxmin_f16_ieee_false: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_minmax_num_f16 v0, v0, v1, v2 -; GFX12-NEXT: ; return to shader part epilog +; SDAG-GFX11-TRUE16-LABEL: test_maxmin_f16_ieee_false: +; SDAG-GFX11-TRUE16: ; %bb.0: +; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; SDAG-GFX11-TRUE16-NEXT: v_minmax_f16 v0.l, v0.l, v0.h, v1.l +; SDAG-GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; SDAG-GFX11-FAKE16-LABEL: test_maxmin_f16_ieee_false: +; SDAG-GFX11-FAKE16: ; %bb.0: +; SDAG-GFX11-FAKE16-NEXT: v_minmax_f16 v0, v0, v1, v2 +; SDAG-GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GISEL-GFX11-TRUE16-LABEL: test_maxmin_f16_ieee_false: +; GISEL-GFX11-TRUE16: ; %bb.0: +; GISEL-GFX11-TRUE16-NEXT: v_minmax_f16 v0.l, v0.l, v1.l, v2.l +; GISEL-GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GISEL-GFX11-FAKE16-LABEL: test_maxmin_f16_ieee_false: +; GISEL-GFX11-FAKE16: ; %bb.0: +; GISEL-GFX11-FAKE16-NEXT: v_minmax_f16 v0, v0, v1, v2 +; GISEL-GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; SDAG-GFX12-TRUE16-LABEL: test_maxmin_f16_ieee_false: +; SDAG-GFX12-TRUE16: ; %bb.0: +; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; SDAG-GFX12-TRUE16-NEXT: v_minmax_num_f16 v0.l, v0.l, v0.h, v1.l +; SDAG-GFX12-TRUE16-NEXT: ; return to shader part epilog +; +; SDAG-GFX12-FAKE16-LABEL: test_maxmin_f16_ieee_false: +; SDAG-GFX12-FAKE16: ; %bb.0: +; SDAG-GFX12-FAKE16-NEXT: v_minmax_num_f16 v0, v0, v1, v2 +; SDAG-GFX12-FAKE16-NEXT: ; return to shader part epilog +; +; GISEL-GFX12-TRUE16-LABEL: test_maxmin_f16_ieee_false: +; GISEL-GFX12-TRUE16: ; %bb.0: +; GISEL-GFX12-TRUE16-NEXT: v_minmax_num_f16 v0.l, v0.l, v1.l, v2.l +; GISEL-GFX12-TRUE16-NEXT: ; return to shader part epilog +; +; GISEL-GFX12-FAKE16-LABEL: test_maxmin_f16_ieee_false: +; GISEL-GFX12-FAKE16: ; %bb.0: +; GISEL-GFX12-FAKE16-NEXT: v_minmax_num_f16 v0, v0, v1, v2 +; GISEL-GFX12-FAKE16-NEXT: ; return to shader part epilog %min = call half @llvm.minnum.f16(half %a, half %b) %maxmin = call half @llvm.maxnum.f16(half %min, half %c) ret half %maxmin } define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) { -; SDAG-GFX11-LABEL: test_maxmin_commuted_f16_ieee_true: -; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX11-NEXT: v_max_f16_e32 v1, v1, v1 -; SDAG-GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; SDAG-GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; SDAG-GFX11-NEXT: v_minmax_f16 v0, v0, v1, v2 -; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-GFX11-LABEL: test_maxmin_commuted_f16_ieee_true: -; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; GISEL-GFX11-NEXT: v_max_f16_e32 v1, v1, v1 -; GISEL-GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GISEL-GFX11-NEXT: v_minmax_f16 v0, v0, v1, v2 -; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] -; -; SDAG-GFX12-LABEL: test_maxmin_commuted_f16_ieee_true: -; SDAG-GFX12: ; %bb.0: -; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 -; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 -; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 -; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1 -; SDAG-GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 -; SDAG-GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; SDAG-GFX12-NEXT: v_minmax_num_f16 v0, v0, v1, v2 -; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-GFX12-LABEL: test_maxmin_commuted_f16_ieee_true: -; GISEL-GFX12: ; %bb.0: -; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 -; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 -; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 -; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 -; GISEL-GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GISEL-GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1 -; GISEL-GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GISEL-GFX12-NEXT: v_minmax_num_f16 v0, v0, v1, v2 -; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-TRUE16-LABEL: test_maxmin_commuted_f16_ieee_true: +; SDAG-GFX11-TRUE16: ; %bb.0: +; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l +; SDAG-GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v1.l, v1.l +; SDAG-GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; SDAG-GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v0.h, v0.h +; SDAG-GFX11-TRUE16-NEXT: v_minmax_f16 v0.l, v0.l, v1.l, v0.h +; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-FAKE16-LABEL: test_maxmin_commuted_f16_ieee_true: +; SDAG-GFX11-FAKE16: ; %bb.0: +; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1 +; SDAG-GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 +; SDAG-GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; SDAG-GFX11-FAKE16-NEXT: v_minmax_f16 v0, v0, v1, v2 +; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-TRUE16-LABEL: test_maxmin_commuted_f16_ieee_true: +; GISEL-GFX11-TRUE16: ; %bb.0: +; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GISEL-GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l +; GISEL-GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v2.l, v2.l +; GISEL-GFX11-TRUE16-NEXT: v_minmax_f16 v0.l, v0.l, v0.h, v1.l +; GISEL-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-FAKE16-LABEL: test_maxmin_commuted_f16_ieee_true: +; GISEL-GFX11-FAKE16: ; %bb.0: +; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 +; GISEL-GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1 +; GISEL-GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GISEL-GFX11-FAKE16-NEXT: v_minmax_f16 v0, v0, v1, v2 +; GISEL-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-TRUE16-LABEL: test_maxmin_commuted_f16_ieee_true: +; SDAG-GFX12-TRUE16: ; %bb.0: +; SDAG-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l +; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v1.l, v1.l +; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v0.h, v0.h +; SDAG-GFX12-TRUE16-NEXT: v_minmax_num_f16 v0.l, v0.l, v1.l, v0.h +; SDAG-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-FAKE16-LABEL: test_maxmin_commuted_f16_ieee_true: +; SDAG-GFX12-FAKE16: ; %bb.0: +; SDAG-GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1 +; SDAG-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 +; SDAG-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; SDAG-GFX12-FAKE16-NEXT: v_minmax_num_f16 v0, v0, v1, v2 +; SDAG-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-TRUE16-LABEL: test_maxmin_commuted_f16_ieee_true: +; GISEL-GFX12-TRUE16: ; %bb.0: +; GISEL-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l +; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l +; GISEL-GFX12-TRUE16-NEXT: v_minmax_num_f16 v0.l, v0.l, v0.h, v1.l +; GISEL-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-FAKE16-LABEL: test_maxmin_commuted_f16_ieee_true: +; GISEL-GFX12-FAKE16: ; %bb.0: +; GISEL-GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 +; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1 +; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GISEL-GFX12-FAKE16-NEXT: v_minmax_num_f16 v0, v0, v1, v2 +; GISEL-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %min = call half @llvm.minnum.f16(half %a, half %b) %maxmin = call half @llvm.maxnum.f16(half %c, half %min) ret half %maxmin } define void @test_med3_f16(ptr addrspace(1) %arg, half %x, half %y, half %z) #0 { -; GFX11-LABEL: test_med3_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_med3_f16 v2, v2, v3, v4 -; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: test_med3_f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_med3_num_f16 v2, v2, v3, v4 -; GFX12-NEXT: global_store_b16 v[0:1], v2, off -; GFX12-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-TRUE16-LABEL: test_med3_f16: +; SDAG-GFX11-TRUE16: ; %bb.0: +; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l +; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.l +; SDAG-GFX11-TRUE16-NEXT: v_med3_f16 v2.l, v2.l, v2.h, v3.l +; SDAG-GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v2, off +; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-FAKE16-LABEL: test_med3_f16: +; SDAG-GFX11-FAKE16: ; %bb.0: +; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-FAKE16-NEXT: v_med3_f16 v2, v2, v3, v4 +; SDAG-GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v2, off +; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-TRUE16-LABEL: test_med3_f16: +; GISEL-GFX11-TRUE16: ; %bb.0: +; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-TRUE16-NEXT: v_med3_f16 v2.l, v2.l, v3.l, v4.l +; GISEL-GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v2, off +; GISEL-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-FAKE16-LABEL: test_med3_f16: +; GISEL-GFX11-FAKE16: ; %bb.0: +; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-FAKE16-NEXT: v_med3_f16 v2, v2, v3, v4 +; GISEL-GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v2, off +; GISEL-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-TRUE16-LABEL: test_med3_f16: +; SDAG-GFX12-TRUE16: ; %bb.0: +; SDAG-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l +; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.l +; SDAG-GFX12-TRUE16-NEXT: v_med3_num_f16 v2.l, v2.l, v2.h, v3.l +; SDAG-GFX12-TRUE16-NEXT: global_store_b16 v[0:1], v2, off +; SDAG-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-FAKE16-LABEL: test_med3_f16: +; SDAG-GFX12-FAKE16: ; %bb.0: +; SDAG-GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: v_med3_num_f16 v2, v2, v3, v4 +; SDAG-GFX12-FAKE16-NEXT: global_store_b16 v[0:1], v2, off +; SDAG-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-TRUE16-LABEL: test_med3_f16: +; GISEL-GFX12-TRUE16: ; %bb.0: +; GISEL-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: v_med3_num_f16 v2.l, v2.l, v3.l, v4.l +; GISEL-GFX12-TRUE16-NEXT: global_store_b16 v[0:1], v2, off +; GISEL-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-FAKE16-LABEL: test_med3_f16: +; GISEL-GFX12-FAKE16: ; %bb.0: +; GISEL-GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: v_med3_num_f16 v2, v2, v3, v4 +; GISEL-GFX12-FAKE16-NEXT: global_store_b16 v[0:1], v2, off +; GISEL-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %tmp0 = call half @llvm.minnum.f16(half %x, half %y) %tmp1 = call half @llvm.maxnum.f16(half %x, half %y) %tmp2 = call half @llvm.minnum.f16(half %tmp1, half %z) diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll index f89341d539a0..7536e83a9da6 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll @@ -53,6 +53,7 @@ ; CHECK-NEXT: .cs: ; CHECK-NEXT: .checksum_value: 0x9444d7d0 ; CHECK-NEXT: .debug_mode: false +; CHECK-NEXT: .entry_point: _amdgpu_cs ; CHECK-NEXT: .entry_point_symbol: _amdgpu_cs_main ; CHECK-NEXT: .excp_en: 0 ; CHECK-NEXT: .float_mode: 0xc0 @@ -109,6 +110,7 @@ ; CHECK-NEXT: .wgp_mode: false ; CHECK-NEXT: .gs: ; CHECK-NEXT: .debug_mode: false +; CHECK-NEXT: .entry_point: _amdgpu_gs ; CHECK-NEXT: .entry_point_symbol: gs_shader ; CHECK-NEXT: .ieee_mode: false ; CHECK-NEXT: .lds_size: 0x200 @@ -120,6 +122,7 @@ ; CHECK-NEXT: .wgp_mode: true ; CHECK-NEXT: .hs: ; CHECK-NEXT: .debug_mode: false +; CHECK-NEXT: .entry_point: _amdgpu_hs ; CHECK-NEXT: .entry_point_symbol: hs_shader ; CHECK-NEXT: .ieee_mode: false ; CHECK-NEXT: .lds_size: 0x1000 @@ -131,6 +134,7 @@ ; CHECK-NEXT: .wgp_mode: true ; CHECK-NEXT: .ps: ; CHECK-NEXT: .debug_mode: false +; CHECK-NEXT: .entry_point: _amdgpu_ps ; CHECK-NEXT: .entry_point_symbol: ps_shader ; CHECK-NEXT: .ieee_mode: false ; CHECK-NEXT: .lds_size: 0 diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll index 676ba1480e6d..efb8d836c7b3 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll @@ -7,7 +7,6 @@ ; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1030 -stop-after=amdgpu-remove-incompatible-functions\ ; RUN: -pass-remarks=amdgpu-remove-incompatible-functions %s -o - 2>%t | FileCheck -check-prefixes=COMPATIBLE,REALTIME,MEMTIME %s ; RUN: FileCheck -allow-empty --check-prefixes=WARN-REALTIME,WARN-MEMTIME %s < %t -; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1102 -stop-after=amdgpu-remove-incompatible-functions\ ; RUN: -pass-remarks=amdgpu-remove-incompatible-functions %s -o - 2>%t | FileCheck -check-prefixes=INCOMPATIBLE,NOREALTIME,NOMEMTIME %s @@ -17,7 +16,6 @@ ; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1102 -stop-after=amdgpu-remove-incompatible-functions\ ; RUN: -pass-remarks=amdgpu-remove-incompatible-functions %s -o - 2>%t | FileCheck -check-prefixes=INCOMPATIBLE,NOREALTIME,NOMEMTIME %s ; RUN: FileCheck --check-prefixes=WARN-NOREALTIME,WARN-NOMEMTIME %s < %t -; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1102 -verify-machineinstrs < %s ; Note: This test checks the IR, but also has a run line to codegen the file just to check we ; do not crash when trying to select those functions. diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll index 75a388eb1229..038f49f30649 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll @@ -14,7 +14,6 @@ ; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1011 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\ ; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX10 %s -; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1011 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\ ; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX11 %s @@ -22,7 +21,6 @@ ; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\ ; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX11 %s -; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s ; WARN-GFX906: removing function 'needs_wavefrontsize32': +wavefrontsize32 is not supported on the current target ; WARN-GFX906-NOT: not supported diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll index 7e7f4f5d1991..c9efeeefdf2d 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -681,63 +681,30 @@ define amdgpu_kernel void @shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; EG-LABEL: shl_v4i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 3, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 42, @12, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XY, T0.X, 1 +; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T8.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_128 T10.XYZW, T0.X, 0, #1 +; EG-NEXT: VTX_READ_128 T8.XYZW, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV T0.Y, T6.X, -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ALU clause starting at 12: -; EG-NEXT: AND_INT * T1.W, T10.Z, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHL * T1.W, T10.X, PV.W, -; EG-NEXT: AND_INT T1.W, PV.W, literal.x, -; EG-NEXT: AND_INT * T2.W, T0.Y, literal.y, -; EG-NEXT: 65535(9.183409e-41), -65536(nan) -; EG-NEXT: OR_INT * T1.W, PS, PV.W, -; EG-NEXT: MOV * T6.X, PV.W, -; EG-NEXT: MOV T0.X, PV.X, -; EG-NEXT: LSHR T1.W, T10.Z, literal.x, -; EG-NEXT: LSHR * T2.W, T10.X, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHL T1.W, PS, PV.W, -; EG-NEXT: AND_INT * T2.W, PV.X, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHL * T1.W, PV.W, literal.x, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: LSHR T1.W, T8.Z, literal.x, +; EG-NEXT: LSHR * T2.W, T8.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, -; EG-NEXT: MOV T6.X, PV.W, -; EG-NEXT: MOV * T0.X, T7.X, -; EG-NEXT: AND_INT * T1.W, T10.W, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHL T1.W, T10.Y, PV.W, -; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: AND_INT * T1.W, PV.W, literal.x, +; EG-NEXT: LSHL T0.Y, PS, PV.W, +; EG-NEXT: AND_INT T1.W, T8.Z, literal.x, +; EG-NEXT: AND_INT * T2.W, T8.X, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, -; EG-NEXT: MOV * T7.X, PV.W, -; EG-NEXT: MOV T0.X, PV.X, -; EG-NEXT: LSHR T1.W, T10.W, literal.x, -; EG-NEXT: LSHR * T2.W, T10.Y, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHL * T1.W, PS, PV.W, -; EG-NEXT: AND_INT T0.Z, T0.X, literal.x, -; EG-NEXT: LSHL T1.W, PV.W, literal.y, +; EG-NEXT: LSHL T0.X, PS, PV.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; EG-NEXT: LSHR T0.X, PS, literal.x, -; EG-NEXT: OR_INT * T10.Y, PV.Z, PV.W, +; EG-NEXT: LSHR * T8.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: MOV T7.X, PV.Y, -; EG-NEXT: MOV * T10.X, T6.X, %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll index ef1adbb395e7..386a04611396 100644 --- a/llvm/test/CodeGen/AMDGPU/sra.ll +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -323,67 +323,28 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; EG-LABEL: ashr_v4i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 48, @10, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XY, T9.X, 1 +; EG-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XY, T8.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_128 T9.XYZW, T9.X, 0, #1 +; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T0.Y, T6.X, -; EG-NEXT: MOV * T9.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: BFE_INT T0.W, T9.X, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, T9.Z, literal.y, -; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) -; EG-NEXT: ASHR * T0.W, PV.W, PS, -; EG-NEXT: AND_INT T0.W, PV.W, literal.x, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, -; EG-NEXT: 65535(9.183409e-41), -65536(nan) -; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV * T6.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T9.X, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: LSHR * T1.W, T9.Z, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: ASHR T0.W, PV.W, PS, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV T6.X, PV.W, -; EG-NEXT: MOV T0.Y, T7.X, -; EG-NEXT: BFE_INT T0.W, T9.Y, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, T9.W, literal.y, +; EG-NEXT: MOV * T7.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: LSHR T0.Z, T7.X, literal.x, +; EG-NEXT: BFE_INT T0.W, T7.X, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, T7.Z, literal.y, ; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) -; EG-NEXT: ASHR T0.W, PV.W, PS, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, -; EG-NEXT: -65536(nan), 0(0.000000e+00) -; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, -; EG-NEXT: MOV * T7.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T9.Y, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: LSHR * T1.W, T9.W, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: ASHR T0.W, PV.W, PS, -; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: ASHR T7.X, PV.W, PS, +; EG-NEXT: BFE_INT T0.W, PV.Z, 0.0, literal.x, +; EG-NEXT: LSHR * T1.W, T7.Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T9.X, KC0[2].Y, literal.x, -; EG-NEXT: OR_INT * T10.Y, T1.W, PV.W, +; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x, +; EG-NEXT: ASHR * T7.Y, PV.W, PS, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: MOV T7.X, PV.Y, -; EG-NEXT: MOV * T10.X, T6.X, %b_ptr = getelementptr <4 x i16>, ptr addrspace(1) %in, i16 1 %a = load <4 x i16>, ptr addrspace(1) %in %b = load <4 x i16>, ptr addrspace(1) %b_ptr diff --git a/llvm/test/CodeGen/AMDGPU/truncate-lshr-cast-build-vector-combine.ll b/llvm/test/CodeGen/AMDGPU/truncate-lshr-cast-build-vector-combine.ll new file mode 100644 index 000000000000..1c3091f6b8d3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/truncate-lshr-cast-build-vector-combine.ll @@ -0,0 +1,140 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s + +; extract element 0 as shift +define i32 @cast_v4i32_to_i128_trunc_i32(<4 x i32> %arg) { +; CHECK-LABEL: cast_v4i32_to_i128_trunc_i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %bigint = bitcast <4 x i32> %arg to i128 + %trunc = trunc i128 %bigint to i32 + ret i32 %trunc +} + +; extract element 1 as shift +define i32 @cast_v4i32_to_i128_lshr_32_trunc_i32(<4 x i32> %arg) { +; CHECK-LABEL: cast_v4i32_to_i128_lshr_32_trunc_i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, v1 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %bigint = bitcast <4 x i32> %arg to i128 + %srl = lshr i128 %bigint, 32 + %trunc = trunc i128 %srl to i32 + ret i32 %trunc +} + +; extract element 2 as shift +define i32 @cast_v4i32_to_i128_lshr_64_trunc_i32(<4 x i32> %arg) { +; CHECK-LABEL: cast_v4i32_to_i128_lshr_64_trunc_i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, v2 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %bigint = bitcast <4 x i32> %arg to i128 + %srl = lshr i128 %bigint, 64 + %trunc = trunc i128 %srl to i32 + ret i32 %trunc +} + +; extract element 3 as shift +define i32 @cast_v4i32_to_i128_lshr_96_trunc_i32(<4 x i32> %arg) { +; CHECK-LABEL: cast_v4i32_to_i128_lshr_96_trunc_i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, v3 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %bigint = bitcast <4 x i32> %arg to i128 + %srl = lshr i128 %bigint, 96 + %trunc = trunc i128 %srl to i32 + ret i32 %trunc +} + +; Shift not aligned to element, not a simple extract +define i32 @cast_v4i32_to_i128_lshr_33_trunc_i32(<4 x i32> %arg) { +; CHECK-LABEL: cast_v4i32_to_i128_lshr_33_trunc_i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_alignbit_b32 v0, v2, v1, 1 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %bigint = bitcast <4 x i32> %arg to i128 + %srl = lshr i128 %bigint, 33 + %trunc = trunc i128 %srl to i32 + ret i32 %trunc +} + +; extract misaligned element +define i32 @cast_v4i32_to_i128_lshr_31_trunc_i32(<4 x i32> %arg) { +; CHECK-LABEL: cast_v4i32_to_i128_lshr_31_trunc_i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_alignbit_b32 v0, v1, v0, 31 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %bigint = bitcast <4 x i32> %arg to i128 + %srl = lshr i128 %bigint, 31 + %trunc = trunc i128 %srl to i32 + ret i32 %trunc +} + +; extract misaligned element +define i32 @cast_v4i32_to_i128_lshr_48_trunc_i32(<4 x i32> %arg) { +; CHECK-LABEL: cast_v4i32_to_i128_lshr_48_trunc_i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, 0x1000706 +; CHECK-NEXT: v_perm_b32 v0, v1, v2, s4 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %bigint = bitcast <4 x i32> %arg to i128 + %srl = lshr i128 %bigint, 48 + %trunc = trunc i128 %srl to i32 + ret i32 %trunc +} + +; extract elements 1 and 2 with shift +define i64 @cast_v4i32_to_i128_lshr_32_trunc_i64(<4 x i32> %arg) { +; CHECK-LABEL: cast_v4i32_to_i128_lshr_32_trunc_i64: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, v1 +; CHECK-NEXT: v_mov_b32_e32 v1, v2 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %bigint = bitcast <4 x i32> %arg to i128 + %srl = lshr i128 %bigint, 32 + %trunc = trunc i128 %srl to i64 + ret i64 %trunc +} + +; extract elements 2 and 3 with shift +define i64 @cast_v4i32_to_i128_lshr_64_trunc_i64(<4 x i32> %arg) { +; CHECK-LABEL: cast_v4i32_to_i128_lshr_64_trunc_i64: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, v3 +; CHECK-NEXT: v_mov_b32_e32 v0, v2 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %bigint = bitcast <4 x i32> %arg to i128 + %srl = lshr i128 %bigint, 64 + %trunc = trunc i128 %srl to i64 + ret i64 %trunc +} + +; FIXME: We don't process this case because we see multiple bitcasts +; before a 32-bit build_vector +define i32 @build_vector_i16_to_shift(i16 %arg0, i16 %arg1, i16 %arg2, i16 %arg3) { +; CHECK-LABEL: build_vector_i16_to_shift: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, 0x5040100 +; CHECK-NEXT: v_perm_b32 v0, v3, v2, s4 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %ins.0 = insertelement <4 x i16> poison, i16 %arg0, i32 0 + %ins.1 = insertelement <4 x i16> %ins.0, i16 %arg1, i32 1 + %ins.2 = insertelement <4 x i16> %ins.1, i16 %arg2, i32 2 + %ins.3 = insertelement <4 x i16> %ins.2, i16 %arg3, i32 3 + + %cast = bitcast <4 x i16> %ins.3 to i64 + %srl = lshr i64 %cast, 32 + %trunc = trunc i64 %srl to i32 + ret i32 %trunc +} diff --git a/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll b/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll index e3a6240aac00..fdc1e6abb051 100644 --- a/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll +++ b/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll @@ -12,6 +12,7 @@ ; GCN-NEXT: amdpal.pipelines: ; GCN-NEXT: - .hardware_stages: ; GCN-NEXT: .cs: +; GCN-NEXT: .entry_point: _amdgpu_cs ; GCN-NEXT: .entry_point_symbol: _amdgpu_cs_main ; GCN-NEXT: .scratch_memory_size: 0 ; SI-NEXT: .sgpr_count: 0x11 |
