summaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.gfx11plus-fake16.mir374
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.gfx11plus.mir402
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.mir117
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/promote-dependency-on-invariant-result.ll23
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal-cs.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal-es.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal-gs.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal-hs.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal-ls.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal-psenable.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal-vs.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/bitcast_vector_bigint.ll351
-rw-r--r--llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/ctpop16.ll328
-rw-r--r--llvm/test/CodeGen/AMDGPU/dead_copy.mir1
-rw-r--r--llvm/test/CodeGen/AMDGPU/elf-notes.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/kernel-args.ll733
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-i16.ll40
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-i8.ll1300
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-global-i16.ll79
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-global-i8.ll48
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll364
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll667
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll258
-rw-r--r--llvm/test/CodeGen/AMDGPU/memmove-var-size.ll184
-rw-r--r--llvm/test/CodeGen/AMDGPU/min.ll306
-rw-r--r--llvm/test/CodeGen/AMDGPU/minimummaximum.ll84
-rw-r--r--llvm/test/CodeGen/AMDGPU/minmax.ll554
-rw-r--r--llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/shl.ll59
-rw-r--r--llvm/test/CodeGen/AMDGPU/sra.ll67
-rw-r--r--llvm/test/CodeGen/AMDGPU/truncate-lshr-cast-build-vector-combine.ll140
-rw-r--r--llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll1
36 files changed, 2892 insertions, 3623 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.gfx11plus-fake16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.gfx11plus-fake16.mir
new file mode 100644
index 000000000000..23da26d96b62
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.gfx11plus-fake16.mir
@@ -0,0 +1,374 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -mattr=-real-true16 -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX11 %s
+
+---
+name: fcmp_false_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_false_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+ ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX11-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(false), [[TRUNC]](s16), [[TRUNC1]]
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[FCMP]](s1)
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(false), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_oeq_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_oeq_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[V_CMP_EQ_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_EQ_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_EQ_F16_fake16_e64_]]
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(oeq), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_ogt_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_ogt_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[V_CMP_GT_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_GT_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_GT_F16_fake16_e64_]]
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(ogt), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_oge_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_oge_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[V_CMP_GE_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_GE_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_GE_F16_fake16_e64_]]
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(oge), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_olt_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_olt_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[V_CMP_LT_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LT_F16_fake16_e64_]]
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(olt), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_ole_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_ole_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[V_CMP_LE_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LE_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LE_F16_fake16_e64_]]
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(ole), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+---
+name: fcmp_one_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_one_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[V_CMP_LG_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LG_F16_fake16_e64_]]
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(one), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_ord_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_ord_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[V_CMP_LG_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LG_F16_fake16_e64_]]
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(one), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_uno_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_uno_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[V_CMP_U_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_U_F16_fake16_e64_]]
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(uno), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_ueq_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_ueq_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[V_CMP_NLG_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLG_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLG_F16_fake16_e64_]]
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(ueq), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_ugt_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_ugt_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[V_CMP_NLE_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLE_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLE_F16_fake16_e64_]]
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(ugt), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_uge_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_uge_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[V_CMP_NLT_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLT_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLT_F16_fake16_e64_]]
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(uge), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_ult_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_ult_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[V_CMP_NGE_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NGE_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NGE_F16_fake16_e64_]]
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(ult), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_ule_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_ule_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[V_CMP_NGT_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NGT_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NGT_F16_fake16_e64_]]
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(ule), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_une_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_une_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[V_CMP_NEQ_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NEQ_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NEQ_F16_fake16_e64_]]
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(une), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_true_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_true_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+ ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX11-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(true), [[TRUNC]](s16), [[TRUNC1]]
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[FCMP]](s1)
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(true), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.gfx11plus.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.gfx11plus.mir
new file mode 100644
index 000000000000..a7140e6a74fd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.gfx11plus.mir
@@ -0,0 +1,402 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -mattr=+real-true16 -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX11 %s
+
+---
+name: fcmp_false_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_false_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+ ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX11-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(false), [[TRUNC]](s16), [[TRUNC1]]
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[FCMP]](s1)
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(false), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_oeq_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_oeq_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16
+ ; GFX11-NEXT: [[V_CMP_EQ_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_EQ_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_EQ_F16_t16_e64_]]
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(oeq), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_ogt_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_ogt_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16
+ ; GFX11-NEXT: [[V_CMP_GT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_GT_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_GT_F16_t16_e64_]]
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(ogt), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_oge_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_oge_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16
+ ; GFX11-NEXT: [[V_CMP_GE_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_GE_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_GE_F16_t16_e64_]]
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(oge), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_olt_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_olt_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16
+ ; GFX11-NEXT: [[V_CMP_LT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LT_F16_t16_e64_]]
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(olt), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_ole_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_ole_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16
+ ; GFX11-NEXT: [[V_CMP_LE_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LE_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LE_F16_t16_e64_]]
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(ole), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+---
+name: fcmp_one_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_one_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16
+ ; GFX11-NEXT: [[V_CMP_LG_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LG_F16_t16_e64_]]
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(one), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_ord_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_ord_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16
+ ; GFX11-NEXT: [[V_CMP_LG_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LG_F16_t16_e64_]]
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(one), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_uno_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_uno_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16
+ ; GFX11-NEXT: [[V_CMP_U_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_U_F16_t16_e64_]]
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(uno), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_ueq_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_ueq_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16
+ ; GFX11-NEXT: [[V_CMP_NLG_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLG_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLG_F16_t16_e64_]]
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(ueq), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_ugt_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_ugt_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16
+ ; GFX11-NEXT: [[V_CMP_NLE_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLE_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLE_F16_t16_e64_]]
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(ugt), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_uge_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_uge_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16
+ ; GFX11-NEXT: [[V_CMP_NLT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLT_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLT_F16_t16_e64_]]
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(uge), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_ult_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_ult_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16
+ ; GFX11-NEXT: [[V_CMP_NGE_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NGE_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NGE_F16_t16_e64_]]
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(ult), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_ule_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_ule_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16
+ ; GFX11-NEXT: [[V_CMP_NGT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NGT_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NGT_F16_t16_e64_]]
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(ule), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_une_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_une_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16
+ ; GFX11-NEXT: [[V_CMP_NEQ_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NEQ_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NEQ_F16_t16_e64_]]
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(une), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_true_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: fcmp_true_s16_vv
+ ; GFX11: liveins: $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+ ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX11-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(true), [[TRUNC]](s16), [[TRUNC1]]
+ ; GFX11-NEXT: S_ENDPGM 0, implicit [[FCMP]](s1)
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s16) = G_TRUNC %0
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vcc(s1) = G_FCMP floatpred(true), %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.mir
index 5c387baf4675..85b1d402146c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.mir
@@ -1,7 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=WAVE64 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=WAVE32 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX11 %s
---
name: fcmp_false_s16_vv
@@ -31,15 +30,6 @@ body: |
; WAVE32-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(false), [[TRUNC]](s16), [[TRUNC1]]
; WAVE32-NEXT: S_ENDPGM 0, implicit [[FCMP]](s1)
;
- ; GFX11-LABEL: name: fcmp_false_s16_vv
- ; GFX11: liveins: $vgpr0, $vgpr1
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
- ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
- ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX11-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(false), [[TRUNC]](s16), [[TRUNC1]]
- ; GFX11-NEXT: S_ENDPGM 0, implicit [[FCMP]](s1)
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s16) = G_TRUNC %0
@@ -72,13 +62,6 @@ body: |
; WAVE32-NEXT: [[V_CMP_EQ_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_EQ_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_EQ_F16_e64_]]
;
- ; GFX11-LABEL: name: fcmp_oeq_s16_vv
- ; GFX11: liveins: $vgpr0, $vgpr1
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX11-NEXT: [[V_CMP_EQ_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_EQ_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_EQ_F16_fake16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s16) = G_TRUNC %0
@@ -111,13 +94,6 @@ body: |
; WAVE32-NEXT: [[V_CMP_GT_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_GT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_GT_F16_e64_]]
;
- ; GFX11-LABEL: name: fcmp_ogt_s16_vv
- ; GFX11: liveins: $vgpr0, $vgpr1
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX11-NEXT: [[V_CMP_GT_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_GT_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_GT_F16_fake16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s16) = G_TRUNC %0
@@ -150,13 +126,6 @@ body: |
; WAVE32-NEXT: [[V_CMP_GE_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_GE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_GE_F16_e64_]]
;
- ; GFX11-LABEL: name: fcmp_oge_s16_vv
- ; GFX11: liveins: $vgpr0, $vgpr1
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX11-NEXT: [[V_CMP_GE_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_GE_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_GE_F16_fake16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s16) = G_TRUNC %0
@@ -189,13 +158,6 @@ body: |
; WAVE32-NEXT: [[V_CMP_LT_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_LT_F16_e64_]]
;
- ; GFX11-LABEL: name: fcmp_olt_s16_vv
- ; GFX11: liveins: $vgpr0, $vgpr1
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX11-NEXT: [[V_CMP_LT_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LT_F16_fake16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s16) = G_TRUNC %0
@@ -228,13 +190,6 @@ body: |
; WAVE32-NEXT: [[V_CMP_LE_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_LE_F16_e64_]]
;
- ; GFX11-LABEL: name: fcmp_ole_s16_vv
- ; GFX11: liveins: $vgpr0, $vgpr1
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX11-NEXT: [[V_CMP_LE_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LE_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LE_F16_fake16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s16) = G_TRUNC %0
@@ -266,13 +221,6 @@ body: |
; WAVE32-NEXT: [[V_CMP_LG_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_LG_F16_e64_]]
;
- ; GFX11-LABEL: name: fcmp_one_s16_vv
- ; GFX11: liveins: $vgpr0, $vgpr1
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX11-NEXT: [[V_CMP_LG_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LG_F16_fake16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s16) = G_TRUNC %0
@@ -305,13 +253,6 @@ body: |
; WAVE32-NEXT: [[V_CMP_LG_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_LG_F16_e64_]]
;
- ; GFX11-LABEL: name: fcmp_ord_s16_vv
- ; GFX11: liveins: $vgpr0, $vgpr1
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX11-NEXT: [[V_CMP_LG_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LG_F16_fake16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s16) = G_TRUNC %0
@@ -344,13 +285,6 @@ body: |
; WAVE32-NEXT: [[V_CMP_U_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_U_F16_e64_]]
;
- ; GFX11-LABEL: name: fcmp_uno_s16_vv
- ; GFX11: liveins: $vgpr0, $vgpr1
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX11-NEXT: [[V_CMP_U_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_U_F16_fake16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s16) = G_TRUNC %0
@@ -383,13 +317,6 @@ body: |
; WAVE32-NEXT: [[V_CMP_NLG_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLG_F16_e64_]]
;
- ; GFX11-LABEL: name: fcmp_ueq_s16_vv
- ; GFX11: liveins: $vgpr0, $vgpr1
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX11-NEXT: [[V_CMP_NLG_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLG_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLG_F16_fake16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s16) = G_TRUNC %0
@@ -422,13 +349,6 @@ body: |
; WAVE32-NEXT: [[V_CMP_NLE_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLE_F16_e64_]]
;
- ; GFX11-LABEL: name: fcmp_ugt_s16_vv
- ; GFX11: liveins: $vgpr0, $vgpr1
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX11-NEXT: [[V_CMP_NLE_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLE_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLE_F16_fake16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s16) = G_TRUNC %0
@@ -461,13 +381,6 @@ body: |
; WAVE32-NEXT: [[V_CMP_NLT_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLT_F16_e64_]]
;
- ; GFX11-LABEL: name: fcmp_uge_s16_vv
- ; GFX11: liveins: $vgpr0, $vgpr1
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX11-NEXT: [[V_CMP_NLT_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLT_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLT_F16_fake16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s16) = G_TRUNC %0
@@ -500,13 +413,6 @@ body: |
; WAVE32-NEXT: [[V_CMP_NGE_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NGE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_NGE_F16_e64_]]
;
- ; GFX11-LABEL: name: fcmp_ult_s16_vv
- ; GFX11: liveins: $vgpr0, $vgpr1
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX11-NEXT: [[V_CMP_NGE_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NGE_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NGE_F16_fake16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s16) = G_TRUNC %0
@@ -539,13 +445,6 @@ body: |
; WAVE32-NEXT: [[V_CMP_NGT_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NGT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_NGT_F16_e64_]]
;
- ; GFX11-LABEL: name: fcmp_ule_s16_vv
- ; GFX11: liveins: $vgpr0, $vgpr1
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX11-NEXT: [[V_CMP_NGT_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NGT_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NGT_F16_fake16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s16) = G_TRUNC %0
@@ -578,13 +477,6 @@ body: |
; WAVE32-NEXT: [[V_CMP_NEQ_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NEQ_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_NEQ_F16_e64_]]
;
- ; GFX11-LABEL: name: fcmp_une_s16_vv
- ; GFX11: liveins: $vgpr0, $vgpr1
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX11-NEXT: [[V_CMP_NEQ_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NEQ_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NEQ_F16_fake16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s16) = G_TRUNC %0
@@ -621,15 +513,6 @@ body: |
; WAVE32-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(true), [[TRUNC]](s16), [[TRUNC1]]
; WAVE32-NEXT: S_ENDPGM 0, implicit [[FCMP]](s1)
;
- ; GFX11-LABEL: name: fcmp_true_s16_vv
- ; GFX11: liveins: $vgpr0, $vgpr1
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
- ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
- ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX11-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(true), [[TRUNC]](s16), [[TRUNC1]]
- ; GFX11-NEXT: S_ENDPGM 0, implicit [[FCMP]](s1)
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s16) = G_TRUNC %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/promote-dependency-on-invariant-result.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/promote-dependency-on-invariant-result.ll
new file mode 100644
index 000000000000..090aa067a526
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/promote-dependency-on-invariant-result.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -O0 -global-isel=true --stop-after=irtranslator -o - %s | FileCheck %s
+
+declare ptr @llvm.invariant.start.p5(i64 immarg, ptr addrspace(5) nocapture)
+declare void @llvm.invariant.end.p5(ptr, i64 immarg, ptr addrspace(5) nocapture)
+
+define void @use_invariant_promotable_lds(ptr addrspace(5) %arg, i32 %i) {
+ ; CHECK-LABEL: name: use_invariant_promotable_lds
+ ; CHECK: bb.1.bb:
+ ; CHECK-NEXT: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: G_STORE [[C]](s32), [[DEF]](p0) :: (store (s32) into %ir.tmp)
+ ; CHECK-NEXT: SI_RETURN
+bb:
+ %tmp = call ptr @llvm.invariant.start.p5(i64 4, ptr addrspace(5) %arg)
+ call void @llvm.invariant.end.p5(ptr %tmp, i64 4, ptr addrspace(5) %arg)
+ store i32 0, ptr %tmp, align 4
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-cs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-cs.ll
index 0818f607da0a..96775f4763e3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-cs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-cs.ll
@@ -8,6 +8,7 @@
; GCN-NEXT: amdpal.pipelines:
; GCN-NEXT: - .hardware_stages:
; GCN-NEXT: .cs:
+; GCN-NEXT: .entry_point: _amdgpu_cs
; GCN-NEXT: .entry_point_symbol: cs_amdpal
; GCN-NEXT: .scratch_memory_size: 0
; GCN: .registers:
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-es.ll b/llvm/test/CodeGen/AMDGPU/amdpal-es.ll
index e37d22c7df37..1379246c3257 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-es.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-es.ll
@@ -7,6 +7,7 @@
; GCN-NEXT: amdpal.pipelines:
; GCN-NEXT: - .hardware_stages:
; GCN-NEXT: .es:
+; GCN-NEXT: .entry_point: _amdgpu_es
; GCN-NEXT: .entry_point_symbol: es_amdpal
; GCN-NEXT: .scratch_memory_size: 0
; GCN: .registers:
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-gs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-gs.ll
index d847f75a5c09..1fba34a50094 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-gs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-gs.ll
@@ -8,6 +8,7 @@
; GCN-NEXT: amdpal.pipelines:
; GCN-NEXT: - .hardware_stages:
; GCN-NEXT: .gs:
+; GCN-NEXT: .entry_point: _amdgpu_gs
; GCN-NEXT: .entry_point_symbol: gs_amdpal
; GCN-NEXT: .scratch_memory_size: 0
; GCN: .registers:
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-hs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-hs.ll
index 74f5f440c99d..53c6b95f0735 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-hs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-hs.ll
@@ -8,6 +8,7 @@
; GCN-NEXT: amdpal.pipelines:
; GCN-NEXT: - .hardware_stages:
; GCN-NEXT: .hs:
+; GCN-NEXT: .entry_point: _amdgpu_hs
; GCN-NEXT: .entry_point_symbol: hs_amdpal
; GCN-NEXT: .scratch_memory_size: 0
; GCN: .registers:
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-ls.ll b/llvm/test/CodeGen/AMDGPU/amdpal-ls.ll
index 287cc1201a3c..ebe753134a42 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-ls.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-ls.ll
@@ -7,6 +7,7 @@
; GCN-NEXT: amdpal.pipelines:
; GCN-NEXT: - .hardware_stages:
; GCN-NEXT: .ls:
+; GCN-NEXT: .entry_point: _amdgpu_ls
; GCN-NEXT: .entry_point_symbol: ls_amdpal
; GCN-NEXT: .scratch_memory_size: 0
; GCN: .registers:
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-psenable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-psenable.ll
index e1767182c359..32f19e2af32e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-psenable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-psenable.ll
@@ -11,6 +11,7 @@
; GCN-NEXT: amdpal.pipelines:
; GCN-NEXT: - .hardware_stages:
; GCN-NEXT: .ps:
+; GCN-NEXT: .entry_point: _amdgpu_ps
; GCN-NEXT: .entry_point_symbol: amdpal_psenable
; GCN-NEXT: .scratch_memory_size: 0
; GCN: .registers:
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-vs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-vs.ll
index b225d978601a..853d221ee3aa 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-vs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-vs.ll
@@ -8,6 +8,7 @@
; GCN-NEXT: amdpal.pipelines:
; GCN-NEXT: - .hardware_stages:
; GCN-NEXT: .vs:
+; GCN-NEXT: .entry_point: _amdgpu_vs
; GCN-NEXT: .entry_point_symbol: vs_amdpal
; GCN-NEXT: .scratch_memory_size: 0
; GCN: .registers:
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal.ll b/llvm/test/CodeGen/AMDGPU/amdpal.ll
index 97fcf0606b5b..171df029615e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal.ll
@@ -86,6 +86,7 @@ declare void @llvm.amdgcn.raw.ptr.buffer.store.f32(float, ptr addrspace(8), i32,
; PAL-NEXT: amdpal.pipelines:
; PAL-NEXT: - .hardware_stages:
; PAL-NEXT: .cs:
+; PAL-NEXT: .entry_point: _amdgpu_cs
; PAL-NEXT: .entry_point_symbol: scratch2_cs
; PAL-NEXT: .scratch_memory_size: 0x10
; PAL-NEXT: .sgpr_count: 0x
diff --git a/llvm/test/CodeGen/AMDGPU/bitcast_vector_bigint.ll b/llvm/test/CodeGen/AMDGPU/bitcast_vector_bigint.ll
new file mode 100644
index 000000000000..2c6aabec7633
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bitcast_vector_bigint.ll
@@ -0,0 +1,351 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+
+; Make sure stack use isn't introduced for these bitcasts.
+
+define i160 @bitcast_v5i32_to_i160(<5 x i32> %vec) {
+; GFX9-LABEL: bitcast_v5i32_to_i160:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_v5i32_to_i160:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %bitcast = bitcast <5 x i32> %vec to i160
+ ret i160 %bitcast
+}
+
+define i192 @bitcast_v6i32_to_i192(<6 x i32> %vec) {
+; GFX9-LABEL: bitcast_v6i32_to_i192:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_v6i32_to_i192:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %bitcast = bitcast <6 x i32> %vec to i192
+ ret i192 %bitcast
+}
+
+define i224 @bitcast_v7i32_to_i224(<7 x i32> %vec) {
+; GFX9-LABEL: bitcast_v7i32_to_i224:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_v7i32_to_i224:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %bitcast = bitcast <7 x i32> %vec to i224
+ ret i224 %bitcast
+}
+
+define i256 @bitcast_v8i32_to_i256(<8 x i32> %vec) {
+; GFX9-LABEL: bitcast_v8i32_to_i256:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_v8i32_to_i256:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %bitcast = bitcast <8 x i32> %vec to i256
+ ret i256 %bitcast
+}
+
+define <5 x i32> @bitcast_i160_to_v5i32(i160 %int) {
+; GFX9-LABEL: bitcast_i160_to_v5i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_i160_to_v5i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %bitcast = bitcast i160 %int to <5 x i32>
+ ret <5 x i32> %bitcast
+}
+
+define <6 x i32> @bitcast_i192_to_v6i32(i192 %int) {
+; GFX9-LABEL: bitcast_i192_to_v6i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_i192_to_v6i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %bitcast = bitcast i192 %int to <6 x i32>
+ ret <6 x i32> %bitcast
+}
+
+define <7 x i32> @bitcast_i224_to_v7i32(i224 %int) {
+; GFX9-LABEL: bitcast_i224_to_v7i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_i224_to_v7i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %bitcast = bitcast i224 %int to <7 x i32>
+ ret <7 x i32> %bitcast
+}
+
+define <8 x i32> @bitcast_i256_to_v8i32(i256 %int) {
+; GFX9-LABEL: bitcast_i256_to_v8i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_i256_to_v8i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %bitcast = bitcast i256 %int to <8 x i32>
+ ret <8 x i32> %bitcast
+}
+
+define i192 @bitcast_v3i64_to_i192(<3 x i64> %vec) {
+; GFX9-LABEL: bitcast_v3i64_to_i192:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_v3i64_to_i192:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %bitcast = bitcast <3 x i64> %vec to i192
+ ret i192 %bitcast
+}
+
+define <3 x i64> @bitcast_i192_to_v3i64(i192 %int) {
+; GFX9-LABEL: bitcast_i192_to_v3i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_i192_to_v3i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %bitcast = bitcast i192 %int to <3 x i64>
+ ret <3 x i64> %bitcast
+}
+
+define <10 x i16> @bitcast_i160_to_v10i16(i160 %int) {
+; GFX9-LABEL: bitcast_i160_to_v10i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v0
+; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_i160_to_v10i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_bfi_b32 v0, 0xffff, v0, v0
+; GFX12-NEXT: v_bfi_b32 v2, 0xffff, v2, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %bitcast = bitcast i160 %int to <10 x i16>
+ ret <10 x i16> %bitcast
+}
+
+define i160 @bitcast_v10i16_to_i160(<10 x i16> %vec) {
+; GFX9-LABEL: bitcast_v10i16_to_i160:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_v10i16_to_i160:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %bitcast = bitcast <10 x i16> %vec to i160
+ ret i160 %bitcast
+}
+
+define i12 @bitcast_v2i6_to_i12(<2 x i6> %vec) {
+; GFX9-LABEL: bitcast_v2i6_to_i12:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 6, v1
+; GFX9-NEXT: v_and_b32_e32 v0, 63, v0
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT: v_and_b32_e32 v0, 0xfff, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_v2i6_to_i12:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_lshlrev_b16 v1, 6, v1
+; GFX12-NEXT: v_and_b32_e32 v0, 63, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX12-NEXT: v_and_b32_e32 v0, 0xfff, v0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %bitcast = bitcast <2 x i6> %vec to i12
+ ret i12 %bitcast
+}
+
+define <2 x i6> @bitcast_i12_to_v2i6(i12 %int) {
+; GFX9-LABEL: bitcast_i12_to_v2i6:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v2, 63, v0
+; GFX9-NEXT: v_lshrrev_b16_e32 v0, 6, v0
+; GFX9-NEXT: v_and_b32_e32 v1, 63, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_i12_to_v2i6:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_lshrrev_b16 v1, 6, v0
+; GFX12-NEXT: v_and_b32_e32 v0, 63, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_and_b32_e32 v1, 63, v1
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %bitcast = bitcast i12 %int to <2 x i6>
+ ret <2 x i6> %bitcast
+}
+
+define i160 @bitcast_v5f32_to_i160(<5 x float> %vec) {
+; GFX9-LABEL: bitcast_v5f32_to_i160:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_v5f32_to_i160:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %bitcast = bitcast <5 x float> %vec to i160
+ ret i160 %bitcast
+}
+
+define <5 x float> @bitcast_i160_to_v5f32(i160 %int) {
+; GFX9-LABEL: bitcast_i160_to_v5f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_i160_to_v5f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %bitcast = bitcast i160 %int to <5 x float>
+ ret <5 x float> %bitcast
+}
+
+define <6 x float> @bitcast_i192_to_v6f32(i192 %int) {
+; GFX9-LABEL: bitcast_i192_to_v6f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_i192_to_v6f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %bitcast = bitcast i192 %int to <6 x float>
+ ret <6 x float> %bitcast
+}
+
+define i192 @bitcast_v6f32_to_i192(<6 x float> %vec) {
+; GFX9-LABEL: bitcast_v6f32_to_i192:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_v6f32_to_i192:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %bitcast = bitcast <6 x float> %vec to i192
+ ret i192 %bitcast
+}
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
index 7eaa52d89b9b..405058b24dcc 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
@@ -3091,15 +3091,6 @@ define i160 @load_i160(ptr addrspace(8) inreg %buf) {
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
; SDAG-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:16
-; SDAG-NEXT: s_mov_b32 s4, s33
-; SDAG-NEXT: s_add_i32 s33, s32, 0x7c0
-; SDAG-NEXT: s_and_b32 s33, s33, 0xfffff800
-; SDAG-NEXT: s_mov_b32 s5, s34
-; SDAG-NEXT: s_mov_b32 s34, s32
-; SDAG-NEXT: s_addk_i32 s32, 0x1800
-; SDAG-NEXT: s_mov_b32 s32, s34
-; SDAG-NEXT: s_mov_b32 s34, s5
-; SDAG-NEXT: s_mov_b32 s33, s4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -3119,17 +3110,8 @@ define void @store_i160(i160 %data, ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: store_i160:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: s_mov_b32 s4, s33
-; SDAG-NEXT: s_add_i32 s33, s32, 0x7c0
-; SDAG-NEXT: s_and_b32 s33, s33, 0xfffff800
-; SDAG-NEXT: s_mov_b32 s5, s34
-; SDAG-NEXT: s_mov_b32 s34, s32
-; SDAG-NEXT: s_addk_i32 s32, 0x1000
; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
; SDAG-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:16
-; SDAG-NEXT: s_mov_b32 s32, s34
-; SDAG-NEXT: s_mov_b32 s34, s5
-; SDAG-NEXT: s_mov_b32 s33, s4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index 17ab8fc780fb..6bf126af5ade 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -457,58 +457,27 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr
;
; EG-LABEL: v_ctpop_v4i16:
; EG: ; %bb.0:
-; EG-NEXT: ALU 3, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 37, @12, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XY, T0.X, 1
+; EG-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T6.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_64 T8.XY, T0.X, 0, #1
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV T0.Y, T4.X,
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
-; EG-NEXT: ALU clause starting at 12:
-; EG-NEXT: AND_INT * T0.W, T8.X, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T4.X, PV.W,
-; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: LSHR * T0.W, T8.X, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T4.X, PV.W,
-; EG-NEXT: MOV * T0.X, T5.X,
-; EG-NEXT: AND_INT * T0.W, T8.Y, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T5.X, PV.W,
-; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: LSHR * T0.W, T8.Y, literal.x,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: LSHR * T0.W, T0.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
+; EG-NEXT: BCNT_INT T0.Y, PV.W,
+; EG-NEXT: AND_INT * T0.W, T0.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
-; EG-NEXT: OR_INT * T8.Y, T1.W, PV.W,
+; EG-NEXT: BCNT_INT T0.X, PV.W,
+; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MOV T5.X, PV.Y,
-; EG-NEXT: MOV * T8.X, T4.X,
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr <4 x i16>, ptr addrspace(1) %in, i32 %tid
%val = load <4 x i16>, ptr addrspace(1) %in.gep, align 16
@@ -601,94 +570,33 @@ define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addr
;
; EG-LABEL: v_ctpop_v8i16:
; EG: ; %bb.0:
-; EG-NEXT: ALU 3, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 73, @12, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T12.X, 1
+; EG-NEXT: ALU 13, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T8.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_128 T12.XYZW, T0.X, 0, #1
+; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV T0.Y, T4.X,
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
-; EG-NEXT: ALU clause starting at 12:
-; EG-NEXT: LSHR * T0.W, T12.X, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT * T0.W, PV.W,
-; EG-NEXT: LSHL T0.W, PV.W, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T4.X, PV.W,
-; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: AND_INT * T0.W, T12.X, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV T4.X, PV.W,
-; EG-NEXT: MOV * T0.X, T5.X,
-; EG-NEXT: LSHR * T0.W, T12.Y, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T5.X, PV.W,
-; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: AND_INT * T0.W, T12.Y, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.Y, PS, PV.W,
-; EG-NEXT: MOV T5.X, PV.Y,
-; EG-NEXT: MOV * T0.X, T8.X,
-; EG-NEXT: LSHR * T0.W, T12.Z, literal.x,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: LSHR * T0.W, T0.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T8.X, PV.W,
-; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: AND_INT * T0.W, T12.Z, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV T8.X, PV.W,
-; EG-NEXT: MOV * T0.X, T9.X,
-; EG-NEXT: LSHR * T0.W, T12.W, literal.x,
+; EG-NEXT: BCNT_INT T0.Z, PS,
+; EG-NEXT: LSHR * T1.W, T0.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T0.W, PV.W,
+; EG-NEXT: BCNT_INT T0.Y, PV.W,
; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T9.X, PV.W,
-; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: AND_INT * T0.W, T12.W, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: LSHR T12.X, KC0[2].Y, literal.x,
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
+; EG-NEXT: BCNT_INT T0.X, PV.W,
+; EG-NEXT: LSHR * T8.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MOV T9.X, PV.W,
-; EG-NEXT: MOV * T0.X, T4.X,
-; EG-NEXT: MOV * T0.Z, T8.X,
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr <8 x i16>, ptr addrspace(1) %in, i32 %tid
%val = load <8 x i16>, ptr addrspace(1) %in.gep, align 32
@@ -837,174 +745,46 @@ define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr add
;
; EG-LABEL: v_ctpop_v16i16:
; EG: ; %bb.0:
-; EG-NEXT: ALU 3, @12, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 1 @8
-; EG-NEXT: ALU 114, @16, KC0[], KC1[]
-; EG-NEXT: ALU 34, @131, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T22.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T21.X, 1
+; EG-NEXT: ALU 2, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 1 @6
+; EG-NEXT: ALU 25, @13, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T14.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T13.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 8:
-; EG-NEXT: VTX_READ_128 T20.XYZW, T0.X, 16, #1
-; EG-NEXT: VTX_READ_128 T21.XYZW, T0.X, 0, #1
-; EG-NEXT: ALU clause starting at 12:
-; EG-NEXT: MOV T0.Y, T4.X,
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_128 T12.XYZW, T0.X, 16, #1
+; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 10:
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 5(7.006492e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
-; EG-NEXT: ALU clause starting at 16:
-; EG-NEXT: LSHR * T0.W, T20.X, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT * T0.W, PV.W,
-; EG-NEXT: LSHL T0.W, PV.W, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T4.X, PV.W,
-; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: AND_INT * T0.W, T20.X, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV T4.X, PV.W,
-; EG-NEXT: MOV * T0.X, T5.X,
-; EG-NEXT: LSHR * T0.W, T20.Y, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T5.X, PV.W,
-; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: AND_INT * T0.W, T20.Y, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.Y, PS, PV.W,
-; EG-NEXT: MOV T5.X, PV.Y,
-; EG-NEXT: MOV * T0.X, T8.X,
-; EG-NEXT: LSHR * T0.W, T20.Z, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T8.X, PV.W,
-; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: AND_INT * T0.W, T20.Z, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV T8.X, PV.W,
-; EG-NEXT: MOV * T0.X, T9.X,
-; EG-NEXT: LSHR * T0.W, T20.W, literal.x,
+; EG-NEXT: ALU clause starting at 13:
+; EG-NEXT: LSHR * T0.W, T12.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
+; EG-NEXT: BCNT_INT T12.W, PV.W,
+; EG-NEXT: AND_INT * T0.W, T12.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: BCNT_INT T12.Z, PS,
+; EG-NEXT: LSHR T0.W, T0.Z, literal.x,
+; EG-NEXT: LSHR * T1.W, T12.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T9.X, PV.W,
-; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: AND_INT * T0.W, T20.W, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: BCNT_INT T12.Y, PS,
+; EG-NEXT: AND_INT T0.Z, T0.Z, literal.x,
; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV T9.X, PV.W,
-; EG-NEXT: MOV * T0.X, T12.X,
-; EG-NEXT: LSHR * T1.W, T21.X, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T1.W, PV.W,
-; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, T2.W, PV.W,
-; EG-NEXT: MOV * T12.X, PV.W,
-; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: AND_INT * T1.W, T21.X, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T1.W, PV.W,
-; EG-NEXT: AND_INT * T2.W, PV.X, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, PS, PV.W,
-; EG-NEXT: MOV T12.X, PV.W,
-; EG-NEXT: MOV * T0.X, T13.X,
-; EG-NEXT: LSHR * T1.W, T21.Y, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T1.W, PV.W,
-; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, T2.W, PV.W,
-; EG-NEXT: MOV * T13.X, PV.W,
-; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: AND_INT * T1.W, T21.Y, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T1.W, PV.W,
-; EG-NEXT: AND_INT * T2.W, PV.X, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T20.Y, PS, PV.W,
-; EG-NEXT: MOV T13.X, PV.Y,
-; EG-NEXT: MOV * T0.X, T16.X,
-; EG-NEXT: LSHR * T1.W, T21.Z, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T1.W, PV.W,
-; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, T2.W, PV.W,
-; EG-NEXT: ALU clause starting at 131:
-; EG-NEXT: MOV * T16.X, T1.W,
-; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: AND_INT * T1.W, T21.Z, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T1.W, PV.W,
-; EG-NEXT: AND_INT * T2.W, PV.X, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, PS, PV.W,
-; EG-NEXT: MOV T16.X, PV.W,
-; EG-NEXT: MOV * T0.X, T17.X,
-; EG-NEXT: LSHR * T1.W, T21.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T1.W, PV.W,
-; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
+; EG-NEXT: AND_INT * T1.W, T12.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: BCNT_INT T12.X, PS,
+; EG-NEXT: BCNT_INT T0.Z, PV.Z,
+; EG-NEXT: LSHR T1.W, T0.X, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, T2.W, PV.W,
-; EG-NEXT: MOV * T17.X, PV.W,
-; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: AND_INT T1.W, T21.W, literal.x,
-; EG-NEXT: LSHR * T21.X, KC0[2].Y, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
-; EG-NEXT: AND_INT T0.Z, PV.X, literal.x,
-; EG-NEXT: BCNT_INT T1.W, PV.W,
-; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
-; EG-NEXT: -65536(nan), 16(2.242078e-44)
-; EG-NEXT: LSHR T22.X, PS, literal.x,
-; EG-NEXT: OR_INT * T20.W, PV.Z, PV.W,
+; EG-NEXT: LSHR T13.X, PS, literal.x,
+; EG-NEXT: BCNT_INT T0.Y, PV.W,
+; EG-NEXT: AND_INT * T1.W, T0.X, literal.y,
+; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
+; EG-NEXT: BCNT_INT T0.X, PV.W,
+; EG-NEXT: LSHR * T14.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MOV T17.X, PV.W,
-; EG-NEXT: MOV * T0.X, T4.X,
-; EG-NEXT: MOV * T0.Z, T8.X,
-; EG-NEXT: MOV T20.X, T12.X,
-; EG-NEXT: MOV * T20.Z, T16.X, BS:VEC_120/SCL_212
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr <16 x i16>, ptr addrspace(1) %in, i32 %tid
%val = load <16 x i16>, ptr addrspace(1) %in.gep, align 32
diff --git a/llvm/test/CodeGen/AMDGPU/dead_copy.mir b/llvm/test/CodeGen/AMDGPU/dead_copy.mir
index 2b54c61056a9..5bc42e9c4719 100644
--- a/llvm/test/CodeGen/AMDGPU/dead_copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/dead_copy.mir
@@ -1,4 +1,5 @@
# RUN: llc -o - %s -mtriple=amdgcn -mcpu=fiji -run-pass=machine-cp -verify-machineinstrs | FileCheck -check-prefix=GCN %s
+# RUN: llc -o - %s -mtriple=amdgcn -mcpu=fiji -passes=machine-cp | FileCheck -check-prefix=GCN %s
# GCN-LABEL: dead_copy
# GCN: bb.0
diff --git a/llvm/test/CodeGen/AMDGPU/elf-notes.ll b/llvm/test/CodeGen/AMDGPU/elf-notes.ll
index e91bed464136..b205678bd908 100644
--- a/llvm/test/CodeGen/AMDGPU/elf-notes.ll
+++ b/llvm/test/CodeGen/AMDGPU/elf-notes.ll
@@ -66,6 +66,7 @@
; OSABI-PAL-ELF: amdpal.pipelines:
; OSABI-PAL-ELF: - .hardware_stages:
; OSABI-PAL-ELF: .cs:
+; OSABI-PAL-ELF: .entry_point: _amdgpu_cs
; OSABI-PAL-ELF: .entry_point_symbol: elf_notes
; OSABI-PAL-ELF: .scratch_memory_size: 0
; OSABI-PAL-ELF: .sgpr_count: 96
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index 8704f4e78044..121891adef18 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -1025,74 +1025,67 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16>
;
; EG-LABEL: v3i16_arg:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 0, @12, KC0[], KC1[]
-; EG-NEXT: TEX 2 @6
-; EG-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 0
-; EG-NEXT: MEM_RAT MSKOR T5.XW, T8.X
+; EG-NEXT: ALU 0, @10, KC0[], KC1[]
+; EG-NEXT: TEX 1 @6
+; EG-NEXT: ALU 14, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T3.X, 0
+; EG-NEXT: MEM_RAT MSKOR T2.XW, T0.X
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3
-; EG-NEXT: VTX_READ_16 T7.X, T5.X, 46, #3
-; EG-NEXT: VTX_READ_16 T5.X, T5.X, 48, #3
-; EG-NEXT: ALU clause starting at 12:
-; EG-NEXT: MOV * T5.X, 0.0,
-; EG-NEXT: ALU clause starting at 13:
+; EG-NEXT: VTX_READ_16 T1.X, T0.X, 44, #3
+; EG-NEXT: VTX_READ_16 T0.X, T0.X, 48, #3
+; EG-NEXT: ALU clause starting at 10:
+; EG-NEXT: MOV * T0.X, 0.0,
+; EG-NEXT: ALU clause starting at 11:
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
-; EG-NEXT: AND_INT * T2.W, T5.X, literal.y,
+; EG-NEXT: AND_INT * T2.W, T0.X, literal.y,
; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL T5.X, T2.W, PV.W,
-; EG-NEXT: LSHL * T5.W, literal.x, PV.W,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: MOV T5.Y, 0.0,
-; EG-NEXT: MOV * T5.Z, 0.0,
-; EG-NEXT: LSHR T8.X, T0.W, literal.x,
-; EG-NEXT: LSHL T0.W, T7.X, literal.y,
-; EG-NEXT: AND_INT * T1.W, T6.X, literal.z,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: LSHL T2.X, T2.W, PV.W,
+; EG-NEXT: LSHL * T2.W, literal.x, PV.W,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: OR_INT T6.X, PV.W, PS,
-; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
+; EG-NEXT: MOV T2.Y, 0.0,
+; EG-NEXT: MOV * T2.Z, 0.0,
+; EG-NEXT: LSHR T0.X, T0.W, literal.x,
+; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v3i16_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 0, @12, KC0[], KC1[]
-; CM-NEXT: TEX 2 @6
-; CM-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT MSKOR T5.XW, T8.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6.X, T7.X
+; CM-NEXT: TEX 0 @8
+; CM-NEXT: ALU 13, @13, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MEM_RAT MSKOR T1.XW, T2.X
+; CM-NEXT: ALU 1, @27, KC0[CB0:0-32], KC1[]
+; CM-NEXT: TEX 0 @10
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
; CM-NEXT: CF_END
-; CM-NEXT: Fetch clause starting at 6:
-; CM-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3
-; CM-NEXT: VTX_READ_16 T7.X, T5.X, 46, #3
-; CM-NEXT: VTX_READ_16 T5.X, T5.X, 48, #3
+; CM-NEXT: Fetch clause starting at 8:
+; CM-NEXT: VTX_READ_16 T1.X, T0.X, 48, #3
+; CM-NEXT: Fetch clause starting at 10:
+; CM-NEXT: VTX_READ_16 T0.X, T0.X, 44, #3
; CM-NEXT: ALU clause starting at 12:
-; CM-NEXT: MOV * T5.X, 0.0,
+; CM-NEXT: MOV * T0.X, 0.0,
; CM-NEXT: ALU clause starting at 13:
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00)
; CM-NEXT: AND_INT * T1.W, PV.W, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; CM-NEXT: AND_INT T0.Z, T5.X, literal.x,
+; CM-NEXT: AND_INT T0.Z, T1.X, literal.x,
; CM-NEXT: LSHL * T1.W, PV.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
-; CM-NEXT: LSHL T5.X, PV.Z, PV.W,
-; CM-NEXT: LSHL * T5.W, literal.x, PV.W,
+; CM-NEXT: LSHL T1.X, PV.Z, PV.W,
+; CM-NEXT: LSHL * T1.W, literal.x, PV.W,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT: MOV T5.Y, 0.0,
-; CM-NEXT: MOV * T5.Z, 0.0,
-; CM-NEXT: LSHL T0.Z, T7.X, literal.x,
-; CM-NEXT: AND_INT * T1.W, T6.X, literal.y, BS:VEC_120/SCL_212
-; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
-; CM-NEXT: OR_INT * T6.X, PV.Z, PV.W,
-; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
+; CM-NEXT: MOV T1.Y, 0.0,
+; CM-NEXT: MOV * T1.Z, 0.0,
+; CM-NEXT: LSHR * T2.X, T0.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT: LSHR * T8.X, T0.W, literal.x,
+; CM-NEXT: ALU clause starting at 27:
+; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <3 x i16> %in, ptr addrspace(1) %out, align 4
@@ -2676,205 +2669,47 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) {
;
; EG-LABEL: v8i16_arg:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 1, @36, KC0[], KC1[]
-; EG-NEXT: TEX 0 @20
-; EG-NEXT: ALU 5, @38, KC0[], KC1[]
-; EG-NEXT: TEX 0 @22
-; EG-NEXT: ALU 5, @44, KC0[], KC1[]
-; EG-NEXT: TEX 0 @24
-; EG-NEXT: ALU 5, @50, KC0[], KC1[]
-; EG-NEXT: TEX 0 @26
-; EG-NEXT: ALU 5, @56, KC0[], KC1[]
-; EG-NEXT: TEX 0 @28
-; EG-NEXT: ALU 5, @62, KC0[], KC1[]
-; EG-NEXT: TEX 0 @30
-; EG-NEXT: ALU 5, @68, KC0[], KC1[]
-; EG-NEXT: TEX 0 @32
-; EG-NEXT: ALU 5, @74, KC0[], KC1[]
-; EG-NEXT: TEX 0 @34
-; EG-NEXT: ALU 8, @80, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1
+; EG-NEXT: ALU 0, @14, KC0[], KC1[]
+; EG-NEXT: TEX 3 @6
+; EG-NEXT: ALU 4, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 20:
-; EG-NEXT: VTX_READ_16 T8.X, T7.X, 66, #3
-; EG-NEXT: Fetch clause starting at 22:
-; EG-NEXT: VTX_READ_16 T8.X, T7.X, 58, #3
-; EG-NEXT: Fetch clause starting at 24:
-; EG-NEXT: VTX_READ_16 T8.X, T7.X, 64, #3
-; EG-NEXT: Fetch clause starting at 26:
-; EG-NEXT: VTX_READ_16 T8.X, T7.X, 56, #3
-; EG-NEXT: Fetch clause starting at 28:
-; EG-NEXT: VTX_READ_16 T8.X, T7.X, 62, #3
-; EG-NEXT: Fetch clause starting at 30:
-; EG-NEXT: VTX_READ_16 T8.X, T7.X, 54, #3
-; EG-NEXT: Fetch clause starting at 32:
-; EG-NEXT: VTX_READ_16 T8.X, T7.X, 60, #3
-; EG-NEXT: Fetch clause starting at 34:
-; EG-NEXT: VTX_READ_16 T7.X, T7.X, 52, #3
-; EG-NEXT: ALU clause starting at 36:
-; EG-NEXT: MOV * T0.Y, T3.X,
-; EG-NEXT: MOV * T7.X, 0.0,
-; EG-NEXT: ALU clause starting at 38:
-; EG-NEXT: LSHL T0.W, T8.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV T3.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T5.X,
-; EG-NEXT: ALU clause starting at 44:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, T8.X, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T5.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T3.X,
-; EG-NEXT: ALU clause starting at 50:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, T8.X, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T3.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T5.X,
-; EG-NEXT: ALU clause starting at 56:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, T8.X, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T5.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T2.X,
-; EG-NEXT: ALU clause starting at 62:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, T8.X, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T2.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T4.X,
-; EG-NEXT: ALU clause starting at 68:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, T8.X, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T4.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T2.X,
-; EG-NEXT: ALU clause starting at 74:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, T8.X, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T7.Z, PV.W, PS,
-; EG-NEXT: MOV T2.X, PV.Z,
-; EG-NEXT: MOV * T0.Y, T4.X,
-; EG-NEXT: ALU clause starting at 80:
-; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x,
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.y,
-; EG-NEXT: AND_INT * T1.W, T7.X, literal.z,
-; EG-NEXT: 2(2.802597e-45), -65536(nan)
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T7.X, PV.W, PS,
-; EG-NEXT: MOV T4.X, PV.X,
-; EG-NEXT: MOV * T7.W, T3.X,
-; EG-NEXT: MOV * T7.Y, T5.X,
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_16 T1.X, T0.X, 52, #3
+; EG-NEXT: VTX_READ_16 T2.X, T0.X, 54, #3
+; EG-NEXT: VTX_READ_16 T3.X, T0.X, 62, #3
+; EG-NEXT: VTX_READ_16 T0.X, T0.X, 60, #3
+; EG-NEXT: ALU clause starting at 14:
+; EG-NEXT: MOV * T0.X, 0.0,
+; EG-NEXT: ALU clause starting at 15:
+; EG-NEXT: MOV T1.Y, T2.X,
+; EG-NEXT: MOV * T1.Z, T0.X, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: MOV * T1.W, T3.X,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v8i16_arg:
; CM: ; %bb.0: ; %entry
-; CM-NEXT: ALU 1, @36, KC0[], KC1[]
-; CM-NEXT: TEX 0 @20
-; CM-NEXT: ALU 5, @38, KC0[], KC1[]
-; CM-NEXT: TEX 0 @22
-; CM-NEXT: ALU 5, @44, KC0[], KC1[]
-; CM-NEXT: TEX 0 @24
-; CM-NEXT: ALU 5, @50, KC0[], KC1[]
-; CM-NEXT: TEX 0 @26
-; CM-NEXT: ALU 5, @56, KC0[], KC1[]
-; CM-NEXT: TEX 0 @28
-; CM-NEXT: ALU 5, @62, KC0[], KC1[]
-; CM-NEXT: TEX 0 @30
-; CM-NEXT: ALU 5, @68, KC0[], KC1[]
-; CM-NEXT: TEX 0 @32
-; CM-NEXT: ALU 5, @74, KC0[], KC1[]
-; CM-NEXT: TEX 0 @34
-; CM-NEXT: ALU 8, @80, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T8.X
+; CM-NEXT: ALU 0, @14, KC0[], KC1[]
+; CM-NEXT: TEX 3 @6
+; CM-NEXT: ALU 4, @15, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
-; CM-NEXT: Fetch clause starting at 20:
-; CM-NEXT: VTX_READ_16 T8.X, T7.X, 66, #3
-; CM-NEXT: Fetch clause starting at 22:
-; CM-NEXT: VTX_READ_16 T8.X, T7.X, 58, #3
-; CM-NEXT: Fetch clause starting at 24:
-; CM-NEXT: VTX_READ_16 T8.X, T7.X, 64, #3
-; CM-NEXT: Fetch clause starting at 26:
-; CM-NEXT: VTX_READ_16 T8.X, T7.X, 56, #3
-; CM-NEXT: Fetch clause starting at 28:
-; CM-NEXT: VTX_READ_16 T8.X, T7.X, 62, #3
-; CM-NEXT: Fetch clause starting at 30:
-; CM-NEXT: VTX_READ_16 T8.X, T7.X, 54, #3
-; CM-NEXT: Fetch clause starting at 32:
-; CM-NEXT: VTX_READ_16 T8.X, T7.X, 60, #3
-; CM-NEXT: Fetch clause starting at 34:
-; CM-NEXT: VTX_READ_16 T7.X, T7.X, 52, #3
-; CM-NEXT: ALU clause starting at 36:
-; CM-NEXT: MOV * T0.Y, T3.X,
-; CM-NEXT: MOV * T7.X, 0.0,
-; CM-NEXT: ALU clause starting at 38:
-; CM-NEXT: LSHL T0.Z, T8.X, literal.x,
-; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
-; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
-; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
-; CM-NEXT: MOV T3.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T5.X,
-; CM-NEXT: ALU clause starting at 44:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, T8.X, literal.y,
-; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T5.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T3.X,
-; CM-NEXT: ALU clause starting at 50:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: AND_INT * T0.W, T8.X, literal.y,
-; CM-NEXT: -65536(nan), 65535(9.183409e-41)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T3.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T5.X,
-; CM-NEXT: ALU clause starting at 56:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: AND_INT * T0.W, T8.X, literal.y,
-; CM-NEXT: -65536(nan), 65535(9.183409e-41)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T5.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T2.X,
-; CM-NEXT: ALU clause starting at 62:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, T8.X, literal.y,
-; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T2.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T4.X,
-; CM-NEXT: ALU clause starting at 68:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, T8.X, literal.y,
-; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T4.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T2.X,
-; CM-NEXT: ALU clause starting at 74:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: AND_INT * T0.W, T8.X, literal.y,
-; CM-NEXT: -65536(nan), 65535(9.183409e-41)
-; CM-NEXT: OR_INT * T7.Z, PV.Z, PV.W,
-; CM-NEXT: MOV T2.X, PV.Z,
-; CM-NEXT: MOV * T0.Y, T4.X,
-; CM-NEXT: ALU clause starting at 80:
-; CM-NEXT: LSHR T8.X, KC0[2].Y, literal.x,
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.y,
-; CM-NEXT: AND_INT * T0.W, T7.X, literal.z,
-; CM-NEXT: 2(2.802597e-45), -65536(nan)
-; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT: OR_INT * T7.X, PV.Z, PV.W,
-; CM-NEXT: MOV T4.X, PV.X,
-; CM-NEXT: MOV * T7.W, T3.X,
-; CM-NEXT: MOV * T7.Y, T5.X,
+; CM-NEXT: Fetch clause starting at 6:
+; CM-NEXT: VTX_READ_16 T1.X, T0.X, 52, #3
+; CM-NEXT: VTX_READ_16 T2.X, T0.X, 54, #3
+; CM-NEXT: VTX_READ_16 T3.X, T0.X, 62, #3
+; CM-NEXT: VTX_READ_16 T0.X, T0.X, 60, #3
+; CM-NEXT: ALU clause starting at 14:
+; CM-NEXT: MOV * T0.X, 0.0,
+; CM-NEXT: ALU clause starting at 15:
+; CM-NEXT: MOV T1.Y, T2.X,
+; CM-NEXT: MOV * T1.Z, T0.X, BS:VEC_120/SCL_212
+; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; CM-NEXT: MOV * T1.W, T3.X,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <8 x i16> %in, ptr addrspace(1) %out
ret void
@@ -3618,392 +3453,68 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) {
;
; EG-LABEL: v16i16_arg:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 1, @68, KC0[], KC1[]
-; EG-NEXT: TEX 0 @36
-; EG-NEXT: ALU 5, @70, KC0[], KC1[]
-; EG-NEXT: TEX 0 @38
-; EG-NEXT: ALU 5, @76, KC0[], KC1[]
-; EG-NEXT: TEX 0 @40
-; EG-NEXT: ALU 5, @82, KC0[], KC1[]
-; EG-NEXT: TEX 0 @42
-; EG-NEXT: ALU 5, @88, KC0[], KC1[]
-; EG-NEXT: TEX 0 @44
-; EG-NEXT: ALU 5, @94, KC0[], KC1[]
-; EG-NEXT: TEX 0 @46
-; EG-NEXT: ALU 5, @100, KC0[], KC1[]
-; EG-NEXT: TEX 0 @48
-; EG-NEXT: ALU 5, @106, KC0[], KC1[]
-; EG-NEXT: TEX 0 @50
-; EG-NEXT: ALU 5, @112, KC0[], KC1[]
-; EG-NEXT: TEX 0 @52
-; EG-NEXT: ALU 5, @118, KC0[], KC1[]
-; EG-NEXT: TEX 0 @54
-; EG-NEXT: ALU 5, @124, KC0[], KC1[]
-; EG-NEXT: TEX 0 @56
-; EG-NEXT: ALU 5, @130, KC0[], KC1[]
-; EG-NEXT: TEX 0 @58
-; EG-NEXT: ALU 5, @136, KC0[], KC1[]
-; EG-NEXT: TEX 0 @60
-; EG-NEXT: ALU 5, @142, KC0[], KC1[]
-; EG-NEXT: TEX 0 @62
-; EG-NEXT: ALU 5, @148, KC0[], KC1[]
-; EG-NEXT: TEX 0 @64
-; EG-NEXT: ALU 5, @154, KC0[], KC1[]
-; EG-NEXT: TEX 0 @66
-; EG-NEXT: ALU 13, @160, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 1
+; EG-NEXT: ALU 0, @22, KC0[], KC1[]
+; EG-NEXT: TEX 7 @6
+; EG-NEXT: ALU 10, @23, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T0.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 36:
-; EG-NEXT: VTX_READ_16 T12.X, T11.X, 98, #3
-; EG-NEXT: Fetch clause starting at 38:
-; EG-NEXT: VTX_READ_16 T12.X, T11.X, 90, #3
-; EG-NEXT: Fetch clause starting at 40:
-; EG-NEXT: VTX_READ_16 T12.X, T11.X, 82, #3
-; EG-NEXT: Fetch clause starting at 42:
-; EG-NEXT: VTX_READ_16 T12.X, T11.X, 74, #3
-; EG-NEXT: Fetch clause starting at 44:
-; EG-NEXT: VTX_READ_16 T12.X, T11.X, 96, #3
-; EG-NEXT: Fetch clause starting at 46:
-; EG-NEXT: VTX_READ_16 T12.X, T11.X, 88, #3
-; EG-NEXT: Fetch clause starting at 48:
-; EG-NEXT: VTX_READ_16 T12.X, T11.X, 80, #3
-; EG-NEXT: Fetch clause starting at 50:
-; EG-NEXT: VTX_READ_16 T12.X, T11.X, 72, #3
-; EG-NEXT: Fetch clause starting at 52:
-; EG-NEXT: VTX_READ_16 T12.X, T11.X, 94, #3
-; EG-NEXT: Fetch clause starting at 54:
-; EG-NEXT: VTX_READ_16 T12.X, T11.X, 86, #3
-; EG-NEXT: Fetch clause starting at 56:
-; EG-NEXT: VTX_READ_16 T12.X, T11.X, 78, #3
-; EG-NEXT: Fetch clause starting at 58:
-; EG-NEXT: VTX_READ_16 T12.X, T11.X, 70, #3
-; EG-NEXT: Fetch clause starting at 60:
-; EG-NEXT: VTX_READ_16 T12.X, T11.X, 92, #3
-; EG-NEXT: Fetch clause starting at 62:
-; EG-NEXT: VTX_READ_16 T12.X, T11.X, 84, #3
-; EG-NEXT: Fetch clause starting at 64:
-; EG-NEXT: VTX_READ_16 T13.X, T11.X, 76, #3
-; EG-NEXT: Fetch clause starting at 66:
-; EG-NEXT: VTX_READ_16 T11.X, T11.X, 68, #3
-; EG-NEXT: ALU clause starting at 68:
-; EG-NEXT: MOV * T0.Y, T3.X,
-; EG-NEXT: MOV * T11.X, 0.0,
-; EG-NEXT: ALU clause starting at 70:
-; EG-NEXT: LSHL T0.W, T12.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV T3.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T5.X,
-; EG-NEXT: ALU clause starting at 76:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T5.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T7.X,
-; EG-NEXT: ALU clause starting at 82:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T7.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T9.X,
-; EG-NEXT: ALU clause starting at 88:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T9.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T3.X,
-; EG-NEXT: ALU clause starting at 94:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, T12.X, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T3.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T5.X,
-; EG-NEXT: ALU clause starting at 100:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, T12.X, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T5.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T7.X,
-; EG-NEXT: ALU clause starting at 106:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, T12.X, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T7.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T9.X,
-; EG-NEXT: ALU clause starting at 112:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, T12.X, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T9.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T2.X,
-; EG-NEXT: ALU clause starting at 118:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T2.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T4.X,
-; EG-NEXT: ALU clause starting at 124:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T4.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T6.X,
-; EG-NEXT: ALU clause starting at 130:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T6.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T8.X,
-; EG-NEXT: ALU clause starting at 136:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T8.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T2.X,
-; EG-NEXT: ALU clause starting at 142:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, T12.X, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T12.Z, PV.W, PS,
-; EG-NEXT: MOV T2.X, PV.Z,
-; EG-NEXT: MOV * T0.Y, T4.X,
-; EG-NEXT: ALU clause starting at 148:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, T12.X, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T12.X, PV.W, PS,
-; EG-NEXT: MOV T4.X, PV.X,
-; EG-NEXT: MOV * T0.Y, T6.X,
-; EG-NEXT: ALU clause starting at 154:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, T13.X, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T11.Z, PV.W, PS,
-; EG-NEXT: MOV T6.X, PV.Z,
-; EG-NEXT: MOV * T0.Y, T8.X,
-; EG-NEXT: ALU clause starting at 160:
-; EG-NEXT: LSHR T13.X, KC0[2].Y, literal.x,
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_16 T1.X, T0.X, 84, #3
+; EG-NEXT: VTX_READ_16 T2.X, T0.X, 86, #3
+; EG-NEXT: VTX_READ_16 T3.X, T0.X, 94, #3
+; EG-NEXT: VTX_READ_16 T4.X, T0.X, 78, #3
+; EG-NEXT: VTX_READ_16 T5.X, T0.X, 76, #3
+; EG-NEXT: VTX_READ_16 T6.X, T0.X, 92, #3
+; EG-NEXT: VTX_READ_16 T7.X, T0.X, 68, #3
+; EG-NEXT: VTX_READ_16 T0.X, T0.X, 70, #3
+; EG-NEXT: ALU clause starting at 22:
+; EG-NEXT: MOV * T0.X, 0.0,
+; EG-NEXT: ALU clause starting at 23:
+; EG-NEXT: MOV T1.Y, T2.X,
+; EG-NEXT: MOV * T7.Y, T0.X,
+; EG-NEXT: MOV * T1.Z, T6.X,
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: MOV T7.Z, T5.X,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: LSHR T14.X, PV.W, literal.x,
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.y,
-; EG-NEXT: AND_INT * T1.W, T11.X, literal.z,
-; EG-NEXT: 2(2.802597e-45), -65536(nan)
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T11.X, PV.W, PS,
-; EG-NEXT: MOV T8.X, PV.X,
-; EG-NEXT: MOV * T12.W, T3.X,
-; EG-NEXT: MOV T12.Y, T5.X,
-; EG-NEXT: MOV T11.W, T7.X, BS:VEC_120/SCL_212
-; EG-NEXT: MOV * T11.Y, T9.X,
+; EG-NEXT: LSHR T2.X, PV.W, literal.x,
+; EG-NEXT: MOV T7.W, T4.X,
+; EG-NEXT: MOV * T1.W, T3.X,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v16i16_arg:
; CM: ; %bb.0: ; %entry
-; CM-NEXT: ALU 1, @68, KC0[], KC1[]
-; CM-NEXT: TEX 0 @36
-; CM-NEXT: ALU 5, @70, KC0[], KC1[]
-; CM-NEXT: TEX 0 @38
-; CM-NEXT: ALU 5, @76, KC0[], KC1[]
-; CM-NEXT: TEX 0 @40
-; CM-NEXT: ALU 5, @82, KC0[], KC1[]
-; CM-NEXT: TEX 0 @42
-; CM-NEXT: ALU 5, @88, KC0[], KC1[]
-; CM-NEXT: TEX 0 @44
-; CM-NEXT: ALU 5, @94, KC0[], KC1[]
-; CM-NEXT: TEX 0 @46
-; CM-NEXT: ALU 5, @100, KC0[], KC1[]
-; CM-NEXT: TEX 0 @48
-; CM-NEXT: ALU 5, @106, KC0[], KC1[]
-; CM-NEXT: TEX 0 @50
-; CM-NEXT: ALU 5, @112, KC0[], KC1[]
-; CM-NEXT: TEX 0 @52
-; CM-NEXT: ALU 5, @118, KC0[], KC1[]
-; CM-NEXT: TEX 0 @54
-; CM-NEXT: ALU 5, @124, KC0[], KC1[]
-; CM-NEXT: TEX 0 @56
-; CM-NEXT: ALU 5, @130, KC0[], KC1[]
-; CM-NEXT: TEX 0 @58
-; CM-NEXT: ALU 5, @136, KC0[], KC1[]
-; CM-NEXT: TEX 0 @60
-; CM-NEXT: ALU 5, @142, KC0[], KC1[]
-; CM-NEXT: TEX 0 @62
-; CM-NEXT: ALU 5, @148, KC0[], KC1[]
-; CM-NEXT: TEX 0 @64
-; CM-NEXT: ALU 5, @154, KC0[], KC1[]
-; CM-NEXT: TEX 0 @66
-; CM-NEXT: ALU 14, @160, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T11, T14.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T13.X
+; CM-NEXT: ALU 0, @22, KC0[], KC1[]
+; CM-NEXT: TEX 7 @6
+; CM-NEXT: ALU 11, @23, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T2.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X
; CM-NEXT: CF_END
-; CM-NEXT: Fetch clause starting at 36:
-; CM-NEXT: VTX_READ_16 T12.X, T11.X, 98, #3
-; CM-NEXT: Fetch clause starting at 38:
-; CM-NEXT: VTX_READ_16 T12.X, T11.X, 90, #3
-; CM-NEXT: Fetch clause starting at 40:
-; CM-NEXT: VTX_READ_16 T12.X, T11.X, 82, #3
-; CM-NEXT: Fetch clause starting at 42:
-; CM-NEXT: VTX_READ_16 T12.X, T11.X, 74, #3
-; CM-NEXT: Fetch clause starting at 44:
-; CM-NEXT: VTX_READ_16 T12.X, T11.X, 96, #3
-; CM-NEXT: Fetch clause starting at 46:
-; CM-NEXT: VTX_READ_16 T12.X, T11.X, 88, #3
-; CM-NEXT: Fetch clause starting at 48:
-; CM-NEXT: VTX_READ_16 T12.X, T11.X, 80, #3
-; CM-NEXT: Fetch clause starting at 50:
-; CM-NEXT: VTX_READ_16 T12.X, T11.X, 72, #3
-; CM-NEXT: Fetch clause starting at 52:
-; CM-NEXT: VTX_READ_16 T12.X, T11.X, 94, #3
-; CM-NEXT: Fetch clause starting at 54:
-; CM-NEXT: VTX_READ_16 T12.X, T11.X, 86, #3
-; CM-NEXT: Fetch clause starting at 56:
-; CM-NEXT: VTX_READ_16 T12.X, T11.X, 78, #3
-; CM-NEXT: Fetch clause starting at 58:
-; CM-NEXT: VTX_READ_16 T12.X, T11.X, 70, #3
-; CM-NEXT: Fetch clause starting at 60:
-; CM-NEXT: VTX_READ_16 T12.X, T11.X, 92, #3
-; CM-NEXT: Fetch clause starting at 62:
-; CM-NEXT: VTX_READ_16 T12.X, T11.X, 84, #3
-; CM-NEXT: Fetch clause starting at 64:
-; CM-NEXT: VTX_READ_16 T13.X, T11.X, 76, #3
-; CM-NEXT: Fetch clause starting at 66:
-; CM-NEXT: VTX_READ_16 T11.X, T11.X, 68, #3
-; CM-NEXT: ALU clause starting at 68:
-; CM-NEXT: MOV * T0.Y, T3.X,
-; CM-NEXT: MOV * T11.X, 0.0,
-; CM-NEXT: ALU clause starting at 70:
-; CM-NEXT: LSHL T0.Z, T12.X, literal.x,
-; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
-; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
-; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
-; CM-NEXT: MOV T3.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T5.X,
-; CM-NEXT: ALU clause starting at 76:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
-; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T5.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T7.X,
-; CM-NEXT: ALU clause starting at 82:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
-; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T7.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T9.X,
-; CM-NEXT: ALU clause starting at 88:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
-; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T9.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T3.X,
-; CM-NEXT: ALU clause starting at 94:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: AND_INT * T0.W, T12.X, literal.y,
-; CM-NEXT: -65536(nan), 65535(9.183409e-41)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T3.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T5.X,
-; CM-NEXT: ALU clause starting at 100:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: AND_INT * T0.W, T12.X, literal.y,
-; CM-NEXT: -65536(nan), 65535(9.183409e-41)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T5.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T7.X,
-; CM-NEXT: ALU clause starting at 106:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: AND_INT * T0.W, T12.X, literal.y,
-; CM-NEXT: -65536(nan), 65535(9.183409e-41)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T7.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T9.X,
-; CM-NEXT: ALU clause starting at 112:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: AND_INT * T0.W, T12.X, literal.y,
-; CM-NEXT: -65536(nan), 65535(9.183409e-41)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T9.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T2.X,
-; CM-NEXT: ALU clause starting at 118:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
-; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T2.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T4.X,
-; CM-NEXT: ALU clause starting at 124:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
-; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T4.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T6.X,
-; CM-NEXT: ALU clause starting at 130:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
-; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T6.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T8.X,
-; CM-NEXT: ALU clause starting at 136:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
-; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T8.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T2.X,
-; CM-NEXT: ALU clause starting at 142:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: AND_INT * T0.W, T12.X, literal.y,
-; CM-NEXT: -65536(nan), 65535(9.183409e-41)
-; CM-NEXT: OR_INT * T12.Z, PV.Z, PV.W,
-; CM-NEXT: MOV T2.X, PV.Z,
-; CM-NEXT: MOV * T0.Y, T4.X,
-; CM-NEXT: ALU clause starting at 148:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: AND_INT * T0.W, T12.X, literal.y,
-; CM-NEXT: -65536(nan), 65535(9.183409e-41)
-; CM-NEXT: OR_INT * T12.X, PV.Z, PV.W,
-; CM-NEXT: MOV T4.X, PV.X,
-; CM-NEXT: MOV * T0.Y, T6.X,
-; CM-NEXT: ALU clause starting at 154:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: AND_INT * T0.W, T13.X, literal.y,
-; CM-NEXT: -65536(nan), 65535(9.183409e-41)
-; CM-NEXT: OR_INT * T11.Z, PV.Z, PV.W,
-; CM-NEXT: MOV T6.X, PV.Z,
-; CM-NEXT: MOV * T0.Y, T8.X,
-; CM-NEXT: ALU clause starting at 160:
+; CM-NEXT: Fetch clause starting at 6:
+; CM-NEXT: VTX_READ_16 T1.X, T0.X, 84, #3
+; CM-NEXT: VTX_READ_16 T2.X, T0.X, 86, #3
+; CM-NEXT: VTX_READ_16 T3.X, T0.X, 78, #3
+; CM-NEXT: VTX_READ_16 T4.X, T0.X, 94, #3
+; CM-NEXT: VTX_READ_16 T5.X, T0.X, 76, #3
+; CM-NEXT: VTX_READ_16 T6.X, T0.X, 92, #3
+; CM-NEXT: VTX_READ_16 T7.X, T0.X, 68, #3
+; CM-NEXT: VTX_READ_16 T0.X, T0.X, 70, #3
+; CM-NEXT: ALU clause starting at 22:
+; CM-NEXT: MOV * T0.X, 0.0,
+; CM-NEXT: ALU clause starting at 23:
+; CM-NEXT: MOV * T1.Y, T2.X,
+; CM-NEXT: MOV T7.Y, T0.X,
+; CM-NEXT: MOV T1.Z, T6.X, BS:VEC_120/SCL_212
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: LSHR * T13.X, PV.W, literal.x,
+; CM-NEXT: LSHR T0.X, PV.W, literal.x,
+; CM-NEXT: MOV T7.Z, T5.X,
+; CM-NEXT: MOV * T1.W, T4.X, BS:VEC_120/SCL_212
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
+; CM-NEXT: MOV * T7.W, T3.X,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT: LSHR T14.X, KC0[2].Y, literal.x,
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.y,
-; CM-NEXT: AND_INT * T0.W, T11.X, literal.z,
-; CM-NEXT: 2(2.802597e-45), -65536(nan)
-; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT: OR_INT * T11.X, PV.Z, PV.W,
-; CM-NEXT: MOV T8.X, PV.X,
-; CM-NEXT: MOV * T12.W, T3.X,
-; CM-NEXT: MOV T12.Y, T5.X,
-; CM-NEXT: MOV * T11.W, T7.X, BS:VEC_120/SCL_212
-; CM-NEXT: MOV * T11.Y, T9.X,
entry:
store <16 x i16> %in, ptr addrspace(1) %out
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 2afac4e90aa4..458afa4d6aad 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -212,38 +212,32 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp
;
; EG-LABEL: constant_load_v3i16:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 2 @6
-; EG-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 0
-; EG-NEXT: MEM_RAT MSKOR T5.XW, T8.X
+; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 1 @6
+; EG-NEXT: ALU 14, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T3.X, 0
+; EG-NEXT: MEM_RAT MSKOR T2.XW, T0.X
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_16 T6.X, T5.X, 0, #1
-; EG-NEXT: VTX_READ_16 T7.X, T5.X, 2, #1
-; EG-NEXT: VTX_READ_16 T5.X, T5.X, 4, #1
-; EG-NEXT: ALU clause starting at 12:
-; EG-NEXT: MOV * T5.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 13:
+; EG-NEXT: VTX_READ_16 T1.X, T0.X, 0, #1
+; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1
+; EG-NEXT: ALU clause starting at 10:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 11:
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
-; EG-NEXT: AND_INT * T2.W, T5.X, literal.y,
+; EG-NEXT: AND_INT * T2.W, T0.X, literal.y,
; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL T5.X, T2.W, PV.W,
-; EG-NEXT: LSHL * T5.W, literal.x, PV.W,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: MOV T5.Y, 0.0,
-; EG-NEXT: MOV * T5.Z, 0.0,
-; EG-NEXT: LSHR T8.X, T0.W, literal.x,
-; EG-NEXT: LSHL T0.W, T7.X, literal.y,
-; EG-NEXT: AND_INT * T1.W, T6.X, literal.z,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: LSHL T2.X, T2.W, PV.W,
+; EG-NEXT: LSHL * T2.W, literal.x, PV.W,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: OR_INT T6.X, PV.W, PS,
-; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
+; EG-NEXT: MOV T2.Y, 0.0,
+; EG-NEXT: MOV * T2.Z, 0.0,
+; EG-NEXT: LSHR T0.X, T0.W, literal.x,
+; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; GFX12-LABEL: constant_load_v3i16:
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index b945c7c3def6..c608bef3f726 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -9491,50 +9491,24 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
;
; EG-LABEL: constant_zextload_v4i8_to_v4i16:
; EG: ; %bb.0:
-; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 31, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XY, T7.X, 1
+; EG-NEXT: ALU 6, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XY, T5.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1
+; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T0.Y, T4.X,
-; EG-NEXT: MOV * T7.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: AND_INT T0.W, T7.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 255(3.573311e-43), -65536(nan)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T0.W, T7.X, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, T5.X,
+; EG-NEXT: MOV * T4.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: MOV * T0.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T0.W, T7.X, literal.x, PV.W,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 16(2.242078e-44), -65536(nan)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T5.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T7.X, literal.x,
+; EG-NEXT: BFE_UINT * T4.Y, T4.X, literal.x, PV.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x,
-; EG-NEXT: OR_INT * T8.Y, PV.W, PS,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MOV T5.X, PV.Y,
-; EG-NEXT: MOV * T8.X, T4.X,
+; EG-NEXT: AND_INT T4.X, T4.X, literal.x,
+; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
+; EG-NEXT: 255(3.573311e-43), 2(2.802597e-45)
;
; GFX12-LABEL: constant_zextload_v4i8_to_v4i16:
; GFX12: ; %bb.0:
@@ -9633,56 +9607,23 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out
;
; EG-LABEL: constant_sextload_v4i8_to_v4i16:
; EG: ; %bb.0:
-; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 37, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XY, T7.X, 1
+; EG-NEXT: ALU 5, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1
+; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T0.Y, T4.X,
-; EG-NEXT: MOV * T7.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 65535(9.183409e-41), -65536(nan)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T7.X, literal.x,
+; EG-NEXT: MOV * T4.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: BFE_INT T5.X, T4.X, 0.0, literal.x,
+; EG-NEXT: LSHR T0.W, T4.X, literal.x,
+; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
+; EG-NEXT: BFE_INT * T5.Y, PV.W, 0.0, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, T5.X,
-; EG-NEXT: LSHR * T0.W, T7.X, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), -65536(nan)
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T5.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: ASHR * T0.W, T7.X, literal.x,
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x,
-; EG-NEXT: OR_INT * T8.Y, PV.W, PS,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MOV T5.X, PV.Y,
-; EG-NEXT: MOV * T8.X, T4.X,
;
; GFX12-LABEL: constant_sextload_v4i8_to_v4i16:
; GFX12: ; %bb.0:
@@ -9800,80 +9741,27 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
;
; EG-LABEL: constant_zextload_v8i8_to_v8i16:
; EG: ; %bb.0:
-; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 61, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T11.X, 1
+; EG-NEXT: ALU 9, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_64 T11.XY, T11.X, 0, #1
+; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T0.Y, T8.X,
-; EG-NEXT: MOV * T11.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: AND_INT T0.W, T11.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 255(3.573311e-43), -65536(nan)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T8.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T0.W, T11.X, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T8.X, PV.W,
-; EG-NEXT: MOV T0.Y, T9.X,
+; EG-NEXT: MOV * T5.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: MOV * T0.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T1.W, T11.X, literal.x, PV.W,
-; EG-NEXT: AND_INT * T2.W, PV.Y, literal.y,
-; EG-NEXT: 16(2.242078e-44), -65536(nan)
-; EG-NEXT: OR_INT * T1.W, PS, PV.W,
-; EG-NEXT: MOV * T9.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T11.X, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T12.Y, PV.W, PS,
-; EG-NEXT: MOV T9.X, PV.Y,
-; EG-NEXT: MOV * T0.Y, T4.X,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T11.Y, literal.y,
-; EG-NEXT: -65536(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV * T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T1.W, T11.Y, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, T5.X,
-; EG-NEXT: BFE_UINT * T0.W, T11.Y, literal.x, T0.W,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, PV.W, T0.W,
-; EG-NEXT: MOV * T5.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T11.Y, literal.x,
+; EG-NEXT: BFE_UINT * T6.W, T5.Y, literal.x, PV.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: LSHR T11.X, KC0[2].Y, literal.x,
-; EG-NEXT: OR_INT * T12.W, PV.W, PS,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MOV T5.X, PV.W,
-; EG-NEXT: MOV * T12.X, T8.X,
-; EG-NEXT: MOV * T12.Z, T4.X,
+; EG-NEXT: BFE_UINT T6.Y, T5.X, literal.x, T0.W,
+; EG-NEXT: AND_INT * T6.Z, T5.Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 255(3.573311e-43)
+; EG-NEXT: AND_INT T6.X, T5.X, literal.x,
+; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
+; EG-NEXT: 255(3.573311e-43), 2(2.802597e-45)
;
; GFX12-LABEL: constant_zextload_v8i8_to_v8i16:
; GFX12: ; %bb.0:
@@ -10017,93 +9905,28 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
;
; EG-LABEL: constant_sextload_v8i8_to_v8i16:
; EG: ; %bb.0:
-; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 74, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T11.X, 1
+; EG-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_64 T11.XY, T11.X, 0, #1
+; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T0.Y, T8.X,
-; EG-NEXT: MOV * T11.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: BFE_INT * T0.W, T11.X, 0.0, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 65535(9.183409e-41), -65536(nan)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T8.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T11.X, literal.x,
+; EG-NEXT: MOV * T5.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: BFE_INT * T6.Z, T5.Y, 0.0, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T8.X, PV.W,
-; EG-NEXT: MOV T0.Y, T9.X,
-; EG-NEXT: LSHR * T0.W, T11.X, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), -65536(nan)
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T9.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: ASHR * T0.W, T11.X, literal.x,
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T12.Y, PV.W, PS,
-; EG-NEXT: MOV T9.X, PV.Y,
-; EG-NEXT: MOV T0.Y, T4.X,
-; EG-NEXT: BFE_INT * T0.W, T11.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T6.X, T5.X, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T5.Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV * T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T11.Y, literal.x,
+; EG-NEXT: BFE_INT T6.W, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T5.X, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, T5.X,
-; EG-NEXT: LSHR * T0.W, T11.Y, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), -65536(nan)
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T5.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: ASHR * T0.W, T11.Y, literal.x,
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: LSHR T11.X, KC0[2].Y, literal.x,
-; EG-NEXT: OR_INT * T12.W, PV.W, PS,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MOV T5.X, PV.W,
-; EG-NEXT: MOV * T12.X, T8.X,
-; EG-NEXT: MOV * T12.Z, T4.X,
+; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x,
+; EG-NEXT: BFE_INT * T6.Y, PS, 0.0, literal.y,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
;
; GFX12-LABEL: constant_sextload_v8i8_to_v8i16:
; GFX12: ; %bb.0:
@@ -10296,146 +10119,37 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
;
; EG-LABEL: constant_zextload_v16i8_to_v16i16:
; EG: ; %bb.0:
-; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @8
-; EG-NEXT: ALU 103, @12, KC0[], KC1[]
-; EG-NEXT: ALU 20, @116, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T22.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T21.X, 1
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 19, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T10.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T7.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 8:
-; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 0, #1
-; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: MOV * T0.Y, T16.X,
-; EG-NEXT: MOV * T19.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 12:
-; EG-NEXT: AND_INT T0.W, T19.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 255(3.573311e-43), -65536(nan)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T16.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T0.W, T19.X, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T16.X, PV.W,
-; EG-NEXT: MOV T0.Y, T17.X,
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T7.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: MOV * T0.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T1.W, T19.X, literal.x, PV.W,
-; EG-NEXT: AND_INT * T2.W, PV.Y, literal.y,
-; EG-NEXT: 16(2.242078e-44), -65536(nan)
-; EG-NEXT: OR_INT * T1.W, PS, PV.W,
-; EG-NEXT: MOV * T17.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T19.X, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T20.Y, PV.W, PS,
-; EG-NEXT: MOV T17.X, PV.Y,
-; EG-NEXT: MOV * T0.Y, T12.X,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T19.Y, literal.y,
-; EG-NEXT: -65536(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV * T12.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T1.W, T19.Y, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV T12.X, PV.W,
-; EG-NEXT: MOV T0.Y, T13.X,
-; EG-NEXT: BFE_UINT * T1.W, T19.Y, literal.x, T0.W,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
-; EG-NEXT: MOV * T13.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T19.Y, literal.x,
+; EG-NEXT: BFE_UINT * T8.W, T7.Y, literal.x, PV.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T20.W, PV.W, PS,
-; EG-NEXT: MOV T13.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T8.X,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T19.Z, literal.y,
-; EG-NEXT: -65536(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV * T8.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T1.W, T19.Z, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV T8.X, PV.W,
-; EG-NEXT: MOV T0.Y, T9.X,
-; EG-NEXT: BFE_UINT * T1.W, T19.Z, literal.x, T0.W,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
-; EG-NEXT: MOV * T9.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T19.Z, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T19.Y, PV.W, PS,
-; EG-NEXT: MOV T9.X, PV.Y,
-; EG-NEXT: MOV * T0.Y, T4.X,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T19.W, literal.y,
-; EG-NEXT: -65536(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV * T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T1.W, T19.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, T5.X,
-; EG-NEXT: BFE_UINT * T0.W, T19.W, literal.x, T0.W,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: ALU clause starting at 116:
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, PV.W, T0.W,
-; EG-NEXT: MOV * T5.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR T0.W, T19.W, literal.x,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT: LSHR T21.X, PS, literal.x,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.y,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.z,
-; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
-; EG-NEXT: 16711680(2.341805e-38), 0(0.000000e+00)
-; EG-NEXT: LSHR T22.X, KC0[2].Y, literal.x,
-; EG-NEXT: OR_INT * T19.W, PV.W, PS,
+; EG-NEXT: BFE_UINT T8.Y, T7.X, literal.x, T0.W,
+; EG-NEXT: AND_INT T8.Z, T7.Y, literal.y,
+; EG-NEXT: BFE_UINT * T9.W, T7.W, literal.x, T0.W,
+; EG-NEXT: 8(1.121039e-44), 255(3.573311e-43)
+; EG-NEXT: AND_INT T8.X, T7.X, literal.x,
+; EG-NEXT: BFE_UINT T9.Y, T7.Z, literal.y, T0.W,
+; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.z,
+; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: AND_INT * T9.Z, T7.W, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T9.X, T7.Z, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44)
+; EG-NEXT: LSHR * T10.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MOV T5.X, PV.W,
-; EG-NEXT: MOV * T20.X, T16.X,
-; EG-NEXT: MOV * T20.Z, T12.X,
-; EG-NEXT: MOV T19.X, T8.X,
-; EG-NEXT: MOV * T19.Z, T4.X, BS:VEC_120/SCL_212
;
; GFX12-LABEL: constant_zextload_v16i8_to_v16i16:
; GFX12: ; %bb.0:
@@ -10683,173 +10397,38 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
;
; EG-LABEL: constant_sextload_v16i8_to_v16i16:
; EG: ; %bb.0:
-; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @8
-; EG-NEXT: ALU 104, @12, KC0[], KC1[]
-; EG-NEXT: ALU 46, @117, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T22.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T21.X, 1
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T10.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T7.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 8:
-; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 0, #1
-; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: MOV * T0.Y, T16.X,
-; EG-NEXT: MOV * T19.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 12:
-; EG-NEXT: BFE_INT * T0.W, T19.X, 0.0, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 65535(9.183409e-41), -65536(nan)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T16.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T19.X, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T16.X, PV.W,
-; EG-NEXT: MOV T0.Y, T17.X,
-; EG-NEXT: LSHR * T0.W, T19.X, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), -65536(nan)
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T17.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: ASHR * T0.W, T19.X, literal.x,
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T20.Y, PV.W, PS,
-; EG-NEXT: MOV T17.X, PV.Y,
-; EG-NEXT: MOV T0.Y, T12.X,
-; EG-NEXT: BFE_INT * T0.W, T19.Y, 0.0, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV * T12.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T19.Y, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T12.X, PV.W,
-; EG-NEXT: MOV T0.Y, T13.X,
-; EG-NEXT: LSHR * T0.W, T19.Y, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), -65536(nan)
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T13.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: ASHR * T0.W, T19.Y, literal.x,
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T20.W, PV.W, PS,
-; EG-NEXT: MOV T13.X, PV.W,
-; EG-NEXT: MOV T0.Y, T8.X,
-; EG-NEXT: BFE_INT * T0.W, T19.Z, 0.0, literal.x,
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T7.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: BFE_INT * T8.Z, T7.Y, 0.0, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV * T8.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T19.Z, literal.x,
+; EG-NEXT: BFE_INT T8.X, T7.X, 0.0, literal.x,
+; EG-NEXT: BFE_INT T9.Z, T7.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T7.Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T8.X, PV.W,
-; EG-NEXT: MOV T0.Y, T9.X,
-; EG-NEXT: LSHR * T0.W, T19.Z, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), -65536(nan)
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T9.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: ASHR * T0.W, T19.Z, literal.x,
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: ALU clause starting at 117:
-; EG-NEXT: OR_INT * T19.Y, T1.W, T0.W,
-; EG-NEXT: MOV T9.X, PV.Y,
-; EG-NEXT: MOV T0.Y, T4.X,
-; EG-NEXT: BFE_INT * T0.W, T19.W, 0.0, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV * T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T19.W, literal.x,
+; EG-NEXT: BFE_INT T9.X, T7.Z, 0.0, literal.x,
+; EG-NEXT: LSHR T0.Z, T7.W, literal.x,
+; EG-NEXT: BFE_INT T8.W, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T7.X, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, T5.X,
-; EG-NEXT: LSHR * T0.W, T19.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), -65536(nan)
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T5.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: ASHR T0.W, T19.W, literal.x,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44)
-; EG-NEXT: LSHR T21.X, PS, literal.x,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.y,
-; EG-NEXT: LSHL * T0.W, PV.W, literal.z,
-; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
+; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x,
+; EG-NEXT: BFE_INT T8.Y, PS, 0.0, literal.y,
+; EG-NEXT: LSHR T1.Z, T7.Z, literal.y,
+; EG-NEXT: BFE_INT T9.W, PV.Z, 0.0, literal.y,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T22.X, KC0[2].Y, literal.x,
-; EG-NEXT: OR_INT * T19.W, PV.W, PS,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MOV T5.X, PV.W,
-; EG-NEXT: MOV * T20.X, T16.X,
-; EG-NEXT: MOV * T20.Z, T12.X,
-; EG-NEXT: MOV T19.X, T8.X,
-; EG-NEXT: MOV * T19.Z, T4.X, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR T10.X, PS, literal.x,
+; EG-NEXT: BFE_INT * T9.Y, PV.Z, 0.0, literal.y,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
;
; GFX12-LABEL: constant_sextload_v16i8_to_v16i16:
; GFX12: ; %bb.0:
@@ -11194,276 +10773,58 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
;
; EG-LABEL: constant_zextload_v32i8_to_v32i16:
; EG: ; %bb.0:
-; EG-NEXT: ALU 1, @14, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 1 @10
-; EG-NEXT: ALU 103, @16, KC0[], KC1[]
-; EG-NEXT: ALU 104, @120, KC0[], KC1[]
-; EG-NEXT: ALU 41, @225, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T42.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T41.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T39.X, 1
+; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 1 @8
+; EG-NEXT: ALU 37, @13, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T18.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T12.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T16.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T11.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 10:
-; EG-NEXT: VTX_READ_128 T37.XYZW, T35.X, 16, #1
-; EG-NEXT: VTX_READ_128 T35.XYZW, T35.X, 0, #1
-; EG-NEXT: ALU clause starting at 14:
-; EG-NEXT: MOV * T0.Y, T16.X,
-; EG-NEXT: MOV * T35.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 16:
-; EG-NEXT: AND_INT T0.W, T37.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 255(3.573311e-43), -65536(nan)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T16.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T0.W, T37.X, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T16.X, PV.W,
-; EG-NEXT: MOV T0.Y, T17.X,
+; EG-NEXT: Fetch clause starting at 8:
+; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1
+; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1
+; EG-NEXT: ALU clause starting at 12:
+; EG-NEXT: MOV * T11.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 13:
; EG-NEXT: MOV * T0.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T1.W, T37.X, literal.x, PV.W,
-; EG-NEXT: AND_INT * T2.W, PV.Y, literal.y,
-; EG-NEXT: 16(2.242078e-44), -65536(nan)
-; EG-NEXT: OR_INT * T1.W, PS, PV.W,
-; EG-NEXT: MOV * T17.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T37.X, literal.x,
+; EG-NEXT: BFE_UINT * T13.W, T11.Y, literal.x, PV.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T36.Y, PV.W, PS,
-; EG-NEXT: MOV T17.X, PV.Y,
-; EG-NEXT: MOV * T0.Y, T12.X,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T37.Y, literal.y,
-; EG-NEXT: -65536(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV * T12.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T1.W, T37.Y, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV T12.X, PV.W,
-; EG-NEXT: MOV T0.Y, T13.X,
-; EG-NEXT: BFE_UINT * T1.W, T37.Y, literal.x, T0.W,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
-; EG-NEXT: MOV * T13.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T37.Y, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T36.W, PV.W, PS,
-; EG-NEXT: MOV T13.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T8.X,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T37.Z, literal.y,
-; EG-NEXT: -65536(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV * T8.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T1.W, T37.Z, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV T8.X, PV.W,
-; EG-NEXT: MOV T0.Y, T9.X,
-; EG-NEXT: BFE_UINT * T1.W, T37.Z, literal.x, T0.W,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
-; EG-NEXT: MOV * T9.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T37.Z, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T37.Y, PV.W, PS,
-; EG-NEXT: MOV T9.X, PV.Y,
-; EG-NEXT: MOV * T0.Y, T4.X,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T37.W, literal.y,
-; EG-NEXT: -65536(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV * T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T1.W, T37.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, T5.X,
-; EG-NEXT: BFE_UINT * T1.W, T37.W, literal.x, T0.W,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: ALU clause starting at 120:
-; EG-NEXT: AND_INT * T2.W, T0.Y, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
-; EG-NEXT: MOV * T5.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T37.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T37.W, PV.W, PS,
-; EG-NEXT: MOV T5.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T32.X,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T35.X, literal.y,
-; EG-NEXT: -65536(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV * T32.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T1.W, T35.X, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV T32.X, PV.W,
-; EG-NEXT: MOV T0.Y, T33.X,
-; EG-NEXT: BFE_UINT * T1.W, T35.X, literal.x, T0.W, BS:VEC_120/SCL_212
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
-; EG-NEXT: MOV * T33.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T35.X, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T38.Y, PV.W, PS,
-; EG-NEXT: MOV T33.X, PV.Y,
-; EG-NEXT: MOV * T0.Y, T28.X,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T35.Y, literal.y,
-; EG-NEXT: -65536(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV * T28.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T1.W, T35.Y, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV T28.X, PV.W,
-; EG-NEXT: MOV T0.Y, T29.X,
-; EG-NEXT: BFE_UINT * T1.W, T35.Y, literal.x, T0.W,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
-; EG-NEXT: MOV * T29.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T35.Y, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T38.W, PV.W, PS,
-; EG-NEXT: MOV T29.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T24.X,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T35.Z, literal.y,
-; EG-NEXT: -65536(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV * T24.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T1.W, T35.Z, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV T24.X, PV.W,
-; EG-NEXT: MOV T0.Y, T25.X,
-; EG-NEXT: BFE_UINT * T1.W, T35.Z, literal.x, T0.W,
+; EG-NEXT: BFE_UINT T13.Y, T11.X, literal.x, T0.W,
+; EG-NEXT: AND_INT T13.Z, T11.Y, literal.y,
+; EG-NEXT: BFE_UINT * T14.W, T11.W, literal.x, T0.W,
+; EG-NEXT: 8(1.121039e-44), 255(3.573311e-43)
+; EG-NEXT: AND_INT T13.X, T11.X, literal.x,
+; EG-NEXT: BFE_UINT T14.Y, T11.Z, literal.y, T0.W,
+; EG-NEXT: LSHR * T11.X, KC0[2].Y, literal.z,
+; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: AND_INT T14.Z, T11.W, literal.x,
+; EG-NEXT: BFE_UINT * T15.W, T12.Y, literal.y, T0.W,
+; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT: AND_INT T14.X, T11.Z, literal.x,
+; EG-NEXT: BFE_UINT T15.Y, T12.X, literal.y, T0.W,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
-; EG-NEXT: MOV * T25.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T35.Z, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T35.Y, PV.W, PS,
-; EG-NEXT: MOV T25.X, PV.Y,
-; EG-NEXT: MOV * T0.Y, T20.X,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T35.W, literal.y,
-; EG-NEXT: -65536(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV * T20.X, PV.W,
-; EG-NEXT: ALU clause starting at 225:
-; EG-NEXT: MOV T0.Y, T20.X,
-; EG-NEXT: LSHL * T1.W, T35.W, literal.x,
+; EG-NEXT: LSHR T16.X, PV.W, literal.x,
+; EG-NEXT: AND_INT T15.Z, T12.Y, literal.y,
+; EG-NEXT: BFE_UINT T17.W, T12.W, literal.z, T0.W,
+; EG-NEXT: AND_INT * T15.X, T12.X, literal.y,
+; EG-NEXT: 2(2.802597e-45), 255(3.573311e-43)
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV T20.X, PV.W,
-; EG-NEXT: MOV T0.Y, T21.X,
-; EG-NEXT: BFE_UINT * T0.W, T35.W, literal.x, T0.W,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, PV.W, T0.W,
-; EG-NEXT: MOV * T21.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: BFE_UINT T17.Y, T12.Z, literal.x, T0.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 32(4.484155e-44)
+; EG-NEXT: LSHR T12.X, PV.W, literal.x,
+; EG-NEXT: AND_INT T17.Z, T12.W, literal.y,
+; EG-NEXT: AND_INT * T17.X, T12.Z, literal.y,
+; EG-NEXT: 2(2.802597e-45), 255(3.573311e-43)
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T39.X, PV.W, literal.x,
-; EG-NEXT: LSHR * T40.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: LSHR T0.W, T35.W, literal.x,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 48(6.726233e-44)
-; EG-NEXT: LSHR T41.X, PS, literal.x,
-; EG-NEXT: AND_INT T0.Z, T0.Y, literal.y,
-; EG-NEXT: AND_INT T0.W, PV.W, literal.z,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w,
-; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
-; EG-NEXT: 16711680(2.341805e-38), 32(4.484155e-44)
-; EG-NEXT: LSHR T42.X, PS, literal.x,
-; EG-NEXT: OR_INT * T35.W, PV.Z, PV.W,
+; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR * T18.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MOV T21.X, PV.W,
-; EG-NEXT: MOV * T36.X, T16.X,
-; EG-NEXT: MOV * T36.Z, T12.X,
-; EG-NEXT: MOV T37.X, T8.X,
-; EG-NEXT: MOV T37.Z, T4.X, BS:VEC_120/SCL_212
-; EG-NEXT: MOV * T38.X, T32.X,
-; EG-NEXT: MOV * T38.Z, T28.X,
-; EG-NEXT: MOV T35.X, T24.X,
-; EG-NEXT: MOV * T35.Z, T20.X, BS:VEC_120/SCL_212
;
; GFX12-LABEL: constant_zextload_v32i8_to_v32i16:
; GFX12: ; %bb.0:
@@ -11919,331 +11280,60 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
;
; EG-LABEL: constant_sextload_v32i8_to_v32i16:
; EG: ; %bb.0:
-; EG-NEXT: ALU 1, @14, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 1 @10
-; EG-NEXT: ALU 104, @16, KC0[], KC1[]
-; EG-NEXT: ALU 104, @121, KC0[], KC1[]
-; EG-NEXT: ALU 95, @226, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T42.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T41.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T39.X, 1
+; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 1 @8
+; EG-NEXT: ALU 39, @13, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T12.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T11.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T14.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T13.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 10:
-; EG-NEXT: VTX_READ_128 T37.XYZW, T35.X, 16, #1
-; EG-NEXT: VTX_READ_128 T35.XYZW, T35.X, 0, #1
-; EG-NEXT: ALU clause starting at 14:
-; EG-NEXT: MOV * T0.Y, T16.X,
-; EG-NEXT: MOV * T35.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 16:
-; EG-NEXT: BFE_INT * T0.W, T37.X, 0.0, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 65535(9.183409e-41), -65536(nan)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T16.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T37.X, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T16.X, PV.W,
-; EG-NEXT: MOV T0.Y, T17.X,
-; EG-NEXT: LSHR * T0.W, T37.X, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), -65536(nan)
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T17.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: ASHR * T0.W, T37.X, literal.x,
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T36.Y, PV.W, PS,
-; EG-NEXT: MOV T17.X, PV.Y,
-; EG-NEXT: MOV T0.Y, T12.X,
-; EG-NEXT: BFE_INT * T0.W, T37.Y, 0.0, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV * T12.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T37.Y, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T12.X, PV.W,
-; EG-NEXT: MOV T0.Y, T13.X,
-; EG-NEXT: LSHR * T0.W, T37.Y, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), -65536(nan)
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T13.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: ASHR * T0.W, T37.Y, literal.x,
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T36.W, PV.W, PS,
-; EG-NEXT: MOV T13.X, PV.W,
-; EG-NEXT: MOV T0.Y, T8.X,
-; EG-NEXT: BFE_INT * T0.W, T37.Z, 0.0, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV * T8.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T37.Z, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T8.X, PV.W,
-; EG-NEXT: MOV T0.Y, T9.X,
-; EG-NEXT: LSHR * T0.W, T37.Z, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), -65536(nan)
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T9.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: ASHR * T0.W, T37.Z, literal.x,
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: ALU clause starting at 121:
-; EG-NEXT: OR_INT * T37.Y, T1.W, T0.W,
-; EG-NEXT: MOV T9.X, PV.Y,
-; EG-NEXT: MOV T0.Y, T4.X,
-; EG-NEXT: BFE_INT * T0.W, T37.W, 0.0, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV * T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T37.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, T5.X,
-; EG-NEXT: LSHR * T0.W, T37.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), -65536(nan)
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T5.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: ASHR * T0.W, T37.W, literal.x,
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T37.W, PV.W, PS,
-; EG-NEXT: MOV T5.X, PV.W,
-; EG-NEXT: MOV T0.Y, T32.X,
-; EG-NEXT: BFE_INT * T0.W, T35.X, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV * T32.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T35.X, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T32.X, PV.W,
-; EG-NEXT: MOV T0.Y, T33.X,
-; EG-NEXT: LSHR * T0.W, T35.X, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), -65536(nan)
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T33.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: ASHR * T0.W, T35.X, literal.x,
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T38.Y, PV.W, PS,
-; EG-NEXT: MOV T33.X, PV.Y,
-; EG-NEXT: MOV T0.Y, T28.X,
-; EG-NEXT: BFE_INT * T0.W, T35.Y, 0.0, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV * T28.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T35.Y, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T28.X, PV.W,
-; EG-NEXT: MOV T0.Y, T29.X,
-; EG-NEXT: LSHR * T0.W, T35.Y, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), -65536(nan)
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T29.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: ASHR * T0.W, T35.Y, literal.x,
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: ALU clause starting at 226:
-; EG-NEXT: AND_INT T1.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T0.W, T0.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T38.W, PV.W, PS,
-; EG-NEXT: MOV T29.X, PV.W,
-; EG-NEXT: MOV T0.Y, T24.X,
-; EG-NEXT: BFE_INT * T0.W, T35.Z, 0.0, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV * T24.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T35.Z, literal.x,
+; EG-NEXT: Fetch clause starting at 8:
+; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1
+; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1
+; EG-NEXT: ALU clause starting at 12:
+; EG-NEXT: MOV * T11.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 13:
+; EG-NEXT: LSHR T13.X, KC0[2].Y, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: LSHR T14.X, PV.W, literal.x,
+; EG-NEXT: BFE_INT * T15.Z, T11.Y, 0.0, literal.y,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
+; EG-NEXT: BFE_INT T15.X, T11.X, 0.0, literal.x,
+; EG-NEXT: LSHR T0.Y, T12.W, literal.x,
+; EG-NEXT: BFE_INT T16.Z, T11.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR T0.W, T12.Y, literal.x,
+; EG-NEXT: LSHR * T1.W, T11.Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T24.X, PV.W,
-; EG-NEXT: MOV T0.Y, T25.X,
-; EG-NEXT: LSHR * T0.W, T35.Z, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), -65536(nan)
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T25.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: ASHR * T0.W, T35.Z, literal.x,
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T35.Y, PV.W, PS,
-; EG-NEXT: MOV T25.X, PV.Y,
-; EG-NEXT: MOV T0.Y, T20.X,
-; EG-NEXT: BFE_INT * T0.W, T35.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T16.X, T11.Z, 0.0, literal.x,
+; EG-NEXT: LSHR T1.Y, T11.W, literal.x,
+; EG-NEXT: BFE_INT T17.Z, T12.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T15.W, PS, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.W, T11.X, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV * T20.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T35.W, literal.x,
+; EG-NEXT: BFE_INT T17.X, T12.X, 0.0, literal.x,
+; EG-NEXT: BFE_INT T15.Y, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T18.Z, T12.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T16.W, PV.Y, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.W, T11.Z, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T20.X, PV.W,
-; EG-NEXT: MOV T0.Y, T21.X,
-; EG-NEXT: LSHR * T0.W, T35.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), -65536(nan)
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T21.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T39.X, PV.W, literal.x,
-; EG-NEXT: LSHR * T40.X, KC0[2].Y, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: ASHR T0.W, T35.W, literal.x,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: 24(3.363116e-44), 48(6.726233e-44)
-; EG-NEXT: LSHR T41.X, PS, literal.x,
-; EG-NEXT: AND_INT T0.Z, T0.Y, literal.y,
-; EG-NEXT: LSHL T0.W, PV.W, literal.z,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w,
-; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
-; EG-NEXT: 16(2.242078e-44), 32(4.484155e-44)
-; EG-NEXT: LSHR T42.X, PS, literal.x,
-; EG-NEXT: OR_INT * T35.W, PV.Z, PV.W,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MOV T21.X, PV.W,
-; EG-NEXT: MOV * T36.X, T16.X,
-; EG-NEXT: MOV * T36.Z, T12.X,
-; EG-NEXT: MOV T37.X, T8.X,
-; EG-NEXT: MOV T37.Z, T4.X, BS:VEC_120/SCL_212
-; EG-NEXT: MOV * T38.X, T32.X,
-; EG-NEXT: MOV * T38.Z, T28.X,
-; EG-NEXT: MOV T35.X, T24.X,
-; EG-NEXT: MOV * T35.Z, T20.X, BS:VEC_120/SCL_212
+; EG-NEXT: BFE_INT T18.X, T12.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T16.Y, PS, 0.0, literal.x,
+; EG-NEXT: LSHR T0.Z, T12.X, literal.x,
+; EG-NEXT: BFE_INT T17.W, T0.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 32(4.484155e-44)
+; EG-NEXT: LSHR T11.X, PS, literal.x,
+; EG-NEXT: BFE_INT T17.Y, PV.Z, 0.0, literal.y,
+; EG-NEXT: LSHR T0.Z, T12.Z, literal.y,
+; EG-NEXT: BFE_INT T18.W, T0.Y, 0.0, literal.y,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
+; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T12.X, PS, literal.x,
+; EG-NEXT: BFE_INT * T18.Y, PV.Z, 0.0, literal.y,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
;
; GFX12-LABEL: constant_sextload_v32i8_to_v32i16:
; GFX12: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 8589158f11a7..573338231bd5 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -254,74 +254,63 @@ define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspac
;
; EG-LABEL: global_load_v3i16:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 2 @6
-; EG-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 0
-; EG-NEXT: MEM_RAT MSKOR T5.XW, T8.X
+; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 1 @6
+; EG-NEXT: ALU 14, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T3.X, 0
+; EG-NEXT: MEM_RAT MSKOR T2.XW, T0.X
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_16 T6.X, T5.X, 0, #1
-; EG-NEXT: VTX_READ_16 T7.X, T5.X, 2, #1
-; EG-NEXT: VTX_READ_16 T5.X, T5.X, 4, #1
-; EG-NEXT: ALU clause starting at 12:
-; EG-NEXT: MOV * T5.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 13:
+; EG-NEXT: VTX_READ_16 T1.X, T0.X, 0, #1
+; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1
+; EG-NEXT: ALU clause starting at 10:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 11:
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
-; EG-NEXT: AND_INT * T2.W, T5.X, literal.y,
+; EG-NEXT: AND_INT * T2.W, T0.X, literal.y,
; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL T5.X, T2.W, PV.W,
-; EG-NEXT: LSHL * T5.W, literal.x, PV.W,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: MOV T5.Y, 0.0,
-; EG-NEXT: MOV * T5.Z, 0.0,
-; EG-NEXT: LSHR T8.X, T0.W, literal.x,
-; EG-NEXT: LSHL T0.W, T7.X, literal.y,
-; EG-NEXT: AND_INT * T1.W, T6.X, literal.z,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: LSHL T2.X, T2.W, PV.W,
+; EG-NEXT: LSHL * T2.W, literal.x, PV.W,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: OR_INT T6.X, PV.W, PS,
-; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
+; EG-NEXT: MOV T2.Y, 0.0,
+; EG-NEXT: MOV * T2.Z, 0.0,
+; EG-NEXT: LSHR T0.X, T0.W, literal.x,
+; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: global_load_v3i16:
; CM: ; %bb.0: ; %entry
-; CM-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
-; CM-NEXT: TEX 2 @6
-; CM-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT MSKOR T5.XW, T8.X
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6.X, T7.X
+; CM-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
+; CM-NEXT: TEX 1 @6
+; CM-NEXT: ALU 15, @11, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MEM_RAT MSKOR T2.XW, T3.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 6:
-; CM-NEXT: VTX_READ_16 T6.X, T5.X, 0, #1
-; CM-NEXT: VTX_READ_16 T7.X, T5.X, 2, #1
-; CM-NEXT: VTX_READ_16 T5.X, T5.X, 4, #1
-; CM-NEXT: ALU clause starting at 12:
-; CM-NEXT: MOV * T5.X, KC0[2].Z,
-; CM-NEXT: ALU clause starting at 13:
+; CM-NEXT: VTX_READ_16 T1.X, T0.X, 0, #1
+; CM-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1
+; CM-NEXT: ALU clause starting at 10:
+; CM-NEXT: MOV * T0.X, KC0[2].Z,
+; CM-NEXT: ALU clause starting at 11:
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00)
; CM-NEXT: AND_INT * T1.W, PV.W, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; CM-NEXT: AND_INT T0.Z, T5.X, literal.x,
+; CM-NEXT: AND_INT T0.Z, T0.X, literal.x,
; CM-NEXT: LSHL * T1.W, PV.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
-; CM-NEXT: LSHL T5.X, PV.Z, PV.W,
-; CM-NEXT: LSHL * T5.W, literal.x, PV.W,
+; CM-NEXT: LSHL T2.X, PV.Z, PV.W,
+; CM-NEXT: LSHL * T2.W, literal.x, PV.W,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT: MOV T5.Y, 0.0,
-; CM-NEXT: MOV * T5.Z, 0.0,
-; CM-NEXT: LSHL T0.Z, T7.X, literal.x,
-; CM-NEXT: AND_INT * T1.W, T6.X, literal.y, BS:VEC_120/SCL_212
-; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
-; CM-NEXT: OR_INT * T6.X, PV.Z, PV.W,
-; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
+; CM-NEXT: MOV T2.Y, 0.0,
+; CM-NEXT: MOV * T2.Z, 0.0,
+; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT: LSHR * T8.X, T0.W, literal.x,
+; CM-NEXT: LSHR * T3.X, T0.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%ld = load <3 x i16>, ptr addrspace(1) %in
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
index fb34b5e1f3af..896e60900c74 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -916,38 +916,22 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out
; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
+; EG: BFE_{{U?}}INT
+; EG: BFE_{{U?}}INT
+; EG: BFE_{{U?}}INT
+; EG: BFE_{{U?}}INT
+; EG: BFE_{{U?}}INT
+; EG: BFE_{{U?}}INT
+; EG: BFE_{{U?}}INT
+; EG: BFE_{{U?}}INT
+; EG: BFE_{{U?}}INT
+; EG: BFE_{{U?}}INT
+; EG: BFE_{{U?}}INT
+; EG: BFE_{{U?}}INT
+; EG: BFE_{{U?}}INT
+; EG: BFE_{{U?}}INT
+; EG: BFE_{{U?}}INT
+; EG: BFE_{{U?}}INT
define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%load = load <32 x i8>, ptr addrspace(1) %in
%ext = sext <32 x i8> %load to <32 x i16>
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
index e8744c7828d4..2b10d469acf5 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
@@ -6,76 +6,37 @@
define amdgpu_kernel void @buffer_last_use_load_0(ptr addrspace(7) %in, ptr addrspace(7) %out) {
; GFX12-LABEL: buffer_last_use_load_0:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_clause 0x2
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x20
-; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x10
+; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x10
+; GFX12-NEXT: s_mov_b32 s12, 0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s7, s12
+; GFX12-NEXT: s_mov_b32 s9, s12
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
-; GFX12-NEXT: v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
-; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:32
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: scratch_load_b64 v[5:6], off, off offset:40
-; GFX12-NEXT: scratch_load_b32 v4, off, off offset:36
-; GFX12-NEXT: s_load_b32 s1, s[4:5], 0x30
-; GFX12-NEXT: scratch_store_b128 off, v[7:10], off
+; GFX12-NEXT: s_mov_b32 s6, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: s_mov_b32 s8, s1
+; GFX12-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
+; GFX12-NEXT: s_mov_b32 s13, s2
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: scratch_load_b64 v[1:2], off, off offset:8
-; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4
-; GFX12-NEXT: v_mov_b32_e32 v7, s6
-; GFX12-NEXT: v_mov_b32_e32 v9, s0
+; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
+; GFX12-NEXT: s_mov_b32 s5, s12
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v3, s1
-; GFX12-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x2
-; GFX12-NEXT: v_readfirstlane_b32 s4, v4
-; GFX12-NEXT: v_readfirstlane_b32 s5, v5
-; GFX12-NEXT: v_readfirstlane_b32 s6, v6
-; GFX12-NEXT: v_readfirstlane_b32 s7, v7
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_LU
-; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
-; GFX12-NEXT: ; implicit-def: $vgpr9
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB0_1
-; GFX12-NEXT: ; %bb.2:
-; GFX12-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v4, s8
-; GFX12-NEXT: s_mov_b32 s0, exec_lo
-; GFX12-NEXT: .LBB0_3: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x1
-; GFX12-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-NEXT: v_readfirstlane_b32 s5, v1
-; GFX12-NEXT: v_readfirstlane_b32 s6, v2
-; GFX12-NEXT: v_readfirstlane_b32 s7, v3
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-NEXT: s_mov_b32 s4, s3
+; GFX12-NEXT: s_mov_b32 s3, s12
+; GFX12-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
+; GFX12-NEXT: s_mov_b32 s13, s2
+; GFX12-NEXT: s_mov_b32 s2, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_store_b32 v8, v4, s[4:7], null offen
-; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX12-NEXT: ; implicit-def: $vgpr8
-; GFX12-NEXT: ; implicit-def: $vgpr4
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB0_3
-; GFX12-NEXT: ; %bb.4:
+; GFX12-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen
; GFX12-NEXT: s_endpgm
entry:
%val = load i32, ptr addrspace(7) %in, !amdgpu.last.use !{}
@@ -86,77 +47,38 @@ entry:
define amdgpu_kernel void @buffer_last_use_load_1(ptr addrspace(7) %in, ptr addrspace(7) %out) {
; GFX12-LABEL: buffer_last_use_load_1:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_clause 0x2
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x20
-; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x10
+; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x10
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_mov_b32 s12, 0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s7, s12
+; GFX12-NEXT: s_mov_b32 s9, s12
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
-; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX12-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX12-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX12-NEXT: scratch_store_b128 off, v[1:4], off offset:32
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: scratch_load_b64 v[6:7], off, off offset:40
-; GFX12-NEXT: scratch_load_b32 v5, off, off offset:36
-; GFX12-NEXT: s_load_b32 s1, s[4:5], 0x30
-; GFX12-NEXT: scratch_store_b128 off, v[8:11], off
+; GFX12-NEXT: s_mov_b32 s6, s3
+; GFX12-NEXT: v_lshl_add_u32 v0, v0, 2, s0
+; GFX12-NEXT: s_mov_b32 s8, s1
+; GFX12-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
+; GFX12-NEXT: s_mov_b32 s13, s2
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: scratch_load_b64 v[2:3], off, off offset:8
-; GFX12-NEXT: scratch_load_b32 v1, off, off offset:4
-; GFX12-NEXT: v_mov_b32_e32 v8, s6
-; GFX12-NEXT: v_lshl_add_u32 v9, v0, 2, s0
+; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
+; GFX12-NEXT: s_mov_b32 s5, s12
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, s1
-; GFX12-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x2
-; GFX12-NEXT: v_readfirstlane_b32 s4, v5
-; GFX12-NEXT: v_readfirstlane_b32 s5, v6
-; GFX12-NEXT: v_readfirstlane_b32 s6, v7
-; GFX12-NEXT: v_readfirstlane_b32 s7, v8
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[5:6]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_load_b32 v0, v9, s[4:7], null offen th:TH_LOAD_LU
-; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7_vgpr8
-; GFX12-NEXT: ; implicit-def: $vgpr9
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB1_1
-; GFX12-NEXT: ; %bb.2:
-; GFX12-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v5, s8
-; GFX12-NEXT: s_mov_b32 s0, exec_lo
-; GFX12-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x1
-; GFX12-NEXT: v_readfirstlane_b32 s4, v1
-; GFX12-NEXT: v_readfirstlane_b32 s5, v2
-; GFX12-NEXT: v_readfirstlane_b32 s6, v3
-; GFX12-NEXT: v_readfirstlane_b32 s7, v4
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-NEXT: s_mov_b32 s4, s3
+; GFX12-NEXT: s_mov_b32 s3, s12
+; GFX12-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
+; GFX12-NEXT: s_mov_b32 s13, s2
+; GFX12-NEXT: s_mov_b32 s2, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_store_b32 v0, v5, s[4:7], null offen
-; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
-; GFX12-NEXT: ; implicit-def: $vgpr0
-; GFX12-NEXT: ; implicit-def: $vgpr5
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB1_3
-; GFX12-NEXT: ; %bb.4:
+; GFX12-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen
; GFX12-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -169,76 +91,37 @@ entry:
define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %in, ptr addrspace(7) %out) {
; GFX12-LABEL: buffer_last_use_and_volatile_load:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_clause 0x2
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x20
-; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x10
+; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x10
+; GFX12-NEXT: s_mov_b32 s12, 0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s7, s12
+; GFX12-NEXT: s_mov_b32 s9, s12
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
-; GFX12-NEXT: v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
-; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:32
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: scratch_load_b64 v[5:6], off, off offset:40
-; GFX12-NEXT: scratch_load_b32 v4, off, off offset:36
-; GFX12-NEXT: s_load_b32 s1, s[4:5], 0x30
-; GFX12-NEXT: scratch_store_b128 off, v[7:10], off
+; GFX12-NEXT: s_mov_b32 s6, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: s_mov_b32 s8, s1
+; GFX12-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
+; GFX12-NEXT: s_mov_b32 s13, s2
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: scratch_load_b64 v[1:2], off, off offset:8
-; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4
-; GFX12-NEXT: v_mov_b32_e32 v7, s6
-; GFX12-NEXT: v_mov_b32_e32 v9, s0
+; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
+; GFX12-NEXT: s_mov_b32 s5, s12
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v3, s1
-; GFX12-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x2
-; GFX12-NEXT: v_readfirstlane_b32 s4, v4
-; GFX12-NEXT: v_readfirstlane_b32 s5, v5
-; GFX12-NEXT: v_readfirstlane_b32 s6, v6
-; GFX12-NEXT: v_readfirstlane_b32 s7, v7
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS
-; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
-; GFX12-NEXT: ; implicit-def: $vgpr9
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB2_1
-; GFX12-NEXT: ; %bb.2:
-; GFX12-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v4, s8
-; GFX12-NEXT: s_mov_b32 s0, exec_lo
-; GFX12-NEXT: .LBB2_3: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x1
-; GFX12-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-NEXT: v_readfirstlane_b32 s5, v1
-; GFX12-NEXT: v_readfirstlane_b32 s6, v2
-; GFX12-NEXT: v_readfirstlane_b32 s7, v3
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-NEXT: s_mov_b32 s4, s3
+; GFX12-NEXT: s_mov_b32 s3, s12
+; GFX12-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
+; GFX12-NEXT: s_mov_b32 s13, s2
+; GFX12-NEXT: s_mov_b32 s2, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_store_b32 v8, v4, s[4:7], null offen
-; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX12-NEXT: ; implicit-def: $vgpr8
-; GFX12-NEXT: ; implicit-def: $vgpr4
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB2_3
-; GFX12-NEXT: ; %bb.4:
+; GFX12-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen
; GFX12-NEXT: s_endpgm
entry:
%val = load volatile i32, ptr addrspace(7) %in, !amdgpu.last.use !{}
@@ -249,76 +132,37 @@ entry:
define amdgpu_kernel void @buffer_last_use_and_nontemporal_load(ptr addrspace(7) %in, ptr addrspace(7) %out) {
; GFX12-LABEL: buffer_last_use_and_nontemporal_load:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_clause 0x2
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x20
-; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x10
+; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x10
+; GFX12-NEXT: s_mov_b32 s12, 0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s7, s12
+; GFX12-NEXT: s_mov_b32 s9, s12
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
-; GFX12-NEXT: v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
-; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:32
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: scratch_load_b64 v[5:6], off, off offset:40
-; GFX12-NEXT: scratch_load_b32 v4, off, off offset:36
-; GFX12-NEXT: s_load_b32 s1, s[4:5], 0x30
-; GFX12-NEXT: scratch_store_b128 off, v[7:10], off
+; GFX12-NEXT: s_mov_b32 s6, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: s_mov_b32 s8, s1
+; GFX12-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
+; GFX12-NEXT: s_mov_b32 s13, s2
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: scratch_load_b64 v[1:2], off, off offset:8
-; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4
-; GFX12-NEXT: v_mov_b32_e32 v7, s6
-; GFX12-NEXT: v_mov_b32_e32 v9, s0
+; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
+; GFX12-NEXT: s_mov_b32 s5, s12
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v3, s1
-; GFX12-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x2
-; GFX12-NEXT: v_readfirstlane_b32 s4, v4
-; GFX12-NEXT: v_readfirstlane_b32 s5, v5
-; GFX12-NEXT: v_readfirstlane_b32 s6, v6
-; GFX12-NEXT: v_readfirstlane_b32 s7, v7
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_LU
-; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
-; GFX12-NEXT: ; implicit-def: $vgpr9
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB3_1
-; GFX12-NEXT: ; %bb.2:
-; GFX12-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v4, s8
-; GFX12-NEXT: s_mov_b32 s0, exec_lo
-; GFX12-NEXT: .LBB3_3: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x1
-; GFX12-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-NEXT: v_readfirstlane_b32 s5, v1
-; GFX12-NEXT: v_readfirstlane_b32 s6, v2
-; GFX12-NEXT: v_readfirstlane_b32 s7, v3
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-NEXT: s_mov_b32 s4, s3
+; GFX12-NEXT: s_mov_b32 s3, s12
+; GFX12-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
+; GFX12-NEXT: s_mov_b32 s13, s2
+; GFX12-NEXT: s_mov_b32 s2, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_store_b32 v8, v4, s[4:7], null offen
-; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX12-NEXT: ; implicit-def: $vgpr8
-; GFX12-NEXT: ; implicit-def: $vgpr4
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB3_3
-; GFX12-NEXT: ; %bb.4:
+; GFX12-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen
; GFX12-NEXT: s_endpgm
entry:
%val = load i32, ptr addrspace(7) %in, !amdgpu.last.use !{}, !nontemporal !0
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
index a5f6c2fe5d26..a62910e4e571 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
@@ -13,30 +13,32 @@
define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, ptr addrspace(7) %out) {
; GFX9-SDAG-LABEL: buffer_nontemporal_load_store:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-SDAG-NEXT: s_load_dword s11, s[8:9], 0x10
; GFX9-SDAG-NEXT: s_mov_b32 s10, 0
-; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s15
-; GFX9-SDAG-NEXT: s_mov_b32 s15, s10
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s10
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s14, s7
-; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-SDAG-NEXT: s_mov_b32 s12, s5
-; GFX9-SDAG-NEXT: s_or_b64 s[14:15], s[14:15], s[10:11]
-; GFX9-SDAG-NEXT: s_mov_b32 s13, s6
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen glc slc
+; GFX9-SDAG-NEXT: s_mov_b32 s4, s3
+; GFX9-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[10:11]
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s2, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s3, s10
+; GFX9-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[10:11]
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc slc
; GFX9-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x20
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s10
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s10
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s7
-; GFX9-SDAG-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11]
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s5
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s6
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s4, s3
+; GFX9-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[10:11]
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s2, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s3, s10
+; GFX9-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[10:11]
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s0
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
+; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen glc slc
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: buffer_nontemporal_load_store:
@@ -72,68 +74,31 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX940-SDAG-LABEL: buffer_nontemporal_load_store:
; GFX940-SDAG: ; %bb.0: ; %entry
; GFX940-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX940-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10
-; GFX940-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20
-; GFX940-SDAG-NEXT: s_load_dword s7, s[4:5], 0x30
+; GFX940-SDAG-NEXT: s_load_dword s13, s[4:5], 0x10
+; GFX940-SDAG-NEXT: s_mov_b32 s12, 0
+; GFX940-SDAG-NEXT: s_mov_b32 s7, s12
; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX940-SDAG-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 sc0 sc1
-; GFX940-SDAG-NEXT: scratch_load_dwordx2 v[10:11], off, off offset:40
-; GFX940-SDAG-NEXT: scratch_load_dword v4, off, off offset:36
-; GFX940-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; GFX940-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
-; GFX940-SDAG-NEXT: scratch_store_dwordx4 off, v[0:3], off sc0 sc1
-; GFX940-SDAG-NEXT: scratch_load_dwordx2 v[12:13], off, off offset:8
-; GFX940-SDAG-NEXT: s_nop 0
-; GFX940-SDAG-NEXT: scratch_load_dword v0, off, off offset:4
-; GFX940-SDAG-NEXT: v_mov_b32_e32 v7, s6
-; GFX940-SDAG-NEXT: v_mov_b32_e32 v3, s7
-; GFX940-SDAG-NEXT: v_mov_b32_e32 v9, s0
-; GFX940-SDAG-NEXT: s_mov_b64 s[2:3], exec
-; GFX940-SDAG-NEXT: s_waitcnt vmcnt(4)
-; GFX940-SDAG-NEXT: v_mov_b32_e32 v5, v10
-; GFX940-SDAG-NEXT: v_mov_b32_e32 v6, v11
-; GFX940-SDAG-NEXT: s_waitcnt vmcnt(1)
-; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, v12
-; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, v13
-; GFX940-SDAG-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
-; GFX940-SDAG-NEXT: v_readfirstlane_b32 s4, v4
-; GFX940-SDAG-NEXT: v_readfirstlane_b32 s5, v5
-; GFX940-SDAG-NEXT: v_readfirstlane_b32 s6, v6
-; GFX940-SDAG-NEXT: v_readfirstlane_b32 s7, v7
-; GFX940-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5]
-; GFX940-SDAG-NEXT: s_nop 0
-; GFX940-SDAG-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[6:7]
-; GFX940-SDAG-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-SDAG-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-SDAG-NEXT: buffer_load_dword v8, v9, s[4:7], 0 offen nt
-; GFX940-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
-; GFX940-SDAG-NEXT: ; implicit-def: $vgpr9
-; GFX940-SDAG-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX940-SDAG-NEXT: s_cbranch_execnz .LBB0_1
-; GFX940-SDAG-NEXT: ; %bb.2:
-; GFX940-SDAG-NEXT: s_mov_b64 exec, s[2:3]
-; GFX940-SDAG-NEXT: v_mov_b32_e32 v4, s8
-; GFX940-SDAG-NEXT: s_mov_b64 s[0:1], exec
+; GFX940-SDAG-NEXT: s_mov_b32 s6, s3
+; GFX940-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
+; GFX940-SDAG-NEXT: s_mov_b32 s13, s2
+; GFX940-SDAG-NEXT: s_mov_b32 s2, s1
+; GFX940-SDAG-NEXT: s_mov_b32 s3, s12
+; GFX940-SDAG-NEXT: s_or_b64 s[8:9], s[2:3], s[12:13]
+; GFX940-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-SDAG-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen nt
+; GFX940-SDAG-NEXT: s_load_dword s13, s[4:5], 0x30
+; GFX940-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20
+; GFX940-SDAG-NEXT: s_mov_b32 s5, s12
+; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-SDAG-NEXT: s_mov_b32 s4, s3
+; GFX940-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
+; GFX940-SDAG-NEXT: s_mov_b32 s13, s2
+; GFX940-SDAG-NEXT: s_mov_b32 s2, s1
+; GFX940-SDAG-NEXT: s_mov_b32 s3, s12
+; GFX940-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
+; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, s0
; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX940-SDAG-NEXT: .LBB0_3: ; =>This Inner Loop Header: Depth=1
-; GFX940-SDAG-NEXT: v_readfirstlane_b32 s4, v0
-; GFX940-SDAG-NEXT: v_readfirstlane_b32 s5, v1
-; GFX940-SDAG-NEXT: v_readfirstlane_b32 s6, v2
-; GFX940-SDAG-NEXT: v_readfirstlane_b32 s7, v3
-; GFX940-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX940-SDAG-NEXT: s_nop 0
-; GFX940-SDAG-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX940-SDAG-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-SDAG-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-SDAG-NEXT: buffer_store_dword v8, v4, s[4:7], 0 offen sc0 nt sc1
-; GFX940-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX940-SDAG-NEXT: ; implicit-def: $vgpr8
-; GFX940-SDAG-NEXT: ; implicit-def: $vgpr4
-; GFX940-SDAG-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX940-SDAG-NEXT: s_cbranch_execnz .LBB0_3
-; GFX940-SDAG-NEXT: ; %bb.4:
+; GFX940-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen sc0 nt sc1
; GFX940-SDAG-NEXT: s_endpgm
;
; GFX940-GISEL-LABEL: buffer_nontemporal_load_store:
@@ -169,31 +134,34 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX10-SDAG-LABEL: buffer_nontemporal_load_store:
; GFX10-SDAG: ; %bb.0: ; %entry
; GFX10-SDAG-NEXT: s_clause 0x1
-; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-SDAG-NEXT: s_load_dword s11, s[8:9], 0x10
; GFX10-SDAG-NEXT: s_mov_b32 s10, 0
-; GFX10-SDAG-NEXT: s_add_u32 s0, s0, s15
+; GFX10-SDAG-NEXT: s_mov_b32 s5, s10
; GFX10-SDAG-NEXT: s_mov_b32 s13, s10
-; GFX10-SDAG-NEXT: s_addc_u32 s1, s1, 0
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-SDAG-NEXT: s_mov_b32 s12, s7
-; GFX10-SDAG-NEXT: s_or_b64 s[14:15], s[12:13], s[10:11]
-; GFX10-SDAG-NEXT: s_mov_b32 s12, s5
-; GFX10-SDAG-NEXT: s_mov_b32 s13, s6
-; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen slc
+; GFX10-SDAG-NEXT: s_mov_b32 s4, s3
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-SDAG-NEXT: s_mov_b32 s12, s1
+; GFX10-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[10:11]
+; GFX10-SDAG-NEXT: s_mov_b32 s11, s2
+; GFX10-SDAG-NEXT: s_or_b64 s[4:5], s[12:13], s[10:11]
+; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen slc
; GFX10-SDAG-NEXT: s_clause 0x1
; GFX10-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30
-; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x20
-; GFX10-SDAG-NEXT: s_mov_b32 s9, s10
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
+; GFX10-SDAG-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-SDAG-NEXT: s_mov_b32 s5, s10
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-SDAG-NEXT: s_mov_b32 s8, s7
-; GFX10-SDAG-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11]
-; GFX10-SDAG-NEXT: s_mov_b32 s8, s5
-; GFX10-SDAG-NEXT: s_mov_b32 s9, s6
+; GFX10-SDAG-NEXT: s_mov_b32 s4, s3
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[10:11]
+; GFX10-SDAG-NEXT: s_mov_b32 s11, s2
+; GFX10-SDAG-NEXT: s_mov_b32 s2, s1
+; GFX10-SDAG-NEXT: s_mov_b32 s3, s10
+; GFX10-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[10:11]
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
+; GFX10-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen glc slc
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: buffer_nontemporal_load_store:
@@ -229,69 +197,37 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
;
; GFX11-SDAG-LABEL: buffer_nontemporal_load_store:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_clause 0x2
+; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-SDAG-NEXT: s_load_b128 s[8:11], s[4:5], 0x20
-; GFX11-SDAG-NEXT: s_load_b32 s6, s[4:5], 0x10
+; GFX11-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x10
+; GFX11-SDAG-NEXT: s_mov_b32 s12, 0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_mov_b32 s7, s12
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s12
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
-; GFX11-SDAG-NEXT: scratch_store_b128 off, v[0:3], off offset:32
+; GFX11-SDAG-NEXT: s_mov_b32 s6, s3
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s1
+; GFX11-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
+; GFX11-SDAG-NEXT: s_mov_b32 s13, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
; GFX11-SDAG-NEXT: s_clause 0x1
-; GFX11-SDAG-NEXT: scratch_load_b64 v[5:6], off, off offset:40
-; GFX11-SDAG-NEXT: scratch_load_b32 v4, off, off offset:36
-; GFX11-SDAG-NEXT: s_load_b32 s1, s[4:5], 0x30
-; GFX11-SDAG-NEXT: scratch_store_b128 off, v[7:10], off
-; GFX11-SDAG-NEXT: s_clause 0x1
-; GFX11-SDAG-NEXT: scratch_load_b64 v[1:2], off, off offset:8
-; GFX11-SDAG-NEXT: scratch_load_b32 v0, off, off offset:4
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v7, s6
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v9, s0
+; GFX11-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
+; GFX11-SDAG-NEXT: s_mov_b32 s5, s12
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s4, v4
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s5, v5
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s6, v6
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s7, v7
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
-; GFX11-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-SDAG-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-SDAG-NEXT: buffer_load_b32 v8, v9, s[4:7], 0 offen slc dlc
-; GFX11-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
-; GFX11-SDAG-NEXT: ; implicit-def: $vgpr9
-; GFX11-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB0_1
-; GFX11-SDAG-NEXT: ; %bb.2:
-; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, s8
-; GFX11-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-SDAG-NEXT: .LBB0_3: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s4, v0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s5, v1
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s6, v2
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s7, v3
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-SDAG-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s4, s3
+; GFX11-SDAG-NEXT: s_mov_b32 s3, s12
+; GFX11-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
+; GFX11-SDAG-NEXT: s_mov_b32 s13, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s2, s1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: buffer_store_b32 v8, v4, s[4:7], 0 offen glc slc dlc
-; GFX11-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX11-SDAG-NEXT: ; implicit-def: $vgpr8
-; GFX11-SDAG-NEXT: ; implicit-def: $vgpr4
-; GFX11-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB0_3
-; GFX11-SDAG-NEXT: ; %bb.4:
+; GFX11-SDAG-NEXT: buffer_store_b32 v0, v1, s[4:7], 0 offen glc slc dlc
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: buffer_nontemporal_load_store:
@@ -330,76 +266,37 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
;
; GFX12-SDAG-LABEL: buffer_nontemporal_load_store:
; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_clause 0x2
+; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-SDAG-NEXT: s_load_b128 s[8:11], s[4:5], 0x20
-; GFX12-SDAG-NEXT: s_load_b32 s6, s[4:5], 0x10
+; GFX12-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x10
+; GFX12-SDAG-NEXT: s_mov_b32 s12, 0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_mov_b32 s7, s12
+; GFX12-SDAG-NEXT: s_mov_b32 s9, s12
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
-; GFX12-SDAG-NEXT: scratch_store_b128 off, v[0:3], off offset:32
-; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: scratch_load_b64 v[5:6], off, off offset:40
-; GFX12-SDAG-NEXT: scratch_load_b32 v4, off, off offset:36
-; GFX12-SDAG-NEXT: s_load_b32 s1, s[4:5], 0x30
-; GFX12-SDAG-NEXT: scratch_store_b128 off, v[7:10], off
+; GFX12-SDAG-NEXT: s_mov_b32 s6, s3
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-SDAG-NEXT: s_mov_b32 s8, s1
+; GFX12-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
+; GFX12-SDAG-NEXT: s_mov_b32 s13, s2
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: scratch_load_b64 v[1:2], off, off offset:8
-; GFX12-SDAG-NEXT: scratch_load_b32 v0, off, off offset:4
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v7, s6
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v9, s0
+; GFX12-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30
+; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
+; GFX12-SDAG-NEXT: s_mov_b32 s5, s12
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; GFX12-SDAG-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-SDAG-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x2
-; GFX12-SDAG-NEXT: v_readfirstlane_b32 s4, v4
-; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v5
-; GFX12-SDAG-NEXT: v_readfirstlane_b32 s6, v6
-; GFX12-SDAG-NEXT: v_readfirstlane_b32 s7, v7
-; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
-; GFX12-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_NT
-; GFX12-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
-; GFX12-SDAG-NEXT: ; implicit-def: $vgpr9
-; GFX12-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB0_1
-; GFX12-SDAG-NEXT: ; %bb.2:
-; GFX12-SDAG-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, s8
-; GFX12-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX12-SDAG-NEXT: .LBB0_3: ; =>This Inner Loop Header: Depth=1
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x1
-; GFX12-SDAG-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v1
-; GFX12-SDAG-NEXT: v_readfirstlane_b32 s6, v2
-; GFX12-SDAG-NEXT: v_readfirstlane_b32 s7, v3
-; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-SDAG-NEXT: s_mov_b32 s4, s3
+; GFX12-SDAG-NEXT: s_mov_b32 s3, s12
+; GFX12-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
+; GFX12-SDAG-NEXT: s_mov_b32 s13, s2
+; GFX12-SDAG-NEXT: s_mov_b32 s2, s1
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: buffer_store_b32 v8, v4, s[4:7], null offen th:TH_STORE_NT
-; GFX12-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX12-SDAG-NEXT: ; implicit-def: $vgpr8
-; GFX12-SDAG-NEXT: ; implicit-def: $vgpr4
-; GFX12-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB0_3
-; GFX12-SDAG-NEXT: ; %bb.4:
+; GFX12-SDAG-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: buffer_nontemporal_load_store:
@@ -444,30 +341,32 @@ entry:
define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrspace(7) %in, ptr addrspace(7) %out) {
; GFX9-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-SDAG-NEXT: s_load_dword s11, s[8:9], 0x10
; GFX9-SDAG-NEXT: s_mov_b32 s10, 0
-; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s15
-; GFX9-SDAG-NEXT: s_mov_b32 s15, s10
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s10
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s14, s7
-; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-SDAG-NEXT: s_mov_b32 s12, s5
-; GFX9-SDAG-NEXT: s_or_b64 s[14:15], s[14:15], s[10:11]
-; GFX9-SDAG-NEXT: s_mov_b32 s13, s6
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen glc
+; GFX9-SDAG-NEXT: s_mov_b32 s4, s3
+; GFX9-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[10:11]
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s2, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s3, s10
+; GFX9-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[10:11]
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc
; GFX9-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x20
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s10
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s10
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s7
-; GFX9-SDAG-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11]
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s5
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s6
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s4, s3
+; GFX9-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[10:11]
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s2, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s3, s10
+; GFX9-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[10:11]
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s0
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -503,68 +402,31 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX940-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
; GFX940-SDAG: ; %bb.0: ; %entry
; GFX940-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX940-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10
-; GFX940-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20
-; GFX940-SDAG-NEXT: s_load_dword s7, s[4:5], 0x30
+; GFX940-SDAG-NEXT: s_load_dword s13, s[4:5], 0x10
+; GFX940-SDAG-NEXT: s_mov_b32 s12, 0
+; GFX940-SDAG-NEXT: s_mov_b32 s7, s12
; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX940-SDAG-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 sc0 sc1
-; GFX940-SDAG-NEXT: scratch_load_dwordx2 v[10:11], off, off offset:40
-; GFX940-SDAG-NEXT: scratch_load_dword v4, off, off offset:36
-; GFX940-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; GFX940-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
-; GFX940-SDAG-NEXT: scratch_store_dwordx4 off, v[0:3], off sc0 sc1
-; GFX940-SDAG-NEXT: scratch_load_dwordx2 v[12:13], off, off offset:8
-; GFX940-SDAG-NEXT: s_nop 0
-; GFX940-SDAG-NEXT: scratch_load_dword v0, off, off offset:4
-; GFX940-SDAG-NEXT: v_mov_b32_e32 v7, s6
-; GFX940-SDAG-NEXT: v_mov_b32_e32 v3, s7
-; GFX940-SDAG-NEXT: v_mov_b32_e32 v9, s0
-; GFX940-SDAG-NEXT: s_mov_b64 s[2:3], exec
-; GFX940-SDAG-NEXT: s_waitcnt vmcnt(4)
-; GFX940-SDAG-NEXT: v_mov_b32_e32 v5, v10
-; GFX940-SDAG-NEXT: v_mov_b32_e32 v6, v11
-; GFX940-SDAG-NEXT: s_waitcnt vmcnt(1)
-; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, v12
-; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, v13
-; GFX940-SDAG-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
-; GFX940-SDAG-NEXT: v_readfirstlane_b32 s4, v4
-; GFX940-SDAG-NEXT: v_readfirstlane_b32 s5, v5
-; GFX940-SDAG-NEXT: v_readfirstlane_b32 s6, v6
-; GFX940-SDAG-NEXT: v_readfirstlane_b32 s7, v7
-; GFX940-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5]
-; GFX940-SDAG-NEXT: s_nop 0
-; GFX940-SDAG-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[6:7]
-; GFX940-SDAG-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-SDAG-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-SDAG-NEXT: buffer_load_dword v8, v9, s[4:7], 0 offen sc0 sc1
-; GFX940-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
-; GFX940-SDAG-NEXT: ; implicit-def: $vgpr9
-; GFX940-SDAG-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX940-SDAG-NEXT: s_cbranch_execnz .LBB1_1
-; GFX940-SDAG-NEXT: ; %bb.2:
-; GFX940-SDAG-NEXT: s_mov_b64 exec, s[2:3]
-; GFX940-SDAG-NEXT: v_mov_b32_e32 v4, s8
-; GFX940-SDAG-NEXT: s_mov_b64 s[0:1], exec
+; GFX940-SDAG-NEXT: s_mov_b32 s6, s3
+; GFX940-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
+; GFX940-SDAG-NEXT: s_mov_b32 s13, s2
+; GFX940-SDAG-NEXT: s_mov_b32 s2, s1
+; GFX940-SDAG-NEXT: s_mov_b32 s3, s12
+; GFX940-SDAG-NEXT: s_or_b64 s[8:9], s[2:3], s[12:13]
+; GFX940-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-SDAG-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1
+; GFX940-SDAG-NEXT: s_load_dword s13, s[4:5], 0x30
+; GFX940-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20
+; GFX940-SDAG-NEXT: s_mov_b32 s5, s12
+; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-SDAG-NEXT: s_mov_b32 s4, s3
+; GFX940-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
+; GFX940-SDAG-NEXT: s_mov_b32 s13, s2
+; GFX940-SDAG-NEXT: s_mov_b32 s2, s1
+; GFX940-SDAG-NEXT: s_mov_b32 s3, s12
+; GFX940-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
+; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, s0
; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX940-SDAG-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1
-; GFX940-SDAG-NEXT: v_readfirstlane_b32 s4, v0
-; GFX940-SDAG-NEXT: v_readfirstlane_b32 s5, v1
-; GFX940-SDAG-NEXT: v_readfirstlane_b32 s6, v2
-; GFX940-SDAG-NEXT: v_readfirstlane_b32 s7, v3
-; GFX940-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX940-SDAG-NEXT: s_nop 0
-; GFX940-SDAG-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX940-SDAG-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-SDAG-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-SDAG-NEXT: buffer_store_dword v8, v4, s[4:7], 0 offen sc0 sc1
-; GFX940-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX940-SDAG-NEXT: ; implicit-def: $vgpr8
-; GFX940-SDAG-NEXT: ; implicit-def: $vgpr4
-; GFX940-SDAG-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX940-SDAG-NEXT: s_cbranch_execnz .LBB1_3
-; GFX940-SDAG-NEXT: ; %bb.4:
+; GFX940-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1
; GFX940-SDAG-NEXT: s_endpgm
;
; GFX940-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -600,31 +462,34 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX10-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
; GFX10-SDAG: ; %bb.0: ; %entry
; GFX10-SDAG-NEXT: s_clause 0x1
-; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-SDAG-NEXT: s_load_dword s11, s[8:9], 0x10
; GFX10-SDAG-NEXT: s_mov_b32 s10, 0
-; GFX10-SDAG-NEXT: s_add_u32 s0, s0, s15
+; GFX10-SDAG-NEXT: s_mov_b32 s5, s10
; GFX10-SDAG-NEXT: s_mov_b32 s13, s10
-; GFX10-SDAG-NEXT: s_addc_u32 s1, s1, 0
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-SDAG-NEXT: s_mov_b32 s12, s7
-; GFX10-SDAG-NEXT: s_or_b64 s[14:15], s[12:13], s[10:11]
-; GFX10-SDAG-NEXT: s_mov_b32 s12, s5
-; GFX10-SDAG-NEXT: s_mov_b32 s13, s6
-; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen glc dlc
+; GFX10-SDAG-NEXT: s_mov_b32 s4, s3
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-SDAG-NEXT: s_mov_b32 s12, s1
+; GFX10-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[10:11]
+; GFX10-SDAG-NEXT: s_mov_b32 s11, s2
+; GFX10-SDAG-NEXT: s_or_b64 s[4:5], s[12:13], s[10:11]
+; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc
; GFX10-SDAG-NEXT: s_clause 0x1
; GFX10-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30
-; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x20
-; GFX10-SDAG-NEXT: s_mov_b32 s9, s10
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
+; GFX10-SDAG-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-SDAG-NEXT: s_mov_b32 s5, s10
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-SDAG-NEXT: s_mov_b32 s8, s7
-; GFX10-SDAG-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11]
-; GFX10-SDAG-NEXT: s_mov_b32 s8, s5
-; GFX10-SDAG-NEXT: s_mov_b32 s9, s6
+; GFX10-SDAG-NEXT: s_mov_b32 s4, s3
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[10:11]
+; GFX10-SDAG-NEXT: s_mov_b32 s11, s2
+; GFX10-SDAG-NEXT: s_mov_b32 s2, s1
+; GFX10-SDAG-NEXT: s_mov_b32 s3, s10
+; GFX10-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[10:11]
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX10-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -660,69 +525,37 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
;
; GFX11-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_clause 0x2
+; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-SDAG-NEXT: s_load_b128 s[8:11], s[4:5], 0x20
-; GFX11-SDAG-NEXT: s_load_b32 s6, s[4:5], 0x10
+; GFX11-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x10
+; GFX11-SDAG-NEXT: s_mov_b32 s12, 0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_mov_b32 s7, s12
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s12
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
-; GFX11-SDAG-NEXT: scratch_store_b128 off, v[0:3], off offset:32
+; GFX11-SDAG-NEXT: s_mov_b32 s6, s3
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s1
+; GFX11-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
+; GFX11-SDAG-NEXT: s_mov_b32 s13, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
; GFX11-SDAG-NEXT: s_clause 0x1
-; GFX11-SDAG-NEXT: scratch_load_b64 v[5:6], off, off offset:40
-; GFX11-SDAG-NEXT: scratch_load_b32 v4, off, off offset:36
-; GFX11-SDAG-NEXT: s_load_b32 s1, s[4:5], 0x30
-; GFX11-SDAG-NEXT: scratch_store_b128 off, v[7:10], off
-; GFX11-SDAG-NEXT: s_clause 0x1
-; GFX11-SDAG-NEXT: scratch_load_b64 v[1:2], off, off offset:8
-; GFX11-SDAG-NEXT: scratch_load_b32 v0, off, off offset:4
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v7, s6
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v9, s0
+; GFX11-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
+; GFX11-SDAG-NEXT: s_mov_b32 s5, s12
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s4, v4
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s5, v5
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s6, v6
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s7, v7
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
-; GFX11-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-SDAG-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-SDAG-NEXT: buffer_load_b32 v8, v9, s[4:7], 0 offen glc dlc
-; GFX11-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
-; GFX11-SDAG-NEXT: ; implicit-def: $vgpr9
-; GFX11-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB1_1
-; GFX11-SDAG-NEXT: ; %bb.2:
-; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, s8
-; GFX11-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-SDAG-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s4, v0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s5, v1
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s6, v2
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s7, v3
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-SDAG-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s4, s3
+; GFX11-SDAG-NEXT: s_mov_b32 s3, s12
+; GFX11-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
+; GFX11-SDAG-NEXT: s_mov_b32 s13, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s2, s1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: buffer_store_b32 v8, v4, s[4:7], 0 offen dlc
-; GFX11-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX11-SDAG-NEXT: ; implicit-def: $vgpr8
-; GFX11-SDAG-NEXT: ; implicit-def: $vgpr4
-; GFX11-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB1_3
-; GFX11-SDAG-NEXT: ; %bb.4:
+; GFX11-SDAG-NEXT: buffer_store_b32 v0, v1, s[4:7], 0 offen dlc
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -761,77 +594,37 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
;
; GFX12-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_clause 0x2
+; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-SDAG-NEXT: s_load_b128 s[8:11], s[4:5], 0x20
-; GFX12-SDAG-NEXT: s_load_b32 s6, s[4:5], 0x10
+; GFX12-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x10
+; GFX12-SDAG-NEXT: s_mov_b32 s12, 0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_mov_b32 s7, s12
+; GFX12-SDAG-NEXT: s_mov_b32 s9, s12
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
-; GFX12-SDAG-NEXT: scratch_store_b128 off, v[0:3], off offset:32
-; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: scratch_load_b64 v[5:6], off, off offset:40
-; GFX12-SDAG-NEXT: scratch_load_b32 v4, off, off offset:36
-; GFX12-SDAG-NEXT: s_load_b32 s1, s[4:5], 0x30
-; GFX12-SDAG-NEXT: scratch_store_b128 off, v[7:10], off
+; GFX12-SDAG-NEXT: s_mov_b32 s6, s3
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-SDAG-NEXT: s_mov_b32 s8, s1
+; GFX12-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
+; GFX12-SDAG-NEXT: s_mov_b32 s13, s2
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: scratch_load_b64 v[1:2], off, off offset:8
-; GFX12-SDAG-NEXT: scratch_load_b32 v0, off, off offset:4
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v7, s6
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v9, s0
+; GFX12-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30
+; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
+; GFX12-SDAG-NEXT: s_mov_b32 s5, s12
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; GFX12-SDAG-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-SDAG-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x2
-; GFX12-SDAG-NEXT: v_readfirstlane_b32 s4, v4
-; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v5
-; GFX12-SDAG-NEXT: v_readfirstlane_b32 s6, v6
-; GFX12-SDAG-NEXT: v_readfirstlane_b32 s7, v7
-; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
-; GFX12-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_NT scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
-; GFX12-SDAG-NEXT: ; implicit-def: $vgpr9
-; GFX12-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB1_1
-; GFX12-SDAG-NEXT: ; %bb.2:
-; GFX12-SDAG-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, s8
-; GFX12-SDAG-NEXT: s_mov_b32 s0, exec_lo
-; GFX12-SDAG-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x1
-; GFX12-SDAG-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v1
-; GFX12-SDAG-NEXT: v_readfirstlane_b32 s6, v2
-; GFX12-SDAG-NEXT: v_readfirstlane_b32 s7, v3
-; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-SDAG-NEXT: s_mov_b32 s4, s3
+; GFX12-SDAG-NEXT: s_mov_b32 s3, s12
+; GFX12-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
+; GFX12-SDAG-NEXT: s_mov_b32 s13, s2
+; GFX12-SDAG-NEXT: s_mov_b32 s2, s1
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: buffer_store_b32 v8, v4, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX12-SDAG-NEXT: ; implicit-def: $vgpr8
-; GFX12-SDAG-NEXT: ; implicit-def: $vgpr4
-; GFX12-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB1_3
-; GFX12-SDAG-NEXT: ; %bb.4:
+; GFX12-SDAG-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
index ffe9e06c04ae..5a9f53ec0077 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -330,17 +330,17 @@ define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0,
define amdgpu_kernel void @memcpy_alt_type(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %n) #0 {
; OPT-LABEL: @memcpy_alt_type(
-; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 15
; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
; OPT: loop-memcpy-expansion:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1
+; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 1
; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1
-; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1
+; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16
; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
; OPT: loop-memcpy-residual:
@@ -681,13 +681,25 @@ define amdgpu_kernel void @memcpy_global_align2_global_align2_1039(ptr addrspace
; OPT: load-store-loop:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(1) [[TMP1]], align 2
+; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 2
; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
-; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 2
-; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1038
+; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
+; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
+; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
; OPT: memcpy-split:
+; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
+; OPT-NEXT: [[TMP16:%.*]] = load i64, ptr addrspace(1) [[TMP15]], align 2
+; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
+; OPT-NEXT: store i64 [[TMP16]], ptr addrspace(1) [[TMP17]], align 2
+; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1032
+; OPT-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP9]], align 2
+; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1032
+; OPT-NEXT: store i32 [[TMP10]], ptr addrspace(1) [[TMP11]], align 2
+; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1036
+; OPT-NEXT: [[TMP13:%.*]] = load i16, ptr addrspace(1) [[TMP12]], align 2
+; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1036
+; OPT-NEXT: store i16 [[TMP13]], ptr addrspace(1) [[TMP14]], align 2
; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1038
; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 2
; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1038
@@ -731,13 +743,17 @@ define amdgpu_kernel void @memcpy_global_align2_global_align4_1027(ptr addrspace
; OPT: load-store-loop:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(1) [[TMP1]], align 2
+; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
-; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 2
-; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1026
+; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
+; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
+; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
; OPT: memcpy-split:
+; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
+; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(1) [[TMP9]], align 4
+; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
+; OPT-NEXT: store i16 [[TMP10]], ptr addrspace(1) [[TMP11]], align 2
; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1026
; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 2
; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1026
@@ -754,13 +770,17 @@ define amdgpu_kernel void @memcpy_global_align4_global_align2_1027(ptr addrspace
; OPT: load-store-loop:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(1) [[TMP1]], align 2
+; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 2
; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
-; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 2
-; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1026
+; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
+; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
+; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
; OPT: memcpy-split:
+; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
+; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(1) [[TMP9]], align 2
+; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
+; OPT-NEXT: store i16 [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1026
; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 2
; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1026
@@ -804,13 +824,17 @@ define amdgpu_kernel void @memcpy_private_align2_private_align4_1027(ptr addrspa
; OPT: load-store-loop:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(5) [[TMP1]], align 2
+; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 4
; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(5) [[TMP3]], align 2
-; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 2
-; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1026
+; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 2
+; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256
+; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024
; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
; OPT: memcpy-split:
+; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024
+; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(5) [[TMP9]], align 4
+; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024
+; OPT-NEXT: store i16 [[TMP10]], ptr addrspace(5) [[TMP11]], align 2
; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026
; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[TMP6]], align 2
; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1026
@@ -854,13 +878,17 @@ define amdgpu_kernel void @memcpy_private_align4_private_align2_1027(ptr addrspa
; OPT: load-store-loop:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(5) [[TMP1]], align 2
+; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 2
; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(5) [[TMP3]], align 2
-; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 2
-; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1026
+; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 4
+; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256
+; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024
; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
; OPT: memcpy-split:
+; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024
+; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(5) [[TMP9]], align 2
+; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024
+; OPT-NEXT: store i16 [[TMP10]], ptr addrspace(5) [[TMP11]], align 4
; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026
; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[TMP6]], align 2
; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1026
@@ -904,13 +932,17 @@ define amdgpu_kernel void @memcpy_private_align2_private_align2_1027(ptr addrspa
; OPT: load-store-loop:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(5) [[TMP1]], align 2
+; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 2
; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(5) [[TMP3]], align 2
-; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 2
-; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1026
+; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 2
+; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256
+; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024
; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
; OPT: memcpy-split:
+; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024
+; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(5) [[TMP9]], align 2
+; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024
+; OPT-NEXT: store i16 [[TMP10]], ptr addrspace(5) [[TMP11]], align 2
; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026
; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[TMP6]], align 2
; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1026
@@ -958,17 +990,17 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(ptr addrs
define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
; OPT-LABEL: @memcpy_global_align2_global_align2_variable(
-; OPT-NEXT: [[TMP2:%.*]] = and i64 [[N:%.*]], 1
+; OPT-NEXT: [[TMP2:%.*]] = and i64 [[N:%.*]], 15
; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
; OPT: loop-memcpy-expansion:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP6:%.*]] = load i16, ptr addrspace(1) [[TMP5]], align 2
+; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 2
; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT: store i16 [[TMP6]], ptr addrspace(1) [[TMP7]], align 2
-; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 2
+; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 2
+; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 16
; OPT-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
; OPT: loop-memcpy-residual:
@@ -1028,17 +1060,17 @@ define amdgpu_kernel void @memcpy_global_align1_global_align1_variable(ptr addrs
define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 {
; OPT-LABEL: @memcpy_local_align4_local_align4_variable(
-; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 15
; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
; OPT: loop-memcpy-expansion:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 4
+; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 4
; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 4
-; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 4
+; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16
; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
; OPT: loop-memcpy-residual:
@@ -1063,17 +1095,17 @@ define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspa
define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 {
; OPT-LABEL: @memcpy_local_align2_local_align2_variable(
-; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 1
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 15
; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
; OPT: loop-memcpy-expansion:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP6:%.*]] = load i16, ptr addrspace(3) [[TMP5]], align 2
+; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 2
; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: store i16 [[TMP6]], ptr addrspace(3) [[TMP7]], align 2
-; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 2
+; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 2
+; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16
; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
; OPT: loop-memcpy-residual:
@@ -1098,17 +1130,17 @@ define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspa
define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 {
; OPT-LABEL: @memcpy_local_align1_local_align1_variable(
-; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 15
; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
; OPT: loop-memcpy-expansion:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1
+; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 1
; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1
-; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1
+; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16
; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
; OPT: loop-memcpy-residual:
@@ -1133,17 +1165,17 @@ define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(ptr addrspa
define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrspace(3) %dst, ptr addrspace(1) %src, i32 %n) #0 {
; OPT-LABEL: @memcpy_local_align4_global_align4_variable(
-; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 15
; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
; OPT: loop-memcpy-expansion:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(1) [[TMP5]], align 4
+; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 4
; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 4
-; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 4
+; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16
; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
; OPT: loop-memcpy-residual:
@@ -1168,17 +1200,17 @@ define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrsp
define amdgpu_kernel void @memcpy_global_align4_local_align4_variable(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %n) #0 {
; OPT-LABEL: @memcpy_global_align4_local_align4_variable(
-; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 15
; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
; OPT: loop-memcpy-expansion:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 4
+; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 4
; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4
-; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4
+; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16
; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
; OPT: loop-memcpy-residual:
@@ -1693,10 +1725,10 @@ define amdgpu_kernel void @memmove_local_align1_private_align1(ptr addrspace(3)
; ALL: load-store-loop:
; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META6:![0-9]+]]
+; ALL-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META6:![0-9]+]]
; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 1, !noalias [[META6]]
-; ALL-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 8
+; ALL-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 1, !noalias [[META6]]
+; ALL-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256
; ALL-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 256
; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
; ALL: memcpy-split:
@@ -1708,17 +1740,17 @@ define amdgpu_kernel void @memmove_local_align1_private_align1(ptr addrspace(3)
define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(5) %src, i32 %size) {
; MAX1024-LABEL: @memmove_local_align1_private_align1_unknown_size(
-; MAX1024-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
+; MAX1024-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15
; MAX1024-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
; MAX1024-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
; MAX1024-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
; MAX1024: loop-memcpy-expansion:
; MAX1024-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; MAX1024-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; MAX1024-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP7]], align 1, !alias.scope [[META0:![0-9]+]]
+; MAX1024-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP7]], align 1, !alias.scope [[META0:![0-9]+]]
; MAX1024-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; MAX1024-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(3) [[TMP6]], align 1, !noalias [[META0]]
-; MAX1024-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; MAX1024-NEXT: store <4 x i32> [[TMP5]], ptr addrspace(3) [[TMP6]], align 1, !noalias [[META0]]
+; MAX1024-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16
; MAX1024-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
; MAX1024-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
; MAX1024: loop-memcpy-residual:
@@ -1738,17 +1770,17 @@ define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr
; MAX1024-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
;
; ALL-LABEL: @memmove_local_align1_private_align1_unknown_size(
-; ALL-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
+; ALL-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15
; ALL-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
; ALL-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
; ALL: loop-memcpy-expansion:
; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP7]], align 1, !alias.scope [[META9:![0-9]+]]
+; ALL-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP7]], align 1, !alias.scope [[META9:![0-9]+]]
; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(3) [[TMP6]], align 1, !noalias [[META9]]
-; ALL-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; ALL-NEXT: store <4 x i32> [[TMP5]], ptr addrspace(3) [[TMP6]], align 1, !noalias [[META9]]
+; ALL-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16
; ALL-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
; ALL-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
; ALL: loop-memcpy-residual:
@@ -1781,10 +1813,10 @@ define amdgpu_kernel void @memmove_private_align1_local_align1(ptr addrspace(5)
; ALL: load-store-loop:
; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP1]], align 1, !alias.scope [[META12:![0-9]+]]
+; ALL-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(3) [[TMP1]], align 1, !alias.scope [[META12:![0-9]+]]
; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias [[META12]]
-; ALL-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 8
+; ALL-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias [[META12]]
+; ALL-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256
; ALL-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 256
; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
; ALL: memcpy-split:
@@ -1796,17 +1828,17 @@ define amdgpu_kernel void @memmove_private_align1_local_align1(ptr addrspace(5)
define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr addrspace(5) %dst, ptr addrspace(3) %src, i32 %size) {
; MAX1024-LABEL: @memmove_private_align1_local_align1_unknown_size(
-; MAX1024-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
+; MAX1024-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15
; MAX1024-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
; MAX1024-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
; MAX1024-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
; MAX1024: loop-memcpy-expansion:
; MAX1024-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; MAX1024-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; MAX1024-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP7]], align 1, !alias.scope [[META3:![0-9]+]]
+; MAX1024-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP7]], align 1, !alias.scope [[META3:![0-9]+]]
; MAX1024-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; MAX1024-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(5) [[TMP6]], align 1, !noalias [[META3]]
-; MAX1024-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; MAX1024-NEXT: store <4 x i32> [[TMP5]], ptr addrspace(5) [[TMP6]], align 1, !noalias [[META3]]
+; MAX1024-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16
; MAX1024-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
; MAX1024-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
; MAX1024: loop-memcpy-residual:
@@ -1826,17 +1858,17 @@ define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr
; MAX1024-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
;
; ALL-LABEL: @memmove_private_align1_local_align1_unknown_size(
-; ALL-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
+; ALL-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15
; ALL-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
; ALL-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
; ALL: loop-memcpy-expansion:
; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP7]], align 1, !alias.scope [[META15:![0-9]+]]
+; ALL-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP7]], align 1, !alias.scope [[META15:![0-9]+]]
; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(5) [[TMP6]], align 1, !noalias [[META15]]
-; ALL-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; ALL-NEXT: store <4 x i32> [[TMP5]], ptr addrspace(5) [[TMP6]], align 1, !noalias [[META15]]
+; ALL-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16
; ALL-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
; ALL-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
; ALL: loop-memcpy-residual:
@@ -1871,20 +1903,20 @@ define amdgpu_kernel void @memmove_flat_align1_local_align1(ptr addrspace(0) %ds
; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
; ALL: memmove_bwd_loop:
; ALL-NEXT: [[TMP2:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ]
-; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP2]], 8
+; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP2]], 256
; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_INDEX]]
-; ALL-NEXT: [[ELEMENT:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP3]], align 1
+; ALL-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(3) [[TMP3]], align 1
; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[BWD_INDEX]]
-; ALL-NEXT: store <2 x i32> [[ELEMENT]], ptr [[TMP4]], align 1
+; ALL-NEXT: store <64 x i32> [[ELEMENT]], ptr [[TMP4]], align 1
; ALL-NEXT: [[TMP5:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
; ALL: memmove_fwd_loop:
; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i32 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_INDEX]]
-; ALL-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP6]], align 1
+; ALL-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(3) [[TMP6]], align 1
; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[FWD_INDEX]]
-; ALL-NEXT: store <2 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1
-; ALL-NEXT: [[TMP8]] = add i32 [[FWD_INDEX]], 8
+; ALL-NEXT: store <64 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1
+; ALL-NEXT: [[TMP8]] = add i32 [[FWD_INDEX]], 256
; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 256
; ALL-NEXT: br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
; ALL: memmove_done:
@@ -1896,7 +1928,7 @@ define amdgpu_kernel void @memmove_flat_align1_local_align1(ptr addrspace(0) %ds
define amdgpu_kernel void @memmove_flat_align1_local_align1_unknown_size(ptr addrspace(0) %dst, ptr addrspace(3) %src, i32 %size) {
; OPT-LABEL: @memmove_flat_align1_local_align1_unknown_size(
-; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15
; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP3]], 0
@@ -1918,11 +1950,11 @@ define amdgpu_kernel void @memmove_flat_align1_local_align1_unknown_size(ptr add
; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
; OPT: memmove_bwd_main_loop:
; OPT-NEXT: [[TMP9:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ]
-; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 8
+; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 16
; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_MAIN_INDEX]]
-; OPT-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP10]], align 1
+; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP10]], align 1
; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[BWD_MAIN_INDEX]]
-; OPT-NEXT: store <2 x i32> [[ELEMENT1]], ptr [[TMP11]], align 1
+; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr [[TMP11]], align 1
; OPT-NEXT: [[TMP12:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
; OPT-NEXT: br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
; OPT: memmove_copy_forward:
@@ -1930,10 +1962,10 @@ define amdgpu_kernel void @memmove_flat_align1_local_align1_unknown_size(ptr add
; OPT: memmove_fwd_main_loop:
; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_MAIN_INDEX]]
-; OPT-NEXT: [[ELEMENT2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP13]], align 1
+; OPT-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP13]], align 1
; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[FWD_MAIN_INDEX]]
-; OPT-NEXT: store <2 x i32> [[ELEMENT2]], ptr [[TMP14]], align 1
-; OPT-NEXT: [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 8
+; OPT-NEXT: store <4 x i32> [[ELEMENT2]], ptr [[TMP14]], align 1
+; OPT-NEXT: [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 16
; OPT-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP3]]
; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
; OPT: memmove_fwd_middle:
@@ -1965,20 +1997,20 @@ define amdgpu_kernel void @memmove_local_align1_flat_align1(ptr addrspace(3) %ds
; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
; ALL: memmove_bwd_loop:
; ALL-NEXT: [[TMP2:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ]
-; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP2]], 8
+; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP2]], 256
; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[BWD_INDEX]]
-; ALL-NEXT: [[ELEMENT:%.*]] = load <2 x i32>, ptr [[TMP3]], align 1
+; ALL-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr [[TMP3]], align 1
; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_INDEX]]
-; ALL-NEXT: store <2 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP4]], align 1
+; ALL-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP4]], align 1
; ALL-NEXT: [[TMP5:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
; ALL: memmove_fwd_loop:
; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i32 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[FWD_INDEX]]
-; ALL-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr [[TMP6]], align 1
+; ALL-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr [[TMP6]], align 1
; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_INDEX]]
-; ALL-NEXT: store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP7]], align 1
-; ALL-NEXT: [[TMP8]] = add i32 [[FWD_INDEX]], 8
+; ALL-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP7]], align 1
+; ALL-NEXT: [[TMP8]] = add i32 [[FWD_INDEX]], 256
; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 256
; ALL-NEXT: br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
; ALL: memmove_done:
@@ -1990,7 +2022,7 @@ define amdgpu_kernel void @memmove_local_align1_flat_align1(ptr addrspace(3) %ds
define amdgpu_kernel void @memmove_local_align1_flat_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(0) %src, i32 %size) {
; OPT-LABEL: @memmove_local_align1_flat_align1_unknown_size(
-; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15
; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP3]], 0
@@ -2012,11 +2044,11 @@ define amdgpu_kernel void @memmove_local_align1_flat_align1_unknown_size(ptr add
; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
; OPT: memmove_bwd_main_loop:
; OPT-NEXT: [[TMP9:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ]
-; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 8
+; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 16
; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[BWD_MAIN_INDEX]]
-; OPT-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr [[TMP10]], align 1
+; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr [[TMP10]], align 1
; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_MAIN_INDEX]]
-; OPT-NEXT: store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP11]], align 1
+; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP11]], align 1
; OPT-NEXT: [[TMP12:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
; OPT-NEXT: br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
; OPT: memmove_copy_forward:
@@ -2024,10 +2056,10 @@ define amdgpu_kernel void @memmove_local_align1_flat_align1_unknown_size(ptr add
; OPT: memmove_fwd_main_loop:
; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[FWD_MAIN_INDEX]]
-; OPT-NEXT: [[ELEMENT2:%.*]] = load <2 x i32>, ptr [[TMP13]], align 1
+; OPT-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr [[TMP13]], align 1
; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_MAIN_INDEX]]
-; OPT-NEXT: store <2 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP14]], align 1
-; OPT-NEXT: [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 8
+; OPT-NEXT: store <4 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP14]], align 1
+; OPT-NEXT: [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 16
; OPT-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP3]]
; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
; OPT: memmove_fwd_middle:
@@ -2058,20 +2090,20 @@ define amdgpu_kernel void @memmove_local_align1_local_align1(ptr addrspace(3) %d
; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
; ALL: memmove_bwd_loop:
; ALL-NEXT: [[TMP1:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ]
-; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP1]], 8
+; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP1]], 256
; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_INDEX]]
-; ALL-NEXT: [[ELEMENT:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP2]], align 1
+; ALL-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(3) [[TMP2]], align 1
; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_INDEX]]
-; ALL-NEXT: store <2 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP3]], align 1
+; ALL-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP3]], align 1
; ALL-NEXT: [[TMP4:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
; ALL-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
; ALL: memmove_fwd_loop:
; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i32 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_INDEX]]
-; ALL-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1
+; ALL-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(3) [[TMP5]], align 1
; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_INDEX]]
-; ALL-NEXT: store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP6]], align 1
-; ALL-NEXT: [[TMP7]] = add i32 [[FWD_INDEX]], 8
+; ALL-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP6]], align 1
+; ALL-NEXT: [[TMP7]] = add i32 [[FWD_INDEX]], 256
; ALL-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 256
; ALL-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
; ALL: memmove_done:
@@ -2083,7 +2115,7 @@ define amdgpu_kernel void @memmove_local_align1_local_align1(ptr addrspace(3) %d
define amdgpu_kernel void @memmove_local_align1_local_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %size) {
; OPT-LABEL: @memmove_local_align1_local_align1_unknown_size(
-; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15
; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP3]], 0
@@ -2104,11 +2136,11 @@ define amdgpu_kernel void @memmove_local_align1_local_align1_unknown_size(ptr ad
; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
; OPT: memmove_bwd_main_loop:
; OPT-NEXT: [[TMP8:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ]
-; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP8]], 8
+; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP8]], 16
; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_MAIN_INDEX]]
-; OPT-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP9]], align 1
+; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP9]], align 1
; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_MAIN_INDEX]]
-; OPT-NEXT: store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP10]], align 1
+; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP10]], align 1
; OPT-NEXT: [[TMP11:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
; OPT-NEXT: br i1 [[TMP11]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
; OPT: memmove_copy_forward:
@@ -2116,10 +2148,10 @@ define amdgpu_kernel void @memmove_local_align1_local_align1_unknown_size(ptr ad
; OPT: memmove_fwd_main_loop:
; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP14:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_MAIN_INDEX]]
-; OPT-NEXT: [[ELEMENT2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP12]], align 1
+; OPT-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP12]], align 1
; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_MAIN_INDEX]]
-; OPT-NEXT: store <2 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP13]], align 1
-; OPT-NEXT: [[TMP14]] = add i32 [[FWD_MAIN_INDEX]], 8
+; OPT-NEXT: store <4 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP13]], align 1
+; OPT-NEXT: [[TMP14]] = add i32 [[FWD_MAIN_INDEX]], 16
; OPT-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP14]], [[TMP3]]
; OPT-NEXT: br i1 [[TMP15]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
; OPT: memmove_fwd_middle:
diff --git a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
index a68d2e575607..bc8bcc622810 100644
--- a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
@@ -306,10 +306,10 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align
; CHECK-LABEL: memmove_p0_p3:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_and_b32_e32 v7, 7, v3
+; CHECK-NEXT: v_and_b32_e32 v7, 15, v3
; CHECK-NEXT: v_mov_b32_e32 v8, 0
; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; CHECK-NEXT: v_and_b32_e32 v5, -8, v3
+; CHECK-NEXT: v_and_b32_e32 v5, -16, v3
; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[7:8]
@@ -338,15 +338,15 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB2_5: ; %memmove_fwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ds_read_b64 v[13:14], v4
-; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -8
+; CHECK-NEXT: ds_read_b128 v[13:16], v4
+; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -16
; CHECK-NEXT: v_add_co_ci_u32_e64 v12, s5, -1, v12, s5
-; CHECK-NEXT: v_add_nc_u32_e32 v4, 8, v4
+; CHECK-NEXT: v_add_nc_u32_e32 v4, 16, v4
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[11:12]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[9:10], v[13:14]
-; CHECK-NEXT: v_add_co_u32 v9, s6, v9, 8
+; CHECK-NEXT: flat_store_dwordx4 v[9:10], v[13:16]
+; CHECK-NEXT: v_add_co_u32 v9, s6, v9, 16
; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s6, 0, v10, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB2_5
@@ -355,7 +355,7 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: s_and_saveexec_b32 s8, s4
; CHECK-NEXT: s_cbranch_execz .LBB2_9
; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
-; CHECK-NEXT: v_and_b32_e32 v3, -8, v3
+; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v5
; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v6, s5
; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3
@@ -414,26 +414,26 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB2_16
; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
-; CHECK-NEXT: v_and_b32_e32 v3, -8, v3
-; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -8
+; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
+; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; CHECK-NEXT: v_add3_u32 v2, v3, v2, -8
+; CHECK-NEXT: v_add3_u32 v2, v3, v2, -16
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB2_15: ; %memmove_bwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ds_read_b64 v[3:4], v2
-; CHECK-NEXT: v_add_co_u32 v7, vcc_lo, v5, -8
-; CHECK-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, -1, v6, vcc_lo
-; CHECK-NEXT: v_add_co_u32 v9, vcc_lo, v0, v5
-; CHECK-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v1, v6, vcc_lo
-; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[7:8]
-; CHECK-NEXT: v_mov_b32_e32 v5, v7
-; CHECK-NEXT: v_add_nc_u32_e32 v2, -8, v2
-; CHECK-NEXT: v_mov_b32_e32 v6, v8
+; CHECK-NEXT: ds_read_b128 v[7:10], v2
+; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v5, -16
+; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v6, vcc_lo
+; CHECK-NEXT: v_add_co_u32 v11, vcc_lo, v0, v5
+; CHECK-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v1, v6, vcc_lo
+; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[3:4]
+; CHECK-NEXT: v_mov_b32_e32 v6, v4
+; CHECK-NEXT: v_add_nc_u32_e32 v2, -16, v2
+; CHECK-NEXT: v_mov_b32_e32 v5, v3
; CHECK-NEXT: s_or_b32 s7, s4, s7
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[9:10], v[3:4]
+; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[7:10]
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB2_15
; CHECK-NEXT: .LBB2_16: ; %Flow36
@@ -1043,9 +1043,9 @@ define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align
; CHECK-LABEL: memmove_p1_p3:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_and_b32_e32 v7, -8, v3
+; CHECK-NEXT: v_and_b32_e32 v7, -16, v3
; CHECK-NEXT: v_mov_b32_e32 v8, v4
-; CHECK-NEXT: v_and_b32_e32 v5, 7, v3
+; CHECK-NEXT: v_and_b32_e32 v5, 15, v3
; CHECK-NEXT: v_mov_b32_e32 v6, 0
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, exec_lo
@@ -1056,16 +1056,16 @@ define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .LBB7_2: ; %loop-memcpy-expansion
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ds_read_b64 v[10:11], v9
-; CHECK-NEXT: v_add_co_u32 v12, vcc_lo, v0, s4
-; CHECK-NEXT: s_add_u32 s4, s4, 8
-; CHECK-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT: ds_read_b128 v[10:13], v9
+; CHECK-NEXT: v_add_co_u32 v14, vcc_lo, v0, s4
+; CHECK-NEXT: s_add_u32 s4, s4, 16
+; CHECK-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, s5, v1, vcc_lo
; CHECK-NEXT: s_addc_u32 s5, s5, 0
-; CHECK-NEXT: v_add_nc_u32_e32 v9, 8, v9
+; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_store_dwordx2 v[12:13], v[10:11], off
+; CHECK-NEXT: global_store_dwordx4 v[14:15], v[10:13], off
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB7_2
; CHECK-NEXT: .LBB7_3: ; %Flow9
@@ -1076,7 +1076,7 @@ define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6
; CHECK-NEXT: s_cbranch_execz .LBB7_7
; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader
-; CHECK-NEXT: v_and_b32_e32 v3, -8, v3
+; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3
@@ -1327,11 +1327,11 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align
; CHECK-LABEL: memmove_p3_p0:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_and_b32_e32 v5, 7, v3
+; CHECK-NEXT: v_and_b32_e32 v5, 15, v3
; CHECK-NEXT: v_mov_b32_e32 v6, 0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v0
; CHECK-NEXT: s_mov_b64 s[4:5], src_shared_base
-; CHECK-NEXT: v_and_b32_e32 v7, -8, v3
+; CHECK-NEXT: v_and_b32_e32 v7, -16, v3
; CHECK-NEXT: v_mov_b32_e32 v8, v4
; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[5:6]
; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, s5, vcc_lo
@@ -1361,16 +1361,16 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB10_5: ; %memmove_fwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: flat_load_dwordx2 v[13:14], v[9:10]
-; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -8
+; CHECK-NEXT: flat_load_dwordx4 v[13:16], v[9:10]
+; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -16
; CHECK-NEXT: v_add_co_ci_u32_e64 v12, s5, -1, v12, s5
-; CHECK-NEXT: v_add_co_u32 v9, s5, v9, 8
+; CHECK-NEXT: v_add_co_u32 v9, s5, v9, 16
; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s5, 0, v10, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s6, 0, v[11:12]
; CHECK-NEXT: s_or_b32 s9, s6, s9
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_write_b64 v4, v[13:14]
-; CHECK-NEXT: v_add_nc_u32_e32 v4, 8, v4
+; CHECK-NEXT: ds_write_b128 v4, v[13:16]
+; CHECK-NEXT: v_add_nc_u32_e32 v4, 16, v4
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB10_5
; CHECK-NEXT: .LBB10_6: ; %Flow34
@@ -1378,7 +1378,7 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align
; CHECK-NEXT: s_and_saveexec_b32 s8, s4
; CHECK-NEXT: s_cbranch_execz .LBB10_9
; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
-; CHECK-NEXT: v_and_b32_e32 v3, -8, v3
+; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: v_add_nc_u32_e32 v3, v0, v3
; CHECK-NEXT: v_add_co_u32 v0, s5, v1, v7
@@ -1437,23 +1437,23 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align
; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB10_16
; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
-; CHECK-NEXT: v_and_b32_e32 v3, -8, v3
-; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, -8
+; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
+; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, -1, v2, vcc_lo
-; CHECK-NEXT: v_add3_u32 v0, v3, v0, -8
+; CHECK-NEXT: v_add3_u32 v0, v3, v0, -16
; CHECK-NEXT: s_mov_b32 s5, 0
; CHECK-NEXT: .LBB10_15: ; %memmove_bwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, v7
; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v2, v8, vcc_lo
-; CHECK-NEXT: v_add_co_u32 v7, vcc_lo, v7, -8
+; CHECK-NEXT: v_add_co_u32 v7, vcc_lo, v7, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, -1, v8, vcc_lo
-; CHECK-NEXT: flat_load_dwordx2 v[3:4], v[3:4]
+; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[3:4]
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[7:8]
; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_write_b64 v0, v[3:4]
-; CHECK-NEXT: v_add_nc_u32_e32 v0, -8, v0
+; CHECK-NEXT: ds_write_b128 v0, v[3:6]
+; CHECK-NEXT: v_add_nc_u32_e32 v0, -16, v0
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; CHECK-NEXT: s_cbranch_execnz .LBB10_15
; CHECK-NEXT: .LBB10_16: ; %Flow36
@@ -1470,9 +1470,9 @@ define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align
; CHECK-LABEL: memmove_p3_p1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_and_b32_e32 v7, -8, v3
+; CHECK-NEXT: v_and_b32_e32 v7, -16, v3
; CHECK-NEXT: v_mov_b32_e32 v8, v4
-; CHECK-NEXT: v_and_b32_e32 v5, 7, v3
+; CHECK-NEXT: v_and_b32_e32 v5, 15, v3
; CHECK-NEXT: v_mov_b32_e32 v6, 0
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, exec_lo
@@ -1485,14 +1485,14 @@ define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4
; CHECK-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, s5, v2, vcc_lo
-; CHECK-NEXT: s_add_u32 s4, s4, 8
+; CHECK-NEXT: s_add_u32 s4, s4, 16
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
-; CHECK-NEXT: global_load_dwordx2 v[10:11], v[10:11], off
+; CHECK-NEXT: global_load_dwordx4 v[10:13], v[10:11], off
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: ds_write_b64 v9, v[10:11]
-; CHECK-NEXT: v_add_nc_u32_e32 v9, 8, v9
+; CHECK-NEXT: ds_write_b128 v9, v[10:13]
+; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB11_2
; CHECK-NEXT: .LBB11_3: ; %Flow9
@@ -1503,7 +1503,7 @@ define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align
; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6
; CHECK-NEXT: s_cbranch_execz .LBB11_7
; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader
-; CHECK-NEXT: v_and_b32_e32 v3, -8, v3
+; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3
@@ -1538,8 +1538,8 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v5, 0
-; CHECK-NEXT: v_and_b32_e32 v4, 7, v2
-; CHECK-NEXT: v_and_b32_e32 v6, -8, v2
+; CHECK-NEXT: v_and_b32_e32 v4, 15, v2
+; CHECK-NEXT: v_and_b32_e32 v6, -16, v2
; CHECK-NEXT: v_mov_b32_e32 v7, v3
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[4:5]
@@ -1563,15 +1563,15 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: .LBB12_5: ; %memmove_fwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ds_read_b64 v[9:10], v3
-; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -8
+; CHECK-NEXT: ds_read_b128 v[9:12], v3
+; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -16
; CHECK-NEXT: v_add_co_ci_u32_e64 v7, s5, -1, v7, s5
-; CHECK-NEXT: v_add_nc_u32_e32 v3, 8, v3
+; CHECK-NEXT: v_add_nc_u32_e32 v3, 16, v3
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[6:7]
; CHECK-NEXT: s_or_b32 s8, s5, s8
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: ds_write_b64 v8, v[9:10]
-; CHECK-NEXT: v_add_nc_u32_e32 v8, 8, v8
+; CHECK-NEXT: ds_write_b128 v8, v[9:12]
+; CHECK-NEXT: v_add_nc_u32_e32 v8, 16, v8
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execnz .LBB12_5
; CHECK-NEXT: .LBB12_6: ; %Flow41
@@ -1579,7 +1579,7 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: s_and_saveexec_b32 s7, s4
; CHECK-NEXT: s_cbranch_execz .LBB12_9
; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
-; CHECK-NEXT: v_and_b32_e32 v2, -8, v2
+; CHECK-NEXT: v_and_b32_e32 v2, -16, v2
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2
; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2
@@ -1630,24 +1630,24 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB12_16
; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
-; CHECK-NEXT: v_and_b32_e32 v5, -8, v2
+; CHECK-NEXT: v_and_b32_e32 v5, -16, v2
; CHECK-NEXT: s_mov_b32 s6, 0
-; CHECK-NEXT: v_add_nc_u32_e32 v4, -8, v5
+; CHECK-NEXT: v_add_nc_u32_e32 v4, -16, v5
; CHECK-NEXT: v_add_nc_u32_e32 v2, v0, v4
; CHECK-NEXT: v_sub_co_u32 v0, vcc_lo, 0, v5
; CHECK-NEXT: v_add_nc_u32_e32 v4, v1, v4
; CHECK-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
; CHECK-NEXT: .LBB12_15: ; %memmove_bwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ds_read_b64 v[5:6], v4
-; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, 8
+; CHECK-NEXT: ds_read_b128 v[5:8], v4
+; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16
; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; CHECK-NEXT: v_add_nc_u32_e32 v4, -8, v4
+; CHECK-NEXT: v_add_nc_u32_e32 v4, -16, v4
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
; CHECK-NEXT: s_or_b32 s6, vcc_lo, s6
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: ds_write_b64 v2, v[5:6]
-; CHECK-NEXT: v_add_nc_u32_e32 v2, -8, v2
+; CHECK-NEXT: ds_write_b128 v2, v[5:8]
+; CHECK-NEXT: v_add_nc_u32_e32 v2, -16, v2
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_cbranch_execnz .LBB12_15
; CHECK-NEXT: .LBB12_16: ; %Flow43
@@ -1664,9 +1664,9 @@ define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align
; CHECK-LABEL: memmove_p3_p4:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_and_b32_e32 v7, -8, v3
+; CHECK-NEXT: v_and_b32_e32 v7, -16, v3
; CHECK-NEXT: v_mov_b32_e32 v8, v4
-; CHECK-NEXT: v_and_b32_e32 v5, 7, v3
+; CHECK-NEXT: v_and_b32_e32 v5, 15, v3
; CHECK-NEXT: v_mov_b32_e32 v6, 0
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, exec_lo
@@ -1679,14 +1679,14 @@ define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4
; CHECK-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, s5, v2, vcc_lo
-; CHECK-NEXT: s_add_u32 s4, s4, 8
+; CHECK-NEXT: s_add_u32 s4, s4, 16
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
-; CHECK-NEXT: global_load_dwordx2 v[10:11], v[10:11], off
+; CHECK-NEXT: global_load_dwordx4 v[10:13], v[10:11], off
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: ds_write_b64 v9, v[10:11]
-; CHECK-NEXT: v_add_nc_u32_e32 v9, 8, v9
+; CHECK-NEXT: ds_write_b128 v9, v[10:13]
+; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB13_2
; CHECK-NEXT: .LBB13_3: ; %Flow9
@@ -1697,7 +1697,7 @@ define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align
; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6
; CHECK-NEXT: s_cbranch_execz .LBB13_7
; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader
-; CHECK-NEXT: v_and_b32_e32 v3, -8, v3
+; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3
@@ -1735,27 +1735,30 @@ define void @memmove_p3_p5(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align
; CHECK-NEXT: v_mov_b32_e32 v6, 0
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, exec_lo
-; CHECK-NEXT: v_and_b32_e32 v2, -8, v4
-; CHECK-NEXT: v_and_b32_e32 v5, 7, v4
+; CHECK-NEXT: v_and_b32_e32 v2, -16, v4
+; CHECK-NEXT: v_and_b32_e32 v5, 15, v4
; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[2:3]
; CHECK-NEXT: s_cbranch_execz .LBB14_3
; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader
; CHECK-NEXT: v_mov_b32_e32 v7, v1
; CHECK-NEXT: v_mov_b32_e32 v8, v0
; CHECK-NEXT: s_mov_b32 s7, 0
+; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB14_2: ; %loop-memcpy-expansion
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: s_clause 0x3
; CHECK-NEXT: buffer_load_dword v9, v7, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v10, v7, s[0:3], 0 offen offset:4
-; CHECK-NEXT: s_add_u32 s4, s4, 8
+; CHECK-NEXT: buffer_load_dword v11, v7, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v12, v7, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_add_u32 s4, s4, 16
; CHECK-NEXT: s_addc_u32 s5, s5, 0
-; CHECK-NEXT: v_add_nc_u32_e32 v7, 8, v7
+; CHECK-NEXT: v_add_nc_u32_e32 v7, 16, v7
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[2:3]
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: ds_write_b64 v8, v[9:10]
-; CHECK-NEXT: v_add_nc_u32_e32 v8, 8, v8
+; CHECK-NEXT: ds_write_b128 v8, v[9:12]
+; CHECK-NEXT: v_add_nc_u32_e32 v8, 16, v8
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB14_2
; CHECK-NEXT: .LBB14_3: ; %Flow14
@@ -1766,7 +1769,7 @@ define void @memmove_p3_p5(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align
; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6
; CHECK-NEXT: s_cbranch_execz .LBB14_7
; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader
-; CHECK-NEXT: v_and_b32_e32 v2, -8, v4
+; CHECK-NEXT: v_and_b32_e32 v2, -16, v4
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2
; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2
@@ -2021,25 +2024,28 @@ define void @memmove_p5_p3(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: v_mov_b32_e32 v6, 0
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, exec_lo
-; CHECK-NEXT: v_and_b32_e32 v2, -8, v4
-; CHECK-NEXT: v_and_b32_e32 v5, 7, v4
+; CHECK-NEXT: v_and_b32_e32 v2, -16, v4
+; CHECK-NEXT: v_and_b32_e32 v5, 15, v4
; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[2:3]
; CHECK-NEXT: s_cbranch_execz .LBB17_3
; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader
; CHECK-NEXT: v_mov_b32_e32 v7, v1
; CHECK-NEXT: v_mov_b32_e32 v8, v0
; CHECK-NEXT: s_mov_b32 s7, 0
+; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB17_2: ; %loop-memcpy-expansion
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ds_read_b64 v[9:10], v7
-; CHECK-NEXT: s_add_u32 s4, s4, 8
+; CHECK-NEXT: ds_read_b128 v[9:12], v7
+; CHECK-NEXT: s_add_u32 s4, s4, 16
; CHECK-NEXT: s_addc_u32 s5, s5, 0
-; CHECK-NEXT: v_add_nc_u32_e32 v7, 8, v7
+; CHECK-NEXT: v_add_nc_u32_e32 v7, 16, v7
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[2:3]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v12, v8, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v11, v8, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_store_dword v10, v8, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen
-; CHECK-NEXT: v_add_nc_u32_e32 v8, 8, v8
+; CHECK-NEXT: v_add_nc_u32_e32 v8, 16, v8
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB17_2
@@ -2051,7 +2057,7 @@ define void @memmove_p5_p3(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6
; CHECK-NEXT: s_cbranch_execz .LBB17_7
; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader
-; CHECK-NEXT: v_and_b32_e32 v2, -8, v4
+; CHECK-NEXT: v_and_b32_e32 v2, -16, v4
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2
; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index af7f92798a93..a6db7d331cef 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -828,81 +828,30 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16
define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16> %a, <4 x i16> %b) #0 {
; EG-LABEL: s_test_imin_sle_v4i16:
; EG: ; %bb.0:
-; EG-NEXT: ALU 1, @28, KC0[], KC1[]
-; EG-NEXT: TEX 1 @12
-; EG-NEXT: ALU 9, @30, KC0[], KC1[]
-; EG-NEXT: TEX 1 @16
-; EG-NEXT: ALU 10, @40, KC0[], KC1[]
-; EG-NEXT: TEX 1 @20
-; EG-NEXT: ALU 10, @51, KC0[], KC1[]
-; EG-NEXT: TEX 1 @24
-; EG-NEXT: ALU 11, @62, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XY, T5.X, 1
+; EG-NEXT: ALU 0, @14, KC0[], KC1[]
+; EG-NEXT: TEX 3 @6
+; EG-NEXT: ALU 9, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 12:
-; EG-NEXT: VTX_READ_16 T6.X, T5.X, 50, #3
-; EG-NEXT: VTX_READ_16 T7.X, T5.X, 58, #3
-; EG-NEXT: Fetch clause starting at 16:
-; EG-NEXT: VTX_READ_16 T6.X, T5.X, 48, #3
-; EG-NEXT: VTX_READ_16 T7.X, T5.X, 56, #3
-; EG-NEXT: Fetch clause starting at 20:
-; EG-NEXT: VTX_READ_16 T6.X, T5.X, 46, #3
-; EG-NEXT: VTX_READ_16 T7.X, T5.X, 54, #3
-; EG-NEXT: Fetch clause starting at 24:
-; EG-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3
-; EG-NEXT: VTX_READ_16 T5.X, T5.X, 52, #3
-; EG-NEXT: ALU clause starting at 28:
-; EG-NEXT: MOV * T0.Y, T3.X,
-; EG-NEXT: MOV * T5.X, 0.0,
-; EG-NEXT: ALU clause starting at 30:
-; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: MIN_INT * T0.W, PV.Z, PV.W,
-; EG-NEXT: LSHL T0.W, PV.W, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T3.X, PV.W,
-; EG-NEXT: MOV * T0.Y, PV.X,
-; EG-NEXT: ALU clause starting at 40:
-; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: MIN_INT T0.W, PV.Z, PV.W,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T3.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T2.X,
-; EG-NEXT: ALU clause starting at 51:
-; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: MIN_INT T0.W, PV.Z, PV.W,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_16 T1.X, T0.X, 46, #3
+; EG-NEXT: VTX_READ_16 T2.X, T0.X, 52, #3
+; EG-NEXT: VTX_READ_16 T3.X, T0.X, 44, #3
+; EG-NEXT: VTX_READ_16 T0.X, T0.X, 54, #3
+; EG-NEXT: ALU clause starting at 14:
+; EG-NEXT: MOV * T0.X, 0.0,
+; EG-NEXT: ALU clause starting at 15:
+; EG-NEXT: BFE_INT T0.Z, T1.X, 0.0, literal.x,
+; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T2.X, PV.W,
-; EG-NEXT: MOV * T0.Y, PV.X,
-; EG-NEXT: ALU clause starting at 62:
-; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT * T0.W, T5.X, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: MIN_INT T0.Y, PV.Z, PV.W,
+; EG-NEXT: BFE_INT T0.Z, T3.X, 0.0, literal.x,
+; EG-NEXT: BFE_INT * T0.W, T2.X, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: MIN_INT * T0.W, PV.Z, PV.W,
-; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x,
-; EG-NEXT: AND_INT T1.W, T0.Y, literal.y,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.z,
-; EG-NEXT: 2(2.802597e-45), -65536(nan)
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T6.X, PV.W, PS,
-; EG-NEXT: MOV T2.X, PV.X,
-; EG-NEXT: MOV * T6.Y, T3.X,
+; EG-NEXT: MIN_INT T0.X, PV.Z, PV.W,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: s_test_imin_sle_v4i16:
; CI: ; %bb.0:
@@ -1848,49 +1797,40 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr
define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; EG-LABEL: v_test_umin_ule_v3i16:
; EG: ; %bb.0:
-; EG-NEXT: ALU 3, @20, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 1 @8
-; EG-NEXT: ALU 11, @24, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 3 @12
-; EG-NEXT: ALU 8, @36, KC0[], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T8.X, 0
-; EG-NEXT: MEM_RAT MSKOR T7.XW, T0.X
+; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 3 @6
+; EG-NEXT: ALU 17, @18, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0
+; EG-NEXT: MEM_RAT MSKOR T4.XW, T0.X
; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 8:
-; EG-NEXT: VTX_READ_16 T7.X, T6.X, 4, #1
-; EG-NEXT: VTX_READ_16 T8.X, T0.X, 4, #1
-; EG-NEXT: Fetch clause starting at 12:
-; EG-NEXT: VTX_READ_16 T8.X, T6.X, 0, #1
-; EG-NEXT: VTX_READ_16 T9.X, T0.X, 0, #1
-; EG-NEXT: VTX_READ_16 T6.X, T6.X, 2, #1
-; EG-NEXT: VTX_READ_16 T0.X, T0.X, 2, #1
-; EG-NEXT: ALU clause starting at 20:
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_16 T2.X, T1.X, 0, #1
+; EG-NEXT: VTX_READ_16 T3.X, T0.X, 0, #1
+; EG-NEXT: VTX_READ_16 T1.X, T1.X, 4, #1
+; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1
+; EG-NEXT: ALU clause starting at 14:
; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
-; EG-NEXT: ADD_INT * T6.X, KC0[2].W, PV.W,
-; EG-NEXT: ALU clause starting at 24:
+; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W,
+; EG-NEXT: ALU clause starting at 18:
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
; EG-NEXT: ADD_INT * T1.W, PV.W, literal.x,
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
; EG-NEXT: AND_INT * T2.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T2.W, PV.W, literal.x,
-; EG-NEXT: MIN_UINT * T3.W, T8.X, T7.X,
+; EG-NEXT: MIN_UINT * T3.W, T0.X, T1.X,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL T7.X, PS, PV.W,
-; EG-NEXT: LSHL * T7.W, literal.x, PV.W,
+; EG-NEXT: LSHL T4.X, PS, PV.W,
+; EG-NEXT: LSHL * T4.W, literal.x, PV.W,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: MOV * T7.Y, 0.0,
-; EG-NEXT: ALU clause starting at 36:
-; EG-NEXT: MOV T7.Z, 0.0,
-; EG-NEXT: MIN_UINT * T2.W, T0.X, T6.X,
+; EG-NEXT: MOV T4.Y, 0.0,
+; EG-NEXT: MOV * T4.Z, 0.0,
; EG-NEXT: LSHR T0.X, T1.W, literal.x,
-; EG-NEXT: LSHL T1.W, PV.W, literal.y,
-; EG-NEXT: MIN_UINT * T2.W, T9.X, T8.X,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: OR_INT T6.X, PV.W, PS,
-; EG-NEXT: LSHR * T8.X, T0.W, literal.x,
+; EG-NEXT: MIN_UINT * T1.X, T3.X, T2.X,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR * T2.X, T0.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: v_test_umin_ule_v3i16:
@@ -2936,142 +2876,46 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32
define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16> %a, <8 x i16> %b) #0 {
; EG-LABEL: s_test_umin_ult_v8i16:
; EG: ; %bb.0:
-; EG-NEXT: ALU 1, @52, KC0[], KC1[]
-; EG-NEXT: TEX 1 @20
-; EG-NEXT: ALU 9, @54, KC0[], KC1[]
-; EG-NEXT: TEX 1 @24
-; EG-NEXT: ALU 8, @64, KC0[], KC1[]
-; EG-NEXT: TEX 1 @28
-; EG-NEXT: ALU 10, @73, KC0[], KC1[]
-; EG-NEXT: TEX 1 @32
-; EG-NEXT: ALU 8, @84, KC0[], KC1[]
-; EG-NEXT: TEX 1 @36
-; EG-NEXT: ALU 10, @93, KC0[], KC1[]
-; EG-NEXT: TEX 1 @40
-; EG-NEXT: ALU 8, @104, KC0[], KC1[]
-; EG-NEXT: TEX 1 @44
-; EG-NEXT: ALU 10, @113, KC0[], KC1[]
-; EG-NEXT: TEX 1 @48
-; EG-NEXT: ALU 10, @124, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1
+; EG-NEXT: ALU 0, @24, KC0[], KC1[]
+; EG-NEXT: TEX 2 @8
+; EG-NEXT: ALU 2, @25, KC0[], KC1[]
+; EG-NEXT: TEX 4 @14
+; EG-NEXT: ALU 14, @28, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 20:
-; EG-NEXT: VTX_READ_16 T8.X, T7.X, 66, #3
-; EG-NEXT: VTX_READ_16 T9.X, T7.X, 82, #3
-; EG-NEXT: Fetch clause starting at 24:
-; EG-NEXT: VTX_READ_16 T8.X, T7.X, 64, #3
-; EG-NEXT: VTX_READ_16 T9.X, T7.X, 80, #3
-; EG-NEXT: Fetch clause starting at 28:
-; EG-NEXT: VTX_READ_16 T8.X, T7.X, 62, #3
-; EG-NEXT: VTX_READ_16 T9.X, T7.X, 78, #3
-; EG-NEXT: Fetch clause starting at 32:
-; EG-NEXT: VTX_READ_16 T8.X, T7.X, 60, #3
-; EG-NEXT: VTX_READ_16 T9.X, T7.X, 76, #3
-; EG-NEXT: Fetch clause starting at 36:
-; EG-NEXT: VTX_READ_16 T8.X, T7.X, 58, #3
-; EG-NEXT: VTX_READ_16 T9.X, T7.X, 74, #3
-; EG-NEXT: Fetch clause starting at 40:
-; EG-NEXT: VTX_READ_16 T8.X, T7.X, 56, #3
-; EG-NEXT: VTX_READ_16 T9.X, T7.X, 72, #3
-; EG-NEXT: Fetch clause starting at 44:
-; EG-NEXT: VTX_READ_16 T8.X, T7.X, 54, #3
-; EG-NEXT: VTX_READ_16 T9.X, T7.X, 70, #3
-; EG-NEXT: Fetch clause starting at 48:
-; EG-NEXT: VTX_READ_16 T8.X, T7.X, 52, #3
-; EG-NEXT: VTX_READ_16 T7.X, T7.X, 68, #3
-; EG-NEXT: ALU clause starting at 52:
-; EG-NEXT: MOV * T0.Y, T3.X,
-; EG-NEXT: MOV * T7.X, 0.0,
-; EG-NEXT: ALU clause starting at 54:
-; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: MIN_UINT * T0.W, PV.W, PS,
-; EG-NEXT: LSHL T0.W, PV.W, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T3.X, PV.W,
-; EG-NEXT: MOV * T0.Y, PV.X,
-; EG-NEXT: ALU clause starting at 64:
-; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, T0.Y, literal.x,
-; EG-NEXT: MIN_UINT * T0.W, PV.W, PS,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T3.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T2.X,
-; EG-NEXT: ALU clause starting at 73:
-; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: MIN_UINT T0.W, PV.W, PS,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T2.X, PV.W,
-; EG-NEXT: MOV * T0.Y, PV.X,
-; EG-NEXT: ALU clause starting at 84:
-; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, T0.Y, literal.x,
-; EG-NEXT: MIN_UINT * T0.W, PV.W, PS,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T7.Z, PV.W, PS,
-; EG-NEXT: MOV T2.X, PV.Z,
-; EG-NEXT: MOV * T0.Y, T5.X,
-; EG-NEXT: ALU clause starting at 93:
-; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: MIN_UINT T0.W, PV.W, PS,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T5.X, PV.W,
-; EG-NEXT: MOV * T0.Y, PV.X,
-; EG-NEXT: ALU clause starting at 104:
-; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
+; EG-NEXT: Fetch clause starting at 8:
+; EG-NEXT: VTX_READ_16 T1.X, T0.X, 62, #3
+; EG-NEXT: VTX_READ_16 T2.X, T0.X, 60, #3
+; EG-NEXT: VTX_READ_16 T3.X, T0.X, 78, #3
+; EG-NEXT: Fetch clause starting at 14:
+; EG-NEXT: VTX_READ_16 T1.X, T0.X, 68, #3
+; EG-NEXT: VTX_READ_16 T3.X, T0.X, 52, #3
+; EG-NEXT: VTX_READ_16 T4.X, T0.X, 70, #3
+; EG-NEXT: VTX_READ_16 T5.X, T0.X, 54, #3
+; EG-NEXT: VTX_READ_16 T0.X, T0.X, 76, #3
+; EG-NEXT: ALU clause starting at 24:
+; EG-NEXT: MOV * T0.X, 0.0,
+; EG-NEXT: ALU clause starting at 25:
+; EG-NEXT: AND_INT T0.W, T1.X, literal.x,
+; EG-NEXT: AND_INT * T1.W, T3.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, T0.Y, literal.x,
-; EG-NEXT: MIN_UINT * T0.W, PV.W, PS,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T5.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T4.X,
-; EG-NEXT: ALU clause starting at 113:
-; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
+; EG-NEXT: ALU clause starting at 28:
+; EG-NEXT: AND_INT T0.Z, T2.X, literal.x,
+; EG-NEXT: AND_INT T2.W, T0.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: MIN_UINT * T0.W, T0.W, T1.W,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: MIN_UINT T0.W, PV.W, PS,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT: MIN_UINT T0.Z, PV.Z, PV.W,
+; EG-NEXT: AND_INT T1.W, T5.X, literal.x,
+; EG-NEXT: AND_INT * T2.W, T4.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T4.X, PV.W,
-; EG-NEXT: MOV * T0.Y, PV.X,
-; EG-NEXT: ALU clause starting at 124:
-; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T7.X, literal.x,
+; EG-NEXT: MIN_UINT T0.Y, PV.W, PS,
+; EG-NEXT: AND_INT T1.W, T3.X, literal.x,
+; EG-NEXT: AND_INT * T2.W, T1.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x,
-; EG-NEXT: AND_INT T2.W, T0.Y, literal.y,
-; EG-NEXT: MIN_UINT * T0.W, PV.W, PS,
-; EG-NEXT: 2(2.802597e-45), -65536(nan)
-; EG-NEXT: OR_INT * T7.X, PV.W, PS,
-; EG-NEXT: MOV T4.X, PV.X,
-; EG-NEXT: MOV * T7.W, T3.X,
-; EG-NEXT: MOV * T7.Y, T5.X,
+; EG-NEXT: MIN_UINT T0.X, PV.W, PS,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: s_test_umin_ult_v8i16:
; CI: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/minimummaximum.ll b/llvm/test/CodeGen/AMDGPU/minimummaximum.ll
index c375b16ee380..7e867a537298 100644
--- a/llvm/test/CodeGen/AMDGPU/minimummaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimummaximum.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-FAKE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-FAKE16 %s
define amdgpu_ps float @test_minmax_f32(float %a, float %b, float %c) {
; GFX12-LABEL: test_minmax_f32:
@@ -72,30 +74,84 @@ define amdgpu_ps float @test_maxmin_commuted_f32(float %a, float %b, float %c) {
}
define amdgpu_ps half @test_minmax_f16(half %a, half %b, half %c) {
-; GFX12-LABEL: test_minmax_f16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: v_maximumminimum_f16 v0, v0, v1, v2
-; GFX12-NEXT: ; return to shader part epilog
+; SDAG-TRUE16-LABEL: test_minmax_f16:
+; SDAG-TRUE16: ; %bb.0:
+; SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-TRUE16-NEXT: v_maximumminimum_f16 v0.l, v0.l, v0.h, v1.l
+; SDAG-TRUE16-NEXT: ; return to shader part epilog
+;
+; SDAG-FAKE16-LABEL: test_minmax_f16:
+; SDAG-FAKE16: ; %bb.0:
+; SDAG-FAKE16-NEXT: v_maximumminimum_f16 v0, v0, v1, v2
+; SDAG-FAKE16-NEXT: ; return to shader part epilog
+;
+; GISEL-TRUE16-LABEL: test_minmax_f16:
+; GISEL-TRUE16: ; %bb.0:
+; GISEL-TRUE16-NEXT: v_maximumminimum_f16 v0.l, v0.l, v1.l, v2.l
+; GISEL-TRUE16-NEXT: ; return to shader part epilog
+;
+; GISEL-FAKE16-LABEL: test_minmax_f16:
+; GISEL-FAKE16: ; %bb.0:
+; GISEL-FAKE16-NEXT: v_maximumminimum_f16 v0, v0, v1, v2
+; GISEL-FAKE16-NEXT: ; return to shader part epilog
%max = call half @llvm.maximum.f16(half %a, half %b)
%minmax = call half @llvm.minimum.f16(half %max, half %c)
ret half %minmax
}
define amdgpu_ps half @test_minmax_commuted_f16(half %a, half %b, half %c) {
-; GFX12-LABEL: test_minmax_commuted_f16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: v_maximumminimum_f16 v0, v0, v1, v2
-; GFX12-NEXT: ; return to shader part epilog
+; SDAG-TRUE16-LABEL: test_minmax_commuted_f16:
+; SDAG-TRUE16: ; %bb.0:
+; SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-TRUE16-NEXT: v_maximumminimum_f16 v0.l, v0.l, v0.h, v1.l
+; SDAG-TRUE16-NEXT: ; return to shader part epilog
+;
+; SDAG-FAKE16-LABEL: test_minmax_commuted_f16:
+; SDAG-FAKE16: ; %bb.0:
+; SDAG-FAKE16-NEXT: v_maximumminimum_f16 v0, v0, v1, v2
+; SDAG-FAKE16-NEXT: ; return to shader part epilog
+;
+; GISEL-TRUE16-LABEL: test_minmax_commuted_f16:
+; GISEL-TRUE16: ; %bb.0:
+; GISEL-TRUE16-NEXT: v_maximumminimum_f16 v0.l, v0.l, v1.l, v2.l
+; GISEL-TRUE16-NEXT: ; return to shader part epilog
+;
+; GISEL-FAKE16-LABEL: test_minmax_commuted_f16:
+; GISEL-FAKE16: ; %bb.0:
+; GISEL-FAKE16-NEXT: v_maximumminimum_f16 v0, v0, v1, v2
+; GISEL-FAKE16-NEXT: ; return to shader part epilog
%max = call half @llvm.maximum.f16(half %a, half %b)
%minmax = call half @llvm.minimum.f16(half %c, half %max)
ret half %minmax
}
define amdgpu_ps half @test_maxmin_commuted_f16(half %a, half %b, half %c) {
-; GFX12-LABEL: test_maxmin_commuted_f16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: v_minimummaximum_f16 v0, v0, v1, v2
-; GFX12-NEXT: ; return to shader part epilog
+; SDAG-TRUE16-LABEL: test_maxmin_commuted_f16:
+; SDAG-TRUE16: ; %bb.0:
+; SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-TRUE16-NEXT: v_minimummaximum_f16 v0.l, v0.l, v0.h, v1.l
+; SDAG-TRUE16-NEXT: ; return to shader part epilog
+;
+; SDAG-FAKE16-LABEL: test_maxmin_commuted_f16:
+; SDAG-FAKE16: ; %bb.0:
+; SDAG-FAKE16-NEXT: v_minimummaximum_f16 v0, v0, v1, v2
+; SDAG-FAKE16-NEXT: ; return to shader part epilog
+;
+; GISEL-TRUE16-LABEL: test_maxmin_commuted_f16:
+; GISEL-TRUE16: ; %bb.0:
+; GISEL-TRUE16-NEXT: v_minimummaximum_f16 v0.l, v0.l, v1.l, v2.l
+; GISEL-TRUE16-NEXT: ; return to shader part epilog
+;
+; GISEL-FAKE16-LABEL: test_maxmin_commuted_f16:
+; GISEL-FAKE16: ; %bb.0:
+; GISEL-FAKE16-NEXT: v_minimummaximum_f16 v0, v0, v1, v2
+; GISEL-FAKE16-NEXT: ; return to shader part epilog
%min = call half @llvm.minimum.f16(half %a, half %b)
%maxmin = call half @llvm.maximum.f16(half %c, half %min)
ret half %maxmin
diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll
index 774a22fb907d..954dab3d0fc6 100644
--- a/llvm/test/CodeGen/AMDGPU/minmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/minmax.ll
@@ -1,8 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,SDAG,SDAG-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GISEL,GISEL-GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-GFX12 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-GFX12 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,SDAG,SDAG-GFX11,SDAG-GFX11-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,SDAG,SDAG-GFX11,SDAG-GFX11-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GISEL,GISEL-GFX11,GISEL-GFX11-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GISEL,GISEL-GFX11,GISEL-GFX11-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-GFX12,SDAG-GFX12-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-GFX12,SDAG-GFX12-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-GFX12,GISEL-GFX12-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-GFX12,GISEL-GFX12-FAKE16 %s
define i32 @test_minmax_i32(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_minmax_i32:
@@ -467,47 +471,111 @@ define void @test_med3_f32(ptr addrspace(1) %arg, float %x, float %y, float %z)
}
define amdgpu_ps half @test_minmax_f16_ieee_false(half %a, half %b, half %c) {
-; GFX11-LABEL: test_minmax_f16_ieee_false:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: v_maxmin_f16 v0, v0, v1, v2
-; GFX11-NEXT: ; return to shader part epilog
-;
-; GFX12-LABEL: test_minmax_f16_ieee_false:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: v_maxmin_num_f16 v0, v0, v1, v2
-; GFX12-NEXT: ; return to shader part epilog
+; SDAG-GFX11-TRUE16-LABEL: test_minmax_f16_ieee_false:
+; SDAG-GFX11-TRUE16: ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; SDAG-GFX11-TRUE16-NEXT: v_maxmin_f16 v0.l, v0.l, v0.h, v1.l
+; SDAG-GFX11-TRUE16-NEXT: ; return to shader part epilog
+;
+; SDAG-GFX11-FAKE16-LABEL: test_minmax_f16_ieee_false:
+; SDAG-GFX11-FAKE16: ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT: v_maxmin_f16 v0, v0, v1, v2
+; SDAG-GFX11-FAKE16-NEXT: ; return to shader part epilog
+;
+; GISEL-GFX11-TRUE16-LABEL: test_minmax_f16_ieee_false:
+; GISEL-GFX11-TRUE16: ; %bb.0:
+; GISEL-GFX11-TRUE16-NEXT: v_maxmin_f16 v0.l, v0.l, v1.l, v2.l
+; GISEL-GFX11-TRUE16-NEXT: ; return to shader part epilog
+;
+; GISEL-GFX11-FAKE16-LABEL: test_minmax_f16_ieee_false:
+; GISEL-GFX11-FAKE16: ; %bb.0:
+; GISEL-GFX11-FAKE16-NEXT: v_maxmin_f16 v0, v0, v1, v2
+; GISEL-GFX11-FAKE16-NEXT: ; return to shader part epilog
+;
+; SDAG-GFX12-TRUE16-LABEL: test_minmax_f16_ieee_false:
+; SDAG-GFX12-TRUE16: ; %bb.0:
+; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; SDAG-GFX12-TRUE16-NEXT: v_maxmin_num_f16 v0.l, v0.l, v0.h, v1.l
+; SDAG-GFX12-TRUE16-NEXT: ; return to shader part epilog
+;
+; SDAG-GFX12-FAKE16-LABEL: test_minmax_f16_ieee_false:
+; SDAG-GFX12-FAKE16: ; %bb.0:
+; SDAG-GFX12-FAKE16-NEXT: v_maxmin_num_f16 v0, v0, v1, v2
+; SDAG-GFX12-FAKE16-NEXT: ; return to shader part epilog
+;
+; GISEL-GFX12-TRUE16-LABEL: test_minmax_f16_ieee_false:
+; GISEL-GFX12-TRUE16: ; %bb.0:
+; GISEL-GFX12-TRUE16-NEXT: v_maxmin_num_f16 v0.l, v0.l, v1.l, v2.l
+; GISEL-GFX12-TRUE16-NEXT: ; return to shader part epilog
+;
+; GISEL-GFX12-FAKE16-LABEL: test_minmax_f16_ieee_false:
+; GISEL-GFX12-FAKE16: ; %bb.0:
+; GISEL-GFX12-FAKE16-NEXT: v_maxmin_num_f16 v0, v0, v1, v2
+; GISEL-GFX12-FAKE16-NEXT: ; return to shader part epilog
%max = call half @llvm.maxnum.f16(half %a, half %b)
%minmax = call half @llvm.minnum.f16(half %max, half %c)
ret half %minmax
}
define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b, half inreg %c, ptr addrspace(1) inreg %out) {
-; SDAG-GFX11-LABEL: s_test_minmax_f16_ieee_false:
-; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
-; SDAG-GFX11-NEXT: s_mov_b32 s5, s4
-; SDAG-GFX11-NEXT: s_mov_b32 s4, s3
-; SDAG-GFX11-NEXT: v_maxmin_f16 v0, s0, s1, v0
-; SDAG-GFX11-NEXT: global_store_b16 v1, v0, s[4:5]
-; SDAG-GFX11-NEXT: s_endpgm
-;
-; GISEL-GFX11-LABEL: s_test_minmax_f16_ieee_false:
-; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
-; GISEL-GFX11-NEXT: s_mov_b32 s6, s3
-; GISEL-GFX11-NEXT: s_mov_b32 s7, s4
-; GISEL-GFX11-NEXT: v_maxmin_f16 v0, s0, s1, v0
-; GISEL-GFX11-NEXT: global_store_b16 v1, v0, s[6:7]
-; GISEL-GFX11-NEXT: s_endpgm
-;
-; SDAG-GFX12-LABEL: s_test_minmax_f16_ieee_false:
-; SDAG-GFX12: ; %bb.0:
-; SDAG-GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
-; SDAG-GFX12-NEXT: s_mov_b32 s5, s4
-; SDAG-GFX12-NEXT: s_mov_b32 s4, s3
-; SDAG-GFX12-NEXT: v_maxmin_num_f16 v0, s0, s1, v0
-; SDAG-GFX12-NEXT: global_store_b16 v1, v0, s[4:5]
-; SDAG-GFX12-NEXT: s_endpgm
+; SDAG-GFX11-TRUE16-LABEL: s_test_minmax_f16_ieee_false:
+; SDAG-GFX11-TRUE16: ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; SDAG-GFX11-TRUE16-NEXT: s_mov_b32 s5, s4
+; SDAG-GFX11-TRUE16-NEXT: s_mov_b32 s4, s3
+; SDAG-GFX11-TRUE16-NEXT: v_maxmin_f16 v0.l, s0, s1, v0.l
+; SDAG-GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[4:5]
+; SDAG-GFX11-TRUE16-NEXT: s_endpgm
+;
+; SDAG-GFX11-FAKE16-LABEL: s_test_minmax_f16_ieee_false:
+; SDAG-GFX11-FAKE16: ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; SDAG-GFX11-FAKE16-NEXT: s_mov_b32 s5, s4
+; SDAG-GFX11-FAKE16-NEXT: s_mov_b32 s4, s3
+; SDAG-GFX11-FAKE16-NEXT: v_maxmin_f16 v0, s0, s1, v0
+; SDAG-GFX11-FAKE16-NEXT: global_store_b16 v1, v0, s[4:5]
+; SDAG-GFX11-FAKE16-NEXT: s_endpgm
+;
+; GISEL-GFX11-TRUE16-LABEL: s_test_minmax_f16_ieee_false:
+; GISEL-GFX11-TRUE16: ; %bb.0:
+; GISEL-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; GISEL-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-GFX11-TRUE16-NEXT: s_mov_b32 s6, s3
+; GISEL-GFX11-TRUE16-NEXT: s_mov_b32 s7, s4
+; GISEL-GFX11-TRUE16-NEXT: v_maxmin_f16 v0.l, s0, s1, v0.l
+; GISEL-GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[6:7]
+; GISEL-GFX11-TRUE16-NEXT: s_endpgm
+;
+; GISEL-GFX11-FAKE16-LABEL: s_test_minmax_f16_ieee_false:
+; GISEL-GFX11-FAKE16: ; %bb.0:
+; GISEL-GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GISEL-GFX11-FAKE16-NEXT: s_mov_b32 s6, s3
+; GISEL-GFX11-FAKE16-NEXT: s_mov_b32 s7, s4
+; GISEL-GFX11-FAKE16-NEXT: v_maxmin_f16 v0, s0, s1, v0
+; GISEL-GFX11-FAKE16-NEXT: global_store_b16 v1, v0, s[6:7]
+; GISEL-GFX11-FAKE16-NEXT: s_endpgm
+;
+; SDAG-GFX12-TRUE16-LABEL: s_test_minmax_f16_ieee_false:
+; SDAG-GFX12-TRUE16: ; %bb.0:
+; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; SDAG-GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; SDAG-GFX12-TRUE16-NEXT: s_mov_b32 s5, s4
+; SDAG-GFX12-TRUE16-NEXT: s_mov_b32 s4, s3
+; SDAG-GFX12-TRUE16-NEXT: v_maxmin_num_f16 v0.l, s0, s1, v0.l
+; SDAG-GFX12-TRUE16-NEXT: global_store_b16 v1, v0, s[4:5]
+; SDAG-GFX12-TRUE16-NEXT: s_endpgm
+;
+; SDAG-GFX12-FAKE16-LABEL: s_test_minmax_f16_ieee_false:
+; SDAG-GFX12-FAKE16: ; %bb.0:
+; SDAG-GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; SDAG-GFX12-FAKE16-NEXT: s_mov_b32 s5, s4
+; SDAG-GFX12-FAKE16-NEXT: s_mov_b32 s4, s3
+; SDAG-GFX12-FAKE16-NEXT: v_maxmin_num_f16 v0, s0, s1, v0
+; SDAG-GFX12-FAKE16-NEXT: global_store_b16 v1, v0, s[4:5]
+; SDAG-GFX12-FAKE16-NEXT: s_endpgm
;
; GISEL-GFX12-LABEL: s_test_minmax_f16_ieee_false:
; GISEL-GFX12: ; %bb.0:
@@ -526,136 +594,320 @@ define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b
}
define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) {
-; SDAG-GFX11-LABEL: test_minmax_commuted_f16_ieee_true:
-; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_max_f16_e32 v1, v1, v1
-; SDAG-GFX11-NEXT: v_max_f16_e32 v0, v0, v0
-; SDAG-GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; SDAG-GFX11-NEXT: v_maxmin_f16 v0, v0, v1, v2
-; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-GFX11-LABEL: test_minmax_commuted_f16_ieee_true:
-; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_max_f16_e32 v0, v0, v0
-; GISEL-GFX11-NEXT: v_max_f16_e32 v1, v1, v1
-; GISEL-GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GISEL-GFX11-NEXT: v_maxmin_f16 v0, v0, v1, v2
-; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; SDAG-GFX12-LABEL: test_minmax_commuted_f16_ieee_true:
-; SDAG-GFX12: ; %bb.0:
-; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; SDAG-GFX12-NEXT: s_wait_expcnt 0x0
-; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0
-; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0
-; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0
-; SDAG-GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1
-; SDAG-GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
-; SDAG-GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; SDAG-GFX12-NEXT: v_maxmin_num_f16 v0, v0, v1, v2
-; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-GFX12-LABEL: test_minmax_commuted_f16_ieee_true:
-; GISEL-GFX12: ; %bb.0:
-; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GISEL-GFX12-NEXT: s_wait_expcnt 0x0
-; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0
-; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0
-; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0
-; GISEL-GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
-; GISEL-GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1
-; GISEL-GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GISEL-GFX12-NEXT: v_maxmin_num_f16 v0, v0, v1, v2
-; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-TRUE16-LABEL: test_minmax_commuted_f16_ieee_true:
+; SDAG-GFX11-TRUE16: ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
+; SDAG-GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v1.l, v1.l
+; SDAG-GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; SDAG-GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v0.h, v0.h
+; SDAG-GFX11-TRUE16-NEXT: v_maxmin_f16 v0.l, v0.l, v1.l, v0.h
+; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX11-FAKE16-LABEL: test_minmax_commuted_f16_ieee_true:
+; SDAG-GFX11-FAKE16: ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1
+; SDAG-GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0
+; SDAG-GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; SDAG-GFX11-FAKE16-NEXT: v_maxmin_f16 v0, v0, v1, v2
+; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-TRUE16-LABEL: test_minmax_commuted_f16_ieee_true:
+; GISEL-GFX11-TRUE16: ; %bb.0:
+; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GISEL-GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l
+; GISEL-GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v2.l, v2.l
+; GISEL-GFX11-TRUE16-NEXT: v_maxmin_f16 v0.l, v0.l, v0.h, v1.l
+; GISEL-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-FAKE16-LABEL: test_minmax_commuted_f16_ieee_true:
+; GISEL-GFX11-FAKE16: ; %bb.0:
+; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0
+; GISEL-GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1
+; GISEL-GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GISEL-GFX11-FAKE16-NEXT: v_maxmin_f16 v0, v0, v1, v2
+; GISEL-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX12-TRUE16-LABEL: test_minmax_commuted_f16_ieee_true:
+; SDAG-GFX12-TRUE16: ; %bb.0:
+; SDAG-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; SDAG-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; SDAG-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; SDAG-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
+; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v1.l, v1.l
+; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v0.h, v0.h
+; SDAG-GFX12-TRUE16-NEXT: v_maxmin_num_f16 v0.l, v0.l, v1.l, v0.h
+; SDAG-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX12-FAKE16-LABEL: test_minmax_commuted_f16_ieee_true:
+; SDAG-GFX12-FAKE16: ; %bb.0:
+; SDAG-GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; SDAG-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; SDAG-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; SDAG-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; SDAG-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1
+; SDAG-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0
+; SDAG-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; SDAG-GFX12-FAKE16-NEXT: v_maxmin_num_f16 v0, v0, v1, v2
+; SDAG-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX12-TRUE16-LABEL: test_minmax_commuted_f16_ieee_true:
+; GISEL-GFX12-TRUE16: ; %bb.0:
+; GISEL-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GISEL-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GISEL-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GISEL-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l
+; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l
+; GISEL-GFX12-TRUE16-NEXT: v_maxmin_num_f16 v0.l, v0.l, v0.h, v1.l
+; GISEL-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX12-FAKE16-LABEL: test_minmax_commuted_f16_ieee_true:
+; GISEL-GFX12-FAKE16: ; %bb.0:
+; GISEL-GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GISEL-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GISEL-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GISEL-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0
+; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1
+; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GISEL-GFX12-FAKE16-NEXT: v_maxmin_num_f16 v0, v0, v1, v2
+; GISEL-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
%max = call half @llvm.maxnum.f16(half %a, half %b)
%minmax = call half @llvm.minnum.f16(half %c, half %max)
ret half %minmax
}
define amdgpu_ps half @test_maxmin_f16_ieee_false(half %a, half %b, half %c) {
-; GFX11-LABEL: test_maxmin_f16_ieee_false:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: v_minmax_f16 v0, v0, v1, v2
-; GFX11-NEXT: ; return to shader part epilog
-;
-; GFX12-LABEL: test_maxmin_f16_ieee_false:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: v_minmax_num_f16 v0, v0, v1, v2
-; GFX12-NEXT: ; return to shader part epilog
+; SDAG-GFX11-TRUE16-LABEL: test_maxmin_f16_ieee_false:
+; SDAG-GFX11-TRUE16: ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; SDAG-GFX11-TRUE16-NEXT: v_minmax_f16 v0.l, v0.l, v0.h, v1.l
+; SDAG-GFX11-TRUE16-NEXT: ; return to shader part epilog
+;
+; SDAG-GFX11-FAKE16-LABEL: test_maxmin_f16_ieee_false:
+; SDAG-GFX11-FAKE16: ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT: v_minmax_f16 v0, v0, v1, v2
+; SDAG-GFX11-FAKE16-NEXT: ; return to shader part epilog
+;
+; GISEL-GFX11-TRUE16-LABEL: test_maxmin_f16_ieee_false:
+; GISEL-GFX11-TRUE16: ; %bb.0:
+; GISEL-GFX11-TRUE16-NEXT: v_minmax_f16 v0.l, v0.l, v1.l, v2.l
+; GISEL-GFX11-TRUE16-NEXT: ; return to shader part epilog
+;
+; GISEL-GFX11-FAKE16-LABEL: test_maxmin_f16_ieee_false:
+; GISEL-GFX11-FAKE16: ; %bb.0:
+; GISEL-GFX11-FAKE16-NEXT: v_minmax_f16 v0, v0, v1, v2
+; GISEL-GFX11-FAKE16-NEXT: ; return to shader part epilog
+;
+; SDAG-GFX12-TRUE16-LABEL: test_maxmin_f16_ieee_false:
+; SDAG-GFX12-TRUE16: ; %bb.0:
+; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; SDAG-GFX12-TRUE16-NEXT: v_minmax_num_f16 v0.l, v0.l, v0.h, v1.l
+; SDAG-GFX12-TRUE16-NEXT: ; return to shader part epilog
+;
+; SDAG-GFX12-FAKE16-LABEL: test_maxmin_f16_ieee_false:
+; SDAG-GFX12-FAKE16: ; %bb.0:
+; SDAG-GFX12-FAKE16-NEXT: v_minmax_num_f16 v0, v0, v1, v2
+; SDAG-GFX12-FAKE16-NEXT: ; return to shader part epilog
+;
+; GISEL-GFX12-TRUE16-LABEL: test_maxmin_f16_ieee_false:
+; GISEL-GFX12-TRUE16: ; %bb.0:
+; GISEL-GFX12-TRUE16-NEXT: v_minmax_num_f16 v0.l, v0.l, v1.l, v2.l
+; GISEL-GFX12-TRUE16-NEXT: ; return to shader part epilog
+;
+; GISEL-GFX12-FAKE16-LABEL: test_maxmin_f16_ieee_false:
+; GISEL-GFX12-FAKE16: ; %bb.0:
+; GISEL-GFX12-FAKE16-NEXT: v_minmax_num_f16 v0, v0, v1, v2
+; GISEL-GFX12-FAKE16-NEXT: ; return to shader part epilog
%min = call half @llvm.minnum.f16(half %a, half %b)
%maxmin = call half @llvm.maxnum.f16(half %min, half %c)
ret half %maxmin
}
define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) {
-; SDAG-GFX11-LABEL: test_maxmin_commuted_f16_ieee_true:
-; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_max_f16_e32 v1, v1, v1
-; SDAG-GFX11-NEXT: v_max_f16_e32 v0, v0, v0
-; SDAG-GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; SDAG-GFX11-NEXT: v_minmax_f16 v0, v0, v1, v2
-; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-GFX11-LABEL: test_maxmin_commuted_f16_ieee_true:
-; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_max_f16_e32 v0, v0, v0
-; GISEL-GFX11-NEXT: v_max_f16_e32 v1, v1, v1
-; GISEL-GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GISEL-GFX11-NEXT: v_minmax_f16 v0, v0, v1, v2
-; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; SDAG-GFX12-LABEL: test_maxmin_commuted_f16_ieee_true:
-; SDAG-GFX12: ; %bb.0:
-; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; SDAG-GFX12-NEXT: s_wait_expcnt 0x0
-; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0
-; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0
-; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0
-; SDAG-GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1
-; SDAG-GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
-; SDAG-GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; SDAG-GFX12-NEXT: v_minmax_num_f16 v0, v0, v1, v2
-; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-GFX12-LABEL: test_maxmin_commuted_f16_ieee_true:
-; GISEL-GFX12: ; %bb.0:
-; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GISEL-GFX12-NEXT: s_wait_expcnt 0x0
-; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0
-; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0
-; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0
-; GISEL-GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
-; GISEL-GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1
-; GISEL-GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GISEL-GFX12-NEXT: v_minmax_num_f16 v0, v0, v1, v2
-; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-TRUE16-LABEL: test_maxmin_commuted_f16_ieee_true:
+; SDAG-GFX11-TRUE16: ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
+; SDAG-GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v1.l, v1.l
+; SDAG-GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; SDAG-GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v0.h, v0.h
+; SDAG-GFX11-TRUE16-NEXT: v_minmax_f16 v0.l, v0.l, v1.l, v0.h
+; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX11-FAKE16-LABEL: test_maxmin_commuted_f16_ieee_true:
+; SDAG-GFX11-FAKE16: ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1
+; SDAG-GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0
+; SDAG-GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; SDAG-GFX11-FAKE16-NEXT: v_minmax_f16 v0, v0, v1, v2
+; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-TRUE16-LABEL: test_maxmin_commuted_f16_ieee_true:
+; GISEL-GFX11-TRUE16: ; %bb.0:
+; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GISEL-GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l
+; GISEL-GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v2.l, v2.l
+; GISEL-GFX11-TRUE16-NEXT: v_minmax_f16 v0.l, v0.l, v0.h, v1.l
+; GISEL-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-FAKE16-LABEL: test_maxmin_commuted_f16_ieee_true:
+; GISEL-GFX11-FAKE16: ; %bb.0:
+; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0
+; GISEL-GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1
+; GISEL-GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GISEL-GFX11-FAKE16-NEXT: v_minmax_f16 v0, v0, v1, v2
+; GISEL-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX12-TRUE16-LABEL: test_maxmin_commuted_f16_ieee_true:
+; SDAG-GFX12-TRUE16: ; %bb.0:
+; SDAG-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; SDAG-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; SDAG-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; SDAG-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
+; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v1.l, v1.l
+; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v0.h, v0.h
+; SDAG-GFX12-TRUE16-NEXT: v_minmax_num_f16 v0.l, v0.l, v1.l, v0.h
+; SDAG-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX12-FAKE16-LABEL: test_maxmin_commuted_f16_ieee_true:
+; SDAG-GFX12-FAKE16: ; %bb.0:
+; SDAG-GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; SDAG-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; SDAG-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; SDAG-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; SDAG-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1
+; SDAG-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0
+; SDAG-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; SDAG-GFX12-FAKE16-NEXT: v_minmax_num_f16 v0, v0, v1, v2
+; SDAG-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX12-TRUE16-LABEL: test_maxmin_commuted_f16_ieee_true:
+; GISEL-GFX12-TRUE16: ; %bb.0:
+; GISEL-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GISEL-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GISEL-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GISEL-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l
+; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l
+; GISEL-GFX12-TRUE16-NEXT: v_minmax_num_f16 v0.l, v0.l, v0.h, v1.l
+; GISEL-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX12-FAKE16-LABEL: test_maxmin_commuted_f16_ieee_true:
+; GISEL-GFX12-FAKE16: ; %bb.0:
+; GISEL-GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GISEL-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GISEL-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GISEL-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0
+; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1
+; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GISEL-GFX12-FAKE16-NEXT: v_minmax_num_f16 v0, v0, v1, v2
+; GISEL-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
%min = call half @llvm.minnum.f16(half %a, half %b)
%maxmin = call half @llvm.maxnum.f16(half %c, half %min)
ret half %maxmin
}
define void @test_med3_f16(ptr addrspace(1) %arg, half %x, half %y, half %z) #0 {
-; GFX11-LABEL: test_med3_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_med3_f16 v2, v2, v3, v4
-; GFX11-NEXT: global_store_b16 v[0:1], v2, off
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: test_med3_f16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_med3_num_f16 v2, v2, v3, v4
-; GFX12-NEXT: global_store_b16 v[0:1], v2, off
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-TRUE16-LABEL: test_med3_f16:
+; SDAG-GFX11-TRUE16: ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.l
+; SDAG-GFX11-TRUE16-NEXT: v_med3_f16 v2.l, v2.l, v2.h, v3.l
+; SDAG-GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v2, off
+; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX11-FAKE16-LABEL: test_med3_f16:
+; SDAG-GFX11-FAKE16: ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT: v_med3_f16 v2, v2, v3, v4
+; SDAG-GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v2, off
+; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-TRUE16-LABEL: test_med3_f16:
+; GISEL-GFX11-TRUE16: ; %bb.0:
+; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-TRUE16-NEXT: v_med3_f16 v2.l, v2.l, v3.l, v4.l
+; GISEL-GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v2, off
+; GISEL-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-FAKE16-LABEL: test_med3_f16:
+; GISEL-GFX11-FAKE16: ; %bb.0:
+; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-FAKE16-NEXT: v_med3_f16 v2, v2, v3, v4
+; GISEL-GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v2, off
+; GISEL-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX12-TRUE16-LABEL: test_med3_f16:
+; SDAG-GFX12-TRUE16: ; %bb.0:
+; SDAG-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; SDAG-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; SDAG-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; SDAG-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
+; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.l
+; SDAG-GFX12-TRUE16-NEXT: v_med3_num_f16 v2.l, v2.l, v2.h, v3.l
+; SDAG-GFX12-TRUE16-NEXT: global_store_b16 v[0:1], v2, off
+; SDAG-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX12-FAKE16-LABEL: test_med3_f16:
+; SDAG-GFX12-FAKE16: ; %bb.0:
+; SDAG-GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; SDAG-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; SDAG-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; SDAG-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; SDAG-GFX12-FAKE16-NEXT: v_med3_num_f16 v2, v2, v3, v4
+; SDAG-GFX12-FAKE16-NEXT: global_store_b16 v[0:1], v2, off
+; SDAG-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX12-TRUE16-LABEL: test_med3_f16:
+; GISEL-GFX12-TRUE16: ; %bb.0:
+; GISEL-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GISEL-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GISEL-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GISEL-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GISEL-GFX12-TRUE16-NEXT: v_med3_num_f16 v2.l, v2.l, v3.l, v4.l
+; GISEL-GFX12-TRUE16-NEXT: global_store_b16 v[0:1], v2, off
+; GISEL-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX12-FAKE16-LABEL: test_med3_f16:
+; GISEL-GFX12-FAKE16: ; %bb.0:
+; GISEL-GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GISEL-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GISEL-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GISEL-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GISEL-GFX12-FAKE16-NEXT: v_med3_num_f16 v2, v2, v3, v4
+; GISEL-GFX12-FAKE16-NEXT: global_store_b16 v[0:1], v2, off
+; GISEL-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
%tmp0 = call half @llvm.minnum.f16(half %x, half %y)
%tmp1 = call half @llvm.maxnum.f16(half %x, half %y)
%tmp2 = call half @llvm.minnum.f16(half %tmp1, half %z)
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
index f89341d539a0..7536e83a9da6 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
@@ -53,6 +53,7 @@
; CHECK-NEXT: .cs:
; CHECK-NEXT: .checksum_value: 0x9444d7d0
; CHECK-NEXT: .debug_mode: false
+; CHECK-NEXT: .entry_point: _amdgpu_cs
; CHECK-NEXT: .entry_point_symbol: _amdgpu_cs_main
; CHECK-NEXT: .excp_en: 0
; CHECK-NEXT: .float_mode: 0xc0
@@ -109,6 +110,7 @@
; CHECK-NEXT: .wgp_mode: false
; CHECK-NEXT: .gs:
; CHECK-NEXT: .debug_mode: false
+; CHECK-NEXT: .entry_point: _amdgpu_gs
; CHECK-NEXT: .entry_point_symbol: gs_shader
; CHECK-NEXT: .ieee_mode: false
; CHECK-NEXT: .lds_size: 0x200
@@ -120,6 +122,7 @@
; CHECK-NEXT: .wgp_mode: true
; CHECK-NEXT: .hs:
; CHECK-NEXT: .debug_mode: false
+; CHECK-NEXT: .entry_point: _amdgpu_hs
; CHECK-NEXT: .entry_point_symbol: hs_shader
; CHECK-NEXT: .ieee_mode: false
; CHECK-NEXT: .lds_size: 0x1000
@@ -131,6 +134,7 @@
; CHECK-NEXT: .wgp_mode: true
; CHECK-NEXT: .ps:
; CHECK-NEXT: .debug_mode: false
+; CHECK-NEXT: .entry_point: _amdgpu_ps
; CHECK-NEXT: .entry_point_symbol: ps_shader
; CHECK-NEXT: .ieee_mode: false
; CHECK-NEXT: .lds_size: 0
diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll
index 676ba1480e6d..efb8d836c7b3 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll
@@ -7,7 +7,6 @@
; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1030 -stop-after=amdgpu-remove-incompatible-functions\
; RUN: -pass-remarks=amdgpu-remove-incompatible-functions %s -o - 2>%t | FileCheck -check-prefixes=COMPATIBLE,REALTIME,MEMTIME %s
; RUN: FileCheck -allow-empty --check-prefixes=WARN-REALTIME,WARN-MEMTIME %s < %t
-; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1102 -stop-after=amdgpu-remove-incompatible-functions\
; RUN: -pass-remarks=amdgpu-remove-incompatible-functions %s -o - 2>%t | FileCheck -check-prefixes=INCOMPATIBLE,NOREALTIME,NOMEMTIME %s
@@ -17,7 +16,6 @@
; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1102 -stop-after=amdgpu-remove-incompatible-functions\
; RUN: -pass-remarks=amdgpu-remove-incompatible-functions %s -o - 2>%t | FileCheck -check-prefixes=INCOMPATIBLE,NOREALTIME,NOMEMTIME %s
; RUN: FileCheck --check-prefixes=WARN-NOREALTIME,WARN-NOMEMTIME %s < %t
-; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1102 -verify-machineinstrs < %s
; Note: This test checks the IR, but also has a run line to codegen the file just to check we
; do not crash when trying to select those functions.
diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll
index 75a388eb1229..038f49f30649 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll
@@ -14,7 +14,6 @@
; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1011 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\
; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1011 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\
; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX11 %s
@@ -22,7 +21,6 @@
; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\
; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX11 %s
-; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s
; WARN-GFX906: removing function 'needs_wavefrontsize32': +wavefrontsize32 is not supported on the current target
; WARN-GFX906-NOT: not supported
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index 7e7f4f5d1991..c9efeeefdf2d 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -681,63 +681,30 @@ define amdgpu_kernel void @shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; EG-LABEL: shl_v4i16:
; EG: ; %bb.0:
-; EG-NEXT: ALU 3, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 42, @12, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XY, T0.X, 1
+; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T8.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_128 T10.XYZW, T0.X, 0, #1
+; EG-NEXT: VTX_READ_128 T8.XYZW, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV T0.Y, T6.X,
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
-; EG-NEXT: ALU clause starting at 12:
-; EG-NEXT: AND_INT * T1.W, T10.Z, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL * T1.W, T10.X, PV.W,
-; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
-; EG-NEXT: AND_INT * T2.W, T0.Y, literal.y,
-; EG-NEXT: 65535(9.183409e-41), -65536(nan)
-; EG-NEXT: OR_INT * T1.W, PS, PV.W,
-; EG-NEXT: MOV * T6.X, PV.W,
-; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: LSHR T1.W, T10.Z, literal.x,
-; EG-NEXT: LSHR * T2.W, T10.X, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: LSHL T1.W, PS, PV.W,
-; EG-NEXT: AND_INT * T2.W, PV.X, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: LSHR T1.W, T8.Z, literal.x,
+; EG-NEXT: LSHR * T2.W, T8.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, T2.W, PV.W,
-; EG-NEXT: MOV T6.X, PV.W,
-; EG-NEXT: MOV * T0.X, T7.X,
-; EG-NEXT: AND_INT * T1.W, T10.W, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL T1.W, T10.Y, PV.W,
-; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.x,
+; EG-NEXT: LSHL T0.Y, PS, PV.W,
+; EG-NEXT: AND_INT T1.W, T8.Z, literal.x,
+; EG-NEXT: AND_INT * T2.W, T8.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, T2.W, PV.W,
-; EG-NEXT: MOV * T7.X, PV.W,
-; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: LSHR T1.W, T10.W, literal.x,
-; EG-NEXT: LSHR * T2.W, T10.Y, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: LSHL * T1.W, PS, PV.W,
-; EG-NEXT: AND_INT T0.Z, T0.X, literal.x,
-; EG-NEXT: LSHL T1.W, PV.W, literal.y,
+; EG-NEXT: LSHL T0.X, PS, PV.W,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: LSHR T0.X, PS, literal.x,
-; EG-NEXT: OR_INT * T10.Y, PV.Z, PV.W,
+; EG-NEXT: LSHR * T8.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MOV T7.X, PV.Y,
-; EG-NEXT: MOV * T10.X, T6.X,
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll
index ef1adbb395e7..386a04611396 100644
--- a/llvm/test/CodeGen/AMDGPU/sra.ll
+++ b/llvm/test/CodeGen/AMDGPU/sra.ll
@@ -323,67 +323,28 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; EG-LABEL: ashr_v4i16:
; EG: ; %bb.0:
-; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 48, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XY, T9.X, 1
+; EG-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XY, T8.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_128 T9.XYZW, T9.X, 0, #1
+; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T0.Y, T6.X,
-; EG-NEXT: MOV * T9.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: BFE_INT T0.W, T9.X, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, T9.Z, literal.y,
-; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
-; EG-NEXT: ASHR * T0.W, PV.W, PS,
-; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 65535(9.183409e-41), -65536(nan)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T6.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T9.X, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T1.W, T9.Z, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: ASHR T0.W, PV.W, PS,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T6.X, PV.W,
-; EG-NEXT: MOV T0.Y, T7.X,
-; EG-NEXT: BFE_INT T0.W, T9.Y, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, T9.W, literal.y,
+; EG-NEXT: MOV * T7.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: LSHR T0.Z, T7.X, literal.x,
+; EG-NEXT: BFE_INT T0.W, T7.X, 0.0, literal.x,
+; EG-NEXT: AND_INT * T1.W, T7.Z, literal.y,
; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
-; EG-NEXT: ASHR T0.W, PV.W, PS,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T7.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T9.Y, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T1.W, T9.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: ASHR T0.W, PV.W, PS,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: ASHR T7.X, PV.W, PS,
+; EG-NEXT: BFE_INT T0.W, PV.Z, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.W, T7.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T9.X, KC0[2].Y, literal.x,
-; EG-NEXT: OR_INT * T10.Y, T1.W, PV.W,
+; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x,
+; EG-NEXT: ASHR * T7.Y, PV.W, PS,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MOV T7.X, PV.Y,
-; EG-NEXT: MOV * T10.X, T6.X,
%b_ptr = getelementptr <4 x i16>, ptr addrspace(1) %in, i16 1
%a = load <4 x i16>, ptr addrspace(1) %in
%b = load <4 x i16>, ptr addrspace(1) %b_ptr
diff --git a/llvm/test/CodeGen/AMDGPU/truncate-lshr-cast-build-vector-combine.ll b/llvm/test/CodeGen/AMDGPU/truncate-lshr-cast-build-vector-combine.ll
new file mode 100644
index 000000000000..1c3091f6b8d3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/truncate-lshr-cast-build-vector-combine.ll
@@ -0,0 +1,140 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+; extract element 0 as shift
+define i32 @cast_v4i32_to_i128_trunc_i32(<4 x i32> %arg) {
+; CHECK-LABEL: cast_v4i32_to_i128_trunc_i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %bigint = bitcast <4 x i32> %arg to i128
+ %trunc = trunc i128 %bigint to i32
+ ret i32 %trunc
+}
+
+; extract element 1 as shift
+define i32 @cast_v4i32_to_i128_lshr_32_trunc_i32(<4 x i32> %arg) {
+; CHECK-LABEL: cast_v4i32_to_i128_lshr_32_trunc_i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, v1
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %bigint = bitcast <4 x i32> %arg to i128
+ %srl = lshr i128 %bigint, 32
+ %trunc = trunc i128 %srl to i32
+ ret i32 %trunc
+}
+
+; extract element 2 as shift
+define i32 @cast_v4i32_to_i128_lshr_64_trunc_i32(<4 x i32> %arg) {
+; CHECK-LABEL: cast_v4i32_to_i128_lshr_64_trunc_i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, v2
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %bigint = bitcast <4 x i32> %arg to i128
+ %srl = lshr i128 %bigint, 64
+ %trunc = trunc i128 %srl to i32
+ ret i32 %trunc
+}
+
+; extract element 3 as shift
+define i32 @cast_v4i32_to_i128_lshr_96_trunc_i32(<4 x i32> %arg) {
+; CHECK-LABEL: cast_v4i32_to_i128_lshr_96_trunc_i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, v3
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %bigint = bitcast <4 x i32> %arg to i128
+ %srl = lshr i128 %bigint, 96
+ %trunc = trunc i128 %srl to i32
+ ret i32 %trunc
+}
+
+; Shift not aligned to element, not a simple extract
+define i32 @cast_v4i32_to_i128_lshr_33_trunc_i32(<4 x i32> %arg) {
+; CHECK-LABEL: cast_v4i32_to_i128_lshr_33_trunc_i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_alignbit_b32 v0, v2, v1, 1
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %bigint = bitcast <4 x i32> %arg to i128
+ %srl = lshr i128 %bigint, 33
+ %trunc = trunc i128 %srl to i32
+ ret i32 %trunc
+}
+
+; extract misaligned element
+define i32 @cast_v4i32_to_i128_lshr_31_trunc_i32(<4 x i32> %arg) {
+; CHECK-LABEL: cast_v4i32_to_i128_lshr_31_trunc_i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_alignbit_b32 v0, v1, v0, 31
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %bigint = bitcast <4 x i32> %arg to i128
+ %srl = lshr i128 %bigint, 31
+ %trunc = trunc i128 %srl to i32
+ ret i32 %trunc
+}
+
+; extract misaligned element
+define i32 @cast_v4i32_to_i128_lshr_48_trunc_i32(<4 x i32> %arg) {
+; CHECK-LABEL: cast_v4i32_to_i128_lshr_48_trunc_i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_mov_b32 s4, 0x1000706
+; CHECK-NEXT: v_perm_b32 v0, v1, v2, s4
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %bigint = bitcast <4 x i32> %arg to i128
+ %srl = lshr i128 %bigint, 48
+ %trunc = trunc i128 %srl to i32
+ ret i32 %trunc
+}
+
+; extract elements 1 and 2 with shift
+define i64 @cast_v4i32_to_i128_lshr_32_trunc_i64(<4 x i32> %arg) {
+; CHECK-LABEL: cast_v4i32_to_i128_lshr_32_trunc_i64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, v1
+; CHECK-NEXT: v_mov_b32_e32 v1, v2
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %bigint = bitcast <4 x i32> %arg to i128
+ %srl = lshr i128 %bigint, 32
+ %trunc = trunc i128 %srl to i64
+ ret i64 %trunc
+}
+
+; extract elements 2 and 3 with shift
+define i64 @cast_v4i32_to_i128_lshr_64_trunc_i64(<4 x i32> %arg) {
+; CHECK-LABEL: cast_v4i32_to_i128_lshr_64_trunc_i64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, v3
+; CHECK-NEXT: v_mov_b32_e32 v0, v2
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %bigint = bitcast <4 x i32> %arg to i128
+ %srl = lshr i128 %bigint, 64
+ %trunc = trunc i128 %srl to i64
+ ret i64 %trunc
+}
+
+; FIXME: We don't process this case because we see multiple bitcasts
+; before a 32-bit build_vector
+define i32 @build_vector_i16_to_shift(i16 %arg0, i16 %arg1, i16 %arg2, i16 %arg3) {
+; CHECK-LABEL: build_vector_i16_to_shift:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_mov_b32 s4, 0x5040100
+; CHECK-NEXT: v_perm_b32 v0, v3, v2, s4
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %ins.0 = insertelement <4 x i16> poison, i16 %arg0, i32 0
+ %ins.1 = insertelement <4 x i16> %ins.0, i16 %arg1, i32 1
+ %ins.2 = insertelement <4 x i16> %ins.1, i16 %arg2, i32 2
+ %ins.3 = insertelement <4 x i16> %ins.2, i16 %arg3, i32 3
+
+ %cast = bitcast <4 x i16> %ins.3 to i64
+ %srl = lshr i64 %cast, 32
+ %trunc = trunc i64 %srl to i32
+ ret i32 %trunc
+}
diff --git a/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll b/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll
index e3a6240aac00..fdc1e6abb051 100644
--- a/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll
@@ -12,6 +12,7 @@
; GCN-NEXT: amdpal.pipelines:
; GCN-NEXT: - .hardware_stages:
; GCN-NEXT: .cs:
+; GCN-NEXT: .entry_point: _amdgpu_cs
; GCN-NEXT: .entry_point_symbol: _amdgpu_cs_main
; GCN-NEXT: .scratch_memory_size: 0
; SI-NEXT: .sgpr_count: 0x11