summaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU/mul_int24.ll
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2025-01-13 11:22:55 +0700
committerMatt Arsenault <arsenm2@gmail.com>2025-11-11 16:47:47 -0800
commitafe4cffcca614618efd300a790c17d322c12f4bc (patch)
tree806463a80fa08eca1b285ef594e2525f937b7da1 /llvm/test/CodeGen/AMDGPU/mul_int24.ll
parent79d9ae7a777a03452991d222642ffdb6687d9210 (diff)
DAG: Handle load in SimplifyDemandedVectorEltsusers/arsenm/dag/simplify-demanded-vector-elts-load
This improves some AMDGPU cases and avoids future regressions. The combiner likes to form shuffles for cases where an extract_vector_elt would do perfectly well, and this recovers some of the regressions from losing load narrowing. AMDGPU, Arch64 and RISCV test changes look broadly better. Other targets have some improvements, but mostly regressions. In particular X86 looks much worse. I'm guessing this is because it's shouldReduceLoadWidth is wrong. I mostly just regenerated the checks. I assume some set of them should switch to use volatile loads to defeat the optimization.
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/mul_int24.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/mul_int24.ll49
1 files changed, 25 insertions, 24 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
index 10d4eb029ee3..36dabd858c70 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
@@ -459,18 +459,18 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a,
define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) #0 {
; SI-LABEL: test_smul24_i33:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_load_dword s6, s[4:5], 0xd
+; SI-NEXT: s_load_dword s4, s[4:5], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_bfe_i32 s4, s4, 0x180000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bfe_i32 s5, s6, 0x180000
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: s_mul_i32 s4, s5, s4
-; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s5, v0
-; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: s_bfe_i32 s4, s4, 0x180000
+; SI-NEXT: v_mov_b32_e32 v0, s5
+; SI-NEXT: s_mul_i32 s5, s4, s5
+; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s4, v0
+; SI-NEXT: v_mov_b32_e32 v0, s5
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 31
; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 31
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -478,11 +478,12 @@ define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b
;
; VI-LABEL: test_smul24_i33:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
+; VI-NEXT: s_load_dword s3, s[4:5], 0x34
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bfe_i32 s2, s2, 0x180000
-; VI-NEXT: s_bfe_i32 s3, s4, 0x180000
+; VI-NEXT: s_bfe_i32 s3, s3, 0x180000
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s2, v0
; VI-NEXT: v_mul_i32_i24_e32 v0, s2, v0
@@ -569,28 +570,28 @@ entry:
define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) {
; SI-LABEL: test_smulhi24_i33:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
+; SI-NEXT: s_load_dword s6, s[4:5], 0xd
+; SI-NEXT: s_load_dword s7, s[4:5], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s6, v0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s6
+; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s7, v0
; SI-NEXT: v_and_b32_e32 v0, 1, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_smulhi24_i33:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b64 s[6:7], s[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s6, v0
+; VI-NEXT: s_load_dword s6, s[4:5], 0x34
+; VI-NEXT: s_load_dword s7, s[4:5], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s7, v0
; VI-NEXT: v_and_b32_e32 v0, 1, v0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm