VectorCombine: Improve the insert/extract fold in the narrowing caseusers/nhaehnle/spr/main/c151bb04

Keeping the extracted element in a natural position in the narrowed vector has two beneficial effects: 1. It makes the narrowing shuffles cheaper (at least on AMDGPU), which allows the insert/extract fold to trigger. 2. It makes the narrowing shuffles in a chain of extract/insert compatible, which allows foldLengthChangingShuffles to successfully recognize a chain that can be folded. There are minor X86 test changes that look reasonable to me. The IR change for AVX2 in llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll doesn't change the assembly generated by `llc -mtriple=x86_64-- -mattr=AVX2` at all. commit-id:c151bb04
author: Nicolai Hähnle <nicolai.haehnle@amd.com> 2025-11-19 18:00:32 -0800
committer: Nicolai Hähnle <nicolai.haehnle@amd.com> 2025-11-21 11:05:26 -0800
commit: e673cdaee95d870dd5e2fa13ab064f6dbd0ba273 (patch)
tree: 8d4ebe0df83984917f49cccef42797dda2a7e1ce /llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-i8.ll
parent: 459939f82086d02c39f5d6eeae141c25f9932d40 (diff)
1 files changed, 2 insertions, 15 deletions
diff --git a/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-i8.ll b/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-i8.ll
index eaab7199a3cf..442a93689a79 100644
--- a/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-i8.ll
+++ b/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-i8.ll
@@ -91,21 +91,8 @@ entry:
 define <8 x i8> @extract_insert_chain_shortening(<32 x i8> %in) {
 ; OPT-LABEL: define <8 x i8> @extract_insert_chain_shortening(
 ; OPT-SAME: <32 x i8> [[IN:%.*]]) #[[ATTR0]] {
-; OPT-NEXT:    [[I_1:%.*]] = extractelement <32 x i8> [[IN]], i64 17
-; OPT-NEXT:    [[I_2:%.*]] = extractelement <32 x i8> [[IN]], i64 18
-; OPT-NEXT:    [[I_3:%.*]] = extractelement <32 x i8> [[IN]], i64 19
-; OPT-NEXT:    [[I_5:%.*]] = extractelement <32 x i8> [[IN]], i64 21
-; OPT-NEXT:    [[I_6:%.*]] = extractelement <32 x i8> [[IN]], i64 22
-; OPT-NEXT:    [[I_7:%.*]] = extractelement <32 x i8> [[IN]], i64 23
-; OPT-NEXT:    [[O_0:%.*]] = shufflevector <32 x i8> [[IN]], <32 x i8> poison, <8 x i32> <i32 16, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; OPT-NEXT:    [[O_1:%.*]] = insertelement <8 x i8> [[O_0]], i8 [[I_1]], i32 1
-; OPT-NEXT:    [[O_2:%.*]] = insertelement <8 x i8> [[O_1]], i8 [[I_2]], i32 2
-; OPT-NEXT:    [[O_3:%.*]] = insertelement <8 x i8> [[O_2]], i8 [[I_3]], i32 3
-; OPT-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[IN]], <32 x i8> poison, <8 x i32> <i32 20, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; OPT-NEXT:    [[O_4:%.*]] = shufflevector <8 x i8> [[O_3]], <8 x i8> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 7>
-; OPT-NEXT:    [[O_5:%.*]] = insertelement <8 x i8> [[O_4]], i8 [[I_5]], i32 5
-; OPT-NEXT:    [[O_6:%.*]] = insertelement <8 x i8> [[O_5]], i8 [[I_6]], i32 6
-; OPT-NEXT:    [[O_7:%.*]] = insertelement <8 x i8> [[O_6]], i8 [[I_7]], i32 7
+; OPT-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[IN]], <32 x i8> poison, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; OPT-NEXT:    [[O_7:%.*]] = shufflevector <8 x i8> poison, <8 x i8> [[TMP1]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; OPT-NEXT:    ret <8 x i8> [[O_7]]
 ;
   %i.0 = extractelement <32 x i8> %in, i64 16
author	Nicolai Hähnle <nicolai.haehnle@amd.com>	2025-11-19 18:00:32 -0800
committer	Nicolai Hähnle <nicolai.haehnle@amd.com>	2025-11-21 11:05:26 -0800
commit	e673cdaee95d870dd5e2fa13ab064f6dbd0ba273 (patch)
tree	8d4ebe0df83984917f49cccef42797dda2a7e1ce /llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-i8.ll
parent	459939f82086d02c39f5d6eeae141c25f9932d40 (diff)