summaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU/build_vector.gfx11plus.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/build_vector.gfx11plus.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/build_vector.gfx11plus.ll147
1 files changed, 147 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.gfx11plus.ll b/llvm/test/CodeGen/AMDGPU/build_vector.gfx11plus.ll
new file mode 100644
index 000000000000..cdfd71f65ca8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.gfx11plus.ll
@@ -0,0 +1,147 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -stop-after=amdgpu-isel | FileCheck %s --check-prefixes=GFX11-REAL16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -stop-after=amdgpu-isel | FileCheck %s --check-prefixes=GFX11-FAKE16
+
+; Make sure no "vgpr32 = copy vgpr16" is generated
+
+define amdgpu_kernel void @build_vector_bfi (ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %out) {
+ ; GFX11-REAL16-LABEL: name: build_vector_bfi
+ ; GFX11-REAL16: bb.0.entry:
+ ; GFX11-REAL16-NEXT: liveins: $vgpr0, $sgpr4_sgpr5
+ ; GFX11-REAL16-NEXT: {{ $}}
+ ; GFX11-REAL16-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; GFX11-REAL16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; GFX11-REAL16-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s128) from %ir.a.kernarg.offset, align 4, addrspace 4)
+ ; GFX11-REAL16-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset + 16, align 4, addrspace 4)
+ ; GFX11-REAL16-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+ ; GFX11-REAL16-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+ ; GFX11-REAL16-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
+ ; GFX11-REAL16-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+ ; GFX11-REAL16-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+ ; GFX11-REAL16-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+ ; GFX11-REAL16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1
+ ; GFX11-REAL16-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1
+ ; GFX11-REAL16-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
+ ; GFX11-REAL16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX11-REAL16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1023
+ ; GFX11-REAL16-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]](s32), killed [[S_MOV_B32_]], implicit $exec
+ ; GFX11-REAL16-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2
+ ; GFX11-REAL16-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_AND_B32_e64_]], implicit $exec
+ ; GFX11-REAL16-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[REG_SEQUENCE]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s32) from %ir.in.gep1, addrspace 1)
+ ; GFX11-REAL16-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[REG_SEQUENCE1]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s32) from %ir.in.gep2, addrspace 1)
+ ; GFX11-REAL16-NEXT: [[COPY8:%[0-9]+]]:vgpr_16 = COPY [[GLOBAL_LOAD_DWORD_SADDR]].lo16
+ ; GFX11-REAL16-NEXT: [[COPY9:%[0-9]+]]:vgpr_16 = COPY [[GLOBAL_LOAD_DWORD_SADDR1]].hi16
+ ; GFX11-REAL16-NEXT: [[V_MOV_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, 0, 0, implicit $exec
+ ; GFX11-REAL16-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[COPY9]], %subreg.lo16, killed [[V_MOV_B16_t16_e64_]], %subreg.hi16
+ ; GFX11-REAL16-NEXT: [[COPY10:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE3]].lo16
+ ; GFX11-REAL16-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[COPY8]], %subreg.lo16, killed [[COPY10]], %subreg.hi16
+ ; GFX11-REAL16-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (store (s32) into %ir.3, addrspace 1)
+ ; GFX11-REAL16-NEXT: S_ENDPGM 0
+ ;
+ ; GFX11-FAKE16-LABEL: name: build_vector_bfi
+ ; GFX11-FAKE16: bb.0.entry:
+ ; GFX11-FAKE16-NEXT: liveins: $vgpr0, $sgpr4_sgpr5
+ ; GFX11-FAKE16-NEXT: {{ $}}
+ ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; GFX11-FAKE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; GFX11-FAKE16-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s128) from %ir.a.kernarg.offset, align 4, addrspace 4)
+ ; GFX11-FAKE16-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset + 16, align 4, addrspace 4)
+ ; GFX11-FAKE16-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+ ; GFX11-FAKE16-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+ ; GFX11-FAKE16-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
+ ; GFX11-FAKE16-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+ ; GFX11-FAKE16-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+ ; GFX11-FAKE16-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+ ; GFX11-FAKE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1
+ ; GFX11-FAKE16-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1
+ ; GFX11-FAKE16-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
+ ; GFX11-FAKE16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX11-FAKE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1023
+ ; GFX11-FAKE16-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]](s32), killed [[S_MOV_B32_]], implicit $exec
+ ; GFX11-FAKE16-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2
+ ; GFX11-FAKE16-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_AND_B32_e64_]], implicit $exec
+ ; GFX11-FAKE16-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[REG_SEQUENCE]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s32) from %ir.in.gep1, addrspace 1)
+ ; GFX11-FAKE16-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[REG_SEQUENCE1]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s32) from %ir.in.gep2, addrspace 1)
+ ; GFX11-FAKE16-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORD_SADDR]]
+ ; GFX11-FAKE16-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
+ ; GFX11-FAKE16-NEXT: [[V_BFI_B32_e64_:%[0-9]+]]:vgpr_32 = V_BFI_B32_e64 killed [[S_MOV_B32_2]], killed [[COPY8]], killed [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+ ; GFX11-FAKE16-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[V_BFI_B32_e64_]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (store (s32) into %ir.3, addrspace 1)
+ ; GFX11-FAKE16-NEXT: S_ENDPGM 0
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep1 = getelementptr i32, ptr addrspace(1) %a, i32 %tid
+ %in.gep2 = getelementptr i32, ptr addrspace(1) %b, i32 %tid
+ %load.i32.a = load i32, ptr addrspace(1) %in.gep1
+ %load.i32.b = load i32, ptr addrspace(1) %in.gep2
+ %load.i16.a = trunc i32 %load.i32.a to i16
+ %load.i32.b.1 = lshr i32 %load.i32.b, 16
+ %load.i16.b = trunc i32 %load.i32.b.1 to i16
+ %ins.0 = insertelement <2 x i16> poison, i16 %load.i16.a, i32 0
+ %ins.1 = insertelement <2 x i16> %ins.0, i16 %load.i16.b, i32 1
+ store <2 x i16> %ins.1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @build_vector_and (ptr addrspace(1) %a, ptr addrspace(1) %out) {
+ ; GFX11-REAL16-LABEL: name: build_vector_and
+ ; GFX11-REAL16: bb.0.entry:
+ ; GFX11-REAL16-NEXT: liveins: $vgpr0, $sgpr4_sgpr5
+ ; GFX11-REAL16-NEXT: {{ $}}
+ ; GFX11-REAL16-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; GFX11-REAL16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; GFX11-REAL16-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s128) from %ir.a.kernarg.offset, align 4, addrspace 4)
+ ; GFX11-REAL16-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+ ; GFX11-REAL16-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+ ; GFX11-REAL16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
+ ; GFX11-REAL16-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
+ ; GFX11-REAL16-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+ ; GFX11-REAL16-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1
+ ; GFX11-REAL16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX11-REAL16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1023
+ ; GFX11-REAL16-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]](s32), killed [[S_MOV_B32_]], implicit $exec
+ ; GFX11-REAL16-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX11-REAL16-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_AND_B32_e64_]], implicit $exec
+ ; GFX11-REAL16-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 killed [[REG_SEQUENCE]], killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s16) from %ir.in.gep1, addrspace 1)
+ ; GFX11-REAL16-NEXT: [[V_MOV_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, 1, 0, implicit $exec
+ ; GFX11-REAL16-NEXT: [[V_ADD_NC_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_ADD_NC_U16_t16_e64 0, killed [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], 0, killed [[V_MOV_B16_t16_e64_]], 0, 0, implicit $exec
+ ; GFX11-REAL16-NEXT: [[V_MOV_B16_t16_e64_1:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, 0, 0, implicit $exec
+ ; GFX11-REAL16-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_ADD_NC_U16_t16_e64_]], %subreg.lo16, killed [[V_MOV_B16_t16_e64_1]], %subreg.hi16
+ ; GFX11-REAL16-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s32) into %ir.2, addrspace 1)
+ ; GFX11-REAL16-NEXT: S_ENDPGM 0
+ ;
+ ; GFX11-FAKE16-LABEL: name: build_vector_and
+ ; GFX11-FAKE16: bb.0.entry:
+ ; GFX11-FAKE16-NEXT: liveins: $vgpr0, $sgpr4_sgpr5
+ ; GFX11-FAKE16-NEXT: {{ $}}
+ ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; GFX11-FAKE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; GFX11-FAKE16-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s128) from %ir.a.kernarg.offset, align 4, addrspace 4)
+ ; GFX11-FAKE16-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+ ; GFX11-FAKE16-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+ ; GFX11-FAKE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
+ ; GFX11-FAKE16-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
+ ; GFX11-FAKE16-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+ ; GFX11-FAKE16-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1
+ ; GFX11-FAKE16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX11-FAKE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1023
+ ; GFX11-FAKE16-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]](s32), killed [[S_MOV_B32_]], implicit $exec
+ ; GFX11-FAKE16-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX11-FAKE16-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_AND_B32_e64_]], implicit $exec
+ ; GFX11-FAKE16-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR killed [[REG_SEQUENCE]], killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s16) from %ir.in.gep1, addrspace 1)
+ ; GFX11-FAKE16-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX11-FAKE16-NEXT: [[V_ADD_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, killed [[GLOBAL_LOAD_USHORT_SADDR]], 0, killed [[S_MOV_B32_2]], 0, 0, implicit $exec
+ ; GFX11-FAKE16-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec
+ ; GFX11-FAKE16-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[V_MOV_B32_e32_1]], killed [[V_ADD_NC_U16_fake16_e64_]], implicit $exec
+ ; GFX11-FAKE16-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[V_AND_B32_e64_1]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s32) into %ir.2, addrspace 1)
+ ; GFX11-FAKE16-NEXT: S_ENDPGM 0
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep1 = getelementptr i16, ptr addrspace(1) %a, i32 %tid
+ %load.i16.a = load i16, ptr addrspace(1) %in.gep1
+ %add = add i16 %load.i16.a, 1
+ %ins.0 = insertelement <2 x i16> poison, i16 %add, i32 0
+ %ins.1 = insertelement <2 x i16> %ins.0, i16 0, i32 1
+ store <2 x i16> %ins.1, ptr addrspace(1) %out
+ ret void
+}
+