diff options
Diffstat (limited to 'llvm/test/CodeGen/NVPTX')
185 files changed, 2764 insertions, 1137 deletions
diff --git a/llvm/test/CodeGen/NVPTX/access-non-generic.ll b/llvm/test/CodeGen/NVPTX/access-non-generic.ll index 601a35288f54..9eb5048e8adf 100644 --- a/llvm/test/CodeGen/NVPTX/access-non-generic.ll +++ b/llvm/test/CodeGen/NVPTX/access-non-generic.ll @@ -2,7 +2,7 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix PTX ; RUN: opt -mtriple=nvptx-- < %s -S -passes=infer-address-spaces | FileCheck %s --check-prefix IR ; RUN: opt -mtriple=nvptx64-- < %s -S -passes=infer-address-spaces | FileCheck %s --check-prefix IR -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} @array = internal addrspace(3) global [10 x float] zeroinitializer, align 4 diff --git a/llvm/test/CodeGen/NVPTX/activemask.ll b/llvm/test/CodeGen/NVPTX/activemask.ll index aa3c5819d7f9..18918c514a4c 100644 --- a/llvm/test/CodeGen/NVPTX/activemask.ll +++ b/llvm/test/CodeGen/NVPTX/activemask.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -O2 -mcpu=sm_52 -mattr=+ptx62 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_52 -mattr=+ptx62 | %ptxas-verify %} +; RUN: %if ptxas-isa-6.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_52 -mattr=+ptx62 | %ptxas-verify %} declare i32 @llvm.nvvm.activemask() diff --git a/llvm/test/CodeGen/NVPTX/addrspacecast-ptx64.ll b/llvm/test/CodeGen/NVPTX/addrspacecast-ptx64.ll index 00b17896d2c9..929196fcb00a 100644 --- a/llvm/test/CodeGen/NVPTX/addrspacecast-ptx64.ll +++ b/llvm/test/CodeGen/NVPTX/addrspacecast-ptx64.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s -check-prefixes=NOPTRCONV ; RUN: llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 --nvptx-short-ptr | FileCheck %s -check-prefixes=PTRCONV -; RUN: %if ptxas-12.8 %{ llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} -; RUN: %if ptxas-12.8 %{ llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 --nvptx-short-ptr | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 %{ llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 %{ llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 --nvptx-short-ptr | %ptxas-verify -arch=sm_90 %} ; ALL-LABEL: conv_shared_cluster_to_generic define i32 @conv_shared_cluster_to_generic(ptr addrspace(7) %ptr) { diff --git a/llvm/test/CodeGen/NVPTX/addrspacecast.ll b/llvm/test/CodeGen/NVPTX/addrspacecast.ll index 86008a1b7005..e7212ce71ca0 100644 --- a/llvm/test/CodeGen/NVPTX/addrspacecast.ll +++ b/llvm/test/CodeGen/NVPTX/addrspacecast.ll @@ -1,7 +1,7 @@ ; RUN: llc -O0 < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s -check-prefixes=ALL,CLS32 ; RUN: llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s -check-prefixes=ALL,NOPTRCONV,CLS64 ; RUN: llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | FileCheck %s -check-prefixes=ALL,PTRCONV,CLS64 -; RUN: %if ptxas && !ptxas-12.0 %{ llc -O0 < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc -O0 < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | %ptxas-verify %} diff --git a/llvm/test/CodeGen/NVPTX/alias.ll b/llvm/test/CodeGen/NVPTX/alias.ll index 01761c21ab10..d5d0c76816b9 100644 --- a/llvm/test/CodeGen/NVPTX/alias.ll +++ b/llvm/test/CodeGen/NVPTX/alias.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx64 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx64 | %ptxas-verify %} +; RUN: %if ptxas-isa-6.4 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx64 | %ptxas-verify %} define i32 @a() { ret i32 0 } @b = internal alias i32 (), ptr @a diff --git a/llvm/test/CodeGen/NVPTX/annotations.ll b/llvm/test/CodeGen/NVPTX/annotations.ll index 5360e8988777..8972953e9145 100644 --- a/llvm/test/CodeGen/NVPTX/annotations.ll +++ b/llvm/test/CodeGen/NVPTX/annotations.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} @texture = internal addrspace(1) global i64 0, align 8 diff --git a/llvm/test/CodeGen/NVPTX/applypriority.ll b/llvm/test/CodeGen/NVPTX/applypriority.ll index 23b1bda9a32b..92092a704933 100644 --- a/llvm/test/CodeGen/NVPTX/applypriority.ll +++ b/llvm/test/CodeGen/NVPTX/applypriority.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx74| FileCheck --check-prefixes=CHECK-PTX64 %s
-; RUN: %if ptxas-11.4 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx74| %ptxas-verify -arch=sm_80 %}
+; RUN: %if ptxas-sm_80 && ptxas-isa-7.4 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx74| %ptxas-verify -arch=sm_80 %}
target triple = "nvptx64-nvidia-cuda"
diff --git a/llvm/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll b/llvm/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll index ce71d3a78c0d..500ff4f541b2 100644 --- a/llvm/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll +++ b/llvm/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %} ;; These tests should run for all targets diff --git a/llvm/test/CodeGen/NVPTX/arithmetic-int.ll b/llvm/test/CodeGen/NVPTX/arithmetic-int.ll index 1fbfd0a987d7..5e02a7d74aa3 100644 --- a/llvm/test/CodeGen/NVPTX/arithmetic-int.ll +++ b/llvm/test/CodeGen/NVPTX/arithmetic-int.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ;; These tests should run for all targets diff --git a/llvm/test/CodeGen/NVPTX/async-copy.ll b/llvm/test/CodeGen/NVPTX/async-copy.ll index cefb8ede9fa5..0d8e23047af0 100644 --- a/llvm/test/CodeGen/NVPTX/async-copy.ll +++ b/llvm/test/CodeGen/NVPTX/async-copy.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX32 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX64 %s -; RUN: %if ptxas-11.0 && ! ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} -; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_80 && ptxas-isa-7.0 && ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_80 && ptxas-isa-7.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} declare void @llvm.nvvm.cp.async.wait.group(i32) diff --git a/llvm/test/CodeGen/NVPTX/atomicrmw-expand.err.ll b/llvm/test/CodeGen/NVPTX/atomicrmw-expand.err.ll index b19f6d56b9a9..392cd8b26d27 100644 --- a/llvm/test/CodeGen/NVPTX/atomicrmw-expand.err.ll +++ b/llvm/test/CodeGen/NVPTX/atomicrmw-expand.err.ll @@ -4,12 +4,12 @@ ; CHECK: error: unsupported cmpxchg ; CHECK: error: unsupported cmpxchg ; CHECK: error: unsupported cmpxchg -define void @bitwise_i128(ptr %0, i128 %1) { +define void @bitwise_i256(ptr %0, i256 %1) { entry: - %2 = atomicrmw and ptr %0, i128 %1 monotonic, align 16 - %3 = atomicrmw or ptr %0, i128 %1 monotonic, align 16 - %4 = atomicrmw xor ptr %0, i128 %1 monotonic, align 16 - %5 = atomicrmw xchg ptr %0, i128 %1 monotonic, align 16 + %2 = atomicrmw and ptr %0, i256 %1 monotonic, align 16 + %3 = atomicrmw or ptr %0, i256 %1 monotonic, align 16 + %4 = atomicrmw xor ptr %0, i256 %1 monotonic, align 16 + %5 = atomicrmw xchg ptr %0, i256 %1 monotonic, align 16 ret void } @@ -17,11 +17,11 @@ entry: ; CHECK: error: unsupported cmpxchg ; CHECK: error: unsupported cmpxchg ; CHECK: error: unsupported cmpxchg -define void @minmax_i128(ptr %0, i128 %1) { +define void @minmax_i256(ptr %0, i256 %1) { entry: - %2 = atomicrmw min ptr %0, i128 %1 monotonic, align 16 - %3 = atomicrmw max ptr %0, i128 %1 monotonic, align 16 - %4 = atomicrmw umin ptr %0, i128 %1 monotonic, align 16 - %5 = atomicrmw umax ptr %0, i128 %1 monotonic, align 16 + %2 = atomicrmw min ptr %0, i256 %1 monotonic, align 16 + %3 = atomicrmw max ptr %0, i256 %1 monotonic, align 16 + %4 = atomicrmw umin ptr %0, i256 %1 monotonic, align 16 + %5 = atomicrmw umax ptr %0, i256 %1 monotonic, align 16 ret void } diff --git a/llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll b/llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll index 94b3f0a2e1c3..88fae7a3f78a 100644 --- a/llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 | FileCheck %s --check-prefixes=ALL,SM30 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_60 | FileCheck %s --check-prefixes=ALL,SM60 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 | %ptxas-verify %} -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %} +; RUN: %if ptxas-sm_60 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %} ; CHECK-LABEL: fadd_double define void @fadd_double(ptr %0, double %1) { diff --git a/llvm/test/CodeGen/NVPTX/atomics-b128.ll b/llvm/test/CodeGen/NVPTX/atomics-b128.ll new file mode 100644 index 000000000000..b2a3f94d11a1 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/atomics-b128.ll @@ -0,0 +1,1033 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: not llc < %s -mcpu=sm_90 -mattr=+ptx82 2>&1 | FileCheck %s --check-prefix=ERROR +; RUN: not llc < %s -mcpu=sm_80 -mattr=+ptx84 2>&1 | FileCheck %s --check-prefix=ERROR +; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx84 | FileCheck %s --check-prefix=CHECK +; RUN: %if ptxas-sm_90 && ptxas-isa-8.4 %{ llc < %s -mcpu=sm_90 -mattr=+ptx84 | %ptxas-verify -arch=sm_90 %} + +;; TODO: Update cmpxchg.py so that it can automatically generate the IR for +;; these test cases. + +target triple = "nvptx64-nvidia-cuda" + +;; Check that the first couple of error messages are correct. +; ERROR: error: unsupported cmpxchg +; ERROR: error: unsupported cmpxchg + +define i128 @test_xchg_generic(ptr %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_generic( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_generic_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_generic_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.release.sys.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr %addr, i128 %amt release + ret i128 %old +} + +define i128 @test_xchg_global(ptr addrspace(1) %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_global( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_global_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_global_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.release.sys.global.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr addrspace(1) %addr, i128 %amt release + ret i128 %old +} + +define i128 @test_xchg_shared(ptr addrspace(3) %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_shared( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_shared_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_shared_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.release.sys.shared.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr addrspace(3) %addr, i128 %amt release + ret i128 %old +} + +define i128 @test_xchg_shared_cluster(ptr addrspace(7) %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_shared_cluster( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_shared_cluster_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_shared_cluster_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.release.sys.shared::cluster.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr addrspace(7) %addr, i128 %amt release + ret i128 %old +} + +define i128 @test_xchg_block(ptr %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_block( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_block_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_block_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.release.cta.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr %addr, i128 %amt syncscope("block") release + ret i128 %old +} + +define i128 @test_xchg_cluster(ptr %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_cluster( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_cluster_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_cluster_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.release.cluster.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr %addr, i128 %amt syncscope("cluster") release + ret i128 %old +} + +define i128 @test_xchg_gpu(ptr %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_gpu( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_gpu_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_gpu_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.release.gpu.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr %addr, i128 %amt syncscope("device") release + ret i128 %old +} + +define i128 @test_xchg_sys(ptr %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_sys( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_sys_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_sys_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.release.sys.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr %addr, i128 %amt release + ret i128 %old +} + +define i128 @test_xchg_relaxed(ptr %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_relaxed( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_relaxed_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_relaxed_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.relaxed.sys.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr %addr, i128 %amt monotonic + ret i128 %old +} + +define i128 @test_xchg_acquire(ptr %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_acquire( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_acquire_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_acquire_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.acquire.sys.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr %addr, i128 %amt acquire + ret i128 %old +} + +define i128 @test_xchg_release(ptr %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_release( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_release_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_release_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.release.sys.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr %addr, i128 %amt release + ret i128 %old +} + +define i128 @test_xchg_acq_rel(ptr %addr, i128 %amt) { +; CHECK-LABEL: test_xchg_acq_rel( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xchg_acq_rel_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_xchg_acq_rel_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd3}; +; CHECK-NEXT: atom.acq_rel.sys.exch.b128 dst, [%rd1], amt; +; CHECK-NEXT: mov.b128 {%rd4, %rd5}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %old = atomicrmw xchg ptr %addr, i128 %amt acq_rel + ret i128 %old +} + +define i128 @test_cmpxchg_generic(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_generic( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_generic_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_generic_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_generic_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new monotonic monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_global(ptr addrspace(1) %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_global( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_global_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_global_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_global_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.relaxed.sys.global.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i128 %cmp, i128 %new monotonic monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_shared(ptr addrspace(3) %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_shared( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_shared_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_shared_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_shared_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.relaxed.sys.shared.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i128 %cmp, i128 %new monotonic monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_block(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_block( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_block_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_block_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_block_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.relaxed.cta.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new syncscope("block") monotonic monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_cluster(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_cluster( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_cluster_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_cluster_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_cluster_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.relaxed.cluster.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new syncscope("cluster") monotonic monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_gpu(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_gpu( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_gpu_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_gpu_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_gpu_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.relaxed.gpu.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new syncscope("device") monotonic monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_shared_cluster(ptr addrspace(7) %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_shared_cluster( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_shared_cluster_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_shared_cluster_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_shared_cluster_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr addrspace(7) %addr, i128 %cmp, i128 %new monotonic monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_monotonic_monotonic(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_monotonic_monotonic( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_monotonic_monotonic_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_monotonic_monotonic_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_monotonic_monotonic_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new monotonic monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_monotonic_acquire(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_monotonic_acquire( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_monotonic_acquire_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_monotonic_acquire_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_monotonic_acquire_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acquire.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new monotonic acquire + ret i128 %new +} + +define i128 @test_cmpxchg_monotonic_seq_cst(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_monotonic_seq_cst( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_monotonic_seq_cst_param_0]; +; CHECK-NEXT: fence.sc.sys; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_monotonic_seq_cst_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_monotonic_seq_cst_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acquire.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new monotonic seq_cst + ret i128 %new +} + +define i128 @test_cmpxchg_acquire_monotonic(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_acquire_monotonic( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_acquire_monotonic_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_acquire_monotonic_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_acquire_monotonic_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acquire.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new acquire monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_acquire_acquire(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_acquire_acquire( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_acquire_acquire_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_acquire_acquire_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_acquire_acquire_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acquire.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new acquire acquire + ret i128 %new +} + +define i128 @test_cmpxchg_acquire_seq_cst(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_acquire_seq_cst( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_acquire_seq_cst_param_0]; +; CHECK-NEXT: fence.sc.sys; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_acquire_seq_cst_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_acquire_seq_cst_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acquire.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new acquire seq_cst + ret i128 %new +} + +define i128 @test_cmpxchg_release_monotonic(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_release_monotonic( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_release_monotonic_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_release_monotonic_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_release_monotonic_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.release.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new release monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_release_acquire(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_release_acquire( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_release_acquire_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_release_acquire_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_release_acquire_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acq_rel.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new release acquire + ret i128 %new +} + +define i128 @test_cmpxchg_release_seq_cst(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_release_seq_cst( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_release_seq_cst_param_0]; +; CHECK-NEXT: fence.sc.sys; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_release_seq_cst_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_release_seq_cst_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acquire.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new release seq_cst + ret i128 %new +} + +define i128 @test_cmpxchg_acq_rel_monotonic(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_acq_rel_monotonic( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_acq_rel_monotonic_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_acq_rel_monotonic_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_acq_rel_monotonic_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acq_rel.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new acq_rel monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_acq_rel_acquire(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_acq_rel_acquire( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_acq_rel_acquire_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_acq_rel_acquire_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_acq_rel_acquire_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acq_rel.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new acq_rel acquire + ret i128 %new +} + +define i128 @test_cmpxchg_acq_rel_seq_cst(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_acq_rel_seq_cst( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_acq_rel_seq_cst_param_0]; +; CHECK-NEXT: fence.sc.sys; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_acq_rel_seq_cst_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_acq_rel_seq_cst_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acquire.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new acq_rel seq_cst + ret i128 %new +} + +define i128 @test_cmpxchg_seq_cst_monotonic(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_seq_cst_monotonic( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_seq_cst_monotonic_param_0]; +; CHECK-NEXT: fence.sc.sys; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_seq_cst_monotonic_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_seq_cst_monotonic_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acquire.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new seq_cst monotonic + ret i128 %new +} + +define i128 @test_cmpxchg_seq_cst_acquire(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_seq_cst_acquire( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_seq_cst_acquire_param_0]; +; CHECK-NEXT: fence.sc.sys; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_seq_cst_acquire_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_seq_cst_acquire_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acquire.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new seq_cst acquire + ret i128 %new +} + +define i128 @test_cmpxchg_seq_cst_seq_cst(ptr %addr, i128 %cmp, i128 %new) { +; CHECK-LABEL: test_cmpxchg_seq_cst_seq_cst( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_cmpxchg_seq_cst_seq_cst_param_0]; +; CHECK-NEXT: fence.sc.sys; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_cmpxchg_seq_cst_seq_cst_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_cmpxchg_seq_cst_seq_cst_param_2]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd2, %rd3}; +; CHECK-NEXT: mov.b128 swap, {%rd4, %rd5}; +; CHECK-NEXT: atom.acquire.sys.cas.b128 dst, [%rd1], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd6, %rd7}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: ret; + %pairold = cmpxchg ptr %addr, i128 %cmp, i128 %new seq_cst seq_cst + ret i128 %new +} + +define i128 @test_atomicrmw_and(ptr %ptr, i128 %val) { +; CHECK-LABEL: test_atomicrmw_and( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b64 %rd<13>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_atomicrmw_and_param_1]; +; CHECK-NEXT: ld.param.b64 %rd3, [test_atomicrmw_and_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; +; CHECK-NEXT: $L__BB34_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: and.b64 %rd6, %rd11, %rd4; +; CHECK-NEXT: and.b64 %rd7, %rd12, %rd5; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 swap, {%rd6, %rd7}; +; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; +; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; +; CHECK-NEXT: setp.ne.b64 %p1, %rd10, 0; +; CHECK-NEXT: mov.b64 %rd11, %rd1; +; CHECK-NEXT: mov.b64 %rd12, %rd2; +; CHECK-NEXT: @%p1 bra $L__BB34_1; +; CHECK-NEXT: // %bb.2: // %atomicrmw.end +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ret; + %ret = atomicrmw and ptr %ptr, i128 %val monotonic + ret i128 %ret +} + +define i128 @test_atomicrmw_or(ptr %ptr, i128 %val) { +; CHECK-LABEL: test_atomicrmw_or( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b64 %rd<13>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_atomicrmw_or_param_1]; +; CHECK-NEXT: ld.param.b64 %rd3, [test_atomicrmw_or_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; +; CHECK-NEXT: $L__BB35_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: or.b64 %rd6, %rd11, %rd4; +; CHECK-NEXT: or.b64 %rd7, %rd12, %rd5; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 swap, {%rd6, %rd7}; +; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; +; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; +; CHECK-NEXT: setp.ne.b64 %p1, %rd10, 0; +; CHECK-NEXT: mov.b64 %rd11, %rd1; +; CHECK-NEXT: mov.b64 %rd12, %rd2; +; CHECK-NEXT: @%p1 bra $L__BB35_1; +; CHECK-NEXT: // %bb.2: // %atomicrmw.end +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ret; + %ret = atomicrmw or ptr %ptr, i128 %val monotonic + ret i128 %ret +} + +define i128 @test_atomicrmw_xor(ptr %ptr, i128 %val) { +; CHECK-LABEL: test_atomicrmw_xor( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b64 %rd<13>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_atomicrmw_xor_param_1]; +; CHECK-NEXT: ld.param.b64 %rd3, [test_atomicrmw_xor_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; +; CHECK-NEXT: $L__BB36_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: xor.b64 %rd6, %rd11, %rd4; +; CHECK-NEXT: xor.b64 %rd7, %rd12, %rd5; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 swap, {%rd6, %rd7}; +; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; +; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; +; CHECK-NEXT: setp.ne.b64 %p1, %rd10, 0; +; CHECK-NEXT: mov.b64 %rd11, %rd1; +; CHECK-NEXT: mov.b64 %rd12, %rd2; +; CHECK-NEXT: @%p1 bra $L__BB36_1; +; CHECK-NEXT: // %bb.2: // %atomicrmw.end +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ret; + %ret = atomicrmw xor ptr %ptr, i128 %val monotonic + ret i128 %ret +} + +define i128 @test_atomicrmw_min(ptr %ptr, i128 %val) { +; CHECK-LABEL: test_atomicrmw_min( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<7>; +; CHECK-NEXT: .reg .b64 %rd<13>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_atomicrmw_min_param_1]; +; CHECK-NEXT: ld.param.b64 %rd3, [test_atomicrmw_min_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; +; CHECK-NEXT: $L__BB37_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: setp.lt.u64 %p1, %rd11, %rd4; +; CHECK-NEXT: setp.eq.b64 %p2, %rd12, %rd5; +; CHECK-NEXT: and.pred %p3, %p2, %p1; +; CHECK-NEXT: setp.lt.s64 %p4, %rd12, %rd5; +; CHECK-NEXT: or.pred %p5, %p3, %p4; +; CHECK-NEXT: selp.b64 %rd6, %rd12, %rd5, %p5; +; CHECK-NEXT: selp.b64 %rd7, %rd11, %rd4, %p5; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 swap, {%rd7, %rd6}; +; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; +; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; +; CHECK-NEXT: setp.ne.b64 %p6, %rd10, 0; +; CHECK-NEXT: mov.b64 %rd11, %rd1; +; CHECK-NEXT: mov.b64 %rd12, %rd2; +; CHECK-NEXT: @%p6 bra $L__BB37_1; +; CHECK-NEXT: // %bb.2: // %atomicrmw.end +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ret; + %ret = atomicrmw min ptr %ptr, i128 %val monotonic + ret i128 %ret +} + +define i128 @test_atomicrmw_max(ptr %ptr, i128 %val) { +; CHECK-LABEL: test_atomicrmw_max( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<7>; +; CHECK-NEXT: .reg .b64 %rd<13>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_atomicrmw_max_param_1]; +; CHECK-NEXT: ld.param.b64 %rd3, [test_atomicrmw_max_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; +; CHECK-NEXT: $L__BB38_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: setp.gt.u64 %p1, %rd11, %rd4; +; CHECK-NEXT: setp.eq.b64 %p2, %rd12, %rd5; +; CHECK-NEXT: and.pred %p3, %p2, %p1; +; CHECK-NEXT: setp.gt.s64 %p4, %rd12, %rd5; +; CHECK-NEXT: or.pred %p5, %p3, %p4; +; CHECK-NEXT: selp.b64 %rd6, %rd12, %rd5, %p5; +; CHECK-NEXT: selp.b64 %rd7, %rd11, %rd4, %p5; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 swap, {%rd7, %rd6}; +; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; +; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; +; CHECK-NEXT: setp.ne.b64 %p6, %rd10, 0; +; CHECK-NEXT: mov.b64 %rd11, %rd1; +; CHECK-NEXT: mov.b64 %rd12, %rd2; +; CHECK-NEXT: @%p6 bra $L__BB38_1; +; CHECK-NEXT: // %bb.2: // %atomicrmw.end +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ret; + %ret = atomicrmw max ptr %ptr, i128 %val monotonic + ret i128 %ret +} + +define i128 @test_atomicrmw_umin(ptr %ptr, i128 %val) { +; CHECK-LABEL: test_atomicrmw_umin( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<7>; +; CHECK-NEXT: .reg .b64 %rd<13>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_atomicrmw_umin_param_1]; +; CHECK-NEXT: ld.param.b64 %rd3, [test_atomicrmw_umin_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; +; CHECK-NEXT: $L__BB39_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: setp.lt.u64 %p1, %rd11, %rd4; +; CHECK-NEXT: setp.eq.b64 %p2, %rd12, %rd5; +; CHECK-NEXT: and.pred %p3, %p2, %p1; +; CHECK-NEXT: setp.lt.u64 %p4, %rd12, %rd5; +; CHECK-NEXT: or.pred %p5, %p3, %p4; +; CHECK-NEXT: selp.b64 %rd6, %rd12, %rd5, %p5; +; CHECK-NEXT: selp.b64 %rd7, %rd11, %rd4, %p5; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 swap, {%rd7, %rd6}; +; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; +; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; +; CHECK-NEXT: setp.ne.b64 %p6, %rd10, 0; +; CHECK-NEXT: mov.b64 %rd11, %rd1; +; CHECK-NEXT: mov.b64 %rd12, %rd2; +; CHECK-NEXT: @%p6 bra $L__BB39_1; +; CHECK-NEXT: // %bb.2: // %atomicrmw.end +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ret; + %ret = atomicrmw umin ptr %ptr, i128 %val monotonic + ret i128 %ret +} + +define i128 @test_atomicrmw_umax(ptr %ptr, i128 %val) { +; CHECK-LABEL: test_atomicrmw_umax( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<7>; +; CHECK-NEXT: .reg .b64 %rd<13>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [test_atomicrmw_umax_param_1]; +; CHECK-NEXT: ld.param.b64 %rd3, [test_atomicrmw_umax_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3]; +; CHECK-NEXT: $L__BB40_1: // %atomicrmw.start +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: setp.gt.u64 %p1, %rd11, %rd4; +; CHECK-NEXT: setp.eq.b64 %p2, %rd12, %rd5; +; CHECK-NEXT: and.pred %p3, %p2, %p1; +; CHECK-NEXT: setp.gt.u64 %p4, %rd12, %rd5; +; CHECK-NEXT: or.pred %p5, %p3, %p4; +; CHECK-NEXT: selp.b64 %rd6, %rd12, %rd5, %p5; +; CHECK-NEXT: selp.b64 %rd7, %rd11, %rd4, %p5; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 cmp, swap, dst; +; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12}; +; CHECK-NEXT: mov.b128 swap, {%rd7, %rd6}; +; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap; +; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12; +; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11; +; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8; +; CHECK-NEXT: setp.ne.b64 %p6, %rd10, 0; +; CHECK-NEXT: mov.b64 %rd11, %rd1; +; CHECK-NEXT: mov.b64 %rd12, %rd2; +; CHECK-NEXT: @%p6 bra $L__BB40_1; +; CHECK-NEXT: // %bb.2: // %atomicrmw.end +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ret; + %ret = atomicrmw umax ptr %ptr, i128 %val monotonic + ret i128 %ret +} + + +@si128 = internal addrspace(3) global i128 0, align 16 + +define void @test_atomicrmw_xchg_const() { +; CHECK-LABEL: test_atomicrmw_xchg_const( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-NEXT: // demoted variable +; CHECK-NEXT: .shared .align 16 .b8 si128[16]; +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.b64 %rd1, 0; +; CHECK-NEXT: mov.b64 %rd2, 23; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b128 amt, dst; +; CHECK-NEXT: mov.b128 amt, {%rd2, %rd1}; +; CHECK-NEXT: atom.relaxed.sys.shared.exch.b128 dst, [si128], amt; +; CHECK-NEXT: mov.b128 {%rd3, %rd4}, dst; +; CHECK-NEXT: } +; CHECK-NEXT: ret; + %res = atomicrmw xchg ptr addrspace(3) @si128, i128 23 monotonic + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm60.ll b/llvm/test/CodeGen/NVPTX/atomics-sm60.ll index 2e11323d1b3e..ae10526ec836 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-sm60.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-sm60.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_60 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_60 | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %} -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %} +; RUN: %if ptxas-sm_60 && ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %} +; RUN: %if ptxas-sm_60 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %} ; CHECK-LABEL: .func test( define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, double %d) { diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll index 5f4856acb317..e2762bac45a3 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll @@ -2,9 +2,9 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefixes=CHECK ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefixes=CHECK64 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_70 -mattr=+ptx62 | FileCheck %s --check-prefixes=CHECKPTX62 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_70 -mattr=+ptx62 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 && ptxas-isa-6.3 && ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 && ptxas-isa-6.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 && ptxas-isa-6.2 && ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_70 -mattr=+ptx62 | %ptxas-verify -arch=sm_70 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll index e560d4386c20..e6c6a73eef14 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll @@ -2,9 +2,9 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s --check-prefixes=CHECK ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s --check-prefixes=CHECK64 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_86 -mattr=+ptx71 | FileCheck %s --check-prefixes=CHECKPTX71 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_86 -mattr=+ptx71 | %ptxas-verify -arch=sm_86 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 && ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_86 && ptxas-isa-7.1 && ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_86 -mattr=+ptx71 | %ptxas-verify -arch=sm_86 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/atomics-with-scope.ll b/llvm/test/CodeGen/NVPTX/atomics-with-scope.ll index e6636d706b49..d406f9c1e33f 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-with-scope.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-with-scope.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_60 | FileCheck %s -check-prefixes=CHECK,CHECK32 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_60 | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %} -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %} +; RUN: %if ptxas-sm_60 && ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %} +; RUN: %if ptxas-sm_60 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %} ; CHECK-LABEL: .func test_atomics_scope( define void @test_atomics_scope(ptr %fp, float %f, diff --git a/llvm/test/CodeGen/NVPTX/b52037.ll b/llvm/test/CodeGen/NVPTX/b52037.ll index b6317dfb2859..268a8972ebd2 100644 --- a/llvm/test/CodeGen/NVPTX/b52037.ll +++ b/llvm/test/CodeGen/NVPTX/b52037.ll @@ -4,7 +4,7 @@ ; https://bugs.llvm.org/show_bug.cgi?id=52037 for the gory details. ; ; RUN: llc -mtriple=nvptx64-nvidia-cuda -mcpu=sm_70 -O3 -o - %s | FileCheck %s -; RUN: %if ptxas %{ llc -mtriple=nvptx64-nvidia-cuda -mcpu=sm_70 -O3 -o - %s | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 %{ llc -mtriple=nvptx64-nvidia-cuda -mcpu=sm_70 -O3 -o - %s | %ptxas-verify -arch=sm_70 %} ; CHECK-LABEL: .visible .entry barney( ; CHECK-NOT: .local{{.*}}__local_depot diff --git a/llvm/test/CodeGen/NVPTX/barrier.ll b/llvm/test/CodeGen/NVPTX/barrier.ll index a3b0d21f098f..f2d6f2354038 100644 --- a/llvm/test/CodeGen/NVPTX/barrier.ll +++ b/llvm/test/CodeGen/NVPTX/barrier.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %} +; RUN: %if ptxas-isa-6.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %} declare void @llvm.nvvm.bar.warp.sync(i32) declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll index a386e4292777..4d930cd9e57c 100644 --- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll @@ -3,9 +3,9 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM80 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | FileCheck --check-prefixes=CHECK,SM80-FTZ %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK,SM90 %s -; RUN: %if ptxas-11.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %} -; RUN: %if ptxas-11.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | %ptxas-verify -arch=sm_80 %} -; RUN: %if ptxas-11.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll index e1d4ef1073a7..2c4aa6b3f8f3 100644 --- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll +++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas-11.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %} target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll index bd4c7775354a..3c6fb4b7517b 100644 --- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM80 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK,SM90 %s -; RUN: %if ptxas-11.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %} -; RUN: %if ptxas-11.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" @@ -711,11 +711,11 @@ define <2 x bfloat> @test_copysign(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; CHECK-NEXT: .reg .b32 %r<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_copysign_param_0]; -; CHECK-NEXT: ld.param.b32 %r2, [test_copysign_param_1]; -; CHECK-NEXT: and.b32 %r3, %r2, -2147450880; -; CHECK-NEXT: and.b32 %r4, %r1, 2147450879; -; CHECK-NEXT: or.b32 %r5, %r4, %r3; +; CHECK-NEXT: ld.param.b32 %r1, [test_copysign_param_1]; +; CHECK-NEXT: and.b32 %r2, %r1, -2147450880; +; CHECK-NEXT: ld.param.b32 %r3, [test_copysign_param_0]; +; CHECK-NEXT: and.b32 %r4, %r3, 2147450879; +; CHECK-NEXT: or.b32 %r5, %r4, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-NEXT: ret; %r = call <2 x bfloat> @llvm.copysign.f16(<2 x bfloat> %a, <2 x bfloat> %b) diff --git a/llvm/test/CodeGen/NVPTX/bmsk.ll b/llvm/test/CodeGen/NVPTX/bmsk.ll index d5b278657bd5..dee5a76f4c9d 100644 --- a/llvm/test/CodeGen/NVPTX/bmsk.ll +++ b/llvm/test/CodeGen/NVPTX/bmsk.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -o - < %s -mcpu=sm_70 -mattr=+ptx76 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mcpu=sm_70 -mattr=+ptx76 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 && ptxas-isa-7.6 %{ llc < %s -mcpu=sm_70 -mattr=+ptx76 | %ptxas-verify -arch=sm_70 %} target triple = "nvptx64-unknown-cuda" diff --git a/llvm/test/CodeGen/NVPTX/bswap.ll b/llvm/test/CodeGen/NVPTX/bswap.ll index 0d1d6da4ba2b..e3d1c8092260 100644 --- a/llvm/test/CodeGen/NVPTX/bswap.ll +++ b/llvm/test/CodeGen/NVPTX/bswap.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx70 | FileCheck -check-prefixes CHECK,PTX70 %s ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} -; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx70 | %ptxas-verify %} +; RUN: %if ptxas-isa-7.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx70 | %ptxas-verify %} ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | FileCheck -check-prefixes CHECK,PTX71 %s -; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %} +; RUN: %if ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/bug26185-2.ll b/llvm/test/CodeGen/NVPTX/bug26185-2.ll index 46172b1af123..4e11f58f85ee 100644 --- a/llvm/test/CodeGen/NVPTX/bug26185-2.ll +++ b/llvm/test/CodeGen/NVPTX/bug26185-2.ll @@ -16,7 +16,7 @@ define ptx_kernel void @spam(ptr addrspace(1) noalias nocapture readonly %arg, p ; CHECK: .maxntid 1, 1, 1 ; CHECK-NEXT: { ; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-NEXT: .reg .b64 %rd<9>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %bb ; CHECK-NEXT: ld.param.b64 %rd1, [spam_param_0]; @@ -25,9 +25,10 @@ define ptx_kernel void @spam(ptr addrspace(1) noalias nocapture readonly %arg, p ; CHECK-NEXT: add.s64 %rd4, %rd1, %rd3; ; CHECK-NEXT: ld.param.b64 %rd5, [spam_param_1]; ; CHECK-NEXT: ld.global.nc.s16 %r1, [%rd4+16]; -; CHECK-NEXT: ld.global.b64 %rd6, [%rd5]; -; CHECK-NEXT: mad.wide.s32 %rd7, %r1, %r1, %rd6; -; CHECK-NEXT: st.global.b64 [%rd5], %rd7; +; CHECK-NEXT: mul.wide.s32 %rd6, %r1, %r1; +; CHECK-NEXT: ld.global.b64 %rd7, [%rd5]; +; CHECK-NEXT: add.s64 %rd8, %rd6, %rd7; +; CHECK-NEXT: st.global.b64 [%rd5], %rd8; ; CHECK-NEXT: ret; bb: %tmp5 = add nsw i64 %arg3, 8 diff --git a/llvm/test/CodeGen/NVPTX/byval-arg-vectorize.ll b/llvm/test/CodeGen/NVPTX/byval-arg-vectorize.ll index 579f02a9539c..ed43b425b12a 100644 --- a/llvm/test/CodeGen/NVPTX/byval-arg-vectorize.ll +++ b/llvm/test/CodeGen/NVPTX/byval-arg-vectorize.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mcpu=sm_70 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mcpu=sm_70 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 %{ llc < %s -mcpu=sm_70 | %ptxas-verify -arch=sm_70 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/byval-const-global.ll b/llvm/test/CodeGen/NVPTX/byval-const-global.ll index b4934e1a94d1..81e7edfd8602 100644 --- a/llvm/test/CodeGen/NVPTX/byval-const-global.ll +++ b/llvm/test/CodeGen/NVPTX/byval-const-global.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mcpu=sm_70 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mcpu=sm_70 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 %{ llc < %s -mcpu=sm_70 | %ptxas-verify -arch=sm_70 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/calling-conv.ll b/llvm/test/CodeGen/NVPTX/calling-conv.ll index 74b99efcdadf..0bec7e6791f1 100644 --- a/llvm/test/CodeGen/NVPTX/calling-conv.ll +++ b/llvm/test/CodeGen/NVPTX/calling-conv.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} diff --git a/llvm/test/CodeGen/NVPTX/cluster-dim.ll b/llvm/test/CodeGen/NVPTX/cluster-dim.ll index 196b967ce868..a8101f6bc6bd 100644 --- a/llvm/test/CodeGen/NVPTX/cluster-dim.ll +++ b/llvm/test/CodeGen/NVPTX/cluster-dim.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 | FileCheck -check-prefixes=CHECK80 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 | FileCheck -check-prefixes=CHECK90 %s -; RUN: %if ptxas-12.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 | %ptxas-verify -arch=sm_90 %} define ptx_kernel void @kernel_func_clusterxyz() "nvvm.cluster_dim"="3,5,7" { ; CHECK80-LABEL: kernel_func_clusterxyz( diff --git a/llvm/test/CodeGen/NVPTX/clusterlaunchcontrol-multicast.ll b/llvm/test/CodeGen/NVPTX/clusterlaunchcontrol-multicast.ll index c8b79dfae760..d930d1842a1d 100644 --- a/llvm/test/CodeGen/NVPTX/clusterlaunchcontrol-multicast.ll +++ b/llvm/test/CodeGen/NVPTX/clusterlaunchcontrol-multicast.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx86 %s | FileCheck %s --check-prefixes=CHECK,CHECK-PTX-SHARED64 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK,CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100a %} ; RUN: llc -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx86 %s | FileCheck %s --check-prefixes=CHECK,CHECK-PTX-SHARED64 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK,CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 | %ptxas-verify -arch=sm_101a %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_101a %} +; RUN: %if ptxas-sm_101a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 | %ptxas-verify -arch=sm_101a %} +; RUN: %if ptxas-sm_101a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_101a %} ; RUN: llc -o - -mcpu=sm_120a -march=nvptx64 -mattr=+ptx86 %s | FileCheck %s --check-prefixes=CHECK,CHECK-PTX-SHARED64 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK,CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 | %ptxas-verify -arch=sm_120a %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_120a %} +; RUN: %if ptxas-sm_120a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 | %ptxas-verify -arch=sm_120a %} +; RUN: %if ptxas-sm_120a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_120a %} define void @nvvm_clusterlaunchcontrol_try_cancel_multicast( ; CHECK-PTX-SHARED64-LABEL: nvvm_clusterlaunchcontrol_try_cancel_multicast( diff --git a/llvm/test/CodeGen/NVPTX/clusterlaunchcontrol.ll b/llvm/test/CodeGen/NVPTX/clusterlaunchcontrol.ll index a8ccfc50fbe7..234fb667e748 100644 --- a/llvm/test/CodeGen/NVPTX/clusterlaunchcontrol.ll +++ b/llvm/test/CodeGen/NVPTX/clusterlaunchcontrol.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx86 | FileCheck %s --check-prefixes=CHECK,CHECK-PTX-SHARED64 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK,CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 | %ptxas-verify -arch=sm_100 %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-sm_100 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 | %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-sm_100 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100 %} define void @nvvm_clusterlaunchcontrol_try_cancel( ; CHECK-PTX-SHARED64-LABEL: nvvm_clusterlaunchcontrol_try_cancel( diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll index 6e480996e7e6..d895c715ab3c 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll @@ -1,13 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | FileCheck %s --check-prefix=SM60 -; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | %ptxas-verify -arch=sm_60 %} +; RUN: %if ptxas-sm_60 && ptxas-isa-5.0 %{ llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | %ptxas-verify -arch=sm_60 %} define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-LABEL: monotonic_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -22,23 +22,22 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB0_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB0_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB0_1; ; SM60-NEXT: $L__BB0_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r12; @@ -52,7 +51,7 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -67,23 +66,22 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB1_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB1_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB1_1; ; SM60-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -98,7 +96,7 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -114,23 +112,22 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB2_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB2_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB2_1; ; SM60-NEXT: $L__BB2_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -145,7 +142,7 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -160,23 +157,22 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB3_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB3_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB3_1; ; SM60-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -191,7 +187,7 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -206,23 +202,22 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB4_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB4_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB4_1; ; SM60-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -237,7 +232,7 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -253,23 +248,22 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB5_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB5_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB5_1; ; SM60-NEXT: $L__BB5_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -284,7 +278,7 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -300,23 +294,22 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB6_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB6_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB6_1; ; SM60-NEXT: $L__BB6_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r12; @@ -330,7 +323,7 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -346,23 +339,22 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB7_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB7_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB7_1; ; SM60-NEXT: $L__BB7_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -377,7 +369,7 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -393,23 +385,22 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB8_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB8_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB8_1; ; SM60-NEXT: $L__BB8_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -424,7 +415,7 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -440,23 +431,22 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB9_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB9_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB9_1; ; SM60-NEXT: $L__BB9_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -471,7 +461,7 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -487,23 +477,22 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB10_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB10_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB10_1; ; SM60-NEXT: $L__BB10_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -518,7 +507,7 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -534,23 +523,22 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB11_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB11_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB11_1; ; SM60-NEXT: $L__BB11_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -565,7 +553,7 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -581,23 +569,22 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB12_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB12_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB12_1; ; SM60-NEXT: $L__BB12_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -612,7 +599,7 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -628,23 +615,22 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB13_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB13_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB13_1; ; SM60-NEXT: $L__BB13_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -659,7 +645,7 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -675,23 +661,22 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB14_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB14_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB14_1; ; SM60-NEXT: $L__BB14_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -1899,7 +1884,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -1915,23 +1900,22 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.sys.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB60_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB60_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB60_1; ; SM60-NEXT: $L__BB60_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; @@ -1997,7 +1981,7 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -2013,23 +1997,22 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB64_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB64_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB64_1; ; SM60-NEXT: $L__BB64_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -2044,7 +2027,7 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -2060,23 +2043,22 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.shared.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.shared.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.shared.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB65_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB65_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB65_1; ; SM60-NEXT: $L__BB65_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll index 065b89c7ebf7..76220ee3a399 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll @@ -1,13 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefix=SM70 -; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 && ptxas-isa-6.3 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-LABEL: monotonic_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -22,23 +22,22 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB0_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB0_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB0_1; ; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r12; @@ -52,7 +51,7 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -67,23 +66,22 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB1_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB1_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB1_1; ; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -98,7 +96,7 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -114,23 +112,22 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB2_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB2_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB2_1; ; SM70-NEXT: $L__BB2_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -145,7 +142,7 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -160,23 +157,22 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB3_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB3_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB3_1; ; SM70-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -191,7 +187,7 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -206,23 +202,22 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB4_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB4_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB4_1; ; SM70-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -237,7 +232,7 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -253,23 +248,22 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB5_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB5_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB5_1; ; SM70-NEXT: $L__BB5_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -284,7 +278,7 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -300,23 +294,22 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB6_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB6_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB6_1; ; SM70-NEXT: $L__BB6_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r12; @@ -330,7 +323,7 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -346,23 +339,22 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB7_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB7_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB7_1; ; SM70-NEXT: $L__BB7_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -377,7 +369,7 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -393,23 +385,22 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB8_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB8_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB8_1; ; SM70-NEXT: $L__BB8_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -424,7 +415,7 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -440,23 +431,22 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB9_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB9_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB9_1; ; SM70-NEXT: $L__BB9_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -471,7 +461,7 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -487,23 +477,22 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB10_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB10_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB10_1; ; SM70-NEXT: $L__BB10_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -518,7 +507,7 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -534,23 +523,22 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB11_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB11_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB11_1; ; SM70-NEXT: $L__BB11_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -565,7 +553,7 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -581,23 +569,22 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB12_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB12_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB12_1; ; SM70-NEXT: $L__BB12_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -612,7 +599,7 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -628,23 +615,22 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB13_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB13_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB13_1; ; SM70-NEXT: $L__BB13_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -659,7 +645,7 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -675,23 +661,22 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB14_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB14_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB14_1; ; SM70-NEXT: $L__BB14_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -1899,7 +1884,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -1915,23 +1900,22 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB60_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB60_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB60_1; ; SM70-NEXT: $L__BB60_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -1997,7 +1981,7 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -2013,23 +1997,22 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB64_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB64_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB64_1; ; SM70-NEXT: $L__BB64_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -2044,7 +2027,7 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -2060,23 +2043,22 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.shared.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.shared.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB65_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB65_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB65_1; ; SM70-NEXT: $L__BB65_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll index e4433570bdd7..4cdedb2065e2 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll @@ -1,13 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck %s --check-prefix=SM90 -; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.7 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-LABEL: monotonic_monotonic_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -22,23 +22,22 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB0_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB0_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB0_1; ; SM90-NEXT: $L__BB0_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r12; @@ -52,7 +51,7 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -67,23 +66,22 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB1_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB1_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB1_1; ; SM90-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -98,7 +96,7 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -114,23 +112,22 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB2_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB2_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB2_1; ; SM90-NEXT: $L__BB2_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -145,7 +142,7 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -160,23 +157,22 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB3_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB3_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB3_1; ; SM90-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -191,7 +187,7 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -206,23 +202,22 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB4_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB4_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB4_1; ; SM90-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -237,7 +232,7 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -253,23 +248,22 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB5_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB5_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB5_1; ; SM90-NEXT: $L__BB5_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -284,7 +278,7 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -300,23 +294,22 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB6_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB6_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB6_1; ; SM90-NEXT: $L__BB6_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r12; @@ -330,7 +323,7 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -346,23 +339,22 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB7_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB7_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB7_1; ; SM90-NEXT: $L__BB7_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -377,7 +369,7 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -393,23 +385,22 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB8_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB8_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB8_1; ; SM90-NEXT: $L__BB8_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -424,7 +415,7 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -440,23 +431,22 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB9_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB9_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB9_1; ; SM90-NEXT: $L__BB9_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -471,7 +461,7 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -487,23 +477,22 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB10_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB10_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB10_1; ; SM90-NEXT: $L__BB10_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -518,7 +507,7 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -534,23 +523,22 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB11_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB11_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB11_1; ; SM90-NEXT: $L__BB11_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -565,7 +553,7 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -581,23 +569,22 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB12_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB12_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB12_1; ; SM90-NEXT: $L__BB12_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -612,7 +599,7 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -628,23 +615,22 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB13_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB13_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB13_1; ; SM90-NEXT: $L__BB13_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -659,7 +645,7 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -675,23 +661,22 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB14_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB14_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB14_1; ; SM90-NEXT: $L__BB14_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -1899,7 +1884,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -1915,23 +1900,22 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB60_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB60_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB60_1; ; SM90-NEXT: $L__BB60_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -2014,7 +1998,7 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -2030,23 +2014,22 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB65_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB65_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB65_1; ; SM90-NEXT: $L__BB65_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -2061,7 +2044,7 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -2077,23 +2060,22 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.shared.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.shared.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB66_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB66_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB66_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB66_1; ; SM90-NEXT: $L__BB66_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll index 997df7a8ad8b..ec37025ec4c9 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll @@ -2,7 +2,7 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_32 | FileCheck %s --check-prefixes=SM30,CHECK ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_32 | %ptxas-verify %} ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefixes=SM70,CHECK -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 && ptxas-isa-6.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} ; TODO: these are system scope, but are compiled to gpu scope.. ; TODO: these are seq_cst, but are compiled to relaxed.. @@ -14,7 +14,7 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; -; SM30-NEXT: .reg .b32 %r<18>; +; SM30-NEXT: .reg .b32 %r<17>; ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: @@ -29,23 +29,22 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: shl.b32 %r11, %r10, %r1; ; SM30-NEXT: not.b32 %r2, %r11; ; SM30-NEXT: cvt.u32.u16 %r12, %rs1; -; SM30-NEXT: and.b32 %r13, %r12, 255; -; SM30-NEXT: shl.b32 %r3, %r13, %r1; +; SM30-NEXT: shl.b32 %r3, %r12, %r1; ; SM30-NEXT: shl.b32 %r4, %r7, %r1; -; SM30-NEXT: ld.b32 %r14, [%rd1]; -; SM30-NEXT: and.b32 %r17, %r14, %r2; +; SM30-NEXT: ld.b32 %r13, [%rd1]; +; SM30-NEXT: and.b32 %r16, %r13, %r2; ; SM30-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r15, %r17, %r3; -; SM30-NEXT: or.b32 %r16, %r17, %r4; -; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r16, %r15; -; SM30-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM30-NEXT: or.b32 %r14, %r16, %r3; +; SM30-NEXT: or.b32 %r15, %r16, %r4; +; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r15, %r14; +; SM30-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM30-NEXT: @%p1 bra $L__BB0_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM30-NEXT: // in Loop: Header=BB0_1 Depth=1 ; SM30-NEXT: and.b32 %r6, %r5, %r2; -; SM30-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM30-NEXT: mov.b32 %r17, %r6; +; SM30-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM30-NEXT: mov.b32 %r16, %r6; ; SM30-NEXT: @%p2 bra $L__BB0_1; ; SM30-NEXT: $L__BB0_3: // %partword.cmpxchg.end ; SM30-NEXT: st.param.b32 [func_retval0], %r12; @@ -55,7 +54,7 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -70,23 +69,22 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB0_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB0_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB0_1; ; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r12; @@ -140,7 +138,7 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; -; SM30-NEXT: .reg .b32 %r<18>; +; SM30-NEXT: .reg .b32 %r<17>; ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: @@ -155,23 +153,22 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: shl.b32 %r11, %r10, %r1; ; SM30-NEXT: not.b32 %r2, %r11; ; SM30-NEXT: cvt.u32.u16 %r12, %rs1; -; SM30-NEXT: and.b32 %r13, %r12, 255; -; SM30-NEXT: shl.b32 %r3, %r13, %r1; +; SM30-NEXT: shl.b32 %r3, %r12, %r1; ; SM30-NEXT: shl.b32 %r4, %r7, %r1; -; SM30-NEXT: ld.b32 %r14, [%rd1]; -; SM30-NEXT: and.b32 %r17, %r14, %r2; +; SM30-NEXT: ld.b32 %r13, [%rd1]; +; SM30-NEXT: and.b32 %r16, %r13, %r2; ; SM30-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r15, %r17, %r3; -; SM30-NEXT: or.b32 %r16, %r17, %r4; -; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r16, %r15; -; SM30-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM30-NEXT: or.b32 %r14, %r16, %r3; +; SM30-NEXT: or.b32 %r15, %r16, %r4; +; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r15, %r14; +; SM30-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM30-NEXT: @%p1 bra $L__BB1_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM30-NEXT: // in Loop: Header=BB1_1 Depth=1 ; SM30-NEXT: and.b32 %r6, %r5, %r2; -; SM30-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM30-NEXT: mov.b32 %r17, %r6; +; SM30-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM30-NEXT: mov.b32 %r16, %r6; ; SM30-NEXT: @%p2 bra $L__BB1_1; ; SM30-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM30-NEXT: membar.sys; @@ -182,7 +179,7 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -197,23 +194,22 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB1_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB1_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB1_1; ; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -269,7 +265,7 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; -; SM30-NEXT: .reg .b32 %r<18>; +; SM30-NEXT: .reg .b32 %r<17>; ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: @@ -285,23 +281,22 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: shl.b32 %r11, %r10, %r1; ; SM30-NEXT: not.b32 %r2, %r11; ; SM30-NEXT: cvt.u32.u16 %r12, %rs1; -; SM30-NEXT: and.b32 %r13, %r12, 255; -; SM30-NEXT: shl.b32 %r3, %r13, %r1; +; SM30-NEXT: shl.b32 %r3, %r12, %r1; ; SM30-NEXT: shl.b32 %r4, %r7, %r1; -; SM30-NEXT: ld.b32 %r14, [%rd1]; -; SM30-NEXT: and.b32 %r17, %r14, %r2; +; SM30-NEXT: ld.b32 %r13, [%rd1]; +; SM30-NEXT: and.b32 %r16, %r13, %r2; ; SM30-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r15, %r17, %r3; -; SM30-NEXT: or.b32 %r16, %r17, %r4; -; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r16, %r15; -; SM30-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM30-NEXT: or.b32 %r14, %r16, %r3; +; SM30-NEXT: or.b32 %r15, %r16, %r4; +; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r15, %r14; +; SM30-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM30-NEXT: @%p1 bra $L__BB2_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM30-NEXT: // in Loop: Header=BB2_1 Depth=1 ; SM30-NEXT: and.b32 %r6, %r5, %r2; -; SM30-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM30-NEXT: mov.b32 %r17, %r6; +; SM30-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM30-NEXT: mov.b32 %r16, %r6; ; SM30-NEXT: @%p2 bra $L__BB2_1; ; SM30-NEXT: $L__BB2_3: // %partword.cmpxchg.end ; SM30-NEXT: st.param.b32 [func_retval0], %r12; @@ -311,7 +306,7 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -327,23 +322,22 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB2_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB2_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB2_1; ; SM70-NEXT: $L__BB2_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r12; @@ -398,7 +392,7 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; -; SM30-NEXT: .reg .b32 %r<18>; +; SM30-NEXT: .reg .b32 %r<17>; ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: @@ -414,23 +408,22 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: shl.b32 %r11, %r10, %r1; ; SM30-NEXT: not.b32 %r2, %r11; ; SM30-NEXT: cvt.u32.u16 %r12, %rs1; -; SM30-NEXT: and.b32 %r13, %r12, 255; -; SM30-NEXT: shl.b32 %r3, %r13, %r1; +; SM30-NEXT: shl.b32 %r3, %r12, %r1; ; SM30-NEXT: shl.b32 %r4, %r7, %r1; -; SM30-NEXT: ld.b32 %r14, [%rd1]; -; SM30-NEXT: and.b32 %r17, %r14, %r2; +; SM30-NEXT: ld.b32 %r13, [%rd1]; +; SM30-NEXT: and.b32 %r16, %r13, %r2; ; SM30-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r15, %r17, %r3; -; SM30-NEXT: or.b32 %r16, %r17, %r4; -; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r16, %r15; -; SM30-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM30-NEXT: or.b32 %r14, %r16, %r3; +; SM30-NEXT: or.b32 %r15, %r16, %r4; +; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r15, %r14; +; SM30-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM30-NEXT: @%p1 bra $L__BB3_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM30-NEXT: // in Loop: Header=BB3_1 Depth=1 ; SM30-NEXT: and.b32 %r6, %r5, %r2; -; SM30-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM30-NEXT: mov.b32 %r17, %r6; +; SM30-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM30-NEXT: mov.b32 %r16, %r6; ; SM30-NEXT: @%p2 bra $L__BB3_1; ; SM30-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM30-NEXT: membar.sys; @@ -441,7 +434,7 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -457,23 +450,22 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB3_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB3_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB3_1; ; SM70-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -530,7 +522,7 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; -; SM30-NEXT: .reg .b32 %r<18>; +; SM30-NEXT: .reg .b32 %r<17>; ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: @@ -546,23 +538,22 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: shl.b32 %r11, %r10, %r1; ; SM30-NEXT: not.b32 %r2, %r11; ; SM30-NEXT: cvt.u32.u16 %r12, %rs1; -; SM30-NEXT: and.b32 %r13, %r12, 255; -; SM30-NEXT: shl.b32 %r3, %r13, %r1; +; SM30-NEXT: shl.b32 %r3, %r12, %r1; ; SM30-NEXT: shl.b32 %r4, %r7, %r1; -; SM30-NEXT: ld.b32 %r14, [%rd1]; -; SM30-NEXT: and.b32 %r17, %r14, %r2; +; SM30-NEXT: ld.b32 %r13, [%rd1]; +; SM30-NEXT: and.b32 %r16, %r13, %r2; ; SM30-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r15, %r17, %r3; -; SM30-NEXT: or.b32 %r16, %r17, %r4; -; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r16, %r15; -; SM30-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM30-NEXT: or.b32 %r14, %r16, %r3; +; SM30-NEXT: or.b32 %r15, %r16, %r4; +; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r15, %r14; +; SM30-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM30-NEXT: @%p1 bra $L__BB4_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM30-NEXT: // in Loop: Header=BB4_1 Depth=1 ; SM30-NEXT: and.b32 %r6, %r5, %r2; -; SM30-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM30-NEXT: mov.b32 %r17, %r6; +; SM30-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM30-NEXT: mov.b32 %r16, %r6; ; SM30-NEXT: @%p2 bra $L__BB4_1; ; SM30-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM30-NEXT: membar.sys; @@ -573,7 +564,7 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -589,23 +580,22 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB4_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB4_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB4_1; ; SM70-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; diff --git a/llvm/test/CodeGen/NVPTX/combine-mad.ll b/llvm/test/CodeGen/NVPTX/combine-mad.ll index e6bce8991a71..04d1932a0abb 100644 --- a/llvm/test/CodeGen/NVPTX/combine-mad.ll +++ b/llvm/test/CodeGen/NVPTX/combine-mad.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 -O1 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -O1 | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -O1 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -O1 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -O1 | %ptxas-verify %} define i32 @test1(i32 %n, i32 %m) { diff --git a/llvm/test/CodeGen/NVPTX/combine-min-max.ll b/llvm/test/CodeGen/NVPTX/combine-min-max.ll index e7140ab13d4b..c0550086b851 100644 --- a/llvm/test/CodeGen/NVPTX/combine-min-max.ll +++ b/llvm/test/CodeGen/NVPTX/combine-min-max.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx80 -O3 | FileCheck %s --check-prefixes=CHECK,SM90 ; RUN: llc < %s -mcpu=sm_20 -O3 | FileCheck %s --check-prefixes=CHECK,SM20 -; RUN: %if ptxas-12.0 %{ llc < %s -mcpu=sm_90 -mattr=+ptx80 -O3 | %ptxas-verify -arch=sm_90 %} -; RUN: %if ptxas-12.0 %{ llc < %s -mcpu=sm_20 -O3 | %ptxas-verify %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mcpu=sm_90 -mattr=+ptx80 -O3 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 -O3 | %ptxas-verify %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/combine-wide.ll b/llvm/test/CodeGen/NVPTX/combine-wide.ll index ed4a2b6e419c..b5948d37c350 100644 --- a/llvm/test/CodeGen/NVPTX/combine-wide.ll +++ b/llvm/test/CodeGen/NVPTX/combine-wide.ll @@ -9,14 +9,15 @@ define i64 @t1(i32 %a, i32 %b, i64 %c) { ; O1-LABEL: t1( ; O1: { ; O1-NEXT: .reg .b32 %r<3>; -; O1-NEXT: .reg .b64 %rd<3>; +; O1-NEXT: .reg .b64 %rd<4>; ; O1-EMPTY: ; O1-NEXT: // %bb.0: ; O1-NEXT: ld.param.b32 %r1, [t1_param_0]; ; O1-NEXT: ld.param.b32 %r2, [t1_param_1]; -; O1-NEXT: ld.param.b64 %rd1, [t1_param_2]; -; O1-NEXT: mad.wide.s32 %rd2, %r1, %r2, %rd1; -; O1-NEXT: st.param.b64 [func_retval0], %rd2; +; O1-NEXT: mul.wide.s32 %rd1, %r1, %r2; +; O1-NEXT: ld.param.b64 %rd2, [t1_param_2]; +; O1-NEXT: add.s64 %rd3, %rd2, %rd1; +; O1-NEXT: st.param.b64 [func_retval0], %rd3; ; O1-NEXT: ret; ; ; O0-LABEL: t1( @@ -44,14 +45,15 @@ define i64 @t2(i32 %a, i32 %b, i64 %c) { ; O1-LABEL: t2( ; O1: { ; O1-NEXT: .reg .b32 %r<3>; -; O1-NEXT: .reg .b64 %rd<3>; +; O1-NEXT: .reg .b64 %rd<4>; ; O1-EMPTY: ; O1-NEXT: // %bb.0: ; O1-NEXT: ld.param.b32 %r1, [t2_param_0]; ; O1-NEXT: ld.param.b32 %r2, [t2_param_1]; -; O1-NEXT: ld.param.b64 %rd1, [t2_param_2]; -; O1-NEXT: mad.wide.s32 %rd2, %r1, %r2, %rd1; -; O1-NEXT: st.param.b64 [func_retval0], %rd2; +; O1-NEXT: mul.wide.s32 %rd1, %r1, %r2; +; O1-NEXT: ld.param.b64 %rd2, [t2_param_2]; +; O1-NEXT: add.s64 %rd3, %rd1, %rd2; +; O1-NEXT: st.param.b64 [func_retval0], %rd3; ; O1-NEXT: ret; ; ; O0-LABEL: t2( @@ -79,13 +81,14 @@ define i64 @t3(i32 %a, i32 %b) { ; O1-LABEL: t3( ; O1: { ; O1-NEXT: .reg .b32 %r<3>; -; O1-NEXT: .reg .b64 %rd<2>; +; O1-NEXT: .reg .b64 %rd<3>; ; O1-EMPTY: ; O1-NEXT: // %bb.0: ; O1-NEXT: ld.param.b32 %r1, [t3_param_0]; ; O1-NEXT: ld.param.b32 %r2, [t3_param_1]; -; O1-NEXT: mad.wide.s32 %rd1, %r1, %r2, 1; -; O1-NEXT: st.param.b64 [func_retval0], %rd1; +; O1-NEXT: mul.wide.s32 %rd1, %r1, %r2; +; O1-NEXT: add.s64 %rd2, %rd1, 1; +; O1-NEXT: st.param.b64 [func_retval0], %rd2; ; O1-NEXT: ret; ; ; O0-LABEL: t3( @@ -112,13 +115,14 @@ define i64 @t4(i32 %a, i64 %c) { ; O1-LABEL: t4( ; O1: { ; O1-NEXT: .reg .b32 %r<2>; -; O1-NEXT: .reg .b64 %rd<3>; +; O1-NEXT: .reg .b64 %rd<4>; ; O1-EMPTY: ; O1-NEXT: // %bb.0: ; O1-NEXT: ld.param.b32 %r1, [t4_param_0]; ; O1-NEXT: ld.param.b64 %rd1, [t4_param_1]; -; O1-NEXT: mad.wide.s32 %rd2, %r1, 3, %rd1; -; O1-NEXT: st.param.b64 [func_retval0], %rd2; +; O1-NEXT: mul.wide.s32 %rd2, %r1, 3; +; O1-NEXT: add.s64 %rd3, %rd1, %rd2; +; O1-NEXT: st.param.b64 [func_retval0], %rd3; ; O1-NEXT: ret; ; ; O0-LABEL: t4( @@ -145,12 +149,13 @@ define i64 @t4_1(i32 %a, i64 %c) { ; O1-LABEL: t4_1( ; O1: { ; O1-NEXT: .reg .b32 %r<2>; -; O1-NEXT: .reg .b64 %rd<2>; +; O1-NEXT: .reg .b64 %rd<3>; ; O1-EMPTY: ; O1-NEXT: // %bb.0: ; O1-NEXT: ld.param.b32 %r1, [t4_1_param_0]; -; O1-NEXT: mad.wide.s32 %rd1, %r1, 3, 5; -; O1-NEXT: st.param.b64 [func_retval0], %rd1; +; O1-NEXT: mul.wide.s32 %rd1, %r1, 3; +; O1-NEXT: add.s64 %rd2, %rd1, 5; +; O1-NEXT: st.param.b64 [func_retval0], %rd2; ; O1-NEXT: ret; ; ; O0-LABEL: t4_1( @@ -176,14 +181,15 @@ define i64 @t5(i32 %a, i32 %b, i64 %c) { ; O1-LABEL: t5( ; O1: { ; O1-NEXT: .reg .b32 %r<3>; -; O1-NEXT: .reg .b64 %rd<3>; +; O1-NEXT: .reg .b64 %rd<4>; ; O1-EMPTY: ; O1-NEXT: // %bb.0: ; O1-NEXT: ld.param.b32 %r1, [t5_param_0]; ; O1-NEXT: ld.param.b32 %r2, [t5_param_1]; -; O1-NEXT: ld.param.b64 %rd1, [t5_param_2]; -; O1-NEXT: mad.wide.u32 %rd2, %r1, %r2, %rd1; -; O1-NEXT: st.param.b64 [func_retval0], %rd2; +; O1-NEXT: mul.wide.u32 %rd1, %r1, %r2; +; O1-NEXT: ld.param.b64 %rd2, [t5_param_2]; +; O1-NEXT: add.s64 %rd3, %rd2, %rd1; +; O1-NEXT: st.param.b64 [func_retval0], %rd3; ; O1-NEXT: ret; ; ; O0-LABEL: t5( @@ -211,14 +217,15 @@ define i64 @t6(i32 %a, i32 %b, i64 %c) { ; O1-LABEL: t6( ; O1: { ; O1-NEXT: .reg .b32 %r<3>; -; O1-NEXT: .reg .b64 %rd<3>; +; O1-NEXT: .reg .b64 %rd<4>; ; O1-EMPTY: ; O1-NEXT: // %bb.0: ; O1-NEXT: ld.param.b32 %r1, [t6_param_0]; ; O1-NEXT: ld.param.b32 %r2, [t6_param_1]; -; O1-NEXT: ld.param.b64 %rd1, [t6_param_2]; -; O1-NEXT: mad.wide.u32 %rd2, %r1, %r2, %rd1; -; O1-NEXT: st.param.b64 [func_retval0], %rd2; +; O1-NEXT: mul.wide.u32 %rd1, %r1, %r2; +; O1-NEXT: ld.param.b64 %rd2, [t6_param_2]; +; O1-NEXT: add.s64 %rd3, %rd1, %rd2; +; O1-NEXT: st.param.b64 [func_retval0], %rd3; ; O1-NEXT: ret; ; ; O0-LABEL: t6( @@ -932,14 +939,15 @@ define i32 @t32(i16 %a, i16 %b, i32 %c) { ; O1-LABEL: t32( ; O1: { ; O1-NEXT: .reg .b16 %rs<3>; -; O1-NEXT: .reg .b32 %r<3>; +; O1-NEXT: .reg .b32 %r<4>; ; O1-EMPTY: ; O1-NEXT: // %bb.0: ; O1-NEXT: ld.param.b16 %rs1, [t32_param_0]; ; O1-NEXT: ld.param.b16 %rs2, [t32_param_1]; -; O1-NEXT: ld.param.b32 %r1, [t32_param_2]; -; O1-NEXT: mad.wide.s16 %r2, %rs1, %rs2, %r1; -; O1-NEXT: st.param.b32 [func_retval0], %r2; +; O1-NEXT: mul.wide.s16 %r1, %rs1, %rs2; +; O1-NEXT: ld.param.b32 %r2, [t32_param_2]; +; O1-NEXT: add.s32 %r3, %r2, %r1; +; O1-NEXT: st.param.b32 [func_retval0], %r3; ; O1-NEXT: ret; ; ; O0-LABEL: t32( @@ -967,14 +975,15 @@ define i32 @t33(i16 %a, i16 %b, i32 %c) { ; O1-LABEL: t33( ; O1: { ; O1-NEXT: .reg .b16 %rs<3>; -; O1-NEXT: .reg .b32 %r<3>; +; O1-NEXT: .reg .b32 %r<4>; ; O1-EMPTY: ; O1-NEXT: // %bb.0: ; O1-NEXT: ld.param.b16 %rs1, [t33_param_0]; ; O1-NEXT: ld.param.b16 %rs2, [t33_param_1]; -; O1-NEXT: ld.param.b32 %r1, [t33_param_2]; -; O1-NEXT: mad.wide.s16 %r2, %rs1, %rs2, %r1; -; O1-NEXT: st.param.b32 [func_retval0], %r2; +; O1-NEXT: mul.wide.s16 %r1, %rs1, %rs2; +; O1-NEXT: ld.param.b32 %r2, [t33_param_2]; +; O1-NEXT: add.s32 %r3, %r2, %r1; +; O1-NEXT: st.param.b32 [func_retval0], %r3; ; O1-NEXT: ret; ; ; O0-LABEL: t33( @@ -1002,13 +1011,14 @@ define i32 @t34(i16 %a, i16 %b) { ; O1-LABEL: t34( ; O1: { ; O1-NEXT: .reg .b16 %rs<3>; -; O1-NEXT: .reg .b32 %r<2>; +; O1-NEXT: .reg .b32 %r<3>; ; O1-EMPTY: ; O1-NEXT: // %bb.0: ; O1-NEXT: ld.param.b16 %rs1, [t34_param_0]; ; O1-NEXT: ld.param.b16 %rs2, [t34_param_1]; -; O1-NEXT: mad.wide.s16 %r1, %rs1, %rs2, 1; -; O1-NEXT: st.param.b32 [func_retval0], %r1; +; O1-NEXT: mul.wide.s16 %r1, %rs1, %rs2; +; O1-NEXT: add.s32 %r2, %r1, 1; +; O1-NEXT: st.param.b32 [func_retval0], %r2; ; O1-NEXT: ret; ; ; O0-LABEL: t34( @@ -1035,13 +1045,14 @@ define i32 @t35(i16 %a, i32 %c) { ; O1-LABEL: t35( ; O1: { ; O1-NEXT: .reg .b16 %rs<2>; -; O1-NEXT: .reg .b32 %r<3>; +; O1-NEXT: .reg .b32 %r<4>; ; O1-EMPTY: ; O1-NEXT: // %bb.0: ; O1-NEXT: ld.param.b16 %rs1, [t35_param_0]; ; O1-NEXT: ld.param.b32 %r1, [t35_param_1]; -; O1-NEXT: mad.wide.s16 %r2, %rs1, 3, %r1; -; O1-NEXT: st.param.b32 [func_retval0], %r2; +; O1-NEXT: mul.wide.s16 %r2, %rs1, 3; +; O1-NEXT: add.s32 %r3, %r1, %r2; +; O1-NEXT: st.param.b32 [func_retval0], %r3; ; O1-NEXT: ret; ; ; O0-LABEL: t35( @@ -1068,12 +1079,13 @@ define i32 @t36(i16 %a, i32 %c) { ; O1-LABEL: t36( ; O1: { ; O1-NEXT: .reg .b16 %rs<2>; -; O1-NEXT: .reg .b32 %r<2>; +; O1-NEXT: .reg .b32 %r<3>; ; O1-EMPTY: ; O1-NEXT: // %bb.0: ; O1-NEXT: ld.param.b16 %rs1, [t36_param_0]; -; O1-NEXT: mad.wide.s16 %r1, %rs1, 3, 5; -; O1-NEXT: st.param.b32 [func_retval0], %r1; +; O1-NEXT: mul.wide.s16 %r1, %rs1, 3; +; O1-NEXT: add.s32 %r2, %r1, 5; +; O1-NEXT: st.param.b32 [func_retval0], %r2; ; O1-NEXT: ret; ; ; O0-LABEL: t36( @@ -1099,14 +1111,15 @@ define i32 @t37(i16 %a, i16 %b, i32 %c) { ; O1-LABEL: t37( ; O1: { ; O1-NEXT: .reg .b16 %rs<3>; -; O1-NEXT: .reg .b32 %r<3>; +; O1-NEXT: .reg .b32 %r<4>; ; O1-EMPTY: ; O1-NEXT: // %bb.0: ; O1-NEXT: ld.param.b16 %rs1, [t37_param_0]; ; O1-NEXT: ld.param.b16 %rs2, [t37_param_1]; -; O1-NEXT: ld.param.b32 %r1, [t37_param_2]; -; O1-NEXT: mad.wide.u16 %r2, %rs1, %rs2, %r1; -; O1-NEXT: st.param.b32 [func_retval0], %r2; +; O1-NEXT: mul.wide.u16 %r1, %rs1, %rs2; +; O1-NEXT: ld.param.b32 %r2, [t37_param_2]; +; O1-NEXT: add.s32 %r3, %r2, %r1; +; O1-NEXT: st.param.b32 [func_retval0], %r3; ; O1-NEXT: ret; ; ; O0-LABEL: t37( @@ -1134,14 +1147,15 @@ define i32 @t38(i16 %a, i16 %b, i32 %c) { ; O1-LABEL: t38( ; O1: { ; O1-NEXT: .reg .b16 %rs<3>; -; O1-NEXT: .reg .b32 %r<3>; +; O1-NEXT: .reg .b32 %r<4>; ; O1-EMPTY: ; O1-NEXT: // %bb.0: ; O1-NEXT: ld.param.b16 %rs1, [t38_param_0]; ; O1-NEXT: ld.param.b16 %rs2, [t38_param_1]; -; O1-NEXT: ld.param.b32 %r1, [t38_param_2]; -; O1-NEXT: mad.wide.u16 %r2, %rs1, %rs2, %r1; -; O1-NEXT: st.param.b32 [func_retval0], %r2; +; O1-NEXT: mul.wide.u16 %r1, %rs1, %rs2; +; O1-NEXT: ld.param.b32 %r2, [t38_param_2]; +; O1-NEXT: add.s32 %r3, %r1, %r2; +; O1-NEXT: st.param.b32 [func_retval0], %r3; ; O1-NEXT: ret; ; ; O0-LABEL: t38( diff --git a/llvm/test/CodeGen/NVPTX/common-linkage.ll b/llvm/test/CodeGen/NVPTX/common-linkage.ll index 2ea5f7f9b09f..c5bf25be51e0 100644 --- a/llvm/test/CodeGen/NVPTX/common-linkage.ll +++ b/llvm/test/CodeGen/NVPTX/common-linkage.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 -mattr=+ptx43 | FileCheck %s --check-prefixes CHECK,PTX43 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 -mattr=+ptx50 | FileCheck %s --check-prefixes CHECK,PTX50 -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx43 | %ptxas-verify %} -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx50 | %ptxas-verify %} +; RUN: %if ptxas-isa-4.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx43 | %ptxas-verify %} +; RUN: %if ptxas-isa-5.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx50 | %ptxas-verify %} ; PTX43: .weak .global .align 4 .u32 g ; PTX50: .common .global .align 4 .u32 g diff --git a/llvm/test/CodeGen/NVPTX/compare-int.ll b/llvm/test/CodeGen/NVPTX/compare-int.ll index 9338172d024c..9c93d18508d0 100644 --- a/llvm/test/CodeGen/NVPTX/compare-int.ll +++ b/llvm/test/CodeGen/NVPTX/compare-int.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ;; These tests should run for all targets diff --git a/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll b/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll index 6c80055ef467..3304f18473e7 100644 --- a/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll +++ b/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_90 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 %{ llc < %s -march=nvptx64 -mcpu=sm_90 | %ptxas-verify -arch=sm_90 %} %struct.64 = type <{ i64 }> declare i64 @callee(ptr %p); diff --git a/llvm/test/CodeGen/NVPTX/convert-fp.ll b/llvm/test/CodeGen/NVPTX/convert-fp.ll index debaadedce09..59b33b1bce7a 100644 --- a/llvm/test/CodeGen/NVPTX/convert-fp.ll +++ b/llvm/test/CodeGen/NVPTX/convert-fp.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} define i16 @cvt_u16_f32(float %x) { diff --git a/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll b/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll index a2fc8da3f1e6..9e850e75aca4 100644 --- a/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll +++ b/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ;; Integer conversions happen inplicitly by loading/storing the proper types diff --git a/llvm/test/CodeGen/NVPTX/convert-sm100.ll b/llvm/test/CodeGen/NVPTX/convert-sm100.ll index 88d0f32065a7..a89b35cad358 100644 --- a/llvm/test/CodeGen/NVPTX/convert-sm100.ll +++ b/llvm/test/CodeGen/NVPTX/convert-sm100.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-sm_100 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| %ptxas-verify -arch=sm_100 %} declare i32 @llvm.nvvm.f2tf32.rn.satfinite(float %f1) declare i32 @llvm.nvvm.f2tf32.rn.relu.satfinite(float %f1) diff --git a/llvm/test/CodeGen/NVPTX/convert-sm100a.ll b/llvm/test/CodeGen/NVPTX/convert-sm100a.ll index c8b7014d7bc1..16bd0da8c6a0 100644 --- a/llvm/test/CodeGen/NVPTX/convert-sm100a.ll +++ b/llvm/test/CodeGen/NVPTX/convert-sm100a.ll @@ -2,9 +2,9 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 | FileCheck %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 | %ptxas-verify -arch=sm_101a %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 | %ptxas-verify -arch=sm_120a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_101a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 | %ptxas-verify -arch=sm_101a %} +; RUN: %if ptxas-sm_120a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 | %ptxas-verify -arch=sm_120a %} define i16 @cvt_rn_sf_e2m3x2_f32(float %f1, float %f2) { ; CHECK-LABEL: cvt_rn_sf_e2m3x2_f32( diff --git a/llvm/test/CodeGen/NVPTX/convert-sm80.ll b/llvm/test/CodeGen/NVPTX/convert-sm80.ll index 9ddeb2bb9e94..edf1739ae992 100644 --- a/llvm/test/CodeGen/NVPTX/convert-sm80.ll +++ b/llvm/test/CodeGen/NVPTX/convert-sm80.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s -; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_80 && ptxas-isa-7.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} define <2 x bfloat> @cvt_rn_bf16x2_f32(float %f1, float %f2) { diff --git a/llvm/test/CodeGen/NVPTX/convert-sm89.ll b/llvm/test/CodeGen/NVPTX/convert-sm89.ll index 30fd76f5a31c..616dcfa330e8 100644 --- a/llvm/test/CodeGen/NVPTX/convert-sm89.ll +++ b/llvm/test/CodeGen/NVPTX/convert-sm89.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_89 -mattr=+ptx81 | FileCheck %s -; RUN: %if ptxas-12.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_89 -mattr=+ptx81 | %ptxas-verify -arch=sm_89 %} +; RUN: %if ptxas-sm_89 && ptxas-isa-8.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_89 -mattr=+ptx81 | %ptxas-verify -arch=sm_89 %} ; CHECK-LABEL: cvt_rn_e4m3x2_f32 define i16 @cvt_rn_e4m3x2_f32(float %f1, float %f2) { diff --git a/llvm/test/CodeGen/NVPTX/convert-sm90.ll b/llvm/test/CodeGen/NVPTX/convert-sm90.ll index c74ceac03d75..af88ede4b7fd 100644 --- a/llvm/test/CodeGen/NVPTX/convert-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/convert-sm90.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78| FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas-12.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78| %ptxas-verify -arch=sm_90 %} declare i32 @llvm.nvvm.f2tf32.rn(float %f1) declare i32 @llvm.nvvm.f2tf32.rn.relu(float %f1) diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-s2g-sm100.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-s2g-sm100.ll index 1e6b04635edd..a22f2165bdd1 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-s2g-sm100.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-s2g-sm100.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| %ptxas-verify -arch=sm_100 %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-sm_100 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-sm_100 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll index 5cfa25dfe55f..b5c43fd259a7 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll index a7e6bec6aef1..57342dc9a49c 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100.ll index 843446a65862..a52fab6a9c73 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| %ptxas-verify -arch=sm_100 %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-sm_100 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-sm_100 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100a.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100a.ll index 9b4858036fca..1f4c62a33267 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100a.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100a.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm90.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm90.ll index 432540594c79..3863c19d8fd3 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm90.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86| %ptxas-verify -arch=sm_90 %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll index ef4a8fb6ca72..6296d5af8ab1 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll index 112dab196406..e5ae3875a0ed 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll index 54e861eca30c..7d04adaa774c 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll index e0aceaf0901c..b0fe77c1a83b 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} -; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch-sm100a.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch-sm100a.ll index 6bf8f03f99ee..ccc3e94e5161 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch-sm100a.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch-sm100a.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch.ll index cf166f83fb24..f5478db5102d 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX %s -; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-reduce.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-reduce.ll index 4045b8b2792e..2dac6c48ca86 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-reduce.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-reduce.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX %s -; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g-scatter4.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g-scatter4.ll index 2ef44ff643bf..037ecea665a5 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g-scatter4.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g-scatter4.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll index 3b5bd161896b..8684ac3709f9 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} -; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll index 46a026313d97..e800523b37ff 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK,CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK,CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} -; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/discard.ll b/llvm/test/CodeGen/NVPTX/discard.ll index ce72f5f52b8a..dca0a0d48005 100644 --- a/llvm/test/CodeGen/NVPTX/discard.ll +++ b/llvm/test/CodeGen/NVPTX/discard.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx74| FileCheck --check-prefixes=CHECK-PTX64 %s
-; RUN: %if ptxas-11.4 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx74| %ptxas-verify -arch=sm_80 %}
+; RUN: %if ptxas-sm_80 && ptxas-isa-7.4 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx74| %ptxas-verify -arch=sm_80 %}
target triple = "nvptx64-nvidia-cuda"
diff --git a/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll index 1d70b9deb608..01cd70d1530b 100644 --- a/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll +++ b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -o - -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s -; RUN: %if ptxas-12.8 %{ llc < %s -mcpu=sm_90 -mattr=+ptx78| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 %{ llc < %s -mcpu=sm_90 -mattr=+ptx78| %ptxas-verify -arch=sm_90 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll index ce2f0f32a874..77141277dad2 100644 --- a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll +++ b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll @@ -4,8 +4,8 @@ ; RUN: llc < %s -mtriple=nvptx -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK-32 ; RUN: llc < %s -mtriple=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK-64 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mattr=+ptx73 -mcpu=sm_52 | %ptxas-verify %} -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | %ptxas-verify %} +; RUN: %if ptxas-isa-7.3 && ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mattr=+ptx73 -mcpu=sm_52 | %ptxas-verify %} +; RUN: %if ptxas-isa-7.3 %{ llc < %s -mtriple=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | %ptxas-verify %} ; CHECK-FAILS: in function test_dynamic_stackalloc{{.*}}: Support for dynamic alloca introduced in PTX ISA version 7.3 and requires target sm_52. diff --git a/llvm/test/CodeGen/NVPTX/elect.ll b/llvm/test/CodeGen/NVPTX/elect.ll index b65fa5a6376e..a61d2da9b861 100644 --- a/llvm/test/CodeGen/NVPTX/elect.ll +++ b/llvm/test/CodeGen/NVPTX/elect.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | FileCheck %s -; RUN: %if ptxas-12.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify -arch=sm_90 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/f16-abs.ll b/llvm/test/CodeGen/NVPTX/f16-abs.ll index 4025b38c0f0e..f5354a33a2c7 100644 --- a/llvm/test/CodeGen/NVPTX/f16-abs.ll +++ b/llvm/test/CodeGen/NVPTX/f16-abs.ll @@ -4,7 +4,7 @@ ; RUN: llc < %s -mcpu=sm_53 -mattr=+ptx60 \ ; RUN: -O0 -disable-post-ra -verify-machineinstrs \ ; RUN: | FileCheck -check-prefix CHECK-NOF16 %s -; RUN: %if ptxas %{ \ +; RUN: %if ptxas-sm_53 %{ \ ; RUN: llc < %s -mcpu=sm_53 -mattr=+ptx60 \ ; RUN: -O0 -disable-post-ra -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_53 \ @@ -14,7 +14,7 @@ ; RUN: llc < %s -mcpu=sm_53 -mattr=+ptx65 --nvptx-no-f16-math \ ; RUN: -O0 -disable-post-ra -verify-machineinstrs \ ; RUN: | FileCheck -check-prefix CHECK-NOF16 %s -; RUN: %if ptxas %{ \ +; RUN: %if ptxas-sm_53 %{ \ ; RUN: llc < %s -mcpu=sm_53 -mattr=+ptx65 --nvptx-no-f16-math \ ; RUN: -O0 -disable-post-ra -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_53 \ @@ -24,7 +24,7 @@ ; RUN: llc < %s -mcpu=sm_52 -mattr=+ptx65 \ ; RUN: -O0 -disable-post-ra -verify-machineinstrs \ ; RUN: | FileCheck -check-prefix CHECK-NOF16 %s -; RUN: %if ptxas %{ \ +; RUN: %if ptxas-sm_52 %{ \ ; RUN: llc < %s -mcpu=sm_52 -mattr=+ptx65 \ ; RUN: -O0 -disable-post-ra -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_52 \ @@ -34,7 +34,7 @@ ; RUN: llc < %s -mcpu=sm_53 -mattr=+ptx65 \ ; RUN: -O0 -disable-post-ra -verify-machineinstrs \ ; RUN: | FileCheck -check-prefix CHECK-F16-ABS %s -; RUN: %if ptxas %{ \ +; RUN: %if ptxas-sm_53 %{ \ ; RUN: llc < %s -mcpu=sm_53 -mattr=+ptx65 \ ; RUN: -O0 -disable-post-ra -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_53 \ diff --git a/llvm/test/CodeGen/NVPTX/f16-ex2.ll b/llvm/test/CodeGen/NVPTX/f16-ex2.ll index ae70946b4b1d..ee79f9d6d056 100644 --- a/llvm/test/CodeGen/NVPTX/f16-ex2.ll +++ b/llvm/test/CodeGen/NVPTX/f16-ex2.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mcpu=sm_75 -mattr=+ptx70 | FileCheck --check-prefixes=CHECK-FP16 %s -; RUN: %if ptxas-11.0 %{ llc < %s -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %} +; RUN: %if ptxas-sm_75 && ptxas-isa-7.0 %{ llc < %s -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %} target triple = "nvptx64-nvidia-cuda" declare half @llvm.nvvm.ex2.approx.f16(half) diff --git a/llvm/test/CodeGen/NVPTX/f16-instructions.ll b/llvm/test/CodeGen/NVPTX/f16-instructions.ll index d4aec4f16f1a..4e2f7ea9e520 100644 --- a/llvm/test/CodeGen/NVPTX/f16-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16-instructions.ll @@ -3,7 +3,7 @@ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: -mattr=+ptx60 \ ; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOFTZ,CHECK-F16-NOFTZ %s -; RUN: %if ptxas %{ \ +; RUN: %if ptxas-sm_53 && ptxas-isa-6.0 %{ \ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: -mattr=+ptx60 \ @@ -14,7 +14,7 @@ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: -denormal-fp-math-f32=preserve-sign -mattr=+ptx60 \ ; RUN: | FileCheck -check-prefixes CHECK,CHECK-F16-FTZ %s -; RUN: %if ptxas %{ \ +; RUN: %if ptxas-sm_53 && ptxas-isa-6.0 %{ \ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: -denormal-fp-math-f32=preserve-sign -mattr=+ptx60 \ @@ -25,7 +25,7 @@ ; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \ ; RUN: -verify-machineinstrs -mattr=+ptx60 \ ; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOFTZ,CHECK-NOF16 %s -; RUN: %if ptxas %{ \ +; RUN: %if ptxas-sm_53 %{ \ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ ; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \ ; RUN: | %ptxas-verify -arch=sm_53 \ @@ -34,7 +34,7 @@ ; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \ ; RUN: -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOFTZ,CHECK-NOF16 %s -; RUN: %if ptxas %{ \ +; RUN: %if ptxas-sm_52 %{ \ ; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \ ; RUN: -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_52 \ diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll index 7b2126870e31..e9143d540b04 100644 --- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll @@ -3,7 +3,7 @@ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK,CHECK-F16 %s -; RUN: %if ptxas %{ \ +; RUN: %if ptxas-sm_53 %{ \ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_53 \ @@ -13,7 +13,7 @@ ; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \ ; RUN: -verify-machineinstrs \ ; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK,CHECK-NOF16 %s -; RUN: %if ptxas %{ \ +; RUN: %if ptxas-sm_53 %{ \ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \ ; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \ ; RUN: -verify-machineinstrs \ @@ -23,7 +23,7 @@ ; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 \ ; RUN: -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK,CHECK-NOF16 %s -; RUN: %if ptxas %{ \ +; RUN: %if ptxas-sm_52 %{ \ ; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 \ ; RUN: -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_52 \ diff --git a/llvm/test/CodeGen/NVPTX/f32-ex2.ll b/llvm/test/CodeGen/NVPTX/f32-ex2.ll index fd92375eb7b7..796d80d3c2c3 100644 --- a/llvm/test/CodeGen/NVPTX/f32-ex2.ll +++ b/llvm/test/CodeGen/NVPTX/f32-ex2.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mcpu=sm_50 -mattr=+ptx32 | FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_50 -mattr=+ptx32 | %ptxas-verify -arch=sm_50 %} +; RUN: %if ptxas-sm_50 && ptxas-isa-3.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_50 -mattr=+ptx32 | %ptxas-verify -arch=sm_50 %} target triple = "nvptx-nvidia-cuda" declare float @llvm.nvvm.ex2.approx.f(float) diff --git a/llvm/test/CodeGen/NVPTX/f32-lg2.ll b/llvm/test/CodeGen/NVPTX/f32-lg2.ll index 29dede097610..4f9e37044a64 100644 --- a/llvm/test/CodeGen/NVPTX/f32-lg2.ll +++ b/llvm/test/CodeGen/NVPTX/f32-lg2.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mcpu=sm_20 -mattr=+ptx32 | FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 -mattr=+ptx32 | %ptxas-verify %} +; RUN: %if ptxas-isa-3.2 %{ llc < %s -mcpu=sm_20 -mattr=+ptx32 | %ptxas-verify %} target triple = "nvptx64-nvidia-cuda" declare float @llvm.nvvm.lg2.approx.f(float) diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll index 7ca16f702d8f..217bb483682f 100644 --- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll @@ -2,13 +2,13 @@ ; ## Full FP32x2 support enabled by default. ; RUN: llc < %s -mcpu=sm_80 -O0 -disable-post-ra -frame-pointer=all \ ; RUN: -verify-machineinstrs | FileCheck --check-prefixes=CHECK,CHECK-NOF32X2 %s -; RUN: %if ptxas-12.7 %{ \ +; RUN: %if ptxas-sm_80 %{ \ ; RUN: llc < %s -mcpu=sm_80 -O0 -disable-post-ra -frame-pointer=all \ ; RUN: -verify-machineinstrs | %ptxas-verify -arch=sm_80 \ ; RUN: %} ; RUN: llc < %s -mcpu=sm_100 -O0 -disable-post-ra -frame-pointer=all \ ; RUN: -verify-machineinstrs | FileCheck --check-prefixes=CHECK,CHECK-F32X2 %s -; RUN: %if ptxas-12.7 %{ \ +; RUN: %if ptxas-sm_100 %{ \ ; RUN: llc < %s -mcpu=sm_100 -O0 -disable-post-ra -frame-pointer=all \ ; RUN: -verify-machineinstrs | %ptxas-verify -arch=sm_100 \ ; RUN: %} diff --git a/llvm/test/CodeGen/NVPTX/fabs-intrinsics.ll b/llvm/test/CodeGen/NVPTX/fabs-intrinsics.ll index 30f9dcc27edb..18b535185e3f 100644 --- a/llvm/test/CodeGen/NVPTX/fabs-intrinsics.ll +++ b/llvm/test/CodeGen/NVPTX/fabs-intrinsics.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mcpu=sm_80 -mattr=+ptx70 | FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas-11.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_80 && ptxas-isa-7.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} target triple = "nvptx-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/fence-cluster.ll b/llvm/test/CodeGen/NVPTX/fence-cluster.ll index 1683ec138818..edaf8de3133c 100644 --- a/llvm/test/CodeGen/NVPTX/fence-cluster.ll +++ b/llvm/test/CodeGen/NVPTX/fence-cluster.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck %s --check-prefix=SM90 -; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.7 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} define void @fence_acquire_cluster() { ; SM90-LABEL: fence_acquire_cluster( diff --git a/llvm/test/CodeGen/NVPTX/fence-nocluster.ll b/llvm/test/CodeGen/NVPTX/fence-nocluster.ll index 1c6c1744b537..20f1df4d368e 100644 --- a/llvm/test/CodeGen/NVPTX/fence-nocluster.ll +++ b/llvm/test/CodeGen/NVPTX/fence-nocluster.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -mattr=+ptx50 | FileCheck %s --check-prefix=SM30 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx64 -mcpu=sm_35 -mattr=+ptx50 | %ptxas-verify -arch=sm_35 %} +; RUN: %if ptxas-sm_35 && ptxas-isa-5.0 %{ llc < %s -march=nvptx64 -mcpu=sm_35 -mattr=+ptx50 | %ptxas-verify -arch=sm_35 %} ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | FileCheck %s --check-prefix=SM70 -; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 && ptxas-isa-6.0 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | %ptxas-verify -arch=sm_70 %} ; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck %s --check-prefix=SM90 -; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.7 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} define void @fence_acquire_sys() { ; SM30-LABEL: fence_acquire_sys( diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap.ll b/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap.ll index dde983d3712f..636280da07ab 100644 --- a/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap.ll +++ b/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx83 | FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx83 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx83 | %ptxas-verify -arch=sm_90 %} ; CHECK-LABEL: test_fence_proxy_tensormap_generic_release define void @test_fence_proxy_tensormap_generic_release() { diff --git a/llvm/test/CodeGen/NVPTX/fexp2.ll b/llvm/test/CodeGen/NVPTX/fexp2.ll index 391aa453f075..d9e82cc372e2 100644 --- a/llvm/test/CodeGen/NVPTX/fexp2.ll +++ b/llvm/test/CodeGen/NVPTX/fexp2.ll @@ -2,9 +2,9 @@ ; RUN: llc < %s -mcpu=sm_50 -mattr=+ptx32 | FileCheck --check-prefixes=CHECK %s ; RUN: llc < %s -mcpu=sm_75 -mattr=+ptx70 | FileCheck --check-prefixes=CHECK-FP16 %s ; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK-BF16 %s -; RUN: %if ptxas-12.0 %{ llc < %s -mcpu=sm_50 -mattr=+ptx32 | %ptxas-verify -arch=sm_50 %} -; RUN: %if ptxas-12.0 %{ llc < %s -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %} -; RUN: %if ptxas-12.0 %{ llc < %s -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_50 && ptxas-isa-3.2 %{ llc < %s -mcpu=sm_50 -mattr=+ptx32 | %ptxas-verify -arch=sm_50 %} +; RUN: %if ptxas-sm_75 && ptxas-isa-7.0 %{ llc < %s -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 %{ llc < %s -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} target triple = "nvptx64-nvidia-cuda" ; --- f32 --- diff --git a/llvm/test/CodeGen/NVPTX/flog2.ll b/llvm/test/CodeGen/NVPTX/flog2.ll index acac5a8da4e1..4aafc986db1d 100644 --- a/llvm/test/CodeGen/NVPTX/flog2.ll +++ b/llvm/test/CodeGen/NVPTX/flog2.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mcpu=sm_50 -mattr=+ptx32 -nvptx-approx-log2f32 | FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas-12.0 %{ llc < %s -mcpu=sm_50 -mattr=+ptx32 -nvptx-approx-log2f32 | %ptxas-verify -arch=sm_50 %} +; RUN: %if ptxas-sm_50 && ptxas-isa-3.2 %{ llc < %s -mcpu=sm_50 -mattr=+ptx32 -nvptx-approx-log2f32 | %ptxas-verify -arch=sm_50 %} target triple = "nvptx64-nvidia-cuda" ; CHECK-LABEL: log2_test diff --git a/llvm/test/CodeGen/NVPTX/fma-disable.ll b/llvm/test/CodeGen/NVPTX/fma-disable.ll index 0038b4b65e0f..e94192b2e5d5 100644 --- a/llvm/test/CodeGen/NVPTX/fma-disable.ll +++ b/llvm/test/CodeGen/NVPTX/fma-disable.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 -nvptx-fma-level=0 | FileCheck %s -check-prefix=MUL ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -nvptx-fma-level=1 | FileCheck %s -check-prefix=FMA ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -nvptx-fma-level=0 | FileCheck %s -check-prefix=MUL -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -nvptx-fma-level=1 | %ptxas-verify %} -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -nvptx-fma-level=0 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -nvptx-fma-level=1 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -nvptx-fma-level=0 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -nvptx-fma-level=1 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -nvptx-fma-level=0 | %ptxas-verify %} diff --git a/llvm/test/CodeGen/NVPTX/fminimum-fmaximum.ll b/llvm/test/CodeGen/NVPTX/fminimum-fmaximum.ll index a18215221fb4..96cdb7651a5c 100644 --- a/llvm/test/CodeGen/NVPTX/fminimum-fmaximum.ll +++ b/llvm/test/CodeGen/NVPTX/fminimum-fmaximum.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=nvptx64 | FileCheck %s --check-prefixes=CHECK ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 | FileCheck %s --check-prefixes=CHECK ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 | %ptxas-verify %} -; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_80 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %} ; ---- minimum ---- diff --git a/llvm/test/CodeGen/NVPTX/fns.ll b/llvm/test/CodeGen/NVPTX/fns.ll index b153e298bbff..f003bc1a95f2 100644 --- a/llvm/test/CodeGen/NVPTX/fns.ll +++ b/llvm/test/CodeGen/NVPTX/fns.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %} +; RUN: %if ptxas-isa-6.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %} declare i32 @llvm.nvvm.fns(i32, i32, i32) diff --git a/llvm/test/CodeGen/NVPTX/fold-movs.ll b/llvm/test/CodeGen/NVPTX/fold-movs.ll index 6ee0fb2eeed2..10e31f5d97ef 100644 --- a/llvm/test/CodeGen/NVPTX/fold-movs.ll +++ b/llvm/test/CodeGen/NVPTX/fold-movs.ll @@ -2,7 +2,7 @@ ; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx88 -O3 -disable-post-ra \ ; RUN: -frame-pointer=all -verify-machineinstrs \ ; RUN: | FileCheck %s --check-prefixes=CHECK-F32X2 -; RUN: %if ptxas-12.7 %{ \ +; RUN: %if ptxas-sm_100 && ptxas-isa-8.8 %{ \ ; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx88 -O3 -disable-post-ra \ ; RUN: -frame-pointer=all -verify-machineinstrs | %ptxas-verify -arch=sm_100 \ ; RUN: %} diff --git a/llvm/test/CodeGen/NVPTX/fp-contract-f32x2.ll b/llvm/test/CodeGen/NVPTX/fp-contract-f32x2.ll index dc0ec0ff7bb0..c4d4dfcc618d 100644 --- a/llvm/test/CodeGen/NVPTX/fp-contract-f32x2.ll +++ b/llvm/test/CodeGen/NVPTX/fp-contract-f32x2.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -fp-contract=fast | FileCheck %s --check-prefixes=CHECK,FAST ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 | FileCheck %s --check-prefixes=CHECK,DEFAULT -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -fp-contract=fast | %ptxas-verify -arch sm_100 %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 | %ptxas-verify -arch sm_100 %} +; RUN: %if ptxas-sm_100 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -fp-contract=fast | %ptxas-verify -arch sm_100 %} +; RUN: %if ptxas-sm_100 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 | %ptxas-verify -arch sm_100 %} target triple = "nvptx64-unknown-cuda" diff --git a/llvm/test/CodeGen/NVPTX/global-addrspace.ll b/llvm/test/CodeGen/NVPTX/global-addrspace.ll index 3f9d321ab440..23f874781b7b 100644 --- a/llvm/test/CodeGen/NVPTX/global-addrspace.ll +++ b/llvm/test/CodeGen/NVPTX/global-addrspace.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; PTX32: .visible .global .align 4 .u32 i; diff --git a/llvm/test/CodeGen/NVPTX/global-ordering.ll b/llvm/test/CodeGen/NVPTX/global-ordering.ll index 2815cff7d7b4..5f598287234e 100644 --- a/llvm/test/CodeGen/NVPTX/global-ordering.ll +++ b/llvm/test/CodeGen/NVPTX/global-ordering.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; Make sure we emit these globals in def-use order diff --git a/llvm/test/CodeGen/NVPTX/griddepcontrol.ll b/llvm/test/CodeGen/NVPTX/griddepcontrol.ll index 0bf9196aa290..5b28d42b9f10 100644 --- a/llvm/test/CodeGen/NVPTX/griddepcontrol.ll +++ b/llvm/test/CodeGen/NVPTX/griddepcontrol.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mcpu=sm_90 -march=nvptx64 | FileCheck %s -; RUN: %if ptxas-11.8 %{ llc < %s -mcpu=sm_90 -march=nvptx64 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 %{ llc < %s -mcpu=sm_90 -march=nvptx64 | %ptxas-verify -arch=sm_90 %} define void @griddepcontrol() { ; CHECK-LABEL: griddepcontrol( diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll index 7f48245af4a2..5d40192fa153 100644 --- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll @@ -3,7 +3,7 @@ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80 \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes COMMON,I16x2 %s -; RUN: %if ptxas %{ \ +; RUN: %if ptxas-sm_90 %{ \ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_90 \ @@ -12,7 +12,7 @@ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes COMMON,NO-I16x2 %s -; RUN: %if ptxas %{ \ +; RUN: %if ptxas-sm_53 %{ \ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_53 \ diff --git a/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll index f4053d84593a..db19495b5a4b 100644 --- a/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll @@ -132,5 +132,120 @@ define <2 x float> @test_uitofp_2xi8(<2 x i8> %a) { %1 = uitofp <2 x i8> %a to <2 x float> ret <2 x float> %1 } + +define void @test_store_i8x2_unaligned(ptr %ptr, <2 x i8> %a) { +; O0-LABEL: test_store_i8x2_unaligned( +; O0: { +; O0-NEXT: .reg .b16 %rs<3>; +; O0-NEXT: .reg .b32 %r<2>; +; O0-NEXT: .reg .b64 %rd<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b64 %rd1, [test_store_i8x2_unaligned_param_0]; +; O0-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [test_store_i8x2_unaligned_param_1]; +; O0-NEXT: mov.b32 %r1, {%rs1, %rs2}; +; O0-NEXT: st.b8 [%rd1+1], %rs2; +; O0-NEXT: st.b8 [%rd1], %rs1; +; O0-NEXT: ret; +; +; O3-LABEL: test_store_i8x2_unaligned( +; O3: { +; O3-NEXT: .reg .b16 %rs<3>; +; O3-NEXT: .reg .b64 %rd<2>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b64 %rd1, [test_store_i8x2_unaligned_param_0]; +; O3-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [test_store_i8x2_unaligned_param_1]; +; O3-NEXT: st.b8 [%rd1+1], %rs2; +; O3-NEXT: st.b8 [%rd1], %rs1; +; O3-NEXT: ret; + store <2 x i8> %a, ptr %ptr, align 1 + ret void +} + +define void @test_store_i8x2_unaligned_immediate(ptr %ptr) { +; O0-LABEL: test_store_i8x2_unaligned_immediate( +; O0: { +; O0-NEXT: .reg .b64 %rd<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b64 %rd1, [test_store_i8x2_unaligned_immediate_param_0]; +; O0-NEXT: st.b8 [%rd1+1], 2; +; O0-NEXT: st.b8 [%rd1], 1; +; O0-NEXT: ret; +; +; O3-LABEL: test_store_i8x2_unaligned_immediate( +; O3: { +; O3-NEXT: .reg .b64 %rd<2>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b64 %rd1, [test_store_i8x2_unaligned_immediate_param_0]; +; O3-NEXT: st.b8 [%rd1+1], 2; +; O3-NEXT: st.b8 [%rd1], 1; +; O3-NEXT: ret; + store <2 x i8> <i8 1, i8 2>, ptr %ptr, align 1 + ret void +} + +define i32 @test_zext_load_i8x2_unaligned(ptr %ptr) { +; O0-LABEL: test_zext_load_i8x2_unaligned( +; O0: { +; O0-NEXT: .reg .b16 %rs<3>; +; O0-NEXT: .reg .b64 %rd<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b64 %rd1, [test_zext_load_i8x2_unaligned_param_0]; +; O0-NEXT: ld.b8 %rs1, [%rd1+1]; +; O0-NEXT: ld.b8 %rs2, [%rd1]; +; O0-NEXT: st.param.v2.b16 [func_retval0], {%rs2, %rs1}; +; O0-NEXT: ret; +; +; O3-LABEL: test_zext_load_i8x2_unaligned( +; O3: { +; O3-NEXT: .reg .b16 %rs<3>; +; O3-NEXT: .reg .b64 %rd<2>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b64 %rd1, [test_zext_load_i8x2_unaligned_param_0]; +; O3-NEXT: ld.b8 %rs1, [%rd1+1]; +; O3-NEXT: ld.b8 %rs2, [%rd1]; +; O3-NEXT: st.param.v2.b16 [func_retval0], {%rs2, %rs1}; +; O3-NEXT: ret; + %a = load <2 x i8>, ptr %ptr, align 1 + %b = zext <2 x i8> %a to <2 x i16> + %c = bitcast <2 x i16> %b to i32 + ret i32 %c +} + +define i32 @test_sext_load_i8x2_unaligned(ptr %ptr) { +; O0-LABEL: test_sext_load_i8x2_unaligned( +; O0: { +; O0-NEXT: .reg .b16 %rs<3>; +; O0-NEXT: .reg .b64 %rd<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b64 %rd1, [test_sext_load_i8x2_unaligned_param_0]; +; O0-NEXT: ld.s8 %rs1, [%rd1+1]; +; O0-NEXT: ld.s8 %rs2, [%rd1]; +; O0-NEXT: st.param.v2.b16 [func_retval0], {%rs2, %rs1}; +; O0-NEXT: ret; +; +; O3-LABEL: test_sext_load_i8x2_unaligned( +; O3: { +; O3-NEXT: .reg .b16 %rs<3>; +; O3-NEXT: .reg .b64 %rd<2>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b64 %rd1, [test_sext_load_i8x2_unaligned_param_0]; +; O3-NEXT: ld.s8 %rs1, [%rd1+1]; +; O3-NEXT: ld.s8 %rs2, [%rd1]; +; O3-NEXT: st.param.v2.b16 [func_retval0], {%rs2, %rs1}; +; O3-NEXT: ret; + %a = load <2 x i8>, ptr %ptr, align 1 + %b = sext <2 x i8> %a to <2 x i16> + %c = bitcast <2 x i16> %b to i32 + ret i32 %c +} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; COMMON: {{.*}} diff --git a/llvm/test/CodeGen/NVPTX/idioms.ll b/llvm/test/CodeGen/NVPTX/idioms.ll index a3bf8922a98f..87c5ab27ecf9 100644 --- a/llvm/test/CodeGen/NVPTX/idioms.ll +++ b/llvm/test/CodeGen/NVPTX/idioms.ll @@ -3,7 +3,7 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} %struct.S16 = type { i16, i16 } diff --git a/llvm/test/CodeGen/NVPTX/indirect_byval.ll b/llvm/test/CodeGen/NVPTX/indirect_byval.ll index 673fb7394826..e1fecdb76bd4 100644 --- a/llvm/test/CodeGen/NVPTX/indirect_byval.ll +++ b/llvm/test/CodeGen/NVPTX/indirect_byval.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | %ptxas-verify %} +; RUN: %if ptxas-isa-6.4 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | %ptxas-verify %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll index 307e2c855091..fd8aeff70c1f 100644 --- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx83 | FileCheck %s -; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 && ptxas-isa-8.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll index 52bd51b3ef7f..e4ca0cb71e7b 100644 --- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx83 | FileCheck %s -; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 && ptxas-isa-8.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll index bf0dd58e27a3..02a75d516811 100644 --- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx83 | FileCheck %s -; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 && ptxas-isa-8.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/intrinsic-old.ll b/llvm/test/CodeGen/NVPTX/intrinsic-old.ll index f595df837f91..01cdacb6ca15 100644 --- a/llvm/test/CodeGen/NVPTX/intrinsic-old.ll +++ b/llvm/test/CodeGen/NVPTX/intrinsic-old.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck -allow-deprecated-dag-overlap %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck -allow-deprecated-dag-overlap %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} define ptx_device i32 @test_tid_x() { diff --git a/llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll b/llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll index a7ab358dc07f..e2a01dc4e0b0 100644 --- a/llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas-12.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} ; CHECK-LABEL: test_isspacep define i1 @test_isspacep_shared_cluster(ptr %p) { diff --git a/llvm/test/CodeGen/NVPTX/intrinsics.ll b/llvm/test/CodeGen/NVPTX/intrinsics.ll index 4ed50632251c..00eb8e293e0f 100644 --- a/llvm/test/CodeGen/NVPTX/intrinsics.ll +++ b/llvm/test/CodeGen/NVPTX/intrinsics.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_60 | FileCheck %s --check-prefixes=CHECK,CHECK32 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_60 | FileCheck %s --check-prefixes=CHECK,CHECK64 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_60 | %ptxas-verify %} -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas-verify %} +; RUN: %if ptxas-sm_60 && ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %} +; RUN: %if ptxas-sm_60 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %} define float @test_fabsf(float %f) { ; CHECK-LABEL: test_fabsf( @@ -267,6 +267,23 @@ define i64 @test_globaltimer() { ret i64 %ret } +define i32 @test_globaltimer_lo(){ +; CHECK-LABEL: test_globaltimer_lo( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.u32 %r1, %globaltimer_lo; +; CHECK-NEXT: mov.u32 %r2, %globaltimer_lo; +; CHECK-NEXT: add.s32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %a = tail call i32 @llvm.nvvm.read.ptx.sreg.globaltimer.lo() + %b = tail call i32 @llvm.nvvm.read.ptx.sreg.globaltimer.lo() + %ret = add i32 %a, %b + ret i32 %ret +} + define i64 @test_cyclecounter() { ; CHECK-LABEL: test_cyclecounter( ; CHECK: { diff --git a/llvm/test/CodeGen/NVPTX/kernel-param-align.ll b/llvm/test/CodeGen/NVPTX/kernel-param-align.ll index a56b85de8014..b66b843f4b83 100644 --- a/llvm/test/CodeGen/NVPTX/kernel-param-align.ll +++ b/llvm/test/CodeGen/NVPTX/kernel-param-align.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_60 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas -arch=sm_60 - %} +; RUN: %if ptxas-sm_60 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas -arch=sm_60 - %} %struct.Large = type { [16 x double] } diff --git a/llvm/test/CodeGen/NVPTX/ld-addrspace.ll b/llvm/test/CodeGen/NVPTX/ld-addrspace.ll index 24071b48143f..c3fd2887d71f 100644 --- a/llvm/test/CodeGen/NVPTX/ld-addrspace.ll +++ b/llvm/test/CodeGen/NVPTX/ld-addrspace.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s --check-prefixes=ALL,G32,LS32 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefixes=ALL,G64,LS64 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | FileCheck %s --check-prefixes=G64,LS32 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | %ptxas-verify %} diff --git a/llvm/test/CodeGen/NVPTX/ld-generic.ll b/llvm/test/CodeGen/NVPTX/ld-generic.ll index ee304ca1601f..628fb499441f 100644 --- a/llvm/test/CodeGen/NVPTX/ld-generic.ll +++ b/llvm/test/CodeGen/NVPTX/ld-generic.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} diff --git a/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py b/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py index 2fa4c89f4d71..4b566b2b52a0 100644 --- a/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py +++ b/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py @@ -4,7 +4,7 @@ # RUN: %python %s > %t.ll # RUN: llc < %t.ll -mtriple=nvptx -mcpu=sm_30 | FileCheck -check-prefixes=CHECK,CHECK_P32 %t.ll # RUN: llc < %t.ll -mtriple=nvptx64 -mcpu=sm_30 | FileCheck -check-prefixes=CHECK,CHECK_P64 %t.ll -# RUN: %if ptxas && !ptxas-12.0 %{ llc < %t.ll -mtriple=nvptx -mcpu=sm_30 | %ptxas-verify %} +# RUN: %if ptxas-ptr32 %{ llc < %t.ll -mtriple=nvptx -mcpu=sm_30 | %ptxas-verify %} # RUN: %if ptxas %{ llc < %t.ll -mtriple=nvptx64 -mcpu=sm_30 | %ptxas-verify %} from __future__ import print_function diff --git a/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll b/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll index 6e42e0006af3..d219493d2b31 100644 --- a/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll +++ b/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx87 -verify-machineinstrs | FileCheck %s -check-prefixes=SM90 -; RUN: %if ptxas-12.9 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.7 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx88 -verify-machineinstrs | FileCheck %s -check-prefixes=SM100 -; RUN: %if ptxas-12.9 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-sm_100 && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} ; For 256-bit vectors, check that invariant loads from the ; global addrspace are lowered to ld.global.nc. diff --git a/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll b/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll index 187ccc9cd89f..12e3287e73f0 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s -check-prefixes=PTX -; RUN: %if ptxas-12.9 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-sm_100 && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} ; In this test, we check that all the addressing modes are lowered correctly ; for 256-bit invariant loads, which get lowered to ld.global.nc diff --git a/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll b/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll index a17df1ee3988..b7fa1dd5f2c4 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s -check-prefixes=PTX -; RUN: %if ptxas-12.9 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-sm_100 && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} ; In this test, we check that all the addressing modes are lowered correctly, ; addr can be any of the following: diff --git a/llvm/test/CodeGen/NVPTX/load-store-atomic.err.ll b/llvm/test/CodeGen/NVPTX/load-store-atomic.err.ll new file mode 100644 index 000000000000..31889e25142a --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/load-store-atomic.err.ll @@ -0,0 +1,10 @@ +; RUN: not llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 2>&1 | FileCheck %s + +; CHECK: error: unsupported atomic store +; CHECK: error: unsupported atomic load + +define void @test_i256_global_atomic(ptr addrspace(1) %a, ptr addrspace(1) %b) { + %a.load = load atomic i256, ptr addrspace(1) %a seq_cst, align 32 + store atomic i256 %a.load, ptr addrspace(1) %b seq_cst, align 32 + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/load-store-scalars.ll b/llvm/test/CodeGen/NVPTX/load-store-scalars.ll index bac59be5158e..09c18b627fac 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-scalars.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-scalars.ll @@ -2,7 +2,7 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck -check-prefixes=CHECK,SM60 %s ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s -check-prefixes=CHECK,SM70 -; RUN: %if ptxas-12.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 && ptxas-isa-8.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %} ; TODO: generate PTX that preserves Concurrent Forward Progress ; for atomic operations to local statespace diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll index 2ffefd0cf461..7373b50477d2 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s -; RUN: %if ptxas-12.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 && ptxas-isa-8.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %} ; TODO: fix "atomic load volatile acquire": generates "ld.acquire.sys;" ; but should generate "ld.mmio.relaxed.sys; fence.acq_rel.sys;" diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll index ed170e92917f..5e85e989a2fd 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s -; RUN: %if ptxas-12.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} ; TODO: fix "atomic load volatile acquire": generates "ld.acquire.sys;" ; but should generate "ld.mmio.relaxed.sys; fence.acq_rel.sys;" diff --git a/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll b/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll index a846607d816c..e8b43ad28ad2 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck -check-prefixes=CHECK,SM90 %s -; RUN: %if ptxas-12.9 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.7 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s -check-prefixes=CHECK,SM100 -; RUN: %if ptxas-12.9 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-sm_100 && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} ; This test is based on load-store-vectors.ll, ; and contains testing for lowering 256-bit vector loads/stores @@ -1506,3 +1506,98 @@ define void @local_volatile_4xdouble(ptr addrspace(5) %a, ptr addrspace(5) %b) { store volatile <4 x double> %a.load, ptr addrspace(5) %b ret void } + +define void @test_i256_global(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SM90-LABEL: test_i256_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<7>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [test_i256_global_param_0]; +; SM90-NEXT: ld.global.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM90-NEXT: ld.global.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd6, [test_i256_global_param_1]; +; SM90-NEXT: st.global.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM90-NEXT: st.global.v2.b64 [%rd6], {%rd2, %rd3}; +; SM90-NEXT: ret; +; +; SM100-LABEL: test_i256_global( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [test_i256_global_param_0]; +; SM100-NEXT: ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; SM100-NEXT: ld.param.b64 %rd6, [test_i256_global_param_1]; +; SM100-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; +; SM100-NEXT: ret; + %a.load = load i256, ptr addrspace(1) %a, align 32 + store i256 %a.load, ptr addrspace(1) %b, align 32 + ret void +} + + +define void @test_i256_global_unaligned(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; CHECK-LABEL: test_i256_global_unaligned( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_i256_global_unaligned_param_0]; +; CHECK-NEXT: ld.global.v2.b64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.global.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd6, [test_i256_global_unaligned_param_1]; +; CHECK-NEXT: st.global.v2.b64 [%rd6+16], {%rd4, %rd5}; +; CHECK-NEXT: st.global.v2.b64 [%rd6], {%rd2, %rd3}; +; CHECK-NEXT: ret; + %a.load = load i256, ptr addrspace(1) %a, align 16 + store i256 %a.load, ptr addrspace(1) %b, align 16 + ret void +} + +define void @test_i256_generic(ptr %a, ptr %b) { +; CHECK-LABEL: test_i256_generic( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_i256_generic_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd6, [test_i256_generic_param_1]; +; CHECK-NEXT: st.v2.b64 [%rd6+16], {%rd4, %rd5}; +; CHECK-NEXT: st.v2.b64 [%rd6], {%rd2, %rd3}; +; CHECK-NEXT: ret; + %a.load = load i256, ptr %a, align 32 + store i256 %a.load, ptr %b, align 32 + ret void +} + +define void @test_i256_global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SM90-LABEL: test_i256_global_volatile( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<7>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [test_i256_global_volatile_param_0]; +; SM90-NEXT: ld.volatile.global.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM90-NEXT: ld.volatile.global.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd6, [test_i256_global_volatile_param_1]; +; SM90-NEXT: st.volatile.global.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM90-NEXT: st.volatile.global.v2.b64 [%rd6], {%rd2, %rd3}; +; SM90-NEXT: ret; +; +; SM100-LABEL: test_i256_global_volatile( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [test_i256_global_volatile_param_0]; +; SM100-NEXT: ld.volatile.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; SM100-NEXT: ld.param.b64 %rd6, [test_i256_global_volatile_param_1]; +; SM100-NEXT: st.volatile.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; +; SM100-NEXT: ret; + %a.load = load volatile i256, ptr addrspace(1) %a, align 32 + store volatile i256 %a.load, ptr addrspace(1) %b, align 32 + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll index f7137e05a5e4..9dac46cb4900 100644 --- a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll +++ b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=PTX32 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=PTX64 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} ; Ensure we access the local stack properly @@ -114,14 +114,15 @@ define void @foo3(i32 %a) { ; PTX64-NEXT: .reg .b64 %SP; ; PTX64-NEXT: .reg .b64 %SPL; ; PTX64-NEXT: .reg .b32 %r<2>; -; PTX64-NEXT: .reg .b64 %rd<3>; +; PTX64-NEXT: .reg .b64 %rd<4>; ; PTX64-EMPTY: ; PTX64-NEXT: // %bb.0: ; PTX64-NEXT: mov.b64 %SPL, __local_depot2; ; PTX64-NEXT: ld.param.b32 %r1, [foo3_param_0]; ; PTX64-NEXT: add.u64 %rd1, %SPL, 0; -; PTX64-NEXT: mad.wide.s32 %rd2, %r1, 4, %rd1; -; PTX64-NEXT: st.local.b32 [%rd2], %r1; +; PTX64-NEXT: mul.wide.s32 %rd2, %r1, 4; +; PTX64-NEXT: add.s64 %rd3, %rd1, %rd2; +; PTX64-NEXT: st.local.b32 [%rd3], %r1; ; PTX64-NEXT: ret; %local = alloca [3 x i32], align 4 %1 = getelementptr inbounds i32, ptr %local, i32 %a diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll index 8adde4ceefbf..01ab47145940 100644 --- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll @@ -49,14 +49,14 @@ define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly ; PTX-NEXT: st.param.b32 [func_retval0], %r10; ; PTX-NEXT: ret; entry: - %a. = select i1 %b, ptr %a, ptr addrspacecast (ptr addrspace(1) @gi to ptr), !dbg !17 - %idx.ext = sext i32 %c to i64, !dbg !18 - %add.ptr = getelementptr inbounds i8, ptr %a., i64 %idx.ext, !dbg !18 - %0 = load i32, ptr %add.ptr, align 1, !dbg !19 - ret i32 %0, !dbg !23 + %a. = select i1 %b, ptr %a, ptr addrspacecast (ptr addrspace(1) @gi to ptr) + %idx.ext = sext i32 %c to i64 + %add.ptr = getelementptr inbounds i8, ptr %a., i64 %idx.ext + %0 = load i32, ptr %add.ptr, align 1 + ret i32 %0 } -define ptx_kernel void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %out, i32 %n) { +define ptx_kernel void @grid_const_int(ptr byval(i32) align 4 "nvvm.grid_constant" %input1, i32 %input2, ptr %out, i32 %n) { ; PTX-LABEL: grid_const_int( ; PTX: { ; PTX-NEXT: .reg .b32 %r<4>; @@ -71,7 +71,7 @@ define ptx_kernel void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %inpu ; PTX-NEXT: st.global.b32 [%rd2], %r3; ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_int( -; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; OPT-SAME: ptr byval(i32) align 4 "nvvm.grid_constant" [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { ; OPT-NEXT: [[INPUT11:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) ; OPT-NEXT: [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4 ; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP]], [[INPUT2]] @@ -85,7 +85,7 @@ define ptx_kernel void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %inpu %struct.s = type { i32, i32 } -define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){ +define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 "nvvm.grid_constant" %input, ptr %out){ ; PTX-LABEL: grid_const_struct( ; PTX: { ; PTX-NEXT: .reg .b32 %r<4>; @@ -100,7 +100,7 @@ define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 %input, p ; PTX-NEXT: st.global.b32 [%rd2], %r3; ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_struct( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] { +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 "nvvm.grid_constant" [[INPUT:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] { ; OPT-NEXT: [[INPUT1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) ; OPT-NEXT: [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0 ; OPT-NEXT: [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1 @@ -118,7 +118,7 @@ define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 %input, p ret void } -define ptx_kernel void @grid_const_escape(ptr byval(%struct.s) align 4 %input) { +define ptx_kernel void @grid_const_escape(ptr byval(%struct.s) align 4 "nvvm.grid_constant" %input) { ; PTX-LABEL: grid_const_escape( ; PTX: { ; PTX-NEXT: .reg .b64 %rd<4>; @@ -136,7 +136,7 @@ define ptx_kernel void @grid_const_escape(ptr byval(%struct.s) align 4 %input) { ; PTX-NEXT: } // callseq 0 ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_escape( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) #[[ATTR0]] { +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 "nvvm.grid_constant" [[INPUT:%.*]]) #[[ATTR0]] { ; OPT-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) ; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr ; OPT-NEXT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]]) @@ -145,7 +145,7 @@ define ptx_kernel void @grid_const_escape(ptr byval(%struct.s) align 4 %input) { ret void } -define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 %a, ptr byval(i32) align 4 %b) { +define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 "nvvm.grid_constant" %input, i32 %a, ptr byval(i32) align 4 "nvvm.grid_constant" %b) { ; PTX-LABEL: multiple_grid_const_escape( ; PTX: { ; PTX-NEXT: .local .align 4 .b8 __local_depot4[4]; @@ -179,7 +179,7 @@ define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 ; PTX-NEXT: } // callseq 1 ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @multiple_grid_const_escape( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) #[[ATTR0]] { +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 "nvvm.grid_constant" [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 "nvvm.grid_constant" [[B:%.*]]) #[[ATTR0]] { ; OPT-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[B]]) ; OPT-NEXT: [[B_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr ; OPT-NEXT: [[TMP2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) @@ -194,7 +194,7 @@ define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 ret void } -define ptx_kernel void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %addr) { +define ptx_kernel void @grid_const_memory_escape(ptr byval(%struct.s) align 4 "nvvm.grid_constant" %input, ptr %addr) { ; PTX-LABEL: grid_const_memory_escape( ; PTX: { ; PTX-NEXT: .reg .b64 %rd<5>; @@ -207,7 +207,7 @@ define ptx_kernel void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %i ; PTX-NEXT: st.global.b64 [%rd3], %rd4; ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_memory_escape( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0]] { +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 "nvvm.grid_constant" [[INPUT:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0]] { ; OPT-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) ; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr ; OPT-NEXT: store ptr [[INPUT1]], ptr [[ADDR]], align 8 @@ -216,7 +216,7 @@ define ptx_kernel void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %i ret void } -define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, ptr %result) { +define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 "nvvm.grid_constant" %input, ptr %result) { ; PTX-LABEL: grid_const_inlineasm_escape( ; PTX: { ; PTX-NEXT: .reg .b64 %rd<7>; @@ -234,7 +234,7 @@ define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 ; PTX-NEXT: ret; ; PTX-NOT .local ; OPT-LABEL: define ptx_kernel void @grid_const_inlineasm_escape( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) #[[ATTR0]] { +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 "nvvm.grid_constant" [[INPUT:%.*]], ptr [[RESULT:%.*]]) #[[ATTR0]] { ; OPT-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) ; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr ; OPT-NEXT: [[TMPPTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0 @@ -249,7 +249,7 @@ define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 ret void } -define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) { +define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) "nvvm.grid_constant" %input, ptr %output) { ; PTX-LABEL: grid_const_partial_escape( ; PTX: { ; PTX-NEXT: .reg .b32 %r<3>; @@ -273,7 +273,7 @@ define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %ou ; PTX-NEXT: } // callseq 2 ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_partial_escape( -; OPT-SAME: ptr byval(i32) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] { +; OPT-SAME: ptr byval(i32) "nvvm.grid_constant" [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] { ; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) ; OPT-NEXT: [[INPUT1_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr ; OPT-NEXT: [[VAL1:%.*]] = load i32, ptr [[INPUT1_GEN]], align 4 @@ -288,7 +288,7 @@ define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %ou ret void } -define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %output) { +define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) "nvvm.grid_constant" %input, ptr %output) { ; PTX-LABEL: grid_const_partial_escapemem( ; PTX: { ; PTX-NEXT: .reg .b32 %r<4>; @@ -314,7 +314,7 @@ define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ; PTX-NEXT: st.param.b32 [func_retval0], %r3; ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel i32 @grid_const_partial_escapemem( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] { +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) "nvvm.grid_constant" [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] { ; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) ; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr ; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0 @@ -335,7 +335,7 @@ define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ret i32 %add } -define ptx_kernel void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr %inout) { +define ptx_kernel void @grid_const_phi(ptr byval(%struct.s) align 4 "nvvm.grid_constant" %input1, ptr %inout) { ; PTX-LABEL: grid_const_phi( ; PTX: { ; PTX-NEXT: .reg .pred %p<2>; @@ -356,7 +356,7 @@ define ptx_kernel void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr ; PTX-NEXT: st.global.b32 [%rd1], %r2; ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_phi( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 "nvvm.grid_constant" [[INPUT1:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { ; OPT-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) ; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr ; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4 @@ -391,7 +391,7 @@ merge: } ; NOTE: %input2 is *not* grid_constant -define ptx_kernel void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ptr byval(%struct.s) %input2, ptr %inout) { +define ptx_kernel void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 "nvvm.grid_constant" %input1, ptr byval(%struct.s) %input2, ptr %inout) { ; PTX-LABEL: grid_const_phi_ngc( ; PTX: { ; PTX-NEXT: .reg .pred %p<2>; @@ -413,7 +413,7 @@ define ptx_kernel void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ; PTX-NEXT: st.global.b32 [%rd1], %r2; ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_phi_ngc( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 "nvvm.grid_constant" [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { ; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]]) ; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr ; OPT-NEXT: [[TMP2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) @@ -449,7 +449,7 @@ merge: } ; NOTE: %input2 is *not* grid_constant -define ptx_kernel void @grid_const_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %inout) { +define ptx_kernel void @grid_const_select(ptr byval(i32) align 4 "nvvm.grid_constant" %input1, ptr byval(i32) %input2, ptr %inout) { ; PTX-LABEL: grid_const_select( ; PTX: { ; PTX-NEXT: .reg .pred %p<2>; @@ -468,7 +468,7 @@ define ptx_kernel void @grid_const_select(ptr byval(i32) align 4 %input1, ptr by ; PTX-NEXT: st.global.b32 [%rd3], %r2; ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_select( -; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { +; OPT-SAME: ptr byval(i32) align 4 "nvvm.grid_constant" [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { ; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]]) ; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr ; OPT-NEXT: [[TMP2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) @@ -487,7 +487,7 @@ define ptx_kernel void @grid_const_select(ptr byval(i32) align 4 %input1, ptr by ret void } -define ptx_kernel i32 @grid_const_ptrtoint(ptr byval(i32) %input) { +define ptx_kernel i32 @grid_const_ptrtoint(ptr byval(i32) "nvvm.grid_constant" %input) { ; PTX-LABEL: grid_const_ptrtoint( ; PTX: { ; PTX-NEXT: .reg .b32 %r<4>; @@ -502,7 +502,7 @@ define ptx_kernel i32 @grid_const_ptrtoint(ptr byval(i32) %input) { ; PTX-NEXT: st.param.b32 [func_retval0], %r3; ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel i32 @grid_const_ptrtoint( -; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] { +; OPT-SAME: ptr byval(i32) align 4 "nvvm.grid_constant" [[INPUT:%.*]]) #[[ATTR0]] { ; OPT-NEXT: [[INPUT2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) ; OPT-NEXT: [[INPUT3:%.*]] = load i32, ptr addrspace(101) [[INPUT2]], align 4 ; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[INPUT2]] to ptr @@ -517,9 +517,9 @@ define ptx_kernel i32 @grid_const_ptrtoint(ptr byval(i32) %input) { declare void @device_func(ptr byval(i32) align 4) -define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) { +define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 "nvvm.grid_constant" %input) { ; OPT-LABEL: define ptx_kernel void @test_forward_byval_arg( -; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] { +; OPT-SAME: ptr byval(i32) align 4 "nvvm.grid_constant" [[INPUT:%.*]]) #[[ATTR0]] { ; OPT-NEXT: [[INPUT_PARAM:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) ; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[INPUT_PARAM]] to ptr ; OPT-NEXT: call void @device_func(ptr byval(i32) align 4 [[INPUT_PARAM_GEN]]) @@ -545,45 +545,3 @@ define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) { declare dso_local void @dummy() local_unnamed_addr declare dso_local ptr @escape(ptr) local_unnamed_addr declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr - -!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24} - -!0 = !{ptr @grid_const_int, !"grid_constant", !1} -!1 = !{i32 1} - -!2 = !{ptr @grid_const_struct, !"grid_constant", !3} -!3 = !{i32 1} - -!4 = !{ptr @grid_const_escape, !"grid_constant", !5} -!5 = !{i32 1} - -!6 = !{ptr @multiple_grid_const_escape, !"grid_constant", !7} -!7 = !{i32 1, i32 3} - -!8 = !{ptr @grid_const_memory_escape, !"grid_constant", !9} -!9 = !{i32 1} - -!10 = !{ptr @grid_const_inlineasm_escape, !"grid_constant", !11} -!11 = !{i32 1} - -!12 = !{ptr @grid_const_partial_escape, !"grid_constant", !13} -!13 = !{i32 1} - -!14 = !{ptr @grid_const_partial_escapemem, !"grid_constant", !15} -!15 = !{i32 1} - -!16 = !{ptr @grid_const_phi, !"grid_constant", !17} -!17 = !{i32 1} - -!18 = !{ptr @grid_const_phi_ngc, !"grid_constant", !19} -!19 = !{i32 1} - -!20 = !{ptr @grid_const_select, !"grid_constant", !21} -!21 = !{i32 1} - -!22 = !{ptr @grid_const_ptrtoint, !"grid_constant", !23} -!23 = !{i32 1} - -!24 = !{ptr @test_forward_byval_arg, !"grid_constant", !25} -!25 = !{i32 1} - diff --git a/llvm/test/CodeGen/NVPTX/managed.ll b/llvm/test/CodeGen/NVPTX/managed.ll index 0b94843c76ea..931c17d5ba80 100644 --- a/llvm/test/CodeGen/NVPTX/managed.ll +++ b/llvm/test/CodeGen/NVPTX/managed.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx40 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx40 | %ptxas-verify %} +; RUN: %if ptxas-isa-4.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx40 | %ptxas-verify %} ; RUN: not --crash llc < %s -mtriple=nvptx64 -mcpu=sm_20 2>&1 | FileCheck %s --check-prefix ERROR ; ERROR: LLVM ERROR: .attribute(.managed) requires PTX version >= 4.0 and sm_30 diff --git a/llvm/test/CodeGen/NVPTX/match.ll b/llvm/test/CodeGen/NVPTX/match.ll index ae01b0d3cc7e..0b459a169aa4 100644 --- a/llvm/test/CodeGen/NVPTX/match.ll +++ b/llvm/test/CodeGen/NVPTX/match.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 && ptxas-isa-6.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | %ptxas-verify -arch=sm_70 %} declare i32 @llvm.nvvm.match.any.sync.i32(i32, i32) declare i32 @llvm.nvvm.match.any.sync.i64(i32, i64) diff --git a/llvm/test/CodeGen/NVPTX/math-intrins-sm53-ptx42.ll b/llvm/test/CodeGen/NVPTX/math-intrins-sm53-ptx42.ll index 236bf67f8182..ff0cf3eaafac 100644 --- a/llvm/test/CodeGen/NVPTX/math-intrins-sm53-ptx42.ll +++ b/llvm/test/CodeGen/NVPTX/math-intrins-sm53-ptx42.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_53 -mattr=+ptx42 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_53 -mattr=+ptx42 | %ptxas-verify -arch=sm_53 %} +; RUN: %if ptxas-sm_53 && ptxas-isa-4.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_53 -mattr=+ptx42 | %ptxas-verify -arch=sm_53 %} declare half @llvm.nvvm.fma.rn.f16(half, half, half) declare half @llvm.nvvm.fma.rn.ftz.f16(half, half, half) diff --git a/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-autoupgrade.ll b/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-autoupgrade.ll index c04fd07ec5da..7b5bfed98515 100644 --- a/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-autoupgrade.ll +++ b/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-autoupgrade.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s -; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_80 && ptxas-isa-7.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} declare bfloat @llvm.nvvm.abs.bf16(bfloat) declare <2 x bfloat> @llvm.nvvm.abs.bf16x2(<2 x bfloat>) diff --git a/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll b/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll index 79b7f429f52b..fe2cb16a9413 100644 --- a/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll +++ b/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s -; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_80 && ptxas-isa-7.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} declare bfloat @llvm.nvvm.abs.bf16(bfloat) declare <2 x bfloat> @llvm.nvvm.abs.bf16x2(<2 x bfloat>) diff --git a/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72-autoupgrade.ll b/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72-autoupgrade.ll index 5d9b8fe3dc46..0ebbd13fbb00 100644 --- a/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72-autoupgrade.ll +++ b/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72-autoupgrade.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_86 -mattr=+ptx72 | FileCheck %s -; RUN: %if ptxas-11.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_86 -mattr=+ptx72 | %ptxas-verify -arch=sm_86 %} +; RUN: %if ptxas-sm_86 && ptxas-isa-7.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_86 -mattr=+ptx72 | %ptxas-verify -arch=sm_86 %} ; CHECK-LABEL: fmin_xorsign_abs_f16 define half @fmin_xorsign_abs_f16(half %0, half %1) { diff --git a/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll b/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll index 2ca9d070737d..0e3ac828e5d4 100644 --- a/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll +++ b/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_86 -mattr=+ptx72 | FileCheck %s -; RUN: %if ptxas-11.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_86 -mattr=+ptx72 | %ptxas-verify -arch=sm_86 %} +; RUN: %if ptxas-sm_86 && ptxas-isa-7.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_86 -mattr=+ptx72 | %ptxas-verify -arch=sm_86 %} declare half @llvm.nvvm.fmin.xorsign.abs.f16(half, half) declare half @llvm.nvvm.fmin.ftz.xorsign.abs.f16(half, half) diff --git a/llvm/test/CodeGen/NVPTX/math-intrins.ll b/llvm/test/CodeGen/NVPTX/math-intrins.ll index e9635e939398..5a55fa97033b 100644 --- a/llvm/test/CodeGen/NVPTX/math-intrins.ll +++ b/llvm/test/CodeGen/NVPTX/math-intrins.ll @@ -3,8 +3,8 @@ ; RUN: llc < %s -mcpu=sm_80 -mattr +ptx70 | FileCheck %s --check-prefixes=CHECK,CHECK-F16 ; RUN: llc < %s -mcpu=sm_80 -mattr +ptx70 --nvptx-no-f16-math | FileCheck %s --check-prefixes=CHECK,CHECK-SM80-NOF16 ; RUN: %if ptxas %{ llc < %s | %ptxas-verify %} -; RUN: %if ptxas-11.0 %{ llc < %s -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %} -; RUN: %if ptxas-11.0 %{ llc < %s -mcpu=sm_80 --nvptx-no-f16-math | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_80 %{ llc < %s -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_80 %{ llc < %s -mcpu=sm_80 --nvptx-no-f16-math | %ptxas-verify -arch=sm_80 %} target triple = "nvptx64-nvidia-cuda" @@ -42,6 +42,14 @@ declare half @llvm.maximum.f16(half, half) #0 declare float @llvm.maximum.f32(float, float) #0 declare double @llvm.maximum.f64(double, double) #0 declare <2 x half> @llvm.maximum.v2f16(<2 x half>, <2 x half>) #0 +declare half @llvm.minimumnum.f16(half, half) #0 +declare float @llvm.minimumnum.f32(float, float) #0 +declare double @llvm.minimumnum.f64(double, double) #0 +declare <2 x half> @llvm.minimumnum.v2f16(<2 x half>, <2 x half>) #0 +declare half @llvm.maximumnum.f16(half, half) #0 +declare float @llvm.maximumnum.f32(float, float) #0 +declare double @llvm.maximumnum.f64(double, double) #0 +declare <2 x half> @llvm.maximumnum.v2f16(<2 x half>, <2 x half>) #0 declare float @llvm.fma.f32(float, float, float) #0 declare double @llvm.fma.f64(double, double, double) #0 @@ -1486,6 +1494,410 @@ define <2 x half> @maximum_v2half(<2 x half> %a, <2 x half> %b) { ret <2 x half> %x } +; ---- minimumnum ---- + +define half @minimumnum_half(half %a, half %b) { +; CHECK-NOF16-LABEL: minimumnum_half( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .b16 %rs<4>; +; CHECK-NOF16-NEXT: .reg .b32 %r<4>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b16 %rs1, [minimumnum_half_param_0]; +; CHECK-NOF16-NEXT: ld.param.b16 %rs2, [minimumnum_half_param_1]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs1; +; CHECK-NOF16-NEXT: min.f32 %r3, %r2, %r1; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3; +; CHECK-NOF16-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-NOF16-NEXT: ret; +; +; CHECK-F16-LABEL: minimumnum_half( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .b16 %rs<4>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b16 %rs1, [minimumnum_half_param_0]; +; CHECK-F16-NEXT: ld.param.b16 %rs2, [minimumnum_half_param_1]; +; CHECK-F16-NEXT: min.f16 %rs3, %rs1, %rs2; +; CHECK-F16-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-F16-NEXT: ret; +; +; CHECK-SM80-NOF16-LABEL: minimumnum_half( +; CHECK-SM80-NOF16: { +; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<4>; +; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<4>; +; CHECK-SM80-NOF16-EMPTY: +; CHECK-SM80-NOF16-NEXT: // %bb.0: +; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs1, [minimumnum_half_param_0]; +; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs2, [minimumnum_half_param_1]; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs1; +; CHECK-SM80-NOF16-NEXT: min.f32 %r3, %r2, %r1; +; CHECK-SM80-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3; +; CHECK-SM80-NOF16-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-SM80-NOF16-NEXT: ret; + %x = call half @llvm.minimumnum.f16(half %a, half %b) + ret half %x +} + +define float @minimumnum_float(float %a, float %b) { +; CHECK-LABEL: minimumnum_float( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [minimumnum_float_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [minimumnum_float_param_1]; +; CHECK-NEXT: min.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %x = call float @llvm.minimumnum.f32(float %a, float %b) + ret float %x +} + +define float @minimumnum_float_ftz(float %a, float %b) #1 { +; CHECK-LABEL: minimumnum_float_ftz( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [minimumnum_float_ftz_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [minimumnum_float_ftz_param_1]; +; CHECK-NEXT: min.ftz.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %x = call float @llvm.minimumnum.f32(float %a, float %b) + ret float %x +} + +define double @minimumnum_double(double %a, double %b) { +; CHECK-LABEL: minimumnum_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [minimumnum_double_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [minimumnum_double_param_1]; +; CHECK-NEXT: min.f64 %rd3, %rd1, %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %x = call double @llvm.minimumnum.f64(double %a, double %b) + ret double %x +} + +; TODO Improve the "Expand" path for minimumnum vectors on targets where +; f16 is not supported. Ideally it should use two f32 minimumnums first instead of +; fully expanding the minimumnum instruction into compare/select instructions. +define <2 x half> @minimumnum_v2half(<2 x half> %a, <2 x half> %b) { +; CHECK-NOF16-LABEL: minimumnum_v2half( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .pred %p<13>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<17>; +; CHECK-NOF16-NEXT: .reg .b32 %r<11>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [minimumnum_v2half_param_0]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; +; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [minimumnum_v2half_param_1]; +; CHECK-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs5; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3; +; CHECK-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs6; +; CHECK-NOF16-NEXT: setp.lt.f32 %p3, %r2, %r4; +; CHECK-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3; +; CHECK-NOF16-NEXT: setp.eq.b16 %p4, %rs5, -32768; +; CHECK-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4; +; CHECK-NOF16-NEXT: setp.eq.b16 %p5, %rs6, -32768; +; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs7; +; CHECK-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000; +; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6; +; CHECK-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs11; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; +; CHECK-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8; +; CHECK-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs12; +; CHECK-NOF16-NEXT: setp.lt.f32 %p9, %r7, %r9; +; CHECK-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9; +; CHECK-NOF16-NEXT: setp.eq.b16 %p10, %rs11, -32768; +; CHECK-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10; +; CHECK-NOF16-NEXT: setp.eq.b16 %p11, %rs12, -32768; +; CHECK-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs13; +; CHECK-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000; +; CHECK-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12; +; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10}; +; CHECK-NOF16-NEXT: ret; +; +; CHECK-F16-LABEL: minimumnum_v2half( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .b32 %r<4>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r1, [minimumnum_v2half_param_0]; +; CHECK-F16-NEXT: ld.param.b32 %r2, [minimumnum_v2half_param_1]; +; CHECK-F16-NEXT: min.f16x2 %r3, %r1, %r2; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-F16-NEXT: ret; +; +; CHECK-SM80-NOF16-LABEL: minimumnum_v2half( +; CHECK-SM80-NOF16: { +; CHECK-SM80-NOF16-NEXT: .reg .pred %p<13>; +; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<17>; +; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<11>; +; CHECK-SM80-NOF16-EMPTY: +; CHECK-SM80-NOF16-NEXT: // %bb.0: +; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [minimumnum_v2half_param_0]; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; +; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; +; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [minimumnum_v2half_param_1]; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs5; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r4, %rs6; +; CHECK-SM80-NOF16-NEXT: setp.lt.f32 %p3, %r2, %r4; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3; +; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p4, %rs5, -32768; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4; +; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p5, %rs6, -32768; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r5, %rs7; +; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r7, %rs11; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; +; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r9, %rs12; +; CHECK-SM80-NOF16-NEXT: setp.lt.f32 %p9, %r7, %r9; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9; +; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p10, %rs11, -32768; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10; +; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p11, %rs12, -32768; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r10, %rs13; +; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12; +; CHECK-SM80-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10}; +; CHECK-SM80-NOF16-NEXT: ret; + %x = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> %a, <2 x half> %b) + ret <2 x half> %x +} + +; ---- maximumnum ---- + +define half @maximumnum_half(half %a, half %b) { +; CHECK-NOF16-LABEL: maximumnum_half( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .b16 %rs<4>; +; CHECK-NOF16-NEXT: .reg .b32 %r<4>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b16 %rs1, [maximumnum_half_param_0]; +; CHECK-NOF16-NEXT: ld.param.b16 %rs2, [maximumnum_half_param_1]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs1; +; CHECK-NOF16-NEXT: max.f32 %r3, %r2, %r1; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3; +; CHECK-NOF16-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-NOF16-NEXT: ret; +; +; CHECK-F16-LABEL: maximumnum_half( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .b16 %rs<4>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b16 %rs1, [maximumnum_half_param_0]; +; CHECK-F16-NEXT: ld.param.b16 %rs2, [maximumnum_half_param_1]; +; CHECK-F16-NEXT: max.f16 %rs3, %rs1, %rs2; +; CHECK-F16-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-F16-NEXT: ret; +; +; CHECK-SM80-NOF16-LABEL: maximumnum_half( +; CHECK-SM80-NOF16: { +; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<4>; +; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<4>; +; CHECK-SM80-NOF16-EMPTY: +; CHECK-SM80-NOF16-NEXT: // %bb.0: +; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs1, [maximumnum_half_param_0]; +; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs2, [maximumnum_half_param_1]; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs1; +; CHECK-SM80-NOF16-NEXT: max.f32 %r3, %r2, %r1; +; CHECK-SM80-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3; +; CHECK-SM80-NOF16-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-SM80-NOF16-NEXT: ret; + %x = call half @llvm.maximumnum.f16(half %a, half %b) + ret half %x +} + +define float @maximumnum_float(float %a, float %b) { +; CHECK-LABEL: maximumnum_float( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [maximumnum_float_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [maximumnum_float_param_1]; +; CHECK-NEXT: max.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %x = call float @llvm.maximumnum.f32(float %a, float %b) + ret float %x +} + +define float @maximumnum_float_ftz(float %a, float %b) #1 { +; CHECK-LABEL: maximumnum_float_ftz( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [maximumnum_float_ftz_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [maximumnum_float_ftz_param_1]; +; CHECK-NEXT: max.ftz.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %x = call float @llvm.maximumnum.f32(float %a, float %b) + ret float %x +} + +define double @maximumnum_double(double %a, double %b) { +; CHECK-LABEL: maximumnum_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [maximumnum_double_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [maximumnum_double_param_1]; +; CHECK-NEXT: max.f64 %rd3, %rd1, %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %x = call double @llvm.maximumnum.f64(double %a, double %b) + ret double %x +} + +; TODO Improve the "Expand" path for maximumnum vectors on targets where +; f16 is not supported. Ideally it should use two f32 maximumnums first instead of +; fully expanding the maximumnum instruction into compare/select instructions. +define <2 x half> @maximumnum_v2half(<2 x half> %a, <2 x half> %b) { +; CHECK-NOF16-LABEL: maximumnum_v2half( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .pred %p<13>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<17>; +; CHECK-NOF16-NEXT: .reg .b32 %r<11>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [maximumnum_v2half_param_0]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; +; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [maximumnum_v2half_param_1]; +; CHECK-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs5; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3; +; CHECK-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs6; +; CHECK-NOF16-NEXT: setp.gt.f32 %p3, %r2, %r4; +; CHECK-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3; +; CHECK-NOF16-NEXT: setp.eq.b16 %p4, %rs5, 0; +; CHECK-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4; +; CHECK-NOF16-NEXT: setp.eq.b16 %p5, %rs6, 0; +; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs7; +; CHECK-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000; +; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6; +; CHECK-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs11; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; +; CHECK-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8; +; CHECK-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs12; +; CHECK-NOF16-NEXT: setp.gt.f32 %p9, %r7, %r9; +; CHECK-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9; +; CHECK-NOF16-NEXT: setp.eq.b16 %p10, %rs11, 0; +; CHECK-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10; +; CHECK-NOF16-NEXT: setp.eq.b16 %p11, %rs12, 0; +; CHECK-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs13; +; CHECK-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000; +; CHECK-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12; +; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10}; +; CHECK-NOF16-NEXT: ret; +; +; CHECK-F16-LABEL: maximumnum_v2half( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .b32 %r<4>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r1, [maximumnum_v2half_param_0]; +; CHECK-F16-NEXT: ld.param.b32 %r2, [maximumnum_v2half_param_1]; +; CHECK-F16-NEXT: max.f16x2 %r3, %r1, %r2; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-F16-NEXT: ret; +; +; CHECK-SM80-NOF16-LABEL: maximumnum_v2half( +; CHECK-SM80-NOF16: { +; CHECK-SM80-NOF16-NEXT: .reg .pred %p<13>; +; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<17>; +; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<11>; +; CHECK-SM80-NOF16-EMPTY: +; CHECK-SM80-NOF16-NEXT: // %bb.0: +; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [maximumnum_v2half_param_0]; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; +; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; +; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [maximumnum_v2half_param_1]; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs5; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r4, %rs6; +; CHECK-SM80-NOF16-NEXT: setp.gt.f32 %p3, %r2, %r4; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3; +; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p4, %rs5, 0; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4; +; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p5, %rs6, 0; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r5, %rs7; +; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r7, %rs11; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; +; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r9, %rs12; +; CHECK-SM80-NOF16-NEXT: setp.gt.f32 %p9, %r7, %r9; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9; +; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p10, %rs11, 0; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10; +; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p11, %rs12, 0; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r10, %rs13; +; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12; +; CHECK-SM80-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10}; +; CHECK-SM80-NOF16-NEXT: ret; + %x = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> %a, <2 x half> %b) + ret <2 x half> %x +} + ; ---- fma ---- define float @fma_float(float %a, float %b, float %c) { diff --git a/llvm/test/CodeGen/NVPTX/mbarrier.ll b/llvm/test/CodeGen/NVPTX/mbarrier.ll index 87a73aa4d4e2..78edc0aa2db5 100644 --- a/llvm/test/CodeGen/NVPTX/mbarrier.ll +++ b/llvm/test/CodeGen/NVPTX/mbarrier.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_80 | FileCheck %s -check-prefix=CHECK_PTX32 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 | FileCheck %s -check-prefix=CHECK_PTX64 -; RUN: %if ptxas-11.0 && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %} -; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_80 && ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_80 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %} declare void @llvm.nvvm.mbarrier.init(ptr %a, i32 %b) declare void @llvm.nvvm.mbarrier.init.shared(ptr addrspace(3) %a, i32 %b) diff --git a/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll b/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll index 0039370e6dcf..be6d1581bc46 100644 --- a/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll +++ b/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll @@ -185,44 +185,40 @@ define void @test_v4halfp0a1(ptr noalias readonly %from, ptr %to) { define void @s1(ptr %p1, <4 x float> %v) { ; CHECK-LABEL: s1( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<18>; +; CHECK-NEXT: .reg .b32 %r<17>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [s1_param_0]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [s1_param_1]; -; CHECK-NEXT: cvt.u64.u32 %rd2, %r4; -; CHECK-NEXT: st.b8 [%rd1+12], %rd2; -; CHECK-NEXT: cvt.u64.u32 %rd3, %r3; -; CHECK-NEXT: st.b8 [%rd1+8], %rd3; -; CHECK-NEXT: cvt.u64.u32 %rd4, %r2; -; CHECK-NEXT: st.b8 [%rd1+4], %rd4; -; CHECK-NEXT: cvt.u64.u32 %rd5, %r1; -; CHECK-NEXT: st.b8 [%rd1], %rd5; -; CHECK-NEXT: shr.u64 %rd6, %rd2, 24; -; CHECK-NEXT: st.b8 [%rd1+15], %rd6; -; CHECK-NEXT: shr.u64 %rd7, %rd2, 16; -; CHECK-NEXT: st.b8 [%rd1+14], %rd7; -; CHECK-NEXT: shr.u64 %rd8, %rd2, 8; -; CHECK-NEXT: st.b8 [%rd1+13], %rd8; -; CHECK-NEXT: shr.u64 %rd9, %rd3, 24; -; CHECK-NEXT: st.b8 [%rd1+11], %rd9; -; CHECK-NEXT: shr.u64 %rd10, %rd3, 16; -; CHECK-NEXT: st.b8 [%rd1+10], %rd10; -; CHECK-NEXT: shr.u64 %rd11, %rd3, 8; -; CHECK-NEXT: st.b8 [%rd1+9], %rd11; -; CHECK-NEXT: shr.u64 %rd12, %rd4, 24; -; CHECK-NEXT: st.b8 [%rd1+7], %rd12; -; CHECK-NEXT: shr.u64 %rd13, %rd4, 16; -; CHECK-NEXT: st.b8 [%rd1+6], %rd13; -; CHECK-NEXT: shr.u64 %rd14, %rd4, 8; -; CHECK-NEXT: st.b8 [%rd1+5], %rd14; -; CHECK-NEXT: shr.u64 %rd15, %rd5, 24; -; CHECK-NEXT: st.b8 [%rd1+3], %rd15; -; CHECK-NEXT: shr.u64 %rd16, %rd5, 16; -; CHECK-NEXT: st.b8 [%rd1+2], %rd16; -; CHECK-NEXT: shr.u64 %rd17, %rd5, 8; -; CHECK-NEXT: st.b8 [%rd1+1], %rd17; +; CHECK-NEXT: st.b8 [%rd1+12], %r4; +; CHECK-NEXT: st.b8 [%rd1+8], %r3; +; CHECK-NEXT: st.b8 [%rd1+4], %r2; +; CHECK-NEXT: st.b8 [%rd1], %r1; +; CHECK-NEXT: shr.u32 %r5, %r4, 24; +; CHECK-NEXT: st.b8 [%rd1+15], %r5; +; CHECK-NEXT: shr.u32 %r6, %r4, 16; +; CHECK-NEXT: st.b8 [%rd1+14], %r6; +; CHECK-NEXT: shr.u32 %r7, %r4, 8; +; CHECK-NEXT: st.b8 [%rd1+13], %r7; +; CHECK-NEXT: shr.u32 %r8, %r3, 24; +; CHECK-NEXT: st.b8 [%rd1+11], %r8; +; CHECK-NEXT: shr.u32 %r9, %r3, 16; +; CHECK-NEXT: st.b8 [%rd1+10], %r9; +; CHECK-NEXT: shr.u32 %r10, %r3, 8; +; CHECK-NEXT: st.b8 [%rd1+9], %r10; +; CHECK-NEXT: shr.u32 %r11, %r2, 24; +; CHECK-NEXT: st.b8 [%rd1+7], %r11; +; CHECK-NEXT: shr.u32 %r12, %r2, 16; +; CHECK-NEXT: st.b8 [%rd1+6], %r12; +; CHECK-NEXT: shr.u32 %r13, %r2, 8; +; CHECK-NEXT: st.b8 [%rd1+5], %r13; +; CHECK-NEXT: shr.u32 %r14, %r1, 24; +; CHECK-NEXT: st.b8 [%rd1+3], %r14; +; CHECK-NEXT: shr.u32 %r15, %r1, 16; +; CHECK-NEXT: st.b8 [%rd1+2], %r15; +; CHECK-NEXT: shr.u32 %r16, %r1, 8; +; CHECK-NEXT: st.b8 [%rd1+1], %r16; ; CHECK-NEXT: ret; store <4 x float> %v, ptr %p1, align 1 ret void diff --git a/llvm/test/CodeGen/NVPTX/mulwide.ll b/llvm/test/CodeGen/NVPTX/mulwide.ll index 17220340d4b0..bde57fb7b95b 100644 --- a/llvm/test/CodeGen/NVPTX/mulwide.ll +++ b/llvm/test/CodeGen/NVPTX/mulwide.ll @@ -118,17 +118,15 @@ define i32 @mulwideu8(i8 %a, i8 %b) { ; NOOPT-LABEL: mulwideu8( ; NOOPT: { ; NOOPT-NEXT: .reg .b16 %rs<3>; -; NOOPT-NEXT: .reg .b32 %r<6>; +; NOOPT-NEXT: .reg .b32 %r<4>; ; NOOPT-EMPTY: ; NOOPT-NEXT: // %bb.0: ; NOOPT-NEXT: ld.param.b8 %rs2, [mulwideu8_param_1]; ; NOOPT-NEXT: ld.param.b8 %rs1, [mulwideu8_param_0]; ; NOOPT-NEXT: cvt.u32.u16 %r1, %rs1; -; NOOPT-NEXT: and.b32 %r2, %r1, 255; -; NOOPT-NEXT: cvt.u32.u16 %r3, %rs2; -; NOOPT-NEXT: and.b32 %r4, %r3, 255; -; NOOPT-NEXT: mul.lo.s32 %r5, %r2, %r4; -; NOOPT-NEXT: st.param.b32 [func_retval0], %r5; +; NOOPT-NEXT: cvt.u32.u16 %r2, %rs2; +; NOOPT-NEXT: mul.lo.s32 %r3, %r1, %r2; +; NOOPT-NEXT: st.param.b32 [func_retval0], %r3; ; NOOPT-NEXT: ret; %val0 = zext i8 %a to i32 %val1 = zext i8 %b to i32 diff --git a/llvm/test/CodeGen/NVPTX/nanosleep.ll b/llvm/test/CodeGen/NVPTX/nanosleep.ll index de08c9fbdf41..48bf8bc464e8 100644 --- a/llvm/test/CodeGen/NVPTX/nanosleep.ll +++ b/llvm/test/CodeGen/NVPTX/nanosleep.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -O2 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 && ptxas-isa-6.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} declare void @llvm.nvvm.nanosleep(i32) diff --git a/llvm/test/CodeGen/NVPTX/nofunc.ll b/llvm/test/CodeGen/NVPTX/nofunc.ll index a8ce20ed91dc..d07d22290c8c 100644 --- a/llvm/test/CodeGen/NVPTX/nofunc.ll +++ b/llvm/test/CodeGen/NVPTX/nofunc.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; Test that we don't crash if we're compiling a module with function references, diff --git a/llvm/test/CodeGen/NVPTX/noreturn.ll b/llvm/test/CodeGen/NVPTX/noreturn.ll index 6c11d0a9376a..0062e62756d3 100644 --- a/llvm/test/CodeGen/NVPTX/noreturn.ll +++ b/llvm/test/CodeGen/NVPTX/noreturn.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mattr=+ptx64 -mcpu=sm_30 | FileCheck %s -; RUN: %if ptxas %{llc < %s -mtriple=nvptx64 -mattr=+ptx60 -mcpu=sm_30 | %ptxas-verify %} +; RUN: %if ptxas-isa-6.0 %{llc < %s -mtriple=nvptx64 -mattr=+ptx60 -mcpu=sm_30 | %ptxas-verify %} @function_pointer = addrspace(1) global ptr null diff --git a/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll b/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll index 9a78d31302e1..8527d3d014f5 100644 --- a/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll +++ b/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64-nvidia-nvcl -mcpu=sm_60 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-nvcl -mcpu=sm_60 | %ptxas-verify %} +; RUN: %if ptxas-sm_60 %{ llc < %s -mtriple=nvptx64-nvidia-nvcl -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %} target triple = "nvptx-unknown-nvcl" diff --git a/llvm/test/CodeGen/NVPTX/packed-aggr.ll b/llvm/test/CodeGen/NVPTX/packed-aggr.ll index 602bef299bb2..353f1cba74eb 100644 --- a/llvm/test/CodeGen/NVPTX/packed-aggr.ll +++ b/llvm/test/CodeGen/NVPTX/packed-aggr.ll @@ -5,8 +5,8 @@ ; RUN: FileCheck %s --check-prefixes=CHECK,CHECK32 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | \ ; RUN: FileCheck %s --check-prefixes=CHECK,CHECK64 -; RUN: %if ptxas-11.1 && !ptxas-12.0%{ llc < %s -mtriple=nvptx -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %} -; RUN: %if ptxas-11.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %} +; RUN: %if ptxas-isa-7.1 && ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %} +; RUN: %if ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %} ;; Test that packed structs with symbol references are represented using the ;; mask() operator. diff --git a/llvm/test/CodeGen/NVPTX/param-overalign.ll b/llvm/test/CodeGen/NVPTX/param-overalign.ll index 8899709d1cf1..2ee749fb3b0c 100644 --- a/llvm/test/CodeGen/NVPTX/param-overalign.ll +++ b/llvm/test/CodeGen/NVPTX/param-overalign.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -verify-machineinstrs | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -verify-machineinstrs | %ptxas-verify %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/pr126337.ll b/llvm/test/CodeGen/NVPTX/pr126337.ll index f56b8eb98077..525da1fde9eb 100644 --- a/llvm/test/CodeGen/NVPTX/pr126337.ll +++ b/llvm/test/CodeGen/NVPTX/pr126337.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 | %ptxas -arch=sm_70 -c - %} +; RUN: %if ptxas-sm_70 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 | %ptxas-verify -arch=sm_70 %} ; This IR should compile without triggering assertions in LICM ; when the CopyToReg from %0 in the first BB gets eliminated diff --git a/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll b/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll index cd2505c20d39..5120550161ea 100644 --- a/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll +++ b/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} define ptx_kernel void @t1(ptr %a) { diff --git a/llvm/test/CodeGen/NVPTX/prefetch-inferas-test.ll b/llvm/test/CodeGen/NVPTX/prefetch-inferas-test.ll index 3efe9be898cc..bc67471209bf 100644 --- a/llvm/test/CodeGen/NVPTX/prefetch-inferas-test.ll +++ b/llvm/test/CodeGen/NVPTX/prefetch-inferas-test.ll @@ -1,6 +1,6 @@ ; RUN: opt < %s -S -passes=infer-address-spaces | FileCheck %s --check-prefix=INFER
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | FileCheck %s --check-prefix=PTX
-; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify -arch=sm_90 %}
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify -arch=sm_90 %}
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-unknown-unknown"
diff --git a/llvm/test/CodeGen/NVPTX/prefetch.ll b/llvm/test/CodeGen/NVPTX/prefetch.ll index 862e26d70467..a1c5ec8f50a6 100644 --- a/llvm/test/CodeGen/NVPTX/prefetch.ll +++ b/llvm/test/CodeGen/NVPTX/prefetch.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX64 %s
-; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %}
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %}
target triple = "nvptx64-nvidia-cuda"
diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll index f286928da448..f871e4039a55 100644 --- a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll +++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll @@ -2,13 +2,13 @@ ; RUN: llc < %s -mcpu=sm_80 -mattr=+ptx70 -O0 \ ; RUN: -disable-post-ra -verify-machineinstrs \ ; RUN: | FileCheck -check-prefixes CHECK,CHECK-SM80 %s -; RUN: %if ptxas-12.9 %{ llc < %s -mcpu=sm_80 -mattr=+ptx70 -O0 \ +; RUN: %if ptxas-sm_80 && ptxas-isa-7.0 %{ llc < %s -mcpu=sm_80 -mattr=+ptx70 -O0 \ ; RUN: -disable-post-ra -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_80 %} ; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx88 -O0 \ ; RUN: -disable-post-ra -verify-machineinstrs \ ; RUN: | FileCheck -check-prefixes CHECK,CHECK-SM100 %s -; RUN: %if ptxas-12.9 %{ llc < %s -mcpu=sm_100 -mattr=+ptx88 -O0 \ +; RUN: %if ptxas-sm_100 && ptxas-isa-8.8 %{ llc < %s -mcpu=sm_100 -mattr=+ptx88 -O0 \ ; RUN: -disable-post-ra -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_100 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/redux-sync-f32.ll b/llvm/test/CodeGen/NVPTX/redux-sync-f32.ll index 7c9487b33854..38c9234c78fe 100644 --- a/llvm/test/CodeGen/NVPTX/redux-sync-f32.ll +++ b/llvm/test/CodeGen/NVPTX/redux-sync-f32.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} declare float @llvm.nvvm.redux.sync.fmin(float, i32) define float @redux_sync_fmin(float %src, i32 %mask) { diff --git a/llvm/test/CodeGen/NVPTX/redux-sync.ll b/llvm/test/CodeGen/NVPTX/redux-sync.ll index bd1c7f5c12e9..90b230850bd3 100644 --- a/llvm/test/CodeGen/NVPTX/redux-sync.ll +++ b/llvm/test/CodeGen/NVPTX/redux-sync.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s -; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_80 && ptxas-isa-7.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} declare i32 @llvm.nvvm.redux.sync.umin(i32, i32) ; CHECK-LABEL: .func{{.*}}redux_sync_min_u32 diff --git a/llvm/test/CodeGen/NVPTX/reg-types.ll b/llvm/test/CodeGen/NVPTX/reg-types.ll index ea45bfdc5e19..f9b4f6b10fca 100644 --- a/llvm/test/CodeGen/NVPTX/reg-types.ll +++ b/llvm/test/CodeGen/NVPTX/reg-types.ll @@ -3,7 +3,7 @@ ; RUN: llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s ; RUN: llc -O0 < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s -check-prefixes=NO8BIT ; RUN: llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s -check-prefixes=NO8BIT -; RUN: %if ptxas && !ptxas-12.0 %{ llc -O0 < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc -O0 < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; CHECK-LABEL: .visible .func func( diff --git a/llvm/test/CodeGen/NVPTX/setmaxnreg-sm100a.ll b/llvm/test/CodeGen/NVPTX/setmaxnreg-sm100a.ll index fecc286c7a2f..cb623142563a 100644 --- a/llvm/test/CodeGen/NVPTX/setmaxnreg-sm100a.ll +++ b/llvm/test/CodeGen/NVPTX/setmaxnreg-sm100a.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} ; CHECK-LABEL: test_set_maxn_reg_sm100a define void @test_set_maxn_reg_sm100a() { diff --git a/llvm/test/CodeGen/NVPTX/setmaxnreg.ll b/llvm/test/CodeGen/NVPTX/setmaxnreg.ll index 5b266e8a6584..cca603aa91d9 100644 --- a/llvm/test/CodeGen/NVPTX/setmaxnreg.ll +++ b/llvm/test/CodeGen/NVPTX/setmaxnreg.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90a -mattr=+ptx80| FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas-12.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90a -mattr=+ptx80| %ptxas-verify -arch=sm_90a %} +; RUN: %if ptxas-sm_90a && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90a -mattr=+ptx80| %ptxas-verify -arch=sm_90a %} declare void @llvm.nvvm.setmaxnreg.inc.sync.aligned.u32(i32 %reg_count) declare void @llvm.nvvm.setmaxnreg.dec.sync.aligned.u32(i32 %reg_count) diff --git a/llvm/test/CodeGen/NVPTX/sext-setcc.ll b/llvm/test/CodeGen/NVPTX/sext-setcc.ll index 97918a6f26cd..9c028c259a21 100644 --- a/llvm/test/CodeGen/NVPTX/sext-setcc.ll +++ b/llvm/test/CodeGen/NVPTX/sext-setcc.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 | FileCheck %s -; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} +; RUN: %if ptxas-sm_80 && ptxas-isa-7.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} define <2 x i16> @sext_setcc_v2i1_to_v2i16(ptr %p) { ; CHECK-LABEL: sext_setcc_v2i1_to_v2i16( diff --git a/llvm/test/CodeGen/NVPTX/shfl-sync-p.ll b/llvm/test/CodeGen/NVPTX/shfl-sync-p.ll index 9cf3a1dc107c..dfc6e9680b10 100644 --- a/llvm/test/CodeGen/NVPTX/shfl-sync-p.ll +++ b/llvm/test/CodeGen/NVPTX/shfl-sync-p.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %} +; RUN: %if ptxas-isa-6.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %} declare {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32, i32, i32, i32) declare {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32, float, i32, i32) diff --git a/llvm/test/CodeGen/NVPTX/shfl-sync.ll b/llvm/test/CodeGen/NVPTX/shfl-sync.ll index 0c826d221d05..139c1e6ecbba 100644 --- a/llvm/test/CodeGen/NVPTX/shfl-sync.ll +++ b/llvm/test/CodeGen/NVPTX/shfl-sync.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %} +; RUN: %if ptxas-isa-6.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %} declare i32 @llvm.nvvm.shfl.sync.down.i32(i32, i32, i32, i32) declare float @llvm.nvvm.shfl.sync.down.f32(float, i32, i32, i32) diff --git a/llvm/test/CodeGen/NVPTX/short-ptr.ll b/llvm/test/CodeGen/NVPTX/short-ptr.ll index eb058955e0aa..7cf7ff74ba73 100644 --- a/llvm/test/CodeGen/NVPTX/short-ptr.ll +++ b/llvm/test/CodeGen/NVPTX/short-ptr.ll @@ -2,7 +2,7 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix CHECK-DEFAULT-32 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -nvptx-short-ptr | FileCheck %s --check-prefixes CHECK-SHORT-SHARED,CHECK-SHORT-CONST,CHECK-SHORT-LOCAL -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -nvptx-short-ptr | %ptxas-verify %} diff --git a/llvm/test/CodeGen/NVPTX/simple-call.ll b/llvm/test/CodeGen/NVPTX/simple-call.ll index 991ae04b91b6..ddc430ee6f8f 100644 --- a/llvm/test/CodeGen/NVPTX/simple-call.ll +++ b/llvm/test/CodeGen/NVPTX/simple-call.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} ; CHECK: .func ({{.*}}) device_func diff --git a/llvm/test/CodeGen/NVPTX/sm-version.ll b/llvm/test/CodeGen/NVPTX/sm-version.ll index 3a154a1b9ac9..c90c086e8b96 100644 --- a/llvm/test/CodeGen/NVPTX/sm-version.ll +++ b/llvm/test/CodeGen/NVPTX/sm-version.ll @@ -14,6 +14,7 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_75 | FileCheck %s --check-prefix=SM75 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_80 | FileCheck %s --check-prefix=SM80 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_86 | FileCheck %s --check-prefix=SM86 +; RUN: llc < %s -mtriple=nvptx -mcpu=sm_88 | FileCheck %s --check-prefix=SM88 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_90 | FileCheck %s --check-prefix=SM90 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_90a | FileCheck %s --check-prefix=SM90a ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_100 | FileCheck %s --check-prefix=SM100 @@ -25,6 +26,9 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_103 | FileCheck %s --check-prefix=SM103 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_103a | FileCheck %s --check-prefix=SM103a ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_103f | FileCheck %s --check-prefix=SM103f +; RUN: llc < %s -mtriple=nvptx -mcpu=sm_110 | FileCheck %s --check-prefix=SM110 +; RUN: llc < %s -mtriple=nvptx -mcpu=sm_110a | FileCheck %s --check-prefix=SM110a +; RUN: llc < %s -mtriple=nvptx -mcpu=sm_110f | FileCheck %s --check-prefix=SM110f ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_120 | FileCheck %s --check-prefix=SM120 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_120a | FileCheck %s --check-prefix=SM120a ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_120f | FileCheck %s --check-prefix=SM120f @@ -48,6 +52,7 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_75 | FileCheck %s --check-prefix=SM75 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 | FileCheck %s --check-prefix=SM80 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_86 | FileCheck %s --check-prefix=SM86 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_88 | FileCheck %s --check-prefix=SM88 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 | FileCheck %s --check-prefix=SM90 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90a | FileCheck %s --check-prefix=SM90a ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 | FileCheck %s --check-prefix=SM100 @@ -59,6 +64,9 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_103 | FileCheck %s --check-prefix=SM103 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_103a | FileCheck %s --check-prefix=SM103a ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_103f | FileCheck %s --check-prefix=SM103f +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110 | FileCheck %s --check-prefix=SM110 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110a | FileCheck %s --check-prefix=SM110a +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f | FileCheck %s --check-prefix=SM110f ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120 | FileCheck %s --check-prefix=SM120 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120a | FileCheck %s --check-prefix=SM120a ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120f | FileCheck %s --check-prefix=SM120f @@ -82,6 +90,7 @@ ; SM75: .version 6.3 ; SM80: .version 7.0 ; SM86: .version 7.1 +; SM88: .version 9.0 ; SM90: .version 7.8 ; SM90a: .version 8.0 ; SM100: .version 8.6 @@ -93,6 +102,9 @@ ; SM103: .version 8.8 ; SM103a: .version 8.8 ; SM103f: .version 8.8 +; SM110: .version 9.0 +; SM110a: .version 9.0 +; SM110f: .version 9.0 ; SM120: .version 8.7 ; SM120a: .version 8.7 ; SM120f: .version 8.8 @@ -116,6 +128,7 @@ ; SM75: .target sm_75 ; SM80: .target sm_80 ; SM86: .target sm_86 +; SM88: .target sm_88 ; SM90: .target sm_90 ; SM90a: .target sm_90a ; SM100: .target sm_100 @@ -127,6 +140,9 @@ ; SM103: .target sm_103 ; SM103a: .target sm_103a ; SM103f: .target sm_103f +; SM110: .target sm_110 +; SM110a: .target sm_110a +; SM110f: .target sm_110f ; SM120: .target sm_120 ; SM120a: .target sm_120a ; SM120f: .target sm_120f diff --git a/llvm/test/CodeGen/NVPTX/st-addrspace.ll b/llvm/test/CodeGen/NVPTX/st-addrspace.ll index 1e0e75a041c1..a229389fd272 100644 --- a/llvm/test/CodeGen/NVPTX/st-addrspace.ll +++ b/llvm/test/CodeGen/NVPTX/st-addrspace.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s --check-prefixes=ALL,G32,LS32 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefixes=ALL,G64,LS64 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | FileCheck %s --check-prefixes=G64,LS32 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | %ptxas-verify %} diff --git a/llvm/test/CodeGen/NVPTX/st-generic.ll b/llvm/test/CodeGen/NVPTX/st-generic.ll index 950da93f9521..a7aa092992b2 100644 --- a/llvm/test/CodeGen/NVPTX/st-generic.ll +++ b/llvm/test/CodeGen/NVPTX/st-generic.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ;; i8 diff --git a/llvm/test/CodeGen/NVPTX/st-param-imm.ll b/llvm/test/CodeGen/NVPTX/st-param-imm.ll index f90435abefbb..a07e1d550785 100644 --- a/llvm/test/CodeGen/NVPTX/st-param-imm.ll +++ b/llvm/test/CodeGen/NVPTX/st-param-imm.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc < %s -mtriple=nvptx64 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -verify-machineinstrs | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -verify-machineinstrs | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -verify-machineinstrs | %ptxas-verify %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/st_bulk.ll b/llvm/test/CodeGen/NVPTX/st_bulk.ll index 944f221fb1af..5c4b5ba62849 100644 --- a/llvm/test/CodeGen/NVPTX/st_bulk.ll +++ b/llvm/test/CodeGen/NVPTX/st_bulk.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 | FileCheck --check-prefixes=CHECK,CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK,CHECK-PTX-SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 | %ptxas-verify -arch=sm_100 %} -; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-sm_100 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 | %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-sm_100 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100 %} declare void @llvm.nvvm.st.bulk(ptr, i64, i64) define void @st_bulk(ptr %dest_addr, i64 %size) { diff --git a/llvm/test/CodeGen/NVPTX/stacksaverestore.ll b/llvm/test/CodeGen/NVPTX/stacksaverestore.ll index 802ae26da41a..a32f88cd016f 100644 --- a/llvm/test/CodeGen/NVPTX/stacksaverestore.ll +++ b/llvm/test/CodeGen/NVPTX/stacksaverestore.ll @@ -2,7 +2,7 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_60 -mattr=+ptx73 | FileCheck %s --check-prefix=CHECK-32 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_60 -mattr=+ptx73 | FileCheck %s --check-prefix=CHECK-64 ; RUN: llc < %s -mtriple=nvptx64 -nvptx-short-ptr -mcpu=sm_60 -mattr=+ptx73 | FileCheck %s --check-prefix=CHECK-MIXED -; RUN: %if ptxas && ptxas-12.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 -mattr=+ptx73 | %ptxas-verify %} +; RUN: %if ptxas-sm_60 && ptxas-isa-7.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 -mattr=+ptx73 | %ptxas-verify -arch=sm_60 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/surf-tex.py b/llvm/test/CodeGen/NVPTX/surf-tex.py index 15b220ca2175..799ef8c56417 100644 --- a/llvm/test/CodeGen/NVPTX/surf-tex.py +++ b/llvm/test/CodeGen/NVPTX/surf-tex.py @@ -1,6 +1,6 @@ # RUN: %python %s --target=cuda --tests=suld,sust,tex,tld4 --gen-list=%t.list > %t-cuda.ll # RUN: llc -mcpu=sm_60 -mattr=+ptx43 %t-cuda.ll -verify-machineinstrs -o - | FileCheck %t-cuda.ll -# RUN: %if ptxas %{ llc -mcpu=sm_60 -mattr=+ptx43 %t-cuda.ll -verify-machineinstrs -o - | %ptxas-verify %} +# RUN: %if ptxas-sm_60 && ptxas-isa-4.3 %{ llc -mcpu=sm_60 -mattr=+ptx43 %t-cuda.ll -verify-machineinstrs -o - | %ptxas-verify -arch=sm_60 %} # We only need to run this second time for texture tests, because # there is a difference between unified and non-unified intrinsics. diff --git a/llvm/test/CodeGen/NVPTX/symbol-naming.ll b/llvm/test/CodeGen/NVPTX/symbol-naming.ll index 941378f120c3..8053b22284fd 100644 --- a/llvm/test/CodeGen/NVPTX/symbol-naming.ll +++ b/llvm/test/CodeGen/NVPTX/symbol-naming.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=nvptx -mattr=+ptx60 -mcpu=sm_30 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mattr=+ptx60 -mcpu=sm_30 | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mattr=+ptx60 -mcpu=sm_30 | %ptxas-verify %} -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mattr=+ptx60 -mcpu=sm_30 | %ptxas-verify %} +; RUN: %if ptxas-isa-6.0 && ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mattr=+ptx60 -mcpu=sm_30 | %ptxas-verify %} +; RUN: %if ptxas-isa-6.0 %{ llc < %s -mtriple=nvptx64 -mattr=+ptx60 -mcpu=sm_30 | %ptxas-verify %} ; Verify that the NVPTX target removes invalid symbol names prior to emitting ; PTX. diff --git a/llvm/test/CodeGen/NVPTX/szext.ll b/llvm/test/CodeGen/NVPTX/szext.ll index 5a4fe4ed7fc0..a245279ab5ce 100644 --- a/llvm/test/CodeGen/NVPTX/szext.ll +++ b/llvm/test/CodeGen/NVPTX/szext.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -o - < %s -mcpu=sm_70 -mattr=+ptx76 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mcpu=sm_70 -mattr=+ptx76 | %ptxas-verify -arch=sm_70 %} +; RUN: %if ptxas-sm_70 && ptxas-isa-7.6 %{ llc < %s -mcpu=sm_70 -mattr=+ptx76 | %ptxas-verify -arch=sm_70 %} target triple = "nvptx64-unknown-cuda" diff --git a/llvm/test/CodeGen/NVPTX/tanhf.ll b/llvm/test/CodeGen/NVPTX/tanhf.ll index 6f4eb222e0b3..94ed44c7361c 100644 --- a/llvm/test/CodeGen/NVPTX/tanhf.ll +++ b/llvm/test/CodeGen/NVPTX/tanhf.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mcpu=sm_75 -mattr=+ptx70 | FileCheck %s -; RUN: %if ptxas-11.0 %{ llc < %s -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %} +; RUN: %if ptxas-sm_75 && ptxas-isa-7.0 %{ llc < %s -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll b/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll index 9c60af914faf..41a0e81b5a6e 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll @@ -1,8 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK_PTX64 %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK_PTX64_SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} -; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100a %} +; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK_PTX64 %s +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %} + declare void @llvm.nvvm.tcgen05.alloc.cg1(ptr %addr, i32 %ncols) declare void @llvm.nvvm.tcgen05.alloc.cg2(ptr %addr, i32 %ncols) diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll b/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll index cc3b359d0624..7981feb934c8 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll @@ -1,8 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK_PTX64 %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK_PTX64_SHARED32 %s -; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} -; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100a %} +; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK_PTX64 %s +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %} declare void @llvm.nvvm.tcgen05.commit.cg1(ptr %bar_addr) declare void @llvm.nvvm.tcgen05.commit.cg2(ptr %bar_addr) diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll index 780116c42380..c540f78c294f 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} +; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %} ; CHECK-LABEL: test_tcgen05_cp_64x128_v1 define void @test_tcgen05_cp_64x128_v1(ptr addrspace(6) %addr, i64 %sdesc) { diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll b/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll index 07c62671d2fb..cbf647f85717 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} +; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %} declare void @llvm.nvvm.tcgen05.fence.before.thread.sync() declare void @llvm.nvvm.tcgen05.fence.after.thread.sync() diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll b/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll index 7e65338c4525..a37b1a95aa80 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll @@ -1,8 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx86 | FileCheck %s ; RUN: llc < %s -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx86 | FileCheck %s -; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_100a | %ptxas-verify -arch=sm_100a %} -; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_101a | %ptxas-verify -arch=sm_101a %} +; RUN: llc < %s -o - -mcpu=sm_103a -march=nvptx64 -mattr=+ptx88 | FileCheck %s +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_100a | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_101a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_101a | %ptxas-verify -arch=sm_101a %} +; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mattr=+ptx88 -mcpu=sm_103a | %ptxas-verify -arch=sm_103a %} ; CHECK-LABEL: nvvm_tcgen05_ld_16x64b define void @nvvm_tcgen05_ld_16x64b(ptr addrspace(6) %taddr) { diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll index 590d75533bb8..8ca6a2a07143 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s -; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} +; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %} declare void @llvm.nvvm.tcgen05.shift.down.cg1(ptr addrspace(6) %tmem_addr) declare void @llvm.nvvm.tcgen05.shift.down.cg2(ptr addrspace(6) %tmem_addr) diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-st.ll b/llvm/test/CodeGen/NVPTX/tcgen05-st.ll index c323a54d75d7..0636a06bc9ea 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-st.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-st.ll @@ -1,8 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx86 | FileCheck %s ; RUN: llc < %s -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx86 | FileCheck %s -; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} -; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_101a -mattr=+ptx86 | %ptxas-verify -arch=sm_101a %} +; RUN: llc < %s -o - -mcpu=sm_103a -march=nvptx64 -mattr=+ptx88 | FileCheck %s +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_101a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_101a -mattr=+ptx86 | %ptxas-verify -arch=sm_101a %} +; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %} ; CHECK-LABEL: nvvm_tcgen05_st_16x64b define void @nvvm_tcgen05_st_16x64b(ptr addrspace(6) %taddr, i32 %stv1, <2 x i32> %stv2, <4 x i32> %stv4, <8 x i32> %stv8, <16 x i32> %stv16, <32 x i32> %stv32, <64 x i32> %stv64, <128 x i32> %stv128) { diff --git a/llvm/test/CodeGen/NVPTX/trunc-setcc.ll b/llvm/test/CodeGen/NVPTX/trunc-setcc.ll index f22e37e20396..f6a1c6bb60d6 100644 --- a/llvm/test/CodeGen/NVPTX/trunc-setcc.ll +++ b/llvm/test/CodeGen/NVPTX/trunc-setcc.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mcpu=sm_50 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mcpu=sm_50 | %ptxas-verify -arch=sm_50 %} +; RUN: %if ptxas-sm_50 %{ llc < %s -mcpu=sm_50 | %ptxas-verify -arch=sm_50 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/trunc-tofp.ll b/llvm/test/CodeGen/NVPTX/trunc-tofp.ll index 12502b6f2989..99a1e8a0630a 100644 --- a/llvm/test/CodeGen/NVPTX/trunc-tofp.ll +++ b/llvm/test/CodeGen/NVPTX/trunc-tofp.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mcpu=sm_50 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mcpu=sm_50 | %ptxas-verify -arch=sm_50 %} +; RUN: %if ptxas-sm_50 %{ llc < %s -mcpu=sm_50 | %ptxas-verify -arch=sm_50 %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/unreachable.ll b/llvm/test/CodeGen/NVPTX/unreachable.ll index 618c7ed0c499..0b65ef8f275d 100644 --- a/llvm/test/CodeGen/NVPTX/unreachable.ll +++ b/llvm/test/CodeGen/NVPTX/unreachable.ll @@ -13,7 +13,7 @@ ; RUN: | FileCheck %s --check-prefixes=CHECK,TRAP ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs -trap-unreachable -mattr=+ptx83 \ ; RUN: | FileCheck %s --check-prefixes=BUG-FIXED -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} target triple = "nvptx-unknown-cuda" diff --git a/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll b/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll index 84c7a124a6f3..80fd47f85795 100644 --- a/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll +++ b/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll @@ -96,7 +96,15 @@ define void @test_cluster_dim() { ret void } -!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12} +define void @test_grid_constant(ptr byval(i32) %input1, i32 %input2, ptr byval(i32) %input3) { +; CHECK-LABEL: define void @test_grid_constant( +; CHECK-SAME: ptr byval(i32) "nvvm.grid_constant" [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr byval(i32) "nvvm.grid_constant" [[INPUT3:%.*]]) { +; CHECK-NEXT: ret void +; + ret void +} + +!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13} !0 = !{ptr @test_align, !"align", i32 u0x00000008, !"align", i32 u0x00010008, !"align", i32 u0x00020010} !1 = !{null, !"align", i32 u0x00000008, !"align", i32 u0x00010008, !"align", i32 u0x00020008} @@ -111,7 +119,8 @@ define void @test_cluster_dim() { !10 = !{ptr @test_maxntid_4, !"maxntidz", i32 100} !11 = !{ptr @test_reqntid, !"reqntidx", i32 31, !"reqntidy", i32 32, !"reqntidz", i32 33} !12 = !{ptr @test_cluster_dim, !"cluster_dim_x", i32 101, !"cluster_dim_y", i32 102, !"cluster_dim_z", i32 103} - +!13 = !{ptr @test_grid_constant, !"grid_constant", !14} +!14 = !{i32 1, i32 3} ;. ; CHECK: attributes #[[ATTR0]] = { "nvvm.maxclusterrank"="2" } ; CHECK: attributes #[[ATTR1]] = { "nvvm.maxclusterrank"="3" } diff --git a/llvm/test/CodeGen/NVPTX/vaargs.ll b/llvm/test/CodeGen/NVPTX/vaargs.ll index 9e312a2fec60..a6b1bdda22e3 100644 --- a/llvm/test/CodeGen/NVPTX/vaargs.ll +++ b/llvm/test/CodeGen/NVPTX/vaargs.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -O0 -march=nvptx -mattr=+ptx60 -mcpu=sm_30 | FileCheck %s --check-prefixes=CHECK,CHECK32 ; RUN: llc < %s -O0 -march=nvptx64 -mattr=+ptx60 -mcpu=sm_30 | FileCheck %s --check-prefixes=CHECK,CHECK64 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -O0 -march=nvptx -mattr=+ptx60 -mcpu=sm_30 | %ptxas-verify %} -; RUN: %if ptxas %{ llc < %s -O0 -march=nvptx64 -mattr=+ptx60 -mcpu=sm_30 | %ptxas-verify %} +; RUN: %if ptxas-isa-6.0 && ptxas-ptr32 %{ llc < %s -O0 -march=nvptx -mattr=+ptx60 -mcpu=sm_30 | %ptxas-verify %} +; RUN: %if ptxas-isa-6.0 %{ llc < %s -O0 -march=nvptx64 -mattr=+ptx60 -mcpu=sm_30 | %ptxas-verify %} ; CHECK: .address_size [[BITS:32|64]] diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll index 890753b6ac5a..61ff80632c78 100644 --- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll +++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64-- -mtriple=nvptx64 -mcpu=sm_52 -mattr=+ptx64 < %s | FileCheck %s --check-prefix=CHECK-PTX -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-- -mtriple=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | %ptxas-verify %} +; RUN: %if ptxas-isa-6.4 %{ llc < %s -mtriple=nvptx64-- -mtriple=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | %ptxas-verify %} %struct.S1 = type { i32, i8, i64 } %struct.S2 = type { i64, i64 } diff --git a/llvm/test/CodeGen/NVPTX/vector-compare.ll b/llvm/test/CodeGen/NVPTX/vector-compare.ll index 0e63ee96932d..d5569b55c337 100644 --- a/llvm/test/CodeGen/NVPTX/vector-compare.ll +++ b/llvm/test/CodeGen/NVPTX/vector-compare.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify -m32 %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify -m32 %} ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; This test makes sure that the result of vector compares are properly diff --git a/llvm/test/CodeGen/NVPTX/vector-loads.ll b/llvm/test/CodeGen/NVPTX/vector-loads.ll index ccac7ff8e647..1ae6f6bcd748 100644 --- a/llvm/test/CodeGen/NVPTX/vector-loads.ll +++ b/llvm/test/CodeGen/NVPTX/vector-loads.ll @@ -154,7 +154,7 @@ define void @foo_complex(ptr nocapture readonly align 16 dereferenceable(1342177 ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<4>; ; CHECK-NEXT: .reg .b32 %r<8>; -; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-NEXT: .reg .b64 %rd<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [foo_complex_param_0]; @@ -166,11 +166,12 @@ define void @foo_complex(ptr nocapture readonly align 16 dereferenceable(1342177 ; CHECK-NEXT: shl.b32 %r6, %r1, 1; ; CHECK-NEXT: or.b32 %r7, %r5, %r6; ; CHECK-NEXT: cvt.u64.u32 %rd2, %r7; -; CHECK-NEXT: mad.wide.u32 %rd3, %r3, 131072, %rd1; -; CHECK-NEXT: add.s64 %rd4, %rd3, %rd2; -; CHECK-NEXT: ld.v2.b8 {%rs1, %rs2}, [%rd4+128]; +; CHECK-NEXT: mul.wide.u32 %rd3, %r3, 131072; +; CHECK-NEXT: add.s64 %rd4, %rd1, %rd3; +; CHECK-NEXT: add.s64 %rd5, %rd4, %rd2; +; CHECK-NEXT: ld.v2.b8 {%rs1, %rs2}, [%rd5+128]; ; CHECK-NEXT: max.u16 %rs3, %rs1, %rs2; -; CHECK-NEXT: st.b8 [%rd4+129], %rs3; +; CHECK-NEXT: st.b8 [%rd5+129], %rs3; ; CHECK-NEXT: ret; %t0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !1 %t1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() diff --git a/llvm/test/CodeGen/NVPTX/vector-select.ll b/llvm/test/CodeGen/NVPTX/vector-select.ll index 569da5e6628b..96b2a0cd35d4 100644 --- a/llvm/test/CodeGen/NVPTX/vector-select.ll +++ b/llvm/test/CodeGen/NVPTX/vector-select.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; This test makes sure that vector selects are scalarized by the type legalizer. diff --git a/llvm/test/CodeGen/NVPTX/vote.ll b/llvm/test/CodeGen/NVPTX/vote.ll index 6e760cee2a11..d8aa0b1bdf12 100644 --- a/llvm/test/CodeGen/NVPTX/vote.ll +++ b/llvm/test/CodeGen/NVPTX/vote.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %} +; RUN: %if ptxas-isa-6.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %} declare i1 @llvm.nvvm.vote.all(i1) ; CHECK-LABEL: .func{{.*}}vote_all diff --git a/llvm/test/CodeGen/NVPTX/weak-global.ll b/llvm/test/CodeGen/NVPTX/weak-global.ll index 43fc9b0ebfe8..06c2cd86ee8d 100644 --- a/llvm/test/CodeGen/NVPTX/weak-global.ll +++ b/llvm/test/CodeGen/NVPTX/weak-global.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx43 | FileCheck %s --check-prefix PTX43 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx50 | FileCheck %s --check-prefix PTX50 -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx43 | %ptxas-verify %} -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx50 | %ptxas-verify %} +; RUN: %if ptxas-isa-4.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx43 | %ptxas-verify %} +; RUN: %if ptxas-isa-5.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx50 | %ptxas-verify %} ; PTX43: .weak .global .align 4 .u32 g ; PTX50: .common .global .align 4 .u32 g diff --git a/llvm/test/CodeGen/NVPTX/wgmma-sm90a-fence.ll b/llvm/test/CodeGen/NVPTX/wgmma-sm90a-fence.ll index 59fe57b9b2c8..531a2042cd2f 100644 --- a/llvm/test/CodeGen/NVPTX/wgmma-sm90a-fence.ll +++ b/llvm/test/CodeGen/NVPTX/wgmma-sm90a-fence.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_90a -mattr=+ptx80 | FileCheck %s -; RUN: %if ptxas-12.0 %{ llc < %s -march=nvptx64 -mcpu=sm_90a -mattr=+ptx80 | %ptxas-verify -arch=sm_90a %} +; RUN: %if ptxas-sm_90a && ptxas-isa-8.0 %{ llc < %s -march=nvptx64 -mcpu=sm_90a -mattr=+ptx80 | %ptxas-verify -arch=sm_90a %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx60-sm70.py b/llvm/test/CodeGen/NVPTX/wmma-ptx60-sm70.py index bc441bfa8180..ca6f78844523 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx60-sm70.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx60-sm70.py @@ -6,7 +6,7 @@ # RUN: --check-prefixes=INTRINSICS,NOEXTGEOM,NOINT,NOSUBINT,NOMMA,NODOUBLE,NOALTFLOAT,NOLDMATRIX # RUN: llc < %t-ptx60-sm_70.ll -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx60 \ # RUN: | FileCheck %t-ptx60-sm_70.ll -# RUN: %if ptxas %{ \ +# RUN: %if ptxas-sm_70 && ptxas-isa-6.0 %{ \ # RUN: llc < %t-ptx60-sm_70.ll -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx60 \ # RUN: | %ptxas-verify -arch=sm_70 \ # RUN: %} diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx61-sm70.py b/llvm/test/CodeGen/NVPTX/wmma-ptx61-sm70.py index 7cfee46ea4c3..25b24217aa51 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx61-sm70.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx61-sm70.py @@ -6,7 +6,7 @@ # RUN: --check-prefixes=INTRINSICS,NOINT,NOSUBINT,NOMMA,NODOUBLE,NOALTFLOAT,NOLDMATRIX # RUN: llc < %t-ptx61-sm_70.ll -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx61 \ # RUN: | FileCheck %t-ptx61-sm_70.ll -# RUN: %if ptxas-9.1 %{ \ +# RUN: %if ptxas-sm_70 && ptxas-isa-6.1 %{ \ # RUN: llc < %t-ptx61-sm_70.ll -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx61 \ # RUN: | %ptxas-verify -arch=sm_70 \ # RUN: %} diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx63-sm72.py b/llvm/test/CodeGen/NVPTX/wmma-ptx63-sm72.py index 6168df26b906..4c0fd48efad3 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx63-sm72.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx63-sm72.py @@ -6,7 +6,7 @@ # RUN: --check-prefixes=INTRINSICS,NOSUBINT,NOMMA,NODOUBLE,NOALTFLOAT,NOLDMATRIX # RUN: llc < %t-ptx63-sm_72.ll -mtriple=nvptx64 -mcpu=sm_72 -mattr=+ptx63 \ # RUN: | FileCheck %t-ptx63-sm_72.ll -# RUN: %if ptxas-10.0 %{ \ +# RUN: %if ptxas-sm_72 && ptxas-isa-6.3 %{ \ # RUN: llc < %t-ptx63-sm_72.ll -mtriple=nvptx64 -mcpu=sm_72 -mattr=+ptx63 \ # RUN: | %ptxas-verify -arch=sm_72 \ # RUN: %} diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx63-sm75.py b/llvm/test/CodeGen/NVPTX/wmma-ptx63-sm75.py index 507760e7b61f..944d284b96b5 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx63-sm75.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx63-sm75.py @@ -6,7 +6,7 @@ # RUN: --check-prefixes=INTRINSICS,NOMMA,NODOUBLE,NOALTFLOAT,NOLDMATRIX # RUN: llc < %t-ptx63-sm_75.ll -mtriple=nvptx64 -mcpu=sm_75 -mattr=+ptx63 \ # RUN: | FileCheck %t-ptx63-sm_75.ll -# RUN: %if ptxas-10.0 %{ \ +# RUN: %if ptxas-sm_75 && ptxas-isa-6.3 %{ \ # RUN: llc < %t-ptx63-sm_75.ll -mtriple=nvptx64 -mcpu=sm_75 -mattr=+ptx63 \ # RUN: | %ptxas-verify -arch=sm_75 \ # RUN: %} diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx64-sm70.py b/llvm/test/CodeGen/NVPTX/wmma-ptx64-sm70.py index 0f0d1c90fe00..a79604548351 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx64-sm70.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx64-sm70.py @@ -6,7 +6,7 @@ # RUN: --check-prefixes=INTRINSICS,NOINT,NOSUBINT,NODOUBLE,NOALTFLOAT,NOLDMATRIX # RUN: llc < %t-ptx64-sm_70.ll -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx64 \ # RUN: | FileCheck %t-ptx64-sm_70.ll -# RUN: %if ptxas-10.1 %{ \ +# RUN: %if ptxas-sm_70 && ptxas-isa-6.4 %{ \ # RUN: llc < %t-ptx64-sm_70.ll -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx64 \ # RUN: | %ptxas-verify -arch=sm_70 \ # RUN: %} diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx65-sm75.py b/llvm/test/CodeGen/NVPTX/wmma-ptx65-sm75.py index 2b919dbdcf3d..ea9d0babac13 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx65-sm75.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx65-sm75.py @@ -6,7 +6,7 @@ # RUN: --check-prefixes=INTRINSICS # RUN: llc < %t-ptx65-sm_75.ll -mtriple=nvptx64 -mcpu=sm_75 -mattr=+ptx65 \ # RUN: | FileCheck %t-ptx65-sm_75.ll -# RUN: %if ptxas-10.2 %{ \ +# RUN: %if ptxas-sm_75 && ptxas-isa-6.5 %{ \ # RUN: llc < %t-ptx65-sm_75.ll -mtriple=nvptx64 -mcpu=sm_75 -mattr=+ptx65 \ # RUN: | %ptxas-verify -arch=sm_75 \ # RUN: %} diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx71-sm80.py b/llvm/test/CodeGen/NVPTX/wmma-ptx71-sm80.py index 2985c1b96ab6..03d46b8f0b30 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx71-sm80.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx71-sm80.py @@ -6,7 +6,7 @@ # RUN: --check-prefixes=INTRINSICS # RUN: llc < %t-ptx71-sm_80.ll -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 \ # RUN: | FileCheck %t-ptx71-sm_80.ll -# RUN: %if ptxas-11.1 %{ \ +# RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ \ # RUN: llc < %t-ptx71-sm_80.ll -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 \ # RUN: | %ptxas-verify -arch=sm_80 \ # RUN: %} diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx78-sm90.py b/llvm/test/CodeGen/NVPTX/wmma-ptx78-sm90.py index 8f502065345c..8a5ae22abdb3 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx78-sm90.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx78-sm90.py @@ -4,7 +4,7 @@ # RUN: --check-prefixes=PTX78STMATRIX-DAG # RUN: llc < %t-ptx78-sm_90.ll -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 \ # RUN: | FileCheck %t-ptx78-sm_90.ll -# RUN: %if ptxas-12.7 %{ \ +# RUN: %if ptxas-sm_90 && ptxas-isa-7.8 %{ \ # RUN: llc < %t-ptx78-sm_90.ll -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 \ # RUN: | %ptxas-verify -arch=sm_90 \ # RUN: %} diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm100a.py b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm100a.py index 5c14a54601ed..12b1980de5e4 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm100a.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm100a.py @@ -4,7 +4,7 @@ # RUN: --check-prefixes=PTX86LDMATRIX-DAG,PTX86STMATRIX-DAG # RUN: llc < %t-ptx86-sm_100a.ll -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 \ # RUN: | FileCheck %t-ptx86-sm_100a.ll -# RUN: %if ptxas-12.7 %{ \ +# RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ \ # RUN: llc < %t-ptx86-sm_100a.ll -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 \ # RUN: | %ptxas-verify -arch=sm_100a \ # RUN: %} diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm101a.py b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm101a.py index a77f9adddff9..f0e972308118 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm101a.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm101a.py @@ -4,7 +4,7 @@ # RUN: --check-prefixes=PTX86LDMATRIX-DAG,PTX86STMATRIX-DAG # RUN: llc < %t-ptx86-sm_101a.ll -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 \ # RUN: | FileCheck %t-ptx86-sm_101a.ll -# RUN: %if ptxas-12.7 %{ \ +# RUN: %if ptxas-sm_101a && ptxas-isa-8.6 %{ \ # RUN: llc < %t-ptx86-sm_101a.ll -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 \ # RUN: | %ptxas-verify -arch=sm_101a \ # RUN: %} diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py index 8126e64d6cc8..570372c42e8e 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py @@ -4,7 +4,7 @@ # RUN: --check-prefixes=PTX86LDMATRIX-DAG,PTX86STMATRIX-DAG # RUN: llc < %t-ptx86-sm_120a.ll -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 \ # RUN: | FileCheck %t-ptx86-sm_120a.ll -# RUN: %if ptxas-12.7 %{ \ +# RUN: %if ptxas-sm_120a && ptxas-isa-8.6 %{ \ # RUN: llc < %t-ptx86-sm_120a.ll -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 \ # RUN: | %ptxas-verify -arch=sm_120a \ # RUN: %} |
