diff options
Diffstat (limited to 'clang/test/CodeGenOpenCL')
9 files changed, 934 insertions, 0 deletions
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features-illegal.cl b/clang/test/CodeGenOpenCL/amdgpu-features-illegal.cl index 4e2f7f86e840..04de5dca3f6c 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-features-illegal.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features-illegal.cl @@ -1,8 +1,10 @@ // RUN: not %clang_cc1 -triple amdgcn -target-feature +wavefrontsize32 -target-feature +wavefrontsize64 -o /dev/null %s 2>&1 | FileCheck %s // RUN: not %clang_cc1 -triple amdgcn -target-cpu gfx1103 -target-feature +wavefrontsize32 -target-feature +wavefrontsize64 -o /dev/null %s 2>&1 | FileCheck %s // RUN: not %clang_cc1 -triple amdgcn -target-cpu gfx900 -target-feature +wavefrontsize32 -o /dev/null %s 2>&1 | FileCheck %s --check-prefix=GFX9 +// RUN: not %clang_cc1 -triple amdgcn -target-cpu gfx1250 -target-feature +wavefrontsize64 -o /dev/null %s 2>&1 | FileCheck %s --check-prefix=GFX1250 // CHECK: error: invalid feature combination: 'wavefrontsize32' and 'wavefrontsize64' are mutually exclusive // GFX9: error: option 'wavefrontsize32' cannot be specified on this target +// GFX1250: error: option 'wavefrontsize64' cannot be specified on this target kernel void test() {} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl index f300b05fe798..cdfe9fcd8909 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl @@ -1,6 +1,7 @@ // REQUIRES: amdgpu-registered-target // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx942 -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s typedef float v2f __attribute__((ext_vector_type(2))); diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl index ccc05f0aa5af..c645d52cc7e3 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl @@ -5,6 +5,46 @@ typedef int v2i __attribute__((ext_vector_type(2))); typedef int v4i __attribute__((ext_vector_type(4))); +// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_async_to_lds_b8( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.cluster.load.async.to.lds.b8(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0, i32 [[MASK:%.*]]) +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_cluster_load_async_to_lds_b8(global char* gaddr, local char* laddr, int mask) +{ + __builtin_amdgcn_cluster_load_async_to_lds_b8(gaddr, laddr, 16, 0, mask); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_async_to_lds_b32( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.cluster.load.async.to.lds.b32(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0, i32 [[MASK:%.*]]) +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_cluster_load_async_to_lds_b32(global int* gaddr, local int* laddr, int mask) +{ + __builtin_amdgcn_cluster_load_async_to_lds_b32(gaddr, laddr, 16, 0, mask); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_async_to_lds_b64( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.cluster.load.async.to.lds.b64(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0, i32 [[MASK:%.*]]) +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_cluster_load_async_to_lds_b64(global v2i* gaddr, local v2i* laddr, int mask) +{ + __builtin_amdgcn_cluster_load_async_to_lds_b64(gaddr, laddr, 16, 0, mask); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_async_to_lds_b128( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.cluster.load.async.to.lds.b128(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0, i32 [[MASK:%.*]]) +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_cluster_load_async_to_lds_b128(global v4i* gaddr, local v4i* laddr, int mask) +{ + __builtin_amdgcn_cluster_load_async_to_lds_b128(gaddr, laddr, 16, 0, mask); +} + // CHECK-GFX1250-LABEL: @test_amdgcn_global_load_async_to_lds_b8( // CHECK-GFX1250-NEXT: entry: // CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.load.async.to.lds.b8(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0) diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-cluster-load.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-cluster-load.cl new file mode 100644 index 000000000000..4c6e8badf1bc --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-cluster-load.cl @@ -0,0 +1,36 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250 + +typedef int v2i __attribute__((ext_vector_type(2))); +typedef int v4i __attribute__((ext_vector_type(4))); + +// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_b32( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.cluster.load.b32.i32(ptr addrspace(1) [[INPTR:%.*]], i32 10, i32 [[MASK:%.*]]) +// CHECK-GFX1250-NEXT: ret i32 [[TMP0]] +// +int test_amdgcn_cluster_load_b32(global int* inptr, int mask) +{ + return __builtin_amdgcn_cluster_load_b32(inptr, 10, mask); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_b64( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.cluster.load.b64.v2i32(ptr addrspace(1) [[INPTR:%.*]], i32 22, i32 [[MASK:%.*]]) +// CHECK-GFX1250-NEXT: ret <2 x i32> [[TMP0]] +// +v2i test_amdgcn_cluster_load_b64(global v2i* inptr, int mask) +{ + return __builtin_amdgcn_cluster_load_b64(inptr, 22, mask); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_b128( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.cluster.load.b128.v4i32(ptr addrspace(1) [[INPTR:%.*]], i32 27, i32 [[MASK:%.*]]) +// CHECK-GFX1250-NEXT: ret <4 x i32> [[TMP0]] +// +v4i test_amdgcn_cluster_load_b128(global v4i* inptr, int mask) +{ + return __builtin_amdgcn_cluster_load_b128(inptr, 27, mask); +} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-cooperative-atomics.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-cooperative-atomics.cl new file mode 100644 index 000000000000..8768f2f36765 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-cooperative-atomics.cl @@ -0,0 +1,104 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 + +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s + +typedef int v2i __attribute__((ext_vector_type(2))); +typedef int v4i __attribute__((ext_vector_type(4))); + +// CHECK-LABEL: define dso_local void @test_amdgcn_cooperative_atomic_store_32x4B( +// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) [[GADDR:%.*]], i32 noundef [[VAL:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p1(ptr addrspace(1) [[GADDR]], i32 [[VAL]], i32 0, metadata [[META4:![0-9]+]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_cooperative_atomic_store_32x4B(global int* gaddr, int val) +{ + __builtin_amdgcn_cooperative_atomic_store_32x4B(gaddr, val, __ATOMIC_RELAXED, "agent"); +} + +// CHECK-LABEL: define dso_local i32 @test_amdgcn_cooperative_atomic_load_32x4B( +// CHECK-SAME: ptr noundef readonly captures(none) [[ADDR:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr [[ADDR]], i32 0, metadata [[META5:![0-9]+]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int test_amdgcn_cooperative_atomic_load_32x4B(int* addr) +{ + return __builtin_amdgcn_cooperative_atomic_load_32x4B(addr, __ATOMIC_RELAXED, ""); +} + +// CHECK-LABEL: define dso_local void @test_amdgcn_cooperative_atomic_store_16x8B( +// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) [[GADDR:%.*]], <2 x i32> noundef [[VAL:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p1(ptr addrspace(1) [[GADDR]], <2 x i32> [[VAL]], i32 0, metadata [[META5]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_cooperative_atomic_store_16x8B(global v2i* gaddr, v2i val) +{ + __builtin_amdgcn_cooperative_atomic_store_16x8B(gaddr, val, __ATOMIC_RELAXED, ""); +} + +// CHECK-LABEL: define dso_local <2 x i32> @test_amdgcn_cooperative_atomic_load_16x8B( +// CHECK-SAME: ptr addrspace(1) noundef readonly captures(none) [[GADDR:%.*]]) local_unnamed_addr #[[ATTR2]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p1(ptr addrspace(1) [[GADDR]], i32 0, metadata [[META6:![0-9]+]]) +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// +v2i test_amdgcn_cooperative_atomic_load_16x8B(global v2i* gaddr) +{ + return __builtin_amdgcn_cooperative_atomic_load_16x8B(gaddr, __ATOMIC_RELAXED, "workgroup"); +} + +// CHECK-LABEL: define dso_local void @test_amdgcn_cooperative_atomic_store_8x16B( +// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) [[GADDR:%.*]], <4 x i32> noundef [[VAL:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p1(ptr addrspace(1) [[GADDR]], <4 x i32> [[VAL]], i32 0, metadata [[META7:![0-9]+]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_cooperative_atomic_store_8x16B(global v4i* gaddr, v4i val) +{ + __builtin_amdgcn_cooperative_atomic_store_8x16B(gaddr, val, __ATOMIC_RELAXED, "singlethread"); +} + +// CHECK-LABEL: define dso_local <4 x i32> @test_amdgcn_cooperative_atomic_load_8x16B( +// CHECK-SAME: ptr addrspace(1) noundef readonly captures(none) [[GADDR:%.*]]) local_unnamed_addr #[[ATTR2]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p1(ptr addrspace(1) [[GADDR]], i32 0, metadata [[META4]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4i test_amdgcn_cooperative_atomic_load_8x16B(global v4i* gaddr) +{ + return __builtin_amdgcn_cooperative_atomic_load_8x16B(gaddr, __ATOMIC_RELAXED, "agent"); +} + +// CHECK-LABEL: define dso_local void @test_amdgcn_cooperative_atomic_store_32x4B_truncated( +// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) [[GADDR:%.*]], i64 noundef [[VAL:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CONV:%.*]] = trunc i64 [[VAL]] to i32 +// CHECK-NEXT: tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p1(ptr addrspace(1) [[GADDR]], i32 [[CONV]], i32 0, metadata [[META4]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_cooperative_atomic_store_32x4B_truncated(global int* gaddr, long val) +{ + __builtin_amdgcn_cooperative_atomic_store_32x4B(gaddr, val, __ATOMIC_RELAXED, "agent"); +} + +// CHECK-LABEL: define dso_local void @test_amdgcn_cooperative_atomic_store_32x4B_extended( +// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) [[GADDR:%.*]], i8 noundef signext [[VAL:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CONV:%.*]] = sext i8 [[VAL]] to i32 +// CHECK-NEXT: tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p1(ptr addrspace(1) [[GADDR]], i32 [[CONV]], i32 0, metadata [[META4]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_cooperative_atomic_store_32x4B_extended(global int* gaddr, char val) +{ + __builtin_amdgcn_cooperative_atomic_store_32x4B(gaddr, val, __ATOMIC_RELAXED, "agent"); +} + +//. +// CHECK: [[META4]] = !{!"agent"} +// CHECK: [[META5]] = !{!""} +// CHECK: [[META6]] = !{!"workgroup"} +// CHECK: [[META7]] = !{!"singlethread"} +//. diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index 4ff0571239e7..c35715965dae 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -58,6 +58,58 @@ void test_s_wait_tensorcnt() { __builtin_amdgcn_s_wait_tensorcnt(0); } +// CHECK-LABEL: @test_bitop3_b32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[C_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.bitop3.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 1) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +void test_bitop3_b32(global uint* out, uint a, uint b, uint c) { + *out = __builtin_amdgcn_bitop3_b32(a, b, c, 1); +} + +// CHECK-LABEL: @test_bitop3_b16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i16, align 2, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i16, align 2, addrspace(5) +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca i16, align 2, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i16 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: store i16 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 2 +// CHECK-NEXT: store i16 [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[B_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[C_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.amdgcn.bitop3.i16(i16 [[TMP0]], i16 [[TMP1]], i16 [[TMP2]], i32 1) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i16 [[TMP3]], ptr addrspace(1) [[TMP4]], align 2 +// CHECK-NEXT: ret void +// +void test_bitop3_b16(global ushort* out, ushort a, ushort b, ushort c) { + *out = __builtin_amdgcn_bitop3_b16(a, b, c, 1); +} + // CHECK-LABEL: @test_prng_b32( // CHECK-NEXT: entry: // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) @@ -1012,6 +1064,174 @@ void test_sat_pk4_i4_i8(ushort *out, uint src) *out = __builtin_amdgcn_sat_pk4_u4_u8(src); } +// CHECK-LABEL: @test_get_cluster_id( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: store i32 [[D:%.*]], ptr [[D_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[D_ADDR_ASCAST]], align 4 +// CHECK-NEXT: switch i32 [[TMP0]], label [[SW_DEFAULT:%.*]] [ +// CHECK-NEXT: i32 0, label [[SW_BB:%.*]] +// CHECK-NEXT: i32 1, label [[SW_BB1:%.*]] +// CHECK-NEXT: i32 2, label [[SW_BB2:%.*]] +// CHECK-NEXT: ] +// CHECK: sw.bb: +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.cluster.id.x() +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(1) [[TMP2]], align 4 +// CHECK-NEXT: br label [[SW_EPILOG:%.*]] +// CHECK: sw.bb1: +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.cluster.id.y() +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 +// CHECK-NEXT: br label [[SW_EPILOG]] +// CHECK: sw.bb2: +// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cluster.id.z() +// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 +// CHECK-NEXT: br label [[SW_EPILOG]] +// CHECK: sw.default: +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 0, ptr addrspace(1) [[TMP7]], align 4 +// CHECK-NEXT: br label [[SW_EPILOG]] +// CHECK: sw.epilog: +// CHECK-NEXT: ret void +// +void test_get_cluster_id(int d, global int *out) +{ + switch (d) { + case 0: *out = __builtin_amdgcn_cluster_id_x(); break; + case 1: *out = __builtin_amdgcn_cluster_id_y(); break; + case 2: *out = __builtin_amdgcn_cluster_id_z(); break; + default: *out = 0; + } +} + +// CHECK-LABEL: @test_get_cluster_group_id( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: store i32 [[D:%.*]], ptr [[D_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[D_ADDR_ASCAST]], align 4 +// CHECK-NEXT: switch i32 [[TMP0]], label [[SW_DEFAULT:%.*]] [ +// CHECK-NEXT: i32 0, label [[SW_BB:%.*]] +// CHECK-NEXT: i32 1, label [[SW_BB1:%.*]] +// CHECK-NEXT: i32 2, label [[SW_BB2:%.*]] +// CHECK-NEXT: ] +// CHECK: sw.bb: +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.cluster.workgroup.id.x() +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(1) [[TMP2]], align 4 +// CHECK-NEXT: br label [[SW_EPILOG:%.*]] +// CHECK: sw.bb1: +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.cluster.workgroup.id.y() +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 +// CHECK-NEXT: br label [[SW_EPILOG]] +// CHECK: sw.bb2: +// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cluster.workgroup.id.z() +// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 +// CHECK-NEXT: br label [[SW_EPILOG]] +// CHECK: sw.default: +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 0, ptr addrspace(1) [[TMP7]], align 4 +// CHECK-NEXT: br label [[SW_EPILOG]] +// CHECK: sw.epilog: +// CHECK-NEXT: ret void +// +void test_get_cluster_group_id(int d, global int *out) +{ + switch (d) { + case 0: *out = __builtin_amdgcn_cluster_workgroup_id_x(); break; + case 1: *out = __builtin_amdgcn_cluster_workgroup_id_y(); break; + case 2: *out = __builtin_amdgcn_cluster_workgroup_id_z(); break; + default: *out = 0; + } +} + +// CHECK-LABEL: @test_cluster_workgroup_flat_id( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.cluster.workgroup.flat.id() +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP0]], ptr addrspace(1) [[TMP1]], align 4 +// CHECK-NEXT: ret void +// +void test_cluster_workgroup_flat_id(global uint *out) +{ + *out = __builtin_amdgcn_cluster_workgroup_flat_id(); +} + +// CHECK-LABEL: @test_get_cluster_workgroups_max_id( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: store i32 [[D:%.*]], ptr [[D_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[D_ADDR_ASCAST]], align 4 +// CHECK-NEXT: switch i32 [[TMP0]], label [[SW_DEFAULT:%.*]] [ +// CHECK-NEXT: i32 0, label [[SW_BB:%.*]] +// CHECK-NEXT: i32 1, label [[SW_BB1:%.*]] +// CHECK-NEXT: i32 2, label [[SW_BB2:%.*]] +// CHECK-NEXT: ] +// CHECK: sw.bb: +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.cluster.workgroup.max.id.x() +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(1) [[TMP2]], align 4 +// CHECK-NEXT: br label [[SW_EPILOG:%.*]] +// CHECK: sw.bb1: +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.cluster.workgroup.max.id.y() +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 +// CHECK-NEXT: br label [[SW_EPILOG]] +// CHECK: sw.bb2: +// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cluster.workgroup.max.id.z() +// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 +// CHECK-NEXT: br label [[SW_EPILOG]] +// CHECK: sw.default: +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 0, ptr addrspace(1) [[TMP7]], align 4 +// CHECK-NEXT: br label [[SW_EPILOG]] +// CHECK: sw.epilog: +// CHECK-NEXT: ret void +// +void test_get_cluster_workgroups_max_id(int d, global int *out) +{ + switch (d) { + case 0: *out = __builtin_amdgcn_cluster_workgroup_max_id_x(); break; + case 1: *out = __builtin_amdgcn_cluster_workgroup_max_id_y(); break; + case 2: *out = __builtin_amdgcn_cluster_workgroup_max_id_z(); break; + default: *out = 0; + } +} + +// CHECK-LABEL: @test_get_cluster_workgroup_max_flat_id( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.cluster.workgroup.max.flat.id() +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP0]], ptr addrspace(1) [[TMP1]], align 4 +// CHECK-NEXT: ret void +// +void test_get_cluster_workgroup_max_flat_id(global int *out) +{ + *out = __builtin_amdgcn_cluster_workgroup_max_flat_id(); +} + // CHECK-LABEL: @test_permlane16_swap( // CHECK-NEXT: entry: // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) @@ -1258,6 +1478,145 @@ void test_prefetch(generic void *fptr, global void *gptr) { __builtin_amdgcn_global_prefetch(gptr, 8); } +// CHECK-LABEL: @test_global_add_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[X_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR_ADDR]] to ptr +// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[ADDR:%.*]], ptr [[ADDR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store float [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[ADDR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[X_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], float [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4:![0-9]+]], !amdgpu.ignore.denormal.mode [[META4]] +// CHECK-NEXT: ret float [[TMP2]] +// +float test_global_add_f32(global float *addr, float x) { + return __builtin_amdgcn_global_atomic_fadd_f32(addr, x); +} + +// CHECK-LABEL: @test_global_add_half2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <2 x half>, align 4, addrspace(5) +// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[X_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR_ADDR]] to ptr +// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[ADDR:%.*]], ptr [[ADDR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x half> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[ADDR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[X_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]] +// CHECK-NEXT: ret <2 x half> [[TMP2]] +// +half2 test_global_add_half2(global half2 *addr, half2 x) { + return __builtin_amdgcn_global_atomic_fadd_v2f16(addr, x); +} + +// CHECK-LABEL: @test_flat_add_2f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <2 x half>, align 4, addrspace(5) +// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[X_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR_ADDR]] to ptr +// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr +// CHECK-NEXT: store ptr [[ADDR:%.*]], ptr [[ADDR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x half> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[X_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]] +// CHECK-NEXT: ret <2 x half> [[TMP2]] +// +half2 test_flat_add_2f16(generic half2 *addr, half2 x) { + return __builtin_amdgcn_flat_atomic_fadd_v2f16(addr, x); +} + +// CHECK-LABEL: @test_flat_add_2bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <2 x i16>, align 4, addrspace(5) +// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[X_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR_ADDR]] to ptr +// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr +// CHECK-NEXT: store ptr [[ADDR:%.*]], ptr [[ADDR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i16> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[X_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[TMP1]] to <2 x bfloat> +// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]] +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[TMP3]] to <2 x i16> +// CHECK-NEXT: ret <2 x i16> [[TMP4]] +// +short2 test_flat_add_2bf16(generic short2 *addr, short2 x) { + return __builtin_amdgcn_flat_atomic_fadd_v2bf16(addr, x); +} + +// CHECK-LABEL: @test_global_add_2bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <2 x i16>, align 4, addrspace(5) +// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[X_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR_ADDR]] to ptr +// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[ADDR:%.*]], ptr [[ADDR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i16> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[ADDR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[X_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[TMP1]] to <2 x bfloat> +// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]] +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[TMP3]] to <2 x i16> +// CHECK-NEXT: ret <2 x i16> [[TMP4]] +// +short2 test_global_add_2bf16(global short2 *addr, short2 x) { + return __builtin_amdgcn_global_atomic_fadd_v2bf16(addr, x); +} + +// CHECK-LABEL: @test_local_add_2f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <2 x i16>, align 4, addrspace(5) +// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) +// CHECK-NEXT: [[X_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR_ADDR]] to ptr +// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(3) [[ADDR:%.*]], ptr [[ADDR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store <2 x i16> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(3), ptr [[ADDR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[X_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[TMP1]] to <2 x bfloat> +// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[TMP3]] to <2 x i16> +// CHECK-NEXT: ret <2 x i16> [[TMP4]] +// +short2 test_local_add_2f16(local short2 *addr, short2 x) { + return __builtin_amdgcn_ds_atomic_fadd_v2bf16(addr, x); +} + +// CHECK-LABEL: @test_local_add_2bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca <2 x half>, align 4, addrspace(5) +// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) +// CHECK-NEXT: [[X_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR_ADDR]] to ptr +// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(3) [[ADDR:%.*]], ptr [[ADDR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store <2 x half> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(3), ptr [[ADDR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[X_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4 +// CHECK-NEXT: ret <2 x half> [[TMP2]] +// +half2 test_local_add_2bf16(local half2 *addr, half2 x) { + return __builtin_amdgcn_ds_atomic_fadd_v2f16(addr, x); +} + // CHECK-LABEL: @test_cvt_pk_fp8_f32_e5m3( // CHECK-NEXT: entry: // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl index 5e587cb87e07..d39041852369 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl @@ -24,6 +24,13 @@ void test_ballot_wave32_target_attr(global uint* out, int a, int b) *out = __builtin_amdgcn_ballot_w32(a == b); } +// CHECK-LABEL: @test_inverse_ballot_wave32( +// CHECK: call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %{{.+}}) +void test_inverse_ballot_wave32(global bool* out, int a) +{ + *out = __builtin_amdgcn_inverse_ballot_w32(a); +} + // CHECK-LABEL: @test_read_exec( // CHECK: call i32 @llvm.amdgcn.ballot.i32(i1 true) void test_read_exec(global uint* out) { diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl index 1fc2ac0d3141..d851ec7e6734 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl @@ -23,6 +23,13 @@ void test_ballot_wave64_target_attr(global ulong* out, int a, int b) *out = __builtin_amdgcn_ballot_w64(a == b); } +// CHECK-LABEL: @test_inverse_ballot_wave64( +// CHECK: call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %{{.+}}) +void test_inverse_ballot_wave64(global bool* out, ulong a) +{ + *out = __builtin_amdgcn_inverse_ballot_w64(a); +} + // CHECK-LABEL: @test_read_exec( // CHECK: call i64 @llvm.amdgcn.ballot.i64(i1 true) void test_read_exec(global ulong* out) { diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index bf022bc6eb44..039d03237b53 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -398,6 +398,384 @@ void test_s_sendmsghalt_var(int in) __builtin_amdgcn_s_sendmsghalt(1, in); } +// CHECK-LABEL: @test_wave_reduce_add_u32_default +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32( +void test_wave_reduce_add_u32_default(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_add_u32(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_add_u64_default +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.add.i64( +void test_wave_reduce_add_u64_default(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_add_u64(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_add_u32_iterative +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32( +void test_wave_reduce_add_u32_iterative(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_add_u32(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_add_u64_iterative +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.add.i64( +void test_wave_reduce_add_u64_iterative(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_add_u64(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_add_u32_dpp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32( +void test_wave_reduce_add_u32_dpp(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_add_u32(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_add_u64_dpp +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.add.i64( +void test_wave_reduce_add_u64_dpp(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_add_u64(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_sub_u32_default +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32( +void test_wave_reduce_sub_u32_default(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_sub_u32(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_sub_u64_default +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.sub.i64( +void test_wave_reduce_sub_u64_default(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_sub_u64(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_sub_u32_iterative +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32( +void test_wave_reduce_sub_u32_iterative(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_sub_u32(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_sub_u64_iterative +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.sub.i64( +void test_wave_reduce_sub_u64_iterative(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_sub_u64(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_sub_u32_dpp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32( +void test_wave_reduce_sub_u32_dpp(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_sub_u32(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_sub_u64_dpp +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.sub.i64( +void test_wave_reduce_sub_u64_dpp(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_sub_u64(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_and_b32_default +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32( +void test_wave_reduce_and_b32_default(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_and_b32(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_and_b64_default +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.and.i64( +void test_wave_reduce_and_b64_default(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_and_b64(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_and_b32_iterative +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32( +void test_wave_reduce_and_b32_iterative(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_and_b32(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_and_b64_iterative +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.and.i64( +void test_wave_reduce_and_b64_iterative(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_and_b64(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_and_b32_dpp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32( +void test_wave_reduce_and_b32_dpp(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_and_b32(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_and_b64_dpp +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.and.i64( +void test_wave_reduce_and_b64_dpp(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_and_b64(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_or_b32_default +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.or.i32( +void test_wave_reduce_or_b32_default(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_or_b32(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_or_b64_default +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.or.i64( +void test_wave_reduce_or_b64_default(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_or_b64(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_or_b32_iterative +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.or.i32( +void test_wave_reduce_or_b32_iterative(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_or_b32(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_or_b64_iterative +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.or.i64( +void test_wave_reduce_or_b64_iterative(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_or_b64(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_or_b32_dpp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.or.i32( +void test_wave_reduce_or_b32_dpp(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_or_b32(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_or_b64_dpp +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.or.i64( +void test_wave_reduce_or_b64_dpp(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_or_b64(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_xor_b32_default +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.xor.i32( +void test_wave_reduce_xor_b32_default(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_xor_b32(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_xor_b64_default +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.xor.i64( +void test_wave_reduce_xor_b64_default(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_xor_b64(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_xor_b32_iterative +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.xor.i32( +void test_wave_reduce_xor_b32_iterative(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_xor_b32(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_xor_b64_iterative +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.xor.i64( +void test_wave_reduce_xor_b64_iterative(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_xor_b64(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_xor_b32_dpp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.xor.i32( +void test_wave_reduce_xor_b32_dpp(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_xor_b32(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_xor_b64_dpp +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.xor.i64( +void test_wave_reduce_xor_b64_dpp(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_xor_b64(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_min_i32_default +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32( +void test_wave_reduce_min_i32_default(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_min_i32(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_min_i64_default +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.min.i64( +void test_wave_reduce_min_i64_default(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_min_i64(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_min_i32_iterative +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32( +void test_wave_reduce_min_i32_iterative(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_min_i32(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_min_i64_iterative +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.min.i64( +void test_wave_reduce_min_i64_iterative(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_min_i64(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_min_i32_dpp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32( +void test_wave_reduce_min_i32_dpp(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_min_i32(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_min_i64_dpp +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.min.i64( +void test_wave_reduce_min_i64_dpp(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_min_i64(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_min_u32_default +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32( +void test_wave_reduce_min_u32_default(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_min_u32(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_min_u64_default +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.umin.i64( +void test_wave_reduce_min_u64_default(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_min_u64(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_min_u32_iterative +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32( +void test_wave_reduce_min_u32_iterative(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_min_u32(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_min_u64_iterative +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.umin.i64( +void test_wave_reduce_min_u64_iterative(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_min_u64(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_min_u32_dpp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32( +void test_wave_reduce_min_u32_dpp(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_min_u32(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_min_u64_dpp +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.umin.i64( +void test_wave_reduce_min_u64_dpp(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_min_u64(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_max_i32_default +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32( +void test_wave_reduce_max_i32_default(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_max_i32(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_max_i64_default +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.max.i64( +void test_wave_reduce_max_i64_default(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_max_i64(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_max_i32_iterative +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32( +void test_wave_reduce_max_i32_iterative(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_max_i32(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_max_i64_iterative +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.max.i64( +void test_wave_reduce_max_i64_iterative(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_max_i64(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_max_i32_dpp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32( +void test_wave_reduce_max_i32_dpp(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_max_i32(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_max_i64_dpp +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.max.i64( +void test_wave_reduce_max_i64_dpp(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_max_i64(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_max_u32_default +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32( +void test_wave_reduce_max_u32_default(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_max_u32(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_max_u64_default +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.umax.i64( +void test_wave_reduce_max_u64_default(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_max_u64(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_max_u32_iterative +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32( +void test_wave_reduce_max_u32_iterative(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_max_u32(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_max_u64_iterative +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.umax.i64( +void test_wave_reduce_max_u64_iterative(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_max_u64(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_max_u32_dpp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32( +void test_wave_reduce_max_u32_dpp(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_max_u32(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_max_u64_dpp +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.umax.i64( +void test_wave_reduce_max_u64_dpp(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_max_u64(in, 2); +} + // CHECK-LABEL: @test_s_barrier // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.barrier( void test_s_barrier() |
