[AMDGPU] Simplify dpp builtin handlingusers/rampitec/11-04-_amdgpu_simplify_dpp_builtin_handling

DPP intrinsics can handle any type now, so no need to cast to integer. The caveat is that intrinsics only handle backend legal types, but it does not work with i8 for example.
author: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> 2024-11-04 12:28:07 -0800
committer: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> 2025-03-03 10:41:39 -0800
commit: 1fe2797a92cd52a34f76fe320ef796553a291cf8 (patch)
tree: 6204efe393c52f7d52d90cd7ef07efa46ff6ac5f
parent: 6cc8a46f262fe26d70d59b63fb7cdd2e1dfa6b3f (diff)
3 files changed, 38 insertions, 75 deletions
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 03b8d16b76e0..bff48f2e1652 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -20003,37 +20003,24 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
     getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
     assert(Error == ASTContext::GE_None && "Should not codegen an error");
     llvm::Type *DataTy = ConvertType(E->getArg(0)->getType());
-    unsigned Size = DataTy->getPrimitiveSizeInBits();
-    llvm::Type *IntTy =
-        llvm::IntegerType::get(Builder.getContext(), std::max(Size, 32u));
     Function *F =
         CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp8
                              ? Intrinsic::amdgcn_mov_dpp8
                              : Intrinsic::amdgcn_update_dpp,
-                         IntTy);
+                         DataTy);
     assert(E->getNumArgs() == 5 || E->getNumArgs() == 6 ||
            E->getNumArgs() == 2);
     bool InsertOld = BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp;
     if (InsertOld)
-      Args.push_back(llvm::PoisonValue::get(IntTy));
-    for (unsigned I = 0; I != E->getNumArgs(); ++I) {
+      Args.push_back(llvm::PoisonValue::get(DataTy));
+    Args.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, 0, E));
+    for (unsigned I = 1; I != E->getNumArgs(); ++I) {
       llvm::Value *V = EmitScalarOrConstFoldImmArg(ICEArguments, I, E);
-      if (I < (BuiltinID == AMDGPU::BI__builtin_amdgcn_update_dpp ? 2u : 1u) &&
-          Size < 32) {
-        if (!DataTy->isIntegerTy())
-          V = Builder.CreateBitCast(
-              V, llvm::IntegerType::get(Builder.getContext(), Size));
-        V = Builder.CreateZExtOrBitCast(V, IntTy);
-      }
       llvm::Type *ExpTy =
           F->getFunctionType()->getFunctionParamType(I + InsertOld);
       Args.push_back(Builder.CreateTruncOrBitCast(V, ExpTy));
     }
-    Value *V = Builder.CreateCall(F, Args);
-    if (Size < 32 && !DataTy->isIntegerTy())
-      V = Builder.CreateTrunc(
-          V, llvm::IntegerType::get(Builder.getContext(), Size));
-    return Builder.CreateTruncOrBitCast(V, DataTy);
+    return Builder.CreateCall(F, Args);
   }
   case AMDGPU::BI__builtin_amdgcn_permlane16:
   case AMDGPU::BI__builtin_amdgcn_permlanex16:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
index a4054cba236d..7e4ee6f4a942 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
@@ -36,45 +36,37 @@ void test_mov_dpp8_long(global long* out, long a) {
 }
 
 // CHECK-LABEL: @test_mov_dpp8_float(
-// CHECK:      %0 = bitcast float %a to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1)
-// CHECK-NEXT: store i32 %1,
+// CHECK:      %0 = tail call{{.*}} float @llvm.amdgcn.mov.dpp8.f32(float %a, i32 1)
+// CHECK-NEXT: store float %0,
 void test_mov_dpp8_float(global float* out, float a) {
   *out = __builtin_amdgcn_mov_dpp8(a, 1);
 }
 
 // CHECK-LABEL: @test_mov_dpp8_double
-// CHECK:      %0 = bitcast double %x to i64
-// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.mov.dpp8.i64(i64 %0, i32 1)
-// CHECK-NEXT: store i64 %1,
+// CHECK:      %0 = tail call{{.*}} double @llvm.amdgcn.mov.dpp8.f64(double %x, i32 1)
+// CHECK-NEXT: store double %0,
 void test_mov_dpp8_double(double x, global double *p) {
   *p = __builtin_amdgcn_mov_dpp8(x, 1);
 }
 
 // CHECK-LABEL: @test_mov_dpp8_short
-// CHECK:      %0 = zext i16 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1)
-// CHECK-NEXT: %2 = trunc i32 %1 to i16
-// CHECK-NEXT: store i16 %2,
+// CHECK:      %0 = tail call{{.*}} i16 @llvm.amdgcn.mov.dpp8.i16(i16 %x, i32 1)
+// CHECK-NEXT: store i16 %0,
 void test_mov_dpp8_short(short x, global short *p) {
   *p = __builtin_amdgcn_mov_dpp8(x, 1);
 }
 
 // CHECK-LABEL: @test_mov_dpp8_char
-// CHECK:      %0 = zext i8 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1)
-// CHECK-NEXT: %2 = trunc i32 %1 to i8
-// CHECK-NEXT: store i8 %2,
+// CHECK:      %0 = tail call{{.*}} i8 @llvm.amdgcn.mov.dpp8.i8(i8 %x, i32 1)
+// CHECK-NEXT: store i8 %0,
 void test_mov_dpp8_char(char x, global char *p) {
   *p = __builtin_amdgcn_mov_dpp8(x, 1);
 }
 
 // CHECK-LABEL: @test_mov_dpp8_half
-// CHECK:      %0 = load i16,
-// CHECK:      %1 = zext i16 %0 to i32
-// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %1, i32 1)
-// CHECK-NEXT: %3 = trunc i32 %2 to i16
-// CHECK-NEXT: store i16 %3,
+// CHECK:      %0 = load half,
+// CHECK-NEXT: %1 = tail call{{.*}} half @llvm.amdgcn.mov.dpp8.f16(half %0, i32 1)
+// CHECK-NEXT: store half %1,
 void test_mov_dpp8_half(half *x, global half *p) {
   *p = __builtin_amdgcn_mov_dpp8(*x, 1);
 }
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
index 269f20e2f53f..0c5995be5e09 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
@@ -117,45 +117,37 @@ void test_mov_dpp_long(long x, global long *p) {
 }
 
 // CHECK-LABEL: @test_mov_dpp_float
-// CHECK:      %0 = bitcast float %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: store i32 %1,
+// CHECK:      %0 = tail call{{.*}} float @llvm.amdgcn.update.dpp.f32(float poison, float %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store float %0,
 void test_mov_dpp_float(float x, global float *p) {
   *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_mov_dpp_double
-// CHECK:      %0 = bitcast double %x to i64
-// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 poison, i64 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: store i64 %1,
+// CHECK:      %0 = tail call{{.*}} double @llvm.amdgcn.update.dpp.f64(double poison, double %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store double %0,
 void test_mov_dpp_double(double x, global double *p) {
   *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_mov_dpp_short
-// CHECK:      %0 = zext i16 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %2 = trunc i32 %1 to i16
-// CHECK-NEXT: store i16 %2,
+// CHECK:      %0 = tail call{{.*}} i16 @llvm.amdgcn.update.dpp.i16(i16 poison, i16 %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i16 %0,
 void test_mov_dpp_short(short x, global short *p) {
   *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_mov_dpp_char
-// CHECK:      %0 = zext i8 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %2 = trunc i32 %1 to i8
-// CHECK-NEXT: store i8 %2,
+// CHECK:      %0 = tail call{{.*}} i8 @llvm.amdgcn.update.dpp.i8(i8 poison, i8 %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i8 %0,
 void test_mov_dpp_char(char x, global char *p) {
   *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_mov_dpp_half
-// CHECK:      %0 = load i16,
-// CHECK:      %1 = zext i16 %0 to i32
-// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %1, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %3 = trunc i32 %2 to i16
-// CHECK-NEXT: store i16 %3,
+// CHECK:      %0 = load half,
+// CHECK-NEXT: %1 = tail call{{.*}} half @llvm.amdgcn.update.dpp.f16(half poison, half %0, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store half %1,
 void test_mov_dpp_half(half *x, global half *p) {
   *p = __builtin_amdgcn_mov_dpp(*x, 0x101, 0xf, 0xf, 0);
 }
@@ -175,45 +167,37 @@ void test_update_dpp_long(long x, global long *p) {
 }
 
 // CHECK-LABEL: @test_update_dpp_float
-// CHECK:      %0 = bitcast float %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: store i32 %1,
+// CHECK:      %0 = tail call{{.*}} float @llvm.amdgcn.update.dpp.f32(float %x, float %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store float %0,
 void test_update_dpp_float(float x, global float *p) {
   *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_update_dpp_double
-// CHECK:      %0 = bitcast double %x to i64
-// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 %0, i64 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: store i64 %1,
+// CHECK:      %0 = tail call{{.*}} double @llvm.amdgcn.update.dpp.f64(double %x, double %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store double %0,
 void test_update_dpp_double(double x, global double *p) {
   *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_update_dpp_short
-// CHECK:      %0 = zext i16 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %2 = trunc i32 %1 to i16
-// CHECK-NEXT: store i16 %2,
+// CHECK:      %0 = tail call{{.*}} i16 @llvm.amdgcn.update.dpp.i16(i16 %x, i16 %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i16 %0,
 void test_update_dpp_short(short x, global short *p) {
   *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_update_dpp_char
-// CHECK:      %0 = zext i8 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %2 = trunc i32 %1 to i8
-// CHECK-NEXT: store i8 %2,
+// CHECK:      %0 = tail call{{.*}} i8 @llvm.amdgcn.update.dpp.i8(i8 %x, i8 %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i8 %0,
 void test_update_dpp_char(char x, global char *p) {
   *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_update_dpp_half
-// CHECK:      %0 = load i16,
-// CHECK:      %1 = zext i16 %0 to i32
-// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %1, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %3 = trunc i32 %2 to i16
-// CHECK-NEXT: store i16 %3,
+// CHECK:      %0 = load half,
+// CHECK-NEXT: %1 = tail call{{.*}} half @llvm.amdgcn.update.dpp.f16(half %0, half %0, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store half %1,
 void test_update_dpp_half(half *x, global half *p) {
   *p = __builtin_amdgcn_update_dpp(*x, *x, 0x101, 0xf, 0xf, 0);
 }
author	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>	2024-11-04 12:28:07 -0800
committer	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>	2025-03-03 10:41:39 -0800
commit	1fe2797a92cd52a34f76fe320ef796553a291cf8 (patch)
tree	6204efe393c52f7d52d90cd7ef07efa46ff6ac5f
parent	6cc8a46f262fe26d70d59b63fb7cdd2e1dfa6b3f (diff)