diff options
Diffstat (limited to 'clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp')
| -rw-r--r-- | clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 129 |
1 files changed, 129 insertions, 0 deletions
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index dad1f95ac710..07cf08c54985 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -295,11 +295,69 @@ void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst, Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs)); } +static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) { + switch (BuiltinID) { + default: + llvm_unreachable("Unknown BuiltinID for wave reduction"); + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64: + return Intrinsic::amdgcn_wave_reduce_add; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64: + return Intrinsic::amdgcn_wave_reduce_sub; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64: + return Intrinsic::amdgcn_wave_reduce_min; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64: + return Intrinsic::amdgcn_wave_reduce_umin; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64: + return Intrinsic::amdgcn_wave_reduce_max; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64: + return Intrinsic::amdgcn_wave_reduce_umax; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b64: + return Intrinsic::amdgcn_wave_reduce_and; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b64: + return Intrinsic::amdgcn_wave_reduce_or; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b64: + return Intrinsic::amdgcn_wave_reduce_xor; + } +} + Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent; llvm::SyncScope::ID SSID; switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b64: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b64: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b64: { + Intrinsic::ID IID = getIntrinsicIDforWaveReduction(BuiltinID); + llvm::Value *Value = EmitScalarExpr(E->getArg(0)); + llvm::Value *Strategy = EmitScalarExpr(E->getArg(1)); + llvm::Function *F = CGM.getIntrinsic(IID, {Value->getType()}); + return Builder.CreateCall(F, {Value, Strategy}); + } case AMDGPU::BI__builtin_amdgcn_div_scale: case AMDGPU::BI__builtin_amdgcn_div_scalef: { // Translate from the intrinsics's struct return to the builtin's out @@ -504,6 +562,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType }); return Builder.CreateCall(F, { Src }); } + case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w32: + case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w64: { + llvm::Value *Src = EmitScalarExpr(E->getArg(0)); + Function *F = + CGM.getIntrinsic(Intrinsic::amdgcn_inverse_ballot, {Src->getType()}); + return Builder.CreateCall(F, {Src}); + } case AMDGPU::BI__builtin_amdgcn_tanhf: case AMDGPU::BI__builtin_amdgcn_tanhh: case AMDGPU::BI__builtin_amdgcn_tanh_bf16: @@ -668,11 +733,75 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy}); return Builder.CreateCall(F, {Addr, Val}); } + case AMDGPU::BI__builtin_amdgcn_cluster_load_b32: + case AMDGPU::BI__builtin_amdgcn_cluster_load_b64: + case AMDGPU::BI__builtin_amdgcn_cluster_load_b128: { + Intrinsic::ID IID; + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_cluster_load_b32: + IID = Intrinsic::amdgcn_cluster_load_b32; + break; + case AMDGPU::BI__builtin_amdgcn_cluster_load_b64: + IID = Intrinsic::amdgcn_cluster_load_b64; + break; + case AMDGPU::BI__builtin_amdgcn_cluster_load_b128: + IID = Intrinsic::amdgcn_cluster_load_b128; + break; + } + SmallVector<Value *, 3> Args; + for (int i = 0, e = E->getNumArgs(); i != e; ++i) + Args.push_back(EmitScalarExpr(E->getArg(i))); + llvm::Function *F = CGM.getIntrinsic(IID, {ConvertType(E->getType())}); + return Builder.CreateCall(F, {Args}); + } case AMDGPU::BI__builtin_amdgcn_load_to_lds: { // Should this have asan instrumentation? return emitBuiltinWithOneOverloadedType<5>(*this, E, Intrinsic::amdgcn_load_to_lds); } + case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_32x4B: + case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_32x4B: + case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_16x8B: + case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_16x8B: + case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_8x16B: + case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_8x16B: { + Intrinsic::ID IID; + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_32x4B: + IID = Intrinsic::amdgcn_cooperative_atomic_load_32x4B; + break; + case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_32x4B: + IID = Intrinsic::amdgcn_cooperative_atomic_store_32x4B; + break; + case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_16x8B: + IID = Intrinsic::amdgcn_cooperative_atomic_load_16x8B; + break; + case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_16x8B: + IID = Intrinsic::amdgcn_cooperative_atomic_store_16x8B; + break; + case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_8x16B: + IID = Intrinsic::amdgcn_cooperative_atomic_load_8x16B; + break; + case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_8x16B: + IID = Intrinsic::amdgcn_cooperative_atomic_store_8x16B; + break; + } + + LLVMContext &Ctx = CGM.getLLVMContext(); + SmallVector<Value *, 5> Args; + // last argument is a MD string + const unsigned ScopeArg = E->getNumArgs() - 1; + for (unsigned i = 0; i != ScopeArg; ++i) + Args.push_back(EmitScalarExpr(E->getArg(i))); + StringRef Arg = cast<StringLiteral>(E->getArg(ScopeArg)->IgnoreParenCasts()) + ->getString(); + llvm::MDNode *MD = llvm::MDNode::get(Ctx, {llvm::MDString::get(Ctx, Arg)}); + Args.push_back(llvm::MetadataAsValue::get(Ctx, MD)); + // Intrinsic is typed based on the pointer AS. Pointer is always the first + // argument. + llvm::Function *F = CGM.getIntrinsic(IID, {Args[0]->getType()}); + return Builder.CreateCall(F, {Args}); + } case AMDGPU::BI__builtin_amdgcn_get_fpenv: { Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv, {llvm::Type::getInt64Ty(getLLVMContext())}); |
