diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 042a86368559..f5901e6f8f3b 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -37,6 +37,10 @@ BUILTIN(__builtin_amdgcn_workgroup_size_x, "Us", "nc") BUILTIN(__builtin_amdgcn_workgroup_size_y, "Us", "nc") BUILTIN(__builtin_amdgcn_workgroup_size_z, "Us", "nc") +BUILTIN(__builtin_amdgcn_grid_size_x, "Ui", "nc") +BUILTIN(__builtin_amdgcn_grid_size_y, "Ui", "nc") +BUILTIN(__builtin_amdgcn_grid_size_z, "Ui", "nc") + BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc") BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 6f7505b7b5c2..f933113fa883 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -14750,6 +14750,22 @@ Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) { llvm::MDNode::get(CGF.getLLVMContext(), None)); return LD; } + +// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively. +Value *EmitAMDGPUGridSize(CodeGenFunction &CGF, unsigned Index) { + const unsigned XOffset = 12; + auto *DP = EmitAMDGPUDispatchPtr(CGF); + // Indexing the HSA kernel_dispatch_packet struct. + auto *Offset = llvm::ConstantInt::get(CGF.Int32Ty, XOffset + Index * 4); + auto *GEP = CGF.Builder.CreateGEP(DP, Offset); + auto *DstTy = + CGF.Int32Ty->getPointerTo(GEP->getType()->getPointerAddressSpace()); + auto *Cast = CGF.Builder.CreateBitCast(GEP, DstTy); + auto *LD = CGF.Builder.CreateLoad(Address(Cast, CharUnits::fromQuantity(4))); + LD->setMetadata(llvm::LLVMContext::MD_invariant_load, + llvm::MDNode::get(CGF.getLLVMContext(), None)); + return LD; +} } // namespace // For processing memory ordering and memory scope arguments of various @@ -15010,6 +15026,14 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_workgroup_size_z: return EmitAMDGPUWorkGroupSize(*this, 2); + // amdgcn grid size + case AMDGPU::BI__builtin_amdgcn_grid_size_x: + return EmitAMDGPUGridSize(*this, 0); + case AMDGPU::BI__builtin_amdgcn_grid_size_y: + return EmitAMDGPUGridSize(*this, 1); + case AMDGPU::BI__builtin_amdgcn_grid_size_z: + return EmitAMDGPUGridSize(*this, 2); + // r600 intrinsics case AMDGPU::BI__builtin_r600_recipsqrt_ieee: case AMDGPU::BI__builtin_r600_recipsqrt_ieeef: diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index 56c83df6b6b4..20edaf2aae3f 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -559,6 +559,24 @@ void test_get_workgroup_size(int d, global int *out) } } +// CHECK-LABEL: @test_get_grid_size( +// CHECK: call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() +// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 12 +// CHECK: load i32, i32 addrspace(4)* %{{.*}}, align 4, !invariant.load +// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 16 +// CHECK: load i32, i32 addrspace(4)* %{{.*}}, align 4, !invariant.load +// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 20 +// CHECK: load i32, i32 addrspace(4)* %{{.*}}, align 4, !invariant.load +void test_get_grid_size(int d, global int *out) +{ + switch (d) { + case 0: *out = __builtin_amdgcn_grid_size_x(); break; + case 1: *out = __builtin_amdgcn_grid_size_y(); break; + case 2: *out = __builtin_amdgcn_grid_size_z(); break; + default: *out = 0; + } +} + // CHECK-LABEL: @test_fmed3_f32 // CHECK: call float @llvm.amdgcn.fmed3.f32( void test_fmed3_f32(global float* out, float a, float b, float c) diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip index 8c53d99b9fb6..9fbdc67b56ab 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip @@ -119,12 +119,6 @@ DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) { } namespace { -DEVICE uint32_t grid_size_x() { - size_t grid_size_x_offset = 96; // In bits, from AQL kernel dispatch format - return *(uint32_t *)((char *)__builtin_amdgcn_dispatch_ptr() + - grid_size_x_offset / 8); -} - DEVICE uint32_t get_grid_dim(uint32_t n, uint16_t d) { uint32_t q = n / d; return q + (n > q * d); @@ -137,11 +131,11 @@ DEVICE uint32_t get_workgroup_dim(uint32_t group_id, uint32_t grid_size, } // namespace DEVICE int GetNumberOfBlocksInKernel() { - return get_grid_dim(grid_size_x(), __builtin_amdgcn_workgroup_size_x()); + return get_grid_dim(__builtin_amdgcn_grid_size_x(), __builtin_amdgcn_workgroup_size_x()); } DEVICE int GetNumberOfThreadsInBlock() { - return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(), grid_size_x(), + return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(), __builtin_amdgcn_grid_size_x(), __builtin_amdgcn_workgroup_size_x()); }