forked from OSchip/llvm-project
[AMDGPU] Add __builtin_amdgcn_grid_size
[AMDGPU] Add __builtin_amdgcn_grid_size Similar to D76772, loads the data from the dispatch pointer. Marked invariant. Patch also updates the openmp devicertl to use this builtin. Reviewed By: yaxunl Differential Revision: https://reviews.llvm.org/D90251
This commit is contained in:
parent
d5a75e7738
commit
dee7704829
|
@ -37,6 +37,10 @@ BUILTIN(__builtin_amdgcn_workgroup_size_x, "Us", "nc")
|
|||
BUILTIN(__builtin_amdgcn_workgroup_size_y, "Us", "nc")
|
||||
BUILTIN(__builtin_amdgcn_workgroup_size_z, "Us", "nc")
|
||||
|
||||
BUILTIN(__builtin_amdgcn_grid_size_x, "Ui", "nc")
|
||||
BUILTIN(__builtin_amdgcn_grid_size_y, "Ui", "nc")
|
||||
BUILTIN(__builtin_amdgcn_grid_size_z, "Ui", "nc")
|
||||
|
||||
BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc")
|
||||
BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc")
|
||||
|
||||
|
|
|
@ -14750,6 +14750,22 @@ Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) {
|
|||
llvm::MDNode::get(CGF.getLLVMContext(), None));
|
||||
return LD;
|
||||
}
|
||||
|
||||
// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
|
||||
Value *EmitAMDGPUGridSize(CodeGenFunction &CGF, unsigned Index) {
|
||||
const unsigned XOffset = 12;
|
||||
auto *DP = EmitAMDGPUDispatchPtr(CGF);
|
||||
// Indexing the HSA kernel_dispatch_packet struct.
|
||||
auto *Offset = llvm::ConstantInt::get(CGF.Int32Ty, XOffset + Index * 4);
|
||||
auto *GEP = CGF.Builder.CreateGEP(DP, Offset);
|
||||
auto *DstTy =
|
||||
CGF.Int32Ty->getPointerTo(GEP->getType()->getPointerAddressSpace());
|
||||
auto *Cast = CGF.Builder.CreateBitCast(GEP, DstTy);
|
||||
auto *LD = CGF.Builder.CreateLoad(Address(Cast, CharUnits::fromQuantity(4)));
|
||||
LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
|
||||
llvm::MDNode::get(CGF.getLLVMContext(), None));
|
||||
return LD;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
// For processing memory ordering and memory scope arguments of various
|
||||
|
@ -15010,6 +15026,14 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
|
|||
case AMDGPU::BI__builtin_amdgcn_workgroup_size_z:
|
||||
return EmitAMDGPUWorkGroupSize(*this, 2);
|
||||
|
||||
// amdgcn grid size
|
||||
case AMDGPU::BI__builtin_amdgcn_grid_size_x:
|
||||
return EmitAMDGPUGridSize(*this, 0);
|
||||
case AMDGPU::BI__builtin_amdgcn_grid_size_y:
|
||||
return EmitAMDGPUGridSize(*this, 1);
|
||||
case AMDGPU::BI__builtin_amdgcn_grid_size_z:
|
||||
return EmitAMDGPUGridSize(*this, 2);
|
||||
|
||||
// r600 intrinsics
|
||||
case AMDGPU::BI__builtin_r600_recipsqrt_ieee:
|
||||
case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:
|
||||
|
|
|
@ -559,6 +559,24 @@ void test_get_workgroup_size(int d, global int *out)
|
|||
}
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @test_get_grid_size(
|
||||
// CHECK: call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
|
||||
// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 12
|
||||
// CHECK: load i32, i32 addrspace(4)* %{{.*}}, align 4, !invariant.load
|
||||
// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 16
|
||||
// CHECK: load i32, i32 addrspace(4)* %{{.*}}, align 4, !invariant.load
|
||||
// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 20
|
||||
// CHECK: load i32, i32 addrspace(4)* %{{.*}}, align 4, !invariant.load
|
||||
void test_get_grid_size(int d, global int *out)
|
||||
{
|
||||
switch (d) {
|
||||
case 0: *out = __builtin_amdgcn_grid_size_x(); break;
|
||||
case 1: *out = __builtin_amdgcn_grid_size_y(); break;
|
||||
case 2: *out = __builtin_amdgcn_grid_size_z(); break;
|
||||
default: *out = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @test_fmed3_f32
|
||||
// CHECK: call float @llvm.amdgcn.fmed3.f32(
|
||||
void test_fmed3_f32(global float* out, float a, float b, float c)
|
||||
|
|
|
@ -119,12 +119,6 @@ DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) {
|
|||
}
|
||||
|
||||
namespace {
|
||||
DEVICE uint32_t grid_size_x() {
|
||||
size_t grid_size_x_offset = 96; // In bits, from AQL kernel dispatch format
|
||||
return *(uint32_t *)((char *)__builtin_amdgcn_dispatch_ptr() +
|
||||
grid_size_x_offset / 8);
|
||||
}
|
||||
|
||||
DEVICE uint32_t get_grid_dim(uint32_t n, uint16_t d) {
|
||||
uint32_t q = n / d;
|
||||
return q + (n > q * d);
|
||||
|
@ -137,11 +131,11 @@ DEVICE uint32_t get_workgroup_dim(uint32_t group_id, uint32_t grid_size,
|
|||
} // namespace
|
||||
|
||||
DEVICE int GetNumberOfBlocksInKernel() {
|
||||
return get_grid_dim(grid_size_x(), __builtin_amdgcn_workgroup_size_x());
|
||||
return get_grid_dim(__builtin_amdgcn_grid_size_x(), __builtin_amdgcn_workgroup_size_x());
|
||||
}
|
||||
|
||||
DEVICE int GetNumberOfThreadsInBlock() {
|
||||
return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(), grid_size_x(),
|
||||
return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(), __builtin_amdgcn_grid_size_x(),
|
||||
__builtin_amdgcn_workgroup_size_x());
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue