[libomptarget][amdgpu] Call into deviceRTL instead of ockl

[libomptarget][amdgpu] Call into deviceRTL instead of ockl Amdgpu codegen presently emits a call into ockl. The same functionality is already present in the deviceRTL. Adds an amdgpu specific entry point to avoid the dependency. This lets simple openmp code (specifically, that which doesn't use libm) run without rocm device libraries installed. Reviewed By: ronlieb Differential Revision: https://reviews.llvm.org/D93356
2021-01-04 16:48:46 +00:00 · 2021-01-04 16:48:46 +00:00 · 76bfbb74d3
parent c55b609b77
commit 76bfbb74d3
4 changed files with 14 additions and 11 deletions
--- a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
@ -49,13 +49,12 @@ llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUThreadID(CodeGenFunction &CGF) {
 llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUNumThreads(CodeGenFunction &CGF) {
  CGBuilderTy &Bld = CGF.Builder;
  llvm::Module *M = &CGF.CGM.getModule();
-  const char *LocSize = "__ockl_get_local_size";
+  const char *LocSize = "__kmpc_amdgcn_gpu_num_threads";
  llvm::Function *F = M->getFunction(LocSize);
  if (!F) {
    F = llvm::Function::Create(
-        llvm::FunctionType::get(CGF.Int64Ty, {CGF.Int32Ty}, false),
+        llvm::FunctionType::get(CGF.Int32Ty, llvm::None, false),
        llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule());
  }
-  return Bld.CreateTrunc(
-      Bld.CreateCall(F, {Bld.getInt32(0)}, "nvptx_num_threads"), CGF.Int32Ty);
+  return Bld.CreateCall(F, llvm::None, "nvptx_num_threads");
 }
--- a/clang/test/OpenMP/amdgcn_target_codegen.cpp
+++ b/clang/test/OpenMP/amdgcn_target_codegen.cpp
@ -13,9 +13,8 @@ int test_amdgcn_target_tid_threads() {

  int arr[N];

-// CHECK: [[NUM_THREADS:%.+]] = call i64 @__ockl_get_local_size(i32 0)
-// CHECK-NEXT: [[VAR:%.+]] = trunc i64 [[NUM_THREADS]] to i32
-// CHECK-NEXT: sub nuw i32 [[VAR]], 64
+// CHECK: [[NUM_THREADS:%.+]] = call i32 @__kmpc_amdgcn_gpu_num_threads()
+// CHECK: sub nuw i32 [[NUM_THREADS]], 64
 // CHECK: call i32 @llvm.amdgcn.workitem.id.x()
 #pragma omp target
  for (int i = 0; i < N; i++) {
@ -30,9 +29,8 @@ int test_amdgcn_target_tid_threads_simd() {

  int arr[N];

-// CHECK: [[NUM_THREADS:%.+]] = call i64 @__ockl_get_local_size(i32 0)
-// CHECK-NEXT: [[VAR:%.+]] = trunc i64 [[NUM_THREADS]] to i32
-// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[VAR]], i16 0)
+// CHECK: [[NUM_THREADS:%.+]] = call i32 @__kmpc_amdgcn_gpu_num_threads()
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[NUM_THREADS]], i16 0)
 #pragma omp target simd
  for (int i = 0; i < N; i++) {
    arr[i] = 1;
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h
@ -15,4 +15,6 @@
 typedef uint64_t __kmpc_impl_lanemask_t;
 typedef uint32_t omp_lock_t; /* arbitrary type of the right length */

+EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads();
+
 #endif
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
@ -144,6 +144,10 @@ DEVICE unsigned GetLaneId() {
  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
 }

+EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads() {
+  return GetNumberOfThreadsInBlock();
+}
+
 // Stub implementations
-DEVICE void *__kmpc_impl_malloc(size_t ) { return nullptr }
+DEVICE void *__kmpc_impl_malloc(size_t) { return nullptr; }
 DEVICE void __kmpc_impl_free(void *) {}