[OpenMP][AMDGPU] Use DS_Max_Warp_Number instead of WARPSIZE

The size of worker_rootS should have been DS_Max_Warp_Number. This reduces memory usage by deviceRTL on AMDGPU from around 2.3GB to around 770MB. Reviewed By: JonChesterfield, jdoerfert Differential Revision: https://reviews.llvm.org/D87084
2020-09-03 07:57:46 -04:00 · 2020-09-03 07:57:46 -04:00 · 7634c64b61
parent 05147d3309
commit 7634c64b61
2 changed files with 2 additions and 2 deletions
--- a/openmp/libomptarget/deviceRTLs/common/omptarget.h
+++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h
@ -252,7 +252,7 @@ private:
      workDescrForActiveParallel; // one, ONLY for the active par

  ALIGN(16)
-  __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE];
+  __kmpc_data_sharing_worker_slot_static worker_rootS[DS_Max_Warp_Number];
  ALIGN(16) __kmpc_data_sharing_master_slot_static master_rootS[1];
 };

--- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
@ -26,7 +26,7 @@ INLINE static void data_sharing_init_stack_common() {
  omptarget_nvptx_TeamDescr *teamDescr =
      &omptarget_nvptx_threadPrivateContext->TeamContext();

-  for (int WID = 0; WID < WARPSIZE; WID++) {
+  for (int WID = 0; WID < DS_Max_Warp_Number; WID++) {
    __kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID);
    DataSharingState.SlotPtr[WID] = RootS;
    DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];