From f3de222b0d3c169985ba9d61f479f3e8a3675100 Mon Sep 17 00:00:00 2001 From: Gheorghe-Teodor Bercea Date: Thu, 15 Mar 2018 15:56:04 +0000 Subject: [PATCH] [OpenMP][libomptarget] Enable multiple frames per global memory slot Summary: To save on calls to malloc, this patch enables the re-use of pre-allocated global memory slots. Reviewers: ABataev, grokos, carlo.bertolli, caomhin Reviewed By: grokos Subscribers: guansong, openmp-commits Differential Revision: https://reviews.llvm.org/D44470 llvm-svn: 327637 --- .../deviceRTLs/nvptx/src/data_sharing.cu | 169 ++++++++++++------ .../deviceRTLs/nvptx/src/interface.h | 1 + .../deviceRTLs/nvptx/src/omptarget-nvptx.h | 4 + 3 files changed, 124 insertions(+), 50 deletions(-) diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu index cd73a6b4108f..41976f6536bf 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -341,7 +341,17 @@ EXTERN void __kmpc_data_sharing_init_stack() { __kmpc_data_sharing_slot *RootS = teamDescr->RootS(WID); DataSharingState.SlotPtr[WID] = RootS; - DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0]; + DataSharingState.TailPtr[WID] = RootS; + + // Initialize the stack pointer to be equal to the end of + // the shared memory slot. This way we ensure that the global + // version of the stack will be used. + // TODO: remove this: + DataSharingState.StackPtr[WID] = RootS->DataEnd; + + // TODO: When the use of shared memory is enabled we will have to + // initialize this with the start of the Data region like so: + // DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0]; // We initialize the list of references to arguments here. omptarget_nvptx_globalArgs.Init(); @@ -355,12 +365,8 @@ EXTERN void __kmpc_data_sharing_init_stack() { // UseSharedMemory is set to true, the runtime will attempt to use shared memory // as long as the size requested fits the pre-allocated size. // -// TODO: allow more than one push per slot to save on calls to malloc. -// Currently there is only one slot for each push so the data size in the slot -// is the same size as the size being requested. -// // Called by: master, TODO: call by workers -EXTERN void* __kmpc_data_sharing_push_stack(size_t size, +EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize, int16_t UseSharedMemory) { // TODO: Add shared memory support. For now, use global memory only for // storing the data sharing slots so ignore the pre-allocated @@ -374,39 +380,85 @@ EXTERN void* __kmpc_data_sharing_push_stack(size_t size, // global memory slot. __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; __kmpc_data_sharing_slot *&TailSlotP = DataSharingState.TailPtr[WID]; + void *&StackP = DataSharingState.StackPtr[WID]; + void *FrameP = 0; - // The slot for holding the data we are pushing. - __kmpc_data_sharing_slot *NewSlot = 0; - size_t NewSize = size; + // Check if we have room for the data in the current slot. + const uintptr_t StartAddress = (uintptr_t)StackP; + const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd; + const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)DataSize; - // Check if there is a next slot. - if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) { - // Attempt to re-use an existing slot provided the data fits in the slot. - // The leftover data space will not be used. - ptrdiff_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd - - (uintptr_t)(&ExistingSlot->Data[0]); - if (ExistingSlotSize >= NewSize) - NewSlot = ExistingSlot; - else - free(ExistingSlot); + // If we requested more data than there is room for in the rest + // of the slot then we need to either re-use the next slot, if one exists, + // or create a new slot. + if (EndAddress < RequestedEndAddress) { + size_t NewSize = DataSize; + + // The new or reused slot for holding the data being pushed. + __kmpc_data_sharing_slot *NewSlot = 0; + + // Check if there is a next slot. + if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) { + // Attempt to reuse an existing slot provided the data fits in the slot. + // The leftover data space will not be used. + ptrdiff_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd - + (uintptr_t)(&ExistingSlot->Data[0]); + + // Try to add the data in the next available slot. Search for a slot + // with enough space. + while (ExistingSlotSize < NewSize) { + SlotP->Next = ExistingSlot->Next; + SlotP->Next->Prev = ExistingSlot->Prev; + free(ExistingSlot); + ExistingSlot = SlotP->Next; + if (!ExistingSlot) + break; + ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd - + (uintptr_t)(&ExistingSlot->Data[0]); + } + + // Check if a slot has been found. + if (ExistingSlotSize >= NewSize) { + NewSlot = ExistingSlot; + NewSlot->PrevSlotStackPtr = StackP; + } + } + + if (!NewSlot) { + // Allocate at least the default size. + // TODO: generalize this for workers which need a larger data slot + // i.e. using DS_Worker_Warp_Slot_Size. + if (DS_Slot_Size > DataSize) + NewSize = DS_Slot_Size; + NewSlot = (__kmpc_data_sharing_slot *)malloc( + sizeof(__kmpc_data_sharing_slot) + NewSize); + NewSlot->Next = 0; + NewSlot->Prev = SlotP; + NewSlot->PrevSlotStackPtr = StackP; + NewSlot->DataEnd = &NewSlot->Data[NewSize]; + + // Newly allocated slots are also tail slots. + TailSlotP = NewSlot; + + // Make previous slot point to the newly allocated slot. + SlotP->Next = NewSlot; + } + + // The current slot becomes the new slot. + SlotP = NewSlot; + // The stack pointer always points to the next free stack frame. + StackP = &NewSlot->Data[DataSize]; + // The frame pointer always points to the beginning of the frame. + FrameP = &NewSlot->Data[0]; + } else { + // Add the data chunk to the current slot. The frame pointer is set to + // point to the start of the new frame held in StackP. + FrameP = StackP; + // Reset stack pointer to the requested address. + StackP = (void *)RequestedEndAddress; } - if (!NewSlot) { - NewSlot = (__kmpc_data_sharing_slot *)malloc( - sizeof(__kmpc_data_sharing_slot) + NewSize); - NewSlot->Next = 0; - NewSlot->Prev = SlotP; - - // This is the last slot, save it. - TailSlotP = NewSlot; - } - - NewSlot->DataEnd = &NewSlot->Data[NewSize]; - - SlotP->Next = NewSlot; - SlotP = NewSlot; - - return (void*)&SlotP->Data[0]; + return FrameP; } // TODO: add memory fence here when this function can be called by @@ -422,26 +474,43 @@ EXTERN void* __kmpc_data_sharing_push_stack(size_t size, // When the pop operation removes the last global memory slot, // reclaim all outstanding global memory slots since it is // likely we have reached the end of the kernel. -EXTERN void __kmpc_data_sharing_pop_stack(void *a) { +EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) { if (IsMasterThread()) { unsigned WID = getWarpId(); - __kmpc_data_sharing_slot *S = DataSharingState.SlotPtr[WID]; + __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; + void *&StackP = DataSharingState.StackPtr[WID]; - if (S->Prev) - S = S->Prev; - - // If this will "pop" the last global memory node then it is likely - // that we are at the end of the data sharing region and we can - // de-allocate any existing global memory slots. - if (!S->Prev) { - __kmpc_data_sharing_slot *Tail = DataSharingState.TailPtr[WID]; - - while(Tail && Tail->Prev) { - Tail = Tail->Prev; - free(Tail->Next); - Tail->Next=0; + // If we try to pop the last frame of the current slot we need to + // move to the previous slot if there is one. + const uintptr_t StartAddress = (uintptr_t)FrameStart; + if (StartAddress == (uintptr_t)&SlotP->Data[0]) { + if (SlotP->Prev) { + // The new stack pointer is the end of the data field of the + // previous slot. This will allow the stack pointer to be + // used in the computation of the remaining data space in + // the current slot. + StackP = SlotP->PrevSlotStackPtr; + // Reset SlotP to previous slot. + SlotP = SlotP->Prev; } + + // If this will "pop" the last global memory node then it is likely + // that we are at the end of the data sharing region and we can + // de-allocate any existing global memory slots. + if (!SlotP->Prev) { + __kmpc_data_sharing_slot *Tail = DataSharingState.TailPtr[WID]; + + while(Tail && Tail->Prev) { + Tail = Tail->Prev; + free(Tail->Next); + Tail->Next=0; + } + } + } else { + // This is not the last frame popped from this slot. + // Reset StackP + StackP = FrameStart; } return; diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h b/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h index 34e33d104b51..84f6ec60859b 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h @@ -497,6 +497,7 @@ EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs); struct __kmpc_data_sharing_slot { __kmpc_data_sharing_slot *Next; __kmpc_data_sharing_slot *Prev; + void *PrevSlotStackPtr; void *DataEnd; char Data[]; }; diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h index 4276f02215d7..8f4f1cdd0366 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -129,6 +129,7 @@ struct DataSharingStateTy { struct __kmpc_data_sharing_worker_slot_static { __kmpc_data_sharing_slot *Next; __kmpc_data_sharing_slot *Prev; + void *PrevSlotStackPtr; void *DataEnd; char Data[DS_Worker_Warp_Slot_Size]; }; @@ -137,6 +138,7 @@ struct __kmpc_data_sharing_worker_slot_static { struct __kmpc_data_sharing_master_slot_static { __kmpc_data_sharing_slot *Next; __kmpc_data_sharing_slot *Prev; + void *PrevSlotStackPtr; void *DataEnd; char Data[DS_Slot_Size]; }; @@ -267,6 +269,7 @@ public: // We currently do not have a next slot. master_rootS[0].Next = 0; master_rootS[0].Prev = 0; + master_rootS[0].PrevSlotStackPtr = 0; return (__kmpc_data_sharing_slot *)&master_rootS[0]; } // Initialize the pointer to the end of the slot given the size of the data @@ -276,6 +279,7 @@ public: // We currently do not have a next slot. worker_rootS[wid].Next = 0; worker_rootS[wid].Prev = 0; + worker_rootS[wid].PrevSlotStackPtr = 0; return (__kmpc_data_sharing_slot *)&worker_rootS[wid]; }