From 2468fdd9af361cb46d02d00a52e87067e7078127 Mon Sep 17 00:00:00 2001 From: George Rokos Date: Wed, 3 Mar 2021 11:48:32 -0800 Subject: [PATCH] [libomptarget] Add allocator support for target memory This patch adds the infrastructure for allocator support for target memory. Three allocators are introduced for device, host and shared memory. The corresponding API functions have the llvm_ prefix temporarily, until they become part of the OpenMP standard. Differential Revision: https://reviews.llvm.org/D97883 --- openmp/libomptarget/include/omptarget.h | 13 ++++++++ openmp/libomptarget/include/omptargetplugin.h | 6 ++-- .../libomptarget/plugins/amdgpu/src/rtl.cpp | 9 +++++- openmp/libomptarget/plugins/cuda/src/rtl.cpp | 9 +++++- .../plugins/generic-elf-64bit/src/rtl.cpp | 19 +++++++++-- .../libomptarget/plugins/remote/src/rtl.cpp | 9 +++++- openmp/libomptarget/plugins/ve/src/rtl.cpp | 9 +++++- openmp/libomptarget/src/api.cpp | 32 ++++++------------- openmp/libomptarget/src/device.cpp | 4 +-- openmp/libomptarget/src/device.h | 9 ++++-- openmp/libomptarget/src/exports | 3 ++ openmp/libomptarget/src/omptarget.cpp | 29 +++++++++++++++++ openmp/libomptarget/src/private.h | 2 ++ openmp/libomptarget/src/rtl.h | 2 +- 14 files changed, 119 insertions(+), 36 deletions(-) diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h index 7b317c45c4cd..9e3667ee2b87 100644 --- a/openmp/libomptarget/include/omptarget.h +++ b/openmp/libomptarget/include/omptarget.h @@ -86,6 +86,13 @@ enum OpenMPOffloadingRequiresDirFlags { OMP_REQ_DYNAMIC_ALLOCATORS = 0x010 }; +enum TargetAllocTy : int32_t { + TARGET_ALLOC_DEVICE = 0, + TARGET_ALLOC_HOST, + TARGET_ALLOC_SHARED, + TARGET_ALLOC_DEFAULT +}; + /// This struct is a record of an entry point or global. For a function /// entry point the size is expected to be zero struct __tgt_offload_entry { @@ -190,6 +197,12 @@ int omp_target_associate_ptr(void *host_ptr, void *device_ptr, size_t size, size_t device_offset, int device_num); int omp_target_disassociate_ptr(void *host_ptr, int device_num); +/// Explicit target memory allocators +/// Using the llvm_ prefix until they become part of the OpenMP standard. +void *llvm_omp_target_alloc_device(size_t size, int device_num); +void *llvm_omp_target_alloc_host(size_t size, int device_num); +void *llvm_omp_target_alloc_shared(size_t size, int device_num); + /// add the clauses of the requires directives in a given file void __tgt_register_requires(int64_t flags); diff --git a/openmp/libomptarget/include/omptargetplugin.h b/openmp/libomptarget/include/omptargetplugin.h index a315cddabe20..721b9d5cd478 100644 --- a/openmp/libomptarget/include/omptargetplugin.h +++ b/openmp/libomptarget/include/omptargetplugin.h @@ -65,8 +65,10 @@ __tgt_target_table *__tgt_rtl_load_binary(int32_t ID, // initialize the target data mapping structures. These addresses are // used to generate a table of target variables to pass to // __tgt_rtl_run_region(). The __tgt_rtl_data_alloc() returns NULL in -// case an error occurred on the target device. -void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr); +// case an error occurred on the target device. Kind dictates what allocator +// to use (e.g. shared, host, device). +void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr, + int32_t Kind); // Pass the data content to the target device using the target address. In case // of success, return zero. Otherwise, return an error code. diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp index 68d791305178..0e8df9e9ca60 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -1488,9 +1488,16 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, return DeviceInfo.getOffloadEntriesTable(device_id); } -void *__tgt_rtl_data_alloc(int device_id, int64_t size, void *) { +void *__tgt_rtl_data_alloc(int device_id, int64_t size, void *, int32_t kind) { void *ptr = NULL; assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); + + if (kind != TARGET_ALLOC_DEFAULT) { + REPORT("Invalid target data allocation kind or requested allocator not " + "implemented yet\n"); + return NULL; + } + atmi_status_t err = atmi_malloc(&ptr, size, get_gpu_mem_place(device_id)); DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", size, (long long unsigned)(Elf64_Addr)ptr); diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp index 564709900892..3d0424f16037 100644 --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -1095,9 +1095,16 @@ __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, return DeviceRTL.loadBinary(device_id, image); } -void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *) { +void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *, + int32_t kind) { assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + if (kind != TARGET_ALLOC_DEFAULT) { + REPORT("Invalid target data allocation kind or requested allocator not " + "implemented yet\n"); + return NULL; + } + return DeviceRTL.dataAlloc(device_id, size); } diff --git a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp index eb6ebc1c35c1..27cb39c5dcf6 100644 --- a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp +++ b/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp @@ -250,8 +250,23 @@ __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, return DeviceInfo.getOffloadEntriesTable(device_id); } -void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) { - void *ptr = malloc(size); +// Sample implementation of explicit memory allocator. For this plugin all kinds +// are equivalent to each other. +void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr, + int32_t kind) { + void *ptr = NULL; + + switch (kind) { + case TARGET_ALLOC_DEVICE: + case TARGET_ALLOC_HOST: + case TARGET_ALLOC_SHARED: + case TARGET_ALLOC_DEFAULT: + ptr = malloc(size); + break; + default: + REPORT("Invalid target data allocation kind"); + } + return ptr; } diff --git a/openmp/libomptarget/plugins/remote/src/rtl.cpp b/openmp/libomptarget/plugins/remote/src/rtl.cpp index 20c415be82a4..26f172a1fdcf 100644 --- a/openmp/libomptarget/plugins/remote/src/rtl.cpp +++ b/openmp/libomptarget/plugins/remote/src/rtl.cpp @@ -84,7 +84,14 @@ int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDevId, int32_t DstDevId) { return Manager->isDataExchangeable(SrcDevId, DstDevId); } -void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *HstPtr) { +void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *HstPtr, + int32_t kind) { + if (kind != TARGET_ALLOC_DEFAULT) { + REPORT("Invalid target data allocation kind or requested allocator not " + "implemented yet\n"); + return NULL; + } + return Manager->dataAlloc(DeviceId, Size, HstPtr); } diff --git a/openmp/libomptarget/plugins/ve/src/rtl.cpp b/openmp/libomptarget/plugins/ve/src/rtl.cpp index a77cd31e4867..2b9c17e368a2 100644 --- a/openmp/libomptarget/plugins/ve/src/rtl.cpp +++ b/openmp/libomptarget/plugins/ve/src/rtl.cpp @@ -330,10 +330,17 @@ __tgt_target_table *__tgt_rtl_load_binary(int32_t ID, // used to generate a table of target variables to pass to // __tgt_rtl_run_region(). The __tgt_rtl_data_alloc() returns NULL in // case an error occurred on the target device. -void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr) { +void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr, + int32_t kind) { int ret; uint64_t addr; + if (kind != TARGET_ALLOC_DEFAULT) { + REPORT("Invalid target data allocation kind or requested allocator not " + "implemented yet\n"); + return NULL; + } + if (DeviceInfo.ProcHandles[ID] == NULL) { struct veo_proc_handle *proc_handle; proc_handle = veo_proc_create(DeviceInfo.NodeIds[ID]); diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp index adacc5ae4c01..3c6142a36904 100644 --- a/openmp/libomptarget/src/api.cpp +++ b/openmp/libomptarget/src/api.cpp @@ -38,31 +38,19 @@ EXTERN int omp_get_initial_device(void) { } EXTERN void *omp_target_alloc(size_t size, int device_num) { - TIMESCOPE(); - DP("Call to omp_target_alloc for device %d requesting %zu bytes\n", - device_num, size); + return targetAllocExplicit(size, device_num, TARGET_ALLOC_DEFAULT, __func__); +} - if (size <= 0) { - DP("Call to omp_target_alloc with non-positive length\n"); - return NULL; - } +EXTERN void *llvm_omp_target_alloc_device(size_t size, int device_num) { + return targetAllocExplicit(size, device_num, TARGET_ALLOC_DEVICE, __func__); +} - void *rc = NULL; +EXTERN void *llvm_omp_target_alloc_host(size_t size, int device_num) { + return targetAllocExplicit(size, device_num, TARGET_ALLOC_HOST, __func__); +} - if (device_num == omp_get_initial_device()) { - rc = malloc(size); - DP("omp_target_alloc returns host ptr " DPxMOD "\n", DPxPTR(rc)); - return rc; - } - - if (!device_is_ready(device_num)) { - DP("omp_target_alloc returns NULL ptr\n"); - return NULL; - } - - rc = PM->Devices[device_num].allocData(size); - DP("omp_target_alloc returns device ptr " DPxMOD "\n", DPxPTR(rc)); - return rc; +EXTERN void *llvm_omp_target_alloc_shared(size_t size, int device_num) { + return targetAllocExplicit(size, device_num, TARGET_ALLOC_SHARED, __func__); } EXTERN void omp_target_free(void *device_ptr, int device_num) { diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp index 50017ac1e906..8f605a0a2185 100644 --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -405,8 +405,8 @@ __tgt_target_table *DeviceTy::load_binary(void *Img) { return rc; } -void *DeviceTy::allocData(int64_t Size, void *HstPtr) { - return RTL->data_alloc(RTLDeviceID, Size, HstPtr); +void *DeviceTy::allocData(int64_t Size, void *HstPtr, int32_t Kind) { + return RTL->data_alloc(RTLDeviceID, Size, HstPtr, Kind); } int32_t DeviceTy::deleteData(void *TgtPtrBegin) { diff --git a/openmp/libomptarget/src/device.h b/openmp/libomptarget/src/device.h index 7ecdec767272..32d1e9681ac7 100644 --- a/openmp/libomptarget/src/device.h +++ b/openmp/libomptarget/src/device.h @@ -185,13 +185,16 @@ struct DeviceTy { __tgt_target_table *load_binary(void *Img); // device memory allocation/deallocation routines - /// Allocates \p Size bytes on the device and returns the address/nullptr when + /// Allocates \p Size bytes on the device, host or shared memory space + /// (depending on \p Kind) and returns the address/nullptr when /// succeeds/fails. \p HstPtr is an address of the host data which the /// allocated target data will be associated with. If it is unknown, the /// default value of \p HstPtr is nullptr. Note: this function doesn't do /// pointer association. Actually, all the __tgt_rtl_data_alloc - /// implementations ignore \p HstPtr. - void *allocData(int64_t Size, void *HstPtr = nullptr); + /// implementations ignore \p HstPtr. \p Kind dictates what allocator should + /// be used (host, shared, device). + void *allocData(int64_t Size, void *HstPtr = nullptr, + int32_t Kind = TARGET_ALLOC_DEFAULT); /// Deallocates memory which \p TgtPtrBegin points at and returns /// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails. int32_t deleteData(void *TgtPtrBegin); diff --git a/openmp/libomptarget/src/exports b/openmp/libomptarget/src/exports index b7fc1c8c3c86..7992daa825e5 100644 --- a/openmp/libomptarget/src/exports +++ b/openmp/libomptarget/src/exports @@ -36,6 +36,9 @@ VERS1.0 { omp_target_memcpy_rect; omp_target_associate_ptr; omp_target_disassociate_ptr; + llvm_omp_target_alloc_host; + llvm_omp_target_alloc_shared; + llvm_omp_target_alloc_device; local: *; }; diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp index 64a529274422..2c6af57efa9f 100644 --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -328,6 +328,35 @@ static int32_t getParentIndex(int64_t type) { return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1; } +void *targetAllocExplicit(size_t size, int device_num, int kind, + const char *name) { + TIMESCOPE(); + DP("Call to %s for device %d requesting %zu bytes\n", name, device_num, size); + + if (size <= 0) { + DP("Call to %s with non-positive length\n", name); + return NULL; + } + + void *rc = NULL; + + if (device_num == omp_get_initial_device()) { + rc = malloc(size); + DP("%s returns host ptr " DPxMOD "\n", name, DPxPTR(rc)); + return rc; + } + + if (!device_is_ready(device_num)) { + DP("%s returns NULL ptr\n", name); + return NULL; + } + + DeviceTy &Device = PM->Devices[device_num]; + rc = Device.allocData(size, nullptr, kind); + DP("%s returns device ptr " DPxMOD "\n", name, DPxPTR(rc)); + return rc; +} + /// Call the user-defined mapper function followed by the appropriate // targetData* function (targetData{Begin,End,Update}). int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg, diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h index 8e4db656afcb..a97d7012a9d7 100644 --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -46,6 +46,8 @@ extern int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum, extern void handleTargetOutcome(bool Success, ident_t *Loc); extern int checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc); +extern void *targetAllocExplicit(size_t size, int device_num, int kind, + const char *name); // This structure stores information of a mapped memory region. struct MapComponentInfoTy { diff --git a/openmp/libomptarget/src/rtl.h b/openmp/libomptarget/src/rtl.h index a67b8682a0f4..ae11eee580aa 100644 --- a/openmp/libomptarget/src/rtl.h +++ b/openmp/libomptarget/src/rtl.h @@ -30,7 +30,7 @@ struct RTLInfoTy { typedef int32_t(number_of_devices_ty)(); typedef int32_t(init_device_ty)(int32_t); typedef __tgt_target_table *(load_binary_ty)(int32_t, void *); - typedef void *(data_alloc_ty)(int32_t, int64_t, void *); + typedef void *(data_alloc_ty)(int32_t, int64_t, void *, int32_t); typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t); typedef int32_t(data_submit_async_ty)(int32_t, void *, void *, int64_t, __tgt_async_info *);