[Libomptarget] Change device free routines to accept the allocation kind

Previous support for device memory allocators used a single free
routine and did not provide the original kind of the allocation. This is
problematic as some of these memory types required different handling.
Previously this was worked around using a map in runtime to record the
original kind of each pointer. Instead, this patch introduces new free
routines similar to the existing allocation routines. This allows us to
avoid a map traversal every time we free a device pointer.

The only interfaces defined by the standard are `omp_target_alloc` and
`omp_target_free`, these do not take a kind as `omp_alloc` does. The
standard dictates the following:

"The omp_target_alloc routine returns a device pointer that references
the device address of a storage location of size bytes. The storage
location is dynamically allocated in the device data environment of the
device specified by device_num."

Which suggests that these routines only allocate the default device
memory for the kind. So this has been changed to reflect this. This
change is somewhat breaking if users were using `omp_target_free` as
previously shown in the tests.

Reviewed By: JonChesterfield, tianshilei1992

Differential Revision: https://reviews.llvm.org/D133053
This commit is contained in:
Joseph Huber 2022-08-31 15:55:14 -05:00
parent 5631d20bfc
commit 23bc343855
18 changed files with 117 additions and 68 deletions

View File

@ -411,8 +411,9 @@ struct DeviceTy {
void *allocData(int64_t Size, void *HstPtr = nullptr,
int32_t Kind = TARGET_ALLOC_DEFAULT);
/// Deallocates memory which \p TgtPtrBegin points at and returns
/// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails.
int32_t deleteData(void *TgtPtrBegin);
/// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails. p Kind dictates what
/// allocator should be used (host, shared, device).
int32_t deleteData(void *TgtPtrBegin, int32_t Kind = TARGET_ALLOC_DEFAULT);
// Data transfer. When AsyncInfo is nullptr, the transfer will be
// synchronous.

View File

@ -247,6 +247,12 @@ void *llvm_omp_target_alloc_device(size_t Size, int DeviceNum);
void *llvm_omp_target_alloc_host(size_t Size, int DeviceNum);
void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
/// Explicit target memory deallocators
/// Using the llvm_ prefix until they become part of the OpenMP standard.
void llvm_omp_target_free_device(void *DevicePtr, int DeviceNum);
void llvm_omp_target_free_host(void *DevicePtr, int DeviceNum);
void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
/// Dummy target so we have a symbol for generating host fallback.
void *llvm_omp_target_dynamic_shared_alloc();

View File

@ -117,8 +117,9 @@ int32_t __tgt_rtl_data_exchange_async(int32_t SrcID, void *SrcPtr,
__tgt_async_info *AsyncInfo);
// De-allocate the data referenced by target ptr on the device. In case of
// success, return zero. Otherwise, return an error code.
int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr);
// success, return zero. Otherwise, return an error code. Kind dictates what
// allocator to use (e.g. shared, host, device).
int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr, int32_t Kind);
// Transfer control to the offloaded entry Entry on the target device.
// Args and Offsets are arrays of NumArgs size of target addresses and

View File

@ -49,7 +49,7 @@ struct RTLInfoTy {
typedef int32_t(data_exchange_ty)(int32_t, void *, int32_t, void *, int64_t);
typedef int32_t(data_exchange_async_ty)(int32_t, void *, int32_t, void *,
int64_t, __tgt_async_info *);
typedef int32_t(data_delete_ty)(int32_t, void *);
typedef int32_t(data_delete_ty)(int32_t, void *, int32_t);
typedef int32_t(run_region_ty)(int32_t, void *, void **, ptrdiff_t *,
int32_t);
typedef int32_t(run_region_async_ty)(int32_t, void *, void **, ptrdiff_t *,

View File

@ -2606,7 +2606,7 @@ int32_t __tgt_rtl_data_retrieve_async(int DeviceId, void *HstPtr, void *TgtPtr,
return dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfo);
}
int32_t __tgt_rtl_data_delete(int DeviceId, void *TgtPtr) {
int32_t __tgt_rtl_data_delete(int DeviceId, void *TgtPtr, int32_t) {
assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
// HSA can free pointers allocated from different types of memory pool.
hsa_status_t Err;

View File

@ -31,10 +31,11 @@ public:
/// Allocate a memory of size \p Size . \p HstPtr is used to assist the
/// allocation.
virtual void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) = 0;
virtual void *allocate(size_t Size, void *HstPtr,
TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0;
/// Delete the pointer \p TgtPtr on the device
virtual int free(void *TgtPtr) = 0;
virtual int free(void *TgtPtr, TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0;
};
/// Class of memory manager. The memory manager is per-device by using

View File

@ -20,7 +20,6 @@
#include <memory>
#include <mutex>
#include <string>
#include <unordered_map>
#include <vector>
#include "Debug.h"
@ -366,8 +365,6 @@ class DeviceRTLTy {
/// A class responsible for interacting with device native runtime library to
/// allocate and free memory.
class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs;
public:
void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
if (Size == 0)
@ -390,7 +387,6 @@ class DeviceRTLTy {
MemAlloc = HostPtr;
if (!checkResult(Err, "Error returned from cuMemAllocHost\n"))
return nullptr;
HostPinnedAllocs[MemAlloc] = Kind;
break;
case TARGET_ALLOC_SHARED:
CUdeviceptr SharedPtr;
@ -404,13 +400,9 @@ class DeviceRTLTy {
return MemAlloc;
}
int free(void *TgtPtr) override {
int free(void *TgtPtr, TargetAllocTy Kind) override {
CUresult Err;
// Host pinned memory must be freed differently.
TargetAllocTy Kind =
(HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end())
? TARGET_ALLOC_DEFAULT
: TARGET_ALLOC_HOST;
switch (Kind) {
case TARGET_ALLOC_DEFAULT:
case TARGET_ALLOC_DEVICE:
@ -1102,11 +1094,23 @@ public:
return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
}
int dataDelete(const int DeviceId, void *TgtPtr) {
if (UseMemoryManager)
return MemoryManagers[DeviceId]->free(TgtPtr);
int dataDelete(const int DeviceId, void *TgtPtr, TargetAllocTy Kind) {
switch (Kind) {
case TARGET_ALLOC_DEFAULT:
case TARGET_ALLOC_DEVICE:
if (UseMemoryManager)
return MemoryManagers[DeviceId]->free(TgtPtr);
else
return DeviceAllocators[DeviceId].free(TgtPtr, Kind);
case TARGET_ALLOC_HOST:
case TARGET_ALLOC_SHARED:
return DeviceAllocators[DeviceId].free(TgtPtr, Kind);
}
return DeviceAllocators[DeviceId].free(TgtPtr);
REPORT("Invalid target data allocation kind or requested allocator not "
"implemented yet\n");
return OFFLOAD_FAIL;
}
int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
@ -1699,13 +1703,13 @@ int32_t __tgt_rtl_data_exchange(int32_t SrcDevId, void *SrcPtr,
return __tgt_rtl_synchronize(SrcDevId, &AsyncInfo);
}
int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr) {
int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr, int32_t Kind) {
assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
return OFFLOAD_FAIL;
return DeviceRTL.dataDelete(DeviceId, TgtPtr);
return DeviceRTL.dataDelete(DeviceId, TgtPtr, (TargetAllocTy)Kind);
}
int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr,

View File

@ -232,7 +232,7 @@ int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
return OFFLOAD_SUCCESS;
}
int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr) {
int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr, int32_t) {
free(TgtPtr);
return OFFLOAD_SUCCESS;
}

View File

@ -93,7 +93,7 @@ int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
return Manager->dataRetrieve(DeviceId, HstPtr, TgtPtr, Size);
}
int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr) {
int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr, int32_t) {
return Manager->dataDelete(DeviceId, TgtPtr);
}

View File

@ -392,7 +392,7 @@ int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr,
// De-allocate the data referenced by target ptr on the device. In case of
// success, return zero. Otherwise, return an error code.
int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr) {
int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr, int32_t) {
int ret = veo_free_mem(DeviceInfo.ProcHandles[ID], (uint64_t)TargetPtr);
if (ret != 0) {

View File

@ -62,34 +62,25 @@ EXTERN void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum) {
return targetAllocExplicit(Size, DeviceNum, TARGET_ALLOC_SHARED, __func__);
}
EXTERN void omp_target_free(void *Ptr, int DeviceNum) {
return targetFreeExplicit(Ptr, DeviceNum, TARGET_ALLOC_DEFAULT, __func__);
}
EXTERN void llvm_omp_target_free_device(void *Ptr, int DeviceNum) {
return targetFreeExplicit(Ptr, DeviceNum, TARGET_ALLOC_DEVICE, __func__);
}
EXTERN void llvm_omp_target_free_host(void *Ptr, int DeviceNum) {
return targetFreeExplicit(Ptr, DeviceNum, TARGET_ALLOC_HOST, __func__);
}
EXTERN void llvm_omp_target_free_shared(void *Ptre, int DeviceNum) {
return targetFreeExplicit(Ptre, DeviceNum, TARGET_ALLOC_SHARED, __func__);
}
EXTERN void *llvm_omp_target_dynamic_shared_alloc() { return nullptr; }
EXTERN void *llvm_omp_get_dynamic_shared() { return nullptr; }
EXTERN void omp_target_free(void *DevicePtr, int DeviceNum) {
TIMESCOPE();
DP("Call to omp_target_free for device %d and address " DPxMOD "\n",
DeviceNum, DPxPTR(DevicePtr));
if (!DevicePtr) {
DP("Call to omp_target_free with NULL ptr\n");
return;
}
if (DeviceNum == omp_get_initial_device()) {
free(DevicePtr);
DP("omp_target_free deallocated host ptr\n");
return;
}
if (!deviceIsReady(DeviceNum)) {
DP("omp_target_free returns, nothing to do\n");
return;
}
PM->Devices[DeviceNum]->deleteData(DevicePtr);
DP("omp_target_free deallocated device ptr\n");
}
EXTERN int omp_target_is_present(const void *Ptr, int DeviceNum) {
TIMESCOPE();
DP("Call to omp_target_is_present for device %d and address " DPxMOD "\n",

View File

@ -530,8 +530,8 @@ void *DeviceTy::allocData(int64_t Size, void *HstPtr, int32_t Kind) {
return RTL->data_alloc(RTLDeviceID, Size, HstPtr, Kind);
}
int32_t DeviceTy::deleteData(void *TgtPtrBegin) {
return RTL->data_delete(RTLDeviceID, TgtPtrBegin);
int32_t DeviceTy::deleteData(void *TgtPtrBegin, int32_t Kind) {
return RTL->data_delete(RTLDeviceID, TgtPtrBegin, Kind);
}
// Submit data to device

View File

@ -43,6 +43,9 @@ VERS1.0 {
llvm_omp_target_alloc_host;
llvm_omp_target_alloc_shared;
llvm_omp_target_alloc_device;
llvm_omp_target_free_host;
llvm_omp_target_free_shared;
llvm_omp_target_free_device;
llvm_omp_target_dynamic_shared_alloc;
__tgt_set_info_flag;
__tgt_print_device_info;

View File

@ -368,6 +368,32 @@ void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
return Rc;
}
void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
const char *Name) {
TIMESCOPE();
DP("Call to %s for device %d and address " DPxMOD "\n", Name, DeviceNum,
DPxPTR(DevicePtr));
if (!DevicePtr) {
DP("Call to %s with NULL ptr\n", Name);
return;
}
if (DeviceNum == omp_get_initial_device()) {
free(DevicePtr);
DP("%s deallocated host ptr\n", Name);
return;
}
if (!deviceIsReady(DeviceNum)) {
DP("%s returns, nothing to do\n", Name);
return;
}
PM->Devices[DeviceNum]->deleteData(DevicePtr, Kind);
DP("omp_target_free deallocated device ptr\n");
}
/// Call the user-defined mapper function followed by the appropriate
// targetData* function (targetData{Begin,End,Update}).
int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg,

View File

@ -49,6 +49,8 @@ extern void handleTargetOutcome(bool Success, ident_t *Loc);
extern bool checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc);
extern void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
const char *Name);
extern void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
const char *Name);
// This structure stores information of a mapped memory region.
struct MapComponentInfoTy {

View File

@ -5,6 +5,7 @@
#include <stdio.h>
void *llvm_omp_target_alloc_shared(size_t, int);
void llvm_omp_target_free_shared(void *, int);
int main() {
const int N = 64;
@ -12,8 +13,8 @@ int main() {
int *shared_ptr = llvm_omp_target_alloc_shared(N * sizeof(int), device);
#pragma omp target teams distribute parallel for device(device) \
is_device_ptr(shared_ptr)
#pragma omp target teams distribute parallel for device(device) \
is_device_ptr(shared_ptr)
for (int i = 0; i < N; ++i) {
shared_ptr[i] = 1;
}
@ -22,8 +23,8 @@ int main() {
for (int i = 0; i < N; ++i)
sum += shared_ptr[i];
omp_target_free(shared_ptr, device);
llvm_omp_target_free_shared(shared_ptr, device);
// CHECK: PASS
if (sum == N)
printf ("PASS\n");
printf("PASS\n");
}

View File

@ -5,6 +5,7 @@
// Allocate pinned memory on the host
void *llvm_omp_target_alloc_host(size_t, int);
void llvm_omp_target_free_host(void *, int);
int main() {
const int N = 64;
@ -25,7 +26,7 @@ int main() {
for (int i = 0; i < N; ++i)
sum += hst_ptr[i];
omp_target_free(hst_ptr, device);
llvm_omp_target_free_host(hst_ptr, device);
// CHECK: PASS
if (sum == N)
printf ("PASS\n");

View File

@ -1245,7 +1245,9 @@ static void **mk_dax_kmem_preferred;
static void *(*kmp_target_alloc_host)(size_t size, int device);
static void *(*kmp_target_alloc_shared)(size_t size, int device);
static void *(*kmp_target_alloc_device)(size_t size, int device);
static void *(*kmp_target_free)(void *ptr, int device);
static void *(*kmp_target_free_host)(void *ptr, int device);
static void *(*kmp_target_free_shared)(void *ptr, int device);
static void *(*kmp_target_free_device)(void *ptr, int device);
static bool __kmp_target_mem_available;
#define KMP_IS_TARGET_MEM_SPACE(MS) \
(MS == llvm_omp_target_host_mem_space || \
@ -1358,10 +1360,15 @@ void __kmp_init_target_mem() {
KMP_DLSYM("llvm_omp_target_alloc_shared");
*(void **)(&kmp_target_alloc_device) =
KMP_DLSYM("llvm_omp_target_alloc_device");
*(void **)(&kmp_target_free) = KMP_DLSYM("omp_target_free");
__kmp_target_mem_available = kmp_target_alloc_host &&
kmp_target_alloc_shared &&
kmp_target_alloc_device && kmp_target_free;
*(void **)(&kmp_target_free_host) = KMP_DLSYM("llvm_omp_target_free_host");
*(void **)(&kmp_target_free_shared) =
KMP_DLSYM("llvm_omp_target_free_shared");
*(void **)(&kmp_target_free_device) =
KMP_DLSYM("llvm_omp_target_free_device");
__kmp_target_mem_available =
kmp_target_alloc_host && kmp_target_alloc_shared &&
kmp_target_alloc_device && kmp_target_free_host &&
kmp_target_free_shared && kmp_target_free_device;
}
omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms,
@ -1774,13 +1781,18 @@ void ___kmpc_free(int gtid, void *ptr, omp_allocator_handle_t allocator) {
kmp_mem_desc_t desc;
kmp_uintptr_t addr_align; // address to return to caller
kmp_uintptr_t addr_descr; // address of memory block descriptor
if (KMP_IS_TARGET_MEM_ALLOC(allocator) ||
(allocator > kmp_max_mem_alloc &&
KMP_IS_TARGET_MEM_SPACE(al->memspace))) {
KMP_DEBUG_ASSERT(kmp_target_free);
if (__kmp_target_mem_available && (KMP_IS_TARGET_MEM_ALLOC(allocator) ||
(allocator > kmp_max_mem_alloc &&
KMP_IS_TARGET_MEM_SPACE(al->memspace)))) {
kmp_int32 device =
__kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
kmp_target_free(ptr, device);
if (allocator == llvm_omp_target_host_mem_alloc) {
kmp_target_free_host(ptr, device);
} else if (allocator == llvm_omp_target_shared_mem_alloc) {
kmp_target_free_shared(ptr, device);
} else if (allocator == llvm_omp_target_device_mem_alloc) {
kmp_target_free_device(ptr, device);
}
return;
}