[libomptarget][AMDGPU][NFC] Split atmi_memcpy for h2d and d2h

The calls to atmi_memcpy presently determine the direction of copy (host to
device or device to host) by storing pointers in a map during malloc and
looking up the pointers during memcpy. As each call site already knows the
direction, this stash+lookup can be eliminated.

This NFC will be followed by a functional one that deletes those map lookups.

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D89776

Change-Id: I1d9089bc1e56b3a9a30e334735fa07dee1f84990
This commit is contained in:
Pushpinder Singh 2020-10-20 05:09:29 -04:00
parent be2afbd019
commit aa616efbb3
3 changed files with 45 additions and 55 deletions

View File

@ -35,9 +35,28 @@ atmi_status_t atmi_module_register_from_memory_to_place(
*/
atmi_status_t atmi_memcpy(hsa_signal_t sig, void *dest, const void *src,
size_t size) {
hsa_status_t rc = hsa_memory_copy(dest, src, size);
// hsa_memory_copy sometimes fails in situations where
// allocate + copy succeeds. Looks like it might be related to
// locking part of a read only segment. Fall back for now.
if (rc == HSA_STATUS_SUCCESS) {
return ATMI_STATUS_SUCCESS;
}
return core::Runtime::Memcpy(sig, dest, src, size);
}
atmi_status_t atmi_memcpy_h2d(hsa_signal_t sig, void *device_dest,
const void *host_src, size_t size) {
return atmi_memcpy(sig, device_dest, host_src, size);
}
atmi_status_t atmi_memcpy_d2h(hsa_signal_t sig, void *host_dest,
const void *device_src, size_t size) {
return atmi_memcpy(sig, host_dest, device_src, size);
}
atmi_status_t atmi_free(void *ptr) { return core::Runtime::Memfree(ptr); }
atmi_status_t atmi_malloc(void **ptr, size_t size, atmi_mem_place_t place) {

View File

@ -155,53 +155,11 @@ atmi_status_t atmi_malloc(void **ptr, size_t size, atmi_mem_place_t place);
*/
atmi_status_t atmi_free(void *ptr);
/**
* @brief Syncrhonously copy memory from the source to destination memory
* locations.
*
* @detail This function assumes that the source and destination regions are
* non-overlapping. The runtime determines the memory place of the source and
* the
* destination and executes the appropriate optimized data movement methodology.
*
* @param[in] dest The destination pointer previously allocated by a system
* allocator or @p atmi_malloc.
*
* @param[in] src The source pointer previously allocated by a system
* allocator or @p atmi_malloc.
*
* @param[in] size The size of the data to be copied in bytes.
*
* @retval ::ATMI_STATUS_SUCCESS The function has executed successfully.
*
* @retval ::ATMI_STATUS_ERROR The function encountered errors.
*
* @retval ::ATMI_STATUS_UNKNOWN The function encountered errors.
*
*/
atmi_status_t atmi_memcpy(hsa_signal_t sig, void *dest, const void *src,
size_t size);
atmi_status_t atmi_memcpy_h2d(hsa_signal_t sig, void *device_dest,
const void *host_src, size_t size);
static inline atmi_status_t atmi_memcpy_no_signal(void *dest, const void *src,
size_t size) {
hsa_signal_t sig;
hsa_status_t err = hsa_signal_create(0, 0, NULL, &sig);
if (err != HSA_STATUS_SUCCESS) {
return ATMI_STATUS_ERROR;
}
atmi_status_t r = atmi_memcpy(sig, dest, src, size);
hsa_status_t rc = hsa_signal_destroy(sig);
if (r != ATMI_STATUS_SUCCESS) {
return r;
}
if (rc != HSA_STATUS_SUCCESS) {
return ATMI_STATUS_ERROR;
}
return ATMI_STATUS_SUCCESS;
}
atmi_status_t atmi_memcpy_d2h(hsa_signal_t sig, void *host_dest,
const void *device_src, size_t size);
/** @} */

View File

@ -335,17 +335,29 @@ public:
static const int Default_WG_Size =
llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Default_WG_Size];
atmi_status_t freesignalpool_memcpy(void *dest, const void *src,
size_t size) {
using MemcpyFunc = atmi_status_t(hsa_signal_t, void *, const void *,
size_t size);
atmi_status_t freesignalpool_memcpy(void *dest, const void *src, size_t size,
MemcpyFunc Func) {
hsa_signal_t s = FreeSignalPool.pop();
if (s.handle == 0) {
return ATMI_STATUS_ERROR;
}
atmi_status_t r = atmi_memcpy(s, dest, src, size);
atmi_status_t r = Func(s, dest, src, size);
FreeSignalPool.push(s);
return r;
}
atmi_status_t freesignalpool_memcpy_d2h(void *dest, const void *src,
size_t size) {
return freesignalpool_memcpy(dest, src, size, atmi_memcpy_d2h);
}
atmi_status_t freesignalpool_memcpy_h2d(void *dest, const void *src,
size_t size) {
return freesignalpool_memcpy(dest, src, size, atmi_memcpy_h2d);
}
// Record entry point associated with device
void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
assert(device_id < (int32_t)FuncGblEntries.size() &&
@ -538,7 +550,7 @@ int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
(long long unsigned)(Elf64_Addr)TgtPtr,
(long long unsigned)(Elf64_Addr)HstPtr);
err = DeviceInfo.freesignalpool_memcpy(HstPtr, TgtPtr, (size_t)Size);
err = DeviceInfo.freesignalpool_memcpy_d2h(HstPtr, TgtPtr, (size_t)Size);
if (err != ATMI_STATUS_SUCCESS) {
DP("Error when copying data from device to host. Pointers: "
@ -564,7 +576,7 @@ int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size,
(long long unsigned)(Elf64_Addr)HstPtr,
(long long unsigned)(Elf64_Addr)TgtPtr);
err = DeviceInfo.freesignalpool_memcpy(TgtPtr, HstPtr, (size_t)Size);
err = DeviceInfo.freesignalpool_memcpy_h2d(TgtPtr, HstPtr, (size_t)Size);
if (err != ATMI_STATUS_SUCCESS) {
DP("Error when copying data from host to device. Pointers: "
"host = 0x%016lx, device = 0x%016lx, size = %lld\n",
@ -1021,7 +1033,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
}
// write ptr to device memory so it can be used by later kernels
err = DeviceInfo.freesignalpool_memcpy(state_ptr, &ptr, sizeof(void *));
err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr, sizeof(void *));
if (err != ATMI_STATUS_SUCCESS) {
fprintf(stderr, "memcpy install of state_ptr failed\n");
return NULL;
@ -1090,7 +1102,8 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
// If unified memory is present any target link variables
// can access host addresses directly. There is no longer a
// need for device copies.
err = DeviceInfo.freesignalpool_memcpy(varptr, e->addr, sizeof(void *));
err = DeviceInfo.freesignalpool_memcpy_h2d(varptr, e->addr,
sizeof(void *));
if (err != ATMI_STATUS_SUCCESS)
DP("Error when copying USM\n");
DP("Copy linked variable host address (" DPxMOD ")"
@ -1518,8 +1531,8 @@ static void *AllocateNestedParallelCallMemory(int MaxParLevel, int NumGroups,
void *TgtPtr = NULL;
atmi_status_t err =
atmi_malloc(&TgtPtr, NestedMemSize, get_gpu_mem_place(device_id));
err =
DeviceInfo.freesignalpool_memcpy(CallStackAddr, &TgtPtr, sizeof(void *));
err = DeviceInfo.freesignalpool_memcpy_h2d(CallStackAddr, &TgtPtr,
sizeof(void *));
if (print_kernel_trace > 2)
fprintf(stderr, "CallSck %lx TgtPtr %lx *TgtPtr %lx \n",
(long)CallStackAddr, (long)&TgtPtr, (long)TgtPtr);