forked from OSchip/llvm-project
[libomptarget][AMDGPU][NFC] Split atmi_memcpy for h2d and d2h
The calls to atmi_memcpy presently determine the direction of copy (host to device or device to host) by storing pointers in a map during malloc and looking up the pointers during memcpy. As each call site already knows the direction, this stash+lookup can be eliminated. This NFC will be followed by a functional one that deletes those map lookups. Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D89776 Change-Id: I1d9089bc1e56b3a9a30e334735fa07dee1f84990
This commit is contained in:
parent
be2afbd019
commit
aa616efbb3
|
@ -35,9 +35,28 @@ atmi_status_t atmi_module_register_from_memory_to_place(
|
|||
*/
|
||||
atmi_status_t atmi_memcpy(hsa_signal_t sig, void *dest, const void *src,
|
||||
size_t size) {
|
||||
hsa_status_t rc = hsa_memory_copy(dest, src, size);
|
||||
|
||||
// hsa_memory_copy sometimes fails in situations where
|
||||
// allocate + copy succeeds. Looks like it might be related to
|
||||
// locking part of a read only segment. Fall back for now.
|
||||
if (rc == HSA_STATUS_SUCCESS) {
|
||||
return ATMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
return core::Runtime::Memcpy(sig, dest, src, size);
|
||||
}
|
||||
|
||||
atmi_status_t atmi_memcpy_h2d(hsa_signal_t sig, void *device_dest,
|
||||
const void *host_src, size_t size) {
|
||||
return atmi_memcpy(sig, device_dest, host_src, size);
|
||||
}
|
||||
|
||||
atmi_status_t atmi_memcpy_d2h(hsa_signal_t sig, void *host_dest,
|
||||
const void *device_src, size_t size) {
|
||||
return atmi_memcpy(sig, host_dest, device_src, size);
|
||||
}
|
||||
|
||||
atmi_status_t atmi_free(void *ptr) { return core::Runtime::Memfree(ptr); }
|
||||
|
||||
atmi_status_t atmi_malloc(void **ptr, size_t size, atmi_mem_place_t place) {
|
||||
|
|
|
@ -155,53 +155,11 @@ atmi_status_t atmi_malloc(void **ptr, size_t size, atmi_mem_place_t place);
|
|||
*/
|
||||
atmi_status_t atmi_free(void *ptr);
|
||||
|
||||
/**
|
||||
* @brief Syncrhonously copy memory from the source to destination memory
|
||||
* locations.
|
||||
*
|
||||
* @detail This function assumes that the source and destination regions are
|
||||
* non-overlapping. The runtime determines the memory place of the source and
|
||||
* the
|
||||
* destination and executes the appropriate optimized data movement methodology.
|
||||
*
|
||||
* @param[in] dest The destination pointer previously allocated by a system
|
||||
* allocator or @p atmi_malloc.
|
||||
*
|
||||
* @param[in] src The source pointer previously allocated by a system
|
||||
* allocator or @p atmi_malloc.
|
||||
*
|
||||
* @param[in] size The size of the data to be copied in bytes.
|
||||
*
|
||||
* @retval ::ATMI_STATUS_SUCCESS The function has executed successfully.
|
||||
*
|
||||
* @retval ::ATMI_STATUS_ERROR The function encountered errors.
|
||||
*
|
||||
* @retval ::ATMI_STATUS_UNKNOWN The function encountered errors.
|
||||
*
|
||||
*/
|
||||
atmi_status_t atmi_memcpy(hsa_signal_t sig, void *dest, const void *src,
|
||||
size_t size);
|
||||
atmi_status_t atmi_memcpy_h2d(hsa_signal_t sig, void *device_dest,
|
||||
const void *host_src, size_t size);
|
||||
|
||||
static inline atmi_status_t atmi_memcpy_no_signal(void *dest, const void *src,
|
||||
size_t size) {
|
||||
hsa_signal_t sig;
|
||||
hsa_status_t err = hsa_signal_create(0, 0, NULL, &sig);
|
||||
if (err != HSA_STATUS_SUCCESS) {
|
||||
return ATMI_STATUS_ERROR;
|
||||
}
|
||||
|
||||
atmi_status_t r = atmi_memcpy(sig, dest, src, size);
|
||||
hsa_status_t rc = hsa_signal_destroy(sig);
|
||||
|
||||
if (r != ATMI_STATUS_SUCCESS) {
|
||||
return r;
|
||||
}
|
||||
if (rc != HSA_STATUS_SUCCESS) {
|
||||
return ATMI_STATUS_ERROR;
|
||||
}
|
||||
|
||||
return ATMI_STATUS_SUCCESS;
|
||||
}
|
||||
atmi_status_t atmi_memcpy_d2h(hsa_signal_t sig, void *host_dest,
|
||||
const void *device_src, size_t size);
|
||||
|
||||
/** @} */
|
||||
|
||||
|
|
|
@ -335,17 +335,29 @@ public:
|
|||
static const int Default_WG_Size =
|
||||
llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Default_WG_Size];
|
||||
|
||||
atmi_status_t freesignalpool_memcpy(void *dest, const void *src,
|
||||
size_t size) {
|
||||
using MemcpyFunc = atmi_status_t(hsa_signal_t, void *, const void *,
|
||||
size_t size);
|
||||
atmi_status_t freesignalpool_memcpy(void *dest, const void *src, size_t size,
|
||||
MemcpyFunc Func) {
|
||||
hsa_signal_t s = FreeSignalPool.pop();
|
||||
if (s.handle == 0) {
|
||||
return ATMI_STATUS_ERROR;
|
||||
}
|
||||
atmi_status_t r = atmi_memcpy(s, dest, src, size);
|
||||
atmi_status_t r = Func(s, dest, src, size);
|
||||
FreeSignalPool.push(s);
|
||||
return r;
|
||||
}
|
||||
|
||||
atmi_status_t freesignalpool_memcpy_d2h(void *dest, const void *src,
|
||||
size_t size) {
|
||||
return freesignalpool_memcpy(dest, src, size, atmi_memcpy_d2h);
|
||||
}
|
||||
|
||||
atmi_status_t freesignalpool_memcpy_h2d(void *dest, const void *src,
|
||||
size_t size) {
|
||||
return freesignalpool_memcpy(dest, src, size, atmi_memcpy_h2d);
|
||||
}
|
||||
|
||||
// Record entry point associated with device
|
||||
void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
|
||||
assert(device_id < (int32_t)FuncGblEntries.size() &&
|
||||
|
@ -538,7 +550,7 @@ int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
|
|||
(long long unsigned)(Elf64_Addr)TgtPtr,
|
||||
(long long unsigned)(Elf64_Addr)HstPtr);
|
||||
|
||||
err = DeviceInfo.freesignalpool_memcpy(HstPtr, TgtPtr, (size_t)Size);
|
||||
err = DeviceInfo.freesignalpool_memcpy_d2h(HstPtr, TgtPtr, (size_t)Size);
|
||||
|
||||
if (err != ATMI_STATUS_SUCCESS) {
|
||||
DP("Error when copying data from device to host. Pointers: "
|
||||
|
@ -564,7 +576,7 @@ int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
|
|||
DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size,
|
||||
(long long unsigned)(Elf64_Addr)HstPtr,
|
||||
(long long unsigned)(Elf64_Addr)TgtPtr);
|
||||
err = DeviceInfo.freesignalpool_memcpy(TgtPtr, HstPtr, (size_t)Size);
|
||||
err = DeviceInfo.freesignalpool_memcpy_h2d(TgtPtr, HstPtr, (size_t)Size);
|
||||
if (err != ATMI_STATUS_SUCCESS) {
|
||||
DP("Error when copying data from host to device. Pointers: "
|
||||
"host = 0x%016lx, device = 0x%016lx, size = %lld\n",
|
||||
|
@ -1021,7 +1033,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
|
|||
}
|
||||
|
||||
// write ptr to device memory so it can be used by later kernels
|
||||
err = DeviceInfo.freesignalpool_memcpy(state_ptr, &ptr, sizeof(void *));
|
||||
err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr, sizeof(void *));
|
||||
if (err != ATMI_STATUS_SUCCESS) {
|
||||
fprintf(stderr, "memcpy install of state_ptr failed\n");
|
||||
return NULL;
|
||||
|
@ -1090,7 +1102,8 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
|
|||
// If unified memory is present any target link variables
|
||||
// can access host addresses directly. There is no longer a
|
||||
// need for device copies.
|
||||
err = DeviceInfo.freesignalpool_memcpy(varptr, e->addr, sizeof(void *));
|
||||
err = DeviceInfo.freesignalpool_memcpy_h2d(varptr, e->addr,
|
||||
sizeof(void *));
|
||||
if (err != ATMI_STATUS_SUCCESS)
|
||||
DP("Error when copying USM\n");
|
||||
DP("Copy linked variable host address (" DPxMOD ")"
|
||||
|
@ -1518,8 +1531,8 @@ static void *AllocateNestedParallelCallMemory(int MaxParLevel, int NumGroups,
|
|||
void *TgtPtr = NULL;
|
||||
atmi_status_t err =
|
||||
atmi_malloc(&TgtPtr, NestedMemSize, get_gpu_mem_place(device_id));
|
||||
err =
|
||||
DeviceInfo.freesignalpool_memcpy(CallStackAddr, &TgtPtr, sizeof(void *));
|
||||
err = DeviceInfo.freesignalpool_memcpy_h2d(CallStackAddr, &TgtPtr,
|
||||
sizeof(void *));
|
||||
if (print_kernel_trace > 2)
|
||||
fprintf(stderr, "CallSck %lx TgtPtr %lx *TgtPtr %lx \n",
|
||||
(long)CallStackAddr, (long)&TgtPtr, (long)TgtPtr);
|
||||
|
|
Loading…
Reference in New Issue