forked from OSchip/llvm-project
[OpenMP][AMDGPU] Switch host-device memory copy to asynchronous version
Prepare amdgpu plugin for asynchronous implementation. This patch switches to using HSA API for asynchronous memory copy. Moving away from hsa_memory_copy means that plugin is responsible for locking/unlocking host memory pointers. Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D115279
This commit is contained in:
parent
c2acd45393
commit
6de698bf10
|
@ -5,9 +5,6 @@
|
|||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#include "hsa_api.h"
|
||||
#include "impl_runtime.h"
|
||||
#include "internal.h"
|
||||
#include "rt.h"
|
||||
#include <memory>
|
||||
|
||||
|
@ -15,29 +12,27 @@
|
|||
* Data
|
||||
*/
|
||||
|
||||
static hsa_status_t invoke_hsa_copy(hsa_signal_t sig, void *dest,
|
||||
const void *src, size_t size,
|
||||
hsa_agent_t agent) {
|
||||
// host pointer (either src or dest) must be locked via hsa_amd_memory_lock
|
||||
static hsa_status_t invoke_hsa_copy(hsa_signal_t signal, void *dest,
|
||||
hsa_agent_t agent, const void *src,
|
||||
size_t size) {
|
||||
const hsa_signal_value_t init = 1;
|
||||
const hsa_signal_value_t success = 0;
|
||||
hsa_signal_store_screlease(sig, init);
|
||||
hsa_signal_store_screlease(signal, init);
|
||||
|
||||
hsa_status_t err =
|
||||
hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0, NULL, sig);
|
||||
if (err != HSA_STATUS_SUCCESS) {
|
||||
hsa_status_t err = hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0,
|
||||
nullptr, signal);
|
||||
if (err != HSA_STATUS_SUCCESS)
|
||||
return err;
|
||||
}
|
||||
|
||||
// async_copy reports success by decrementing and failure by setting to < 0
|
||||
hsa_signal_value_t got = init;
|
||||
while (got == init) {
|
||||
got = hsa_signal_wait_scacquire(sig, HSA_SIGNAL_CONDITION_NE, init,
|
||||
while (got == init)
|
||||
got = hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_NE, init,
|
||||
UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
|
||||
}
|
||||
|
||||
if (got != success) {
|
||||
if (got != success)
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
@ -48,19 +43,58 @@ struct implFreePtrDeletor {
|
|||
}
|
||||
};
|
||||
|
||||
hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest,
|
||||
const void *hostSrc, size_t size,
|
||||
hsa_agent_t agent,
|
||||
hsa_amd_memory_pool_t MemoryPool) {
|
||||
hsa_status_t rc = hsa_memory_copy(deviceDest, hostSrc, size);
|
||||
enum CopyDirection { H2D, D2H };
|
||||
|
||||
// hsa_memory_copy sometimes fails in situations where
|
||||
// allocate + copy succeeds. Looks like it might be related to
|
||||
// locking part of a read only segment. Fall back for now.
|
||||
if (rc == HSA_STATUS_SUCCESS) {
|
||||
return HSA_STATUS_SUCCESS;
|
||||
static hsa_status_t locking_async_memcpy(enum CopyDirection direction,
|
||||
hsa_signal_t signal, void *dest,
|
||||
hsa_agent_t agent, void *src,
|
||||
void *lockingPtr, size_t size) {
|
||||
hsa_status_t err;
|
||||
|
||||
void *lockedPtr = nullptr;
|
||||
err = hsa_amd_memory_lock(lockingPtr, size, nullptr, 0, (void **)&lockedPtr);
|
||||
if (err != HSA_STATUS_SUCCESS)
|
||||
return err;
|
||||
|
||||
switch (direction) {
|
||||
case H2D:
|
||||
err = invoke_hsa_copy(signal, dest, agent, lockedPtr, size);
|
||||
break;
|
||||
case D2H:
|
||||
err = invoke_hsa_copy(signal, lockedPtr, agent, src, size);
|
||||
break;
|
||||
default:
|
||||
err = HSA_STATUS_ERROR; // fall into unlock before returning
|
||||
}
|
||||
|
||||
if (err != HSA_STATUS_SUCCESS) {
|
||||
// do not leak locked host pointers, but discard potential error message
|
||||
hsa_amd_memory_unlock(lockingPtr);
|
||||
return err;
|
||||
}
|
||||
|
||||
err = hsa_amd_memory_unlock(lockingPtr);
|
||||
if (err != HSA_STATUS_SUCCESS)
|
||||
return err;
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest,
|
||||
void *hostSrc, size_t size,
|
||||
hsa_agent_t device_agent,
|
||||
hsa_amd_memory_pool_t MemoryPool) {
|
||||
hsa_status_t err;
|
||||
|
||||
err = locking_async_memcpy(CopyDirection::H2D, signal, deviceDest,
|
||||
device_agent, hostSrc, hostSrc, size);
|
||||
|
||||
if (err == HSA_STATUS_SUCCESS)
|
||||
return err;
|
||||
|
||||
// async memcpy sometimes fails in situations where
|
||||
// allocate + copy succeeds. Looks like it might be related to
|
||||
// locking part of a read only segment. Fall back for now.
|
||||
void *tempHostPtr;
|
||||
hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool);
|
||||
if (ret != HSA_STATUS_SUCCESS) {
|
||||
|
@ -70,26 +104,26 @@ hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest,
|
|||
std::unique_ptr<void, implFreePtrDeletor> del(tempHostPtr);
|
||||
memcpy(tempHostPtr, hostSrc, size);
|
||||
|
||||
if (invoke_hsa_copy(signal, deviceDest, tempHostPtr, size, agent) !=
|
||||
HSA_STATUS_SUCCESS) {
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
return HSA_STATUS_SUCCESS;
|
||||
return locking_async_memcpy(CopyDirection::H2D, signal, deviceDest,
|
||||
device_agent, tempHostPtr, tempHostPtr, size);
|
||||
}
|
||||
|
||||
hsa_status_t impl_memcpy_d2h(hsa_signal_t signal, void *dest,
|
||||
const void *deviceSrc, size_t size,
|
||||
hsa_agent_t agent,
|
||||
hsa_status_t impl_memcpy_d2h(hsa_signal_t signal, void *hostDest,
|
||||
void *deviceSrc, size_t size,
|
||||
hsa_agent_t deviceAgent,
|
||||
hsa_amd_memory_pool_t MemoryPool) {
|
||||
hsa_status_t rc = hsa_memory_copy(dest, deviceSrc, size);
|
||||
hsa_status_t err;
|
||||
|
||||
// device has always visibility over both pointers, so use that
|
||||
err = locking_async_memcpy(CopyDirection::D2H, signal, hostDest, deviceAgent,
|
||||
deviceSrc, hostDest, size);
|
||||
|
||||
if (err == HSA_STATUS_SUCCESS)
|
||||
return err;
|
||||
|
||||
// hsa_memory_copy sometimes fails in situations where
|
||||
// allocate + copy succeeds. Looks like it might be related to
|
||||
// locking part of a read only segment. Fall back for now.
|
||||
if (rc == HSA_STATUS_SUCCESS) {
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
void *tempHostPtr;
|
||||
hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool);
|
||||
if (ret != HSA_STATUS_SUCCESS) {
|
||||
|
@ -98,11 +132,11 @@ hsa_status_t impl_memcpy_d2h(hsa_signal_t signal, void *dest,
|
|||
}
|
||||
std::unique_ptr<void, implFreePtrDeletor> del(tempHostPtr);
|
||||
|
||||
if (invoke_hsa_copy(signal, tempHostPtr, deviceSrc, size, agent) !=
|
||||
HSA_STATUS_SUCCESS) {
|
||||
err = locking_async_memcpy(CopyDirection::D2H, signal, tempHostPtr,
|
||||
deviceAgent, deviceSrc, tempHostPtr, size);
|
||||
if (err != HSA_STATUS_SUCCESS)
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
memcpy(dest, tempHostPtr, size);
|
||||
memcpy(hostDest, tempHostPtr, size);
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
|
|
@ -19,13 +19,12 @@ hsa_status_t impl_module_register_from_memory_to_place(
|
|||
void *cb_state);
|
||||
|
||||
hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest,
|
||||
const void *hostSrc, size_t size,
|
||||
hsa_agent_t agent,
|
||||
void *hostSrc, size_t size,
|
||||
hsa_agent_t device_agent,
|
||||
hsa_amd_memory_pool_t MemoryPool);
|
||||
|
||||
hsa_status_t impl_memcpy_d2h(hsa_signal_t sig, void *hostDest,
|
||||
const void *deviceSrc, size_t size,
|
||||
hsa_agent_t agent,
|
||||
hsa_status_t impl_memcpy_d2h(hsa_signal_t sig, void *hostDest, void *deviceSrc,
|
||||
size_t size, hsa_agent_t device_agent,
|
||||
hsa_amd_memory_pool_t MemoryPool);
|
||||
}
|
||||
|
||||
|
|
|
@ -464,10 +464,9 @@ public:
|
|||
"");
|
||||
static const int Default_WG_Size = getGridValue<64>().GV_Default_WG_Size;
|
||||
|
||||
using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, const void *,
|
||||
size_t size, hsa_agent_t,
|
||||
hsa_amd_memory_pool_t);
|
||||
hsa_status_t freesignalpool_memcpy(void *dest, const void *src, size_t size,
|
||||
using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, void *, size_t size,
|
||||
hsa_agent_t, hsa_amd_memory_pool_t);
|
||||
hsa_status_t freesignalpool_memcpy(void *dest, void *src, size_t size,
|
||||
MemcpyFunc Func, int32_t deviceId) {
|
||||
hsa_agent_t agent = HSAAgents[deviceId];
|
||||
hsa_signal_t s = FreeSignalPool.pop();
|
||||
|
@ -479,13 +478,13 @@ public:
|
|||
return r;
|
||||
}
|
||||
|
||||
hsa_status_t freesignalpool_memcpy_d2h(void *dest, const void *src,
|
||||
size_t size, int32_t deviceId) {
|
||||
hsa_status_t freesignalpool_memcpy_d2h(void *dest, void *src, size_t size,
|
||||
int32_t deviceId) {
|
||||
return freesignalpool_memcpy(dest, src, size, impl_memcpy_d2h, deviceId);
|
||||
}
|
||||
|
||||
hsa_status_t freesignalpool_memcpy_h2d(void *dest, const void *src,
|
||||
size_t size, int32_t deviceId) {
|
||||
hsa_status_t freesignalpool_memcpy_h2d(void *dest, void *src, size_t size,
|
||||
int32_t deviceId) {
|
||||
return freesignalpool_memcpy(dest, src, size, impl_memcpy_h2d, deviceId);
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue