From 55dc123555dbb3e7ca717c1ecc0ab6cd934bdee5 Mon Sep 17 00:00:00 2001 From: JonChesterfield Date: Wed, 21 Oct 2020 16:59:16 +0100 Subject: [PATCH] [libomptarget][amdgcn] Refactor memcpy to eliminate maps [libomptarget][amdgcn] Refactor memcpy to eliminate maps Builds on D89776 to remove now dead code. Reviewed By: pdhaliwal Differential Revision: https://reviews.llvm.org/D89888 --- .../libomptarget/plugins/amdgpu/impl/atmi.cpp | 99 +++++++++++-- .../plugins/amdgpu/impl/atmi_runtime.h | 10 +- .../libomptarget/plugins/amdgpu/impl/data.cpp | 131 +----------------- .../libomptarget/plugins/amdgpu/impl/data.h | 83 ----------- .../libomptarget/plugins/amdgpu/src/rtl.cpp | 30 ++-- 5 files changed, 112 insertions(+), 241 deletions(-) delete mode 100644 openmp/libomptarget/plugins/amdgpu/impl/data.h diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp b/openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp index 0586cd358c25..285dc2dbe763 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp @@ -3,7 +3,13 @@ * * This file is distributed under the MIT License. See LICENSE.txt for details. *===------------------------------------------------------------------------*/ +#include "atmi_runtime.h" +#include "internal.h" #include "rt.h" +#include +#include +#include + /* * Initialize/Finalize */ @@ -33,9 +39,44 @@ atmi_status_t atmi_module_register_from_memory_to_place( /* * Data */ -atmi_status_t atmi_memcpy(hsa_signal_t sig, void *dest, const void *src, - size_t size) { - hsa_status_t rc = hsa_memory_copy(dest, src, size); + +static hsa_status_t invoke_hsa_copy(hsa_signal_t sig, void *dest, + const void *src, size_t size, + hsa_agent_t agent) { + const hsa_signal_value_t init = 1; + const hsa_signal_value_t success = 0; + hsa_signal_store_screlease(sig, init); + + hsa_status_t err = + hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0, NULL, sig); + if (err != HSA_STATUS_SUCCESS) { + return err; + } + + // async_copy reports success by decrementing and failure by setting to < 0 + hsa_signal_value_t got = init; + while (got == init) { + got = hsa_signal_wait_scacquire(sig, HSA_SIGNAL_CONDITION_NE, init, + UINT64_MAX, ATMI_WAIT_STATE); + } + + if (got != success) { + return HSA_STATUS_ERROR; + } + + return err; +} + +struct atmiFreePtrDeletor { + void operator()(void *p) { + atmi_free(p); // ignore failure to free + } +}; + +atmi_status_t atmi_memcpy_h2d(hsa_signal_t signal, void *deviceDest, + const void *hostSrc, size_t size, + hsa_agent_t agent) { + hsa_status_t rc = hsa_memory_copy(deviceDest, hostSrc, size); // hsa_memory_copy sometimes fails in situations where // allocate + copy succeeds. Looks like it might be related to @@ -44,17 +85,53 @@ atmi_status_t atmi_memcpy(hsa_signal_t sig, void *dest, const void *src, return ATMI_STATUS_SUCCESS; } - return core::Runtime::Memcpy(sig, dest, src, size); + void *tempHostPtr; + atmi_mem_place_t CPU = ATMI_MEM_PLACE_CPU_MEM(0, 0, 0); + atmi_status_t ret = atmi_malloc(&tempHostPtr, size, CPU); + if (ret != ATMI_STATUS_SUCCESS) { + DEBUG_PRINT("atmi_malloc: Unable to alloc %d bytes for temp scratch\n", + size); + return ret; + } + std::unique_ptr del(tempHostPtr); + memcpy(tempHostPtr, hostSrc, size); + + if (invoke_hsa_copy(signal, deviceDest, tempHostPtr, size, agent) != + HSA_STATUS_SUCCESS) { + return ATMI_STATUS_ERROR; + } + return ATMI_STATUS_SUCCESS; } -atmi_status_t atmi_memcpy_h2d(hsa_signal_t sig, void *device_dest, - const void *host_src, size_t size) { - return atmi_memcpy(sig, device_dest, host_src, size); -} +atmi_status_t atmi_memcpy_d2h(hsa_signal_t signal, void *dest, + const void *deviceSrc, size_t size, + hsa_agent_t agent) { + hsa_status_t rc = hsa_memory_copy(dest, deviceSrc, size); -atmi_status_t atmi_memcpy_d2h(hsa_signal_t sig, void *host_dest, - const void *device_src, size_t size) { - return atmi_memcpy(sig, host_dest, device_src, size); + // hsa_memory_copy sometimes fails in situations where + // allocate + copy succeeds. Looks like it might be related to + // locking part of a read only segment. Fall back for now. + if (rc == HSA_STATUS_SUCCESS) { + return ATMI_STATUS_SUCCESS; + } + + void *tempHostPtr; + atmi_mem_place_t CPU = ATMI_MEM_PLACE_CPU_MEM(0, 0, 0); + atmi_status_t ret = atmi_malloc(&tempHostPtr, size, CPU); + if (ret != ATMI_STATUS_SUCCESS) { + DEBUG_PRINT("atmi_malloc: Unable to alloc %d bytes for temp scratch\n", + size); + return ret; + } + std::unique_ptr del(tempHostPtr); + + if (invoke_hsa_copy(signal, tempHostPtr, deviceSrc, size, agent) != + HSA_STATUS_SUCCESS) { + return ATMI_STATUS_ERROR; + } + + memcpy(dest, tempHostPtr, size); + return ATMI_STATUS_SUCCESS; } atmi_status_t atmi_free(void *ptr) { return core::Runtime::Memfree(ptr); } diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h b/openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h index a935b6ad4b75..47022f7f5dea 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h @@ -155,11 +155,13 @@ atmi_status_t atmi_malloc(void **ptr, size_t size, atmi_mem_place_t place); */ atmi_status_t atmi_free(void *ptr); -atmi_status_t atmi_memcpy_h2d(hsa_signal_t sig, void *device_dest, - const void *host_src, size_t size); +atmi_status_t atmi_memcpy_h2d(hsa_signal_t signal, void *deviceDest, + const void *hostSrc, size_t size, + hsa_agent_t agent); -atmi_status_t atmi_memcpy_d2h(hsa_signal_t sig, void *host_dest, - const void *device_src, size_t size); +atmi_status_t atmi_memcpy_d2h(hsa_signal_t sig, void *hostDest, + const void *deviceSrc, size_t size, + hsa_agent_t agent); /** @} */ diff --git a/openmp/libomptarget/plugins/amdgpu/impl/data.cpp b/openmp/libomptarget/plugins/amdgpu/impl/data.cpp index 91627bd8d4b3..39546fbae4b3 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/data.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/data.cpp @@ -3,7 +3,6 @@ * * This file is distributed under the MIT License. See LICENSE.txt for details. *===------------------------------------------------------------------------*/ -#include "data.h" #include "atmi_runtime.h" #include "internal.h" #include "machine.h" @@ -21,7 +20,6 @@ using core::TaskImpl; extern ATLMachine g_atl_machine; namespace core { -ATLPointerTracker g_data_map; // Track all am pointer allocations. void allow_access_to_all_gpu_agents(void *ptr); const char *getPlaceStr(atmi_devtype_t type) { @@ -35,37 +33,6 @@ const char *getPlaceStr(atmi_devtype_t type) { } } -std::ostream &operator<<(std::ostream &os, const ATLData *ap) { - atmi_mem_place_t place = ap->place(); - os << " devicePointer:" << ap->ptr() << " sizeBytes:" << ap->size() - << " place:(" << getPlaceStr(place.dev_type) << ", " << place.dev_id - << ", " << place.mem_id << ")"; - return os; -} - -void ATLPointerTracker::insert(void *pointer, ATLData *p) { - std::lock_guard l(mutex_); - - DEBUG_PRINT("insert: %p + %zu\n", pointer, p->size()); - tracker_.insert(std::make_pair(ATLMemoryRange(pointer, p->size()), p)); -} - -void ATLPointerTracker::remove(void *pointer) { - std::lock_guard l(mutex_); - DEBUG_PRINT("remove: %p\n", pointer); - tracker_.erase(ATLMemoryRange(pointer, 1)); -} - -ATLData *ATLPointerTracker::find(const void *pointer) { - std::lock_guard l(mutex_); - ATLData *ret = NULL; - auto iter = tracker_.find(ATLMemoryRange(pointer, 1)); - DEBUG_PRINT("find: %p\n", pointer); - if (iter != tracker_.end()) // found - ret = iter->second; - return ret; -} - ATLProcessor &get_processor_by_mem_place(atmi_mem_place_t place) { int dev_id = place.dev_id; switch (place.dev_type) { @@ -76,18 +43,12 @@ ATLProcessor &get_processor_by_mem_place(atmi_mem_place_t place) { } } -static hsa_agent_t get_mem_agent(atmi_mem_place_t place) { - return get_processor_by_mem_place(place).agent(); -} - hsa_amd_memory_pool_t get_memory_pool_by_mem_place(atmi_mem_place_t place) { ATLProcessor &proc = get_processor_by_mem_place(place); return get_memory_pool(proc, place.mem_id); } void register_allocation(void *ptr, size_t size, atmi_mem_place_t place) { - ATLData *data = new ATLData(ptr, size, place); - g_data_map.insert(ptr, data); if (place.dev_type == ATMI_DEVTYPE_CPU) allow_access_to_all_gpu_agents(ptr); // TODO(ashwinma): what if one GPU wants to access another GPU? @@ -112,103 +73,13 @@ atmi_status_t Runtime::Malloc(void **ptr, size_t size, atmi_mem_place_t place) { atmi_status_t Runtime::Memfree(void *ptr) { atmi_status_t ret = ATMI_STATUS_SUCCESS; hsa_status_t err; - ATLData *data = g_data_map.find(ptr); - if (!data) - ErrorCheck(Checking pointer info userData, - HSA_STATUS_ERROR_INVALID_ALLOCATION); - - g_data_map.remove(ptr); - delete data; - err = hsa_amd_memory_pool_free(ptr); ErrorCheck(atmi_free, err); DEBUG_PRINT("Freed %p\n", ptr); - if (err != HSA_STATUS_SUCCESS || !data) + if (err != HSA_STATUS_SUCCESS) ret = ATMI_STATUS_ERROR; return ret; } -static hsa_status_t invoke_hsa_copy(hsa_signal_t sig, void *dest, - const void *src, size_t size, - hsa_agent_t agent) { - const hsa_signal_value_t init = 1; - const hsa_signal_value_t success = 0; - hsa_signal_store_screlease(sig, init); - - hsa_status_t err = - hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0, NULL, sig); - if (err != HSA_STATUS_SUCCESS) { - return err; - } - - // async_copy reports success by decrementing and failure by setting to < 0 - hsa_signal_value_t got = init; - while (got == init) { - got = hsa_signal_wait_scacquire(sig, HSA_SIGNAL_CONDITION_NE, init, - UINT64_MAX, ATMI_WAIT_STATE); - } - - if (got != success) { - return HSA_STATUS_ERROR; - } - - return err; -} - -struct atmiFreePtrDeletor { - void operator()(void *p) { - atmi_free(p); // ignore failure to free - } -}; - -atmi_status_t Runtime::Memcpy(hsa_signal_t sig, void *dest, const void *src, - size_t size) { - ATLData *src_data = g_data_map.find(src); - ATLData *dest_data = g_data_map.find(dest); - atmi_mem_place_t cpu = ATMI_MEM_PLACE_CPU_MEM(0, 0, 0); - - void *temp_host_ptr; - atmi_status_t ret = atmi_malloc(&temp_host_ptr, size, cpu); - if (ret != ATMI_STATUS_SUCCESS) { - return ret; - } - std::unique_ptr del(temp_host_ptr); - - if (src_data && !dest_data) { - // Copy from device to scratch to host - hsa_agent_t agent = get_mem_agent(src_data->place()); - DEBUG_PRINT("Memcpy D2H device agent: %lu\n", agent.handle); - - if (invoke_hsa_copy(sig, temp_host_ptr, src, size, agent) != - HSA_STATUS_SUCCESS) { - return ATMI_STATUS_ERROR; - } - - memcpy(dest, temp_host_ptr, size); - - } else if (!src_data && dest_data) { - // Copy from host to scratch to device - hsa_agent_t agent = get_mem_agent(dest_data->place()); - DEBUG_PRINT("Memcpy H2D device agent: %lu\n", agent.handle); - - memcpy(temp_host_ptr, src, size); - - if (invoke_hsa_copy(sig, dest, temp_host_ptr, size, agent) != - HSA_STATUS_SUCCESS) { - return ATMI_STATUS_ERROR; - } - - } else if (!src_data && !dest_data) { - // would be host to host, just call memcpy, or missing metadata - DEBUG_PRINT("atmi_memcpy invoked without metadata\n"); - return ATMI_STATUS_ERROR; - } else { - DEBUG_PRINT("atmi_memcpy unimplemented device to device copy\n"); - return ATMI_STATUS_ERROR; - } - - return ATMI_STATUS_SUCCESS; -} - } // namespace core diff --git a/openmp/libomptarget/plugins/amdgpu/impl/data.h b/openmp/libomptarget/plugins/amdgpu/impl/data.h deleted file mode 100644 index fa9e7380bf67..000000000000 --- a/openmp/libomptarget/plugins/amdgpu/impl/data.h +++ /dev/null @@ -1,83 +0,0 @@ -/*===-------------------------------------------------------------------------- - * ATMI (Asynchronous Task and Memory Interface) - * - * This file is distributed under the MIT License. See LICENSE.txt for details. - *===------------------------------------------------------------------------*/ -#ifndef SRC_RUNTIME_INCLUDE_DATA_H_ -#define SRC_RUNTIME_INCLUDE_DATA_H_ -#include "atmi.h" -#include -#include -#include -#include -#include -// we maintain our own mapping of device addr to a user specified data object -// in order to work around a (possibly historic) bug in ROCr's -// hsa_amd_pointer_info_set_userdata for variable symbols -// this is expected to be temporary - -namespace core { -// Internal representation of any data that is created and managed by ATMI. -// Data can be located on any device memory or host memory. -class ATLData { -public: - ATLData(void *ptr, size_t size, atmi_mem_place_t place) - : ptr_(ptr), size_(size), place_(place) {} - - void *ptr() const { return ptr_; } - size_t size() const { return size_; } - atmi_mem_place_t place() const { return place_; } - -private: - void *ptr_; - size_t size_; - atmi_mem_place_t place_; -}; - -//--- -struct ATLMemoryRange { - const void *base_pointer; - const void *end_pointer; - ATLMemoryRange(const void *bp, size_t size_bytes) - : base_pointer(bp), - end_pointer(reinterpret_cast(bp) + size_bytes - - 1) {} -}; - -// Functor to compare ranges: -struct ATLMemoryRangeCompare { - // Return true is LHS range is less than RHS - used to order the ranges - bool operator()(const ATLMemoryRange &lhs, const ATLMemoryRange &rhs) const { - return lhs.end_pointer < rhs.base_pointer; - } -}; - -//------------------------------------------------------------------------------------------------- -// This structure tracks information for each pointer. -// Uses memory-range-based lookups - so pointers that exist anywhere in the -// range of hostPtr + size -// will find the associated ATLPointerInfo. -// The insertions and lookups use a self-balancing binary tree and should -// support O(logN) lookup speed. -// The structure is thread-safe - writers obtain a mutex before modifying the -// tree. Multiple simulatenous readers are supported. -class ATLPointerTracker { - typedef std::map - MapTrackerType; - -public: - void insert(void *pointer, ATLData *data); - void remove(void *pointer); - ATLData *find(const void *pointer); - -private: - MapTrackerType tracker_; - std::mutex mutex_; -}; - -extern ATLPointerTracker g_data_map; // Track all am pointer allocations. - -enum class Direction { ATMI_H2D, ATMI_D2H, ATMI_D2D, ATMI_H2H }; - -} // namespace core -#endif // SRC_RUNTIME_INCLUDE_DATA_H_ diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp index 8fd7b7e34155..dc3a288903f0 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -335,27 +335,28 @@ public: static const int Default_WG_Size = llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Default_WG_Size]; - using MemcpyFunc = atmi_status_t(hsa_signal_t, void *, const void *, - size_t size); + using MemcpyFunc = atmi_status_t (*)(hsa_signal_t, void *, const void *, + size_t size, hsa_agent_t); atmi_status_t freesignalpool_memcpy(void *dest, const void *src, size_t size, - MemcpyFunc Func) { + MemcpyFunc Func, int32_t deviceId) { + hsa_agent_t agent = HSAAgents[deviceId]; hsa_signal_t s = FreeSignalPool.pop(); if (s.handle == 0) { return ATMI_STATUS_ERROR; } - atmi_status_t r = Func(s, dest, src, size); + atmi_status_t r = Func(s, dest, src, size, agent); FreeSignalPool.push(s); return r; } atmi_status_t freesignalpool_memcpy_d2h(void *dest, const void *src, - size_t size) { - return freesignalpool_memcpy(dest, src, size, atmi_memcpy_d2h); + size_t size, int32_t deviceId) { + return freesignalpool_memcpy(dest, src, size, atmi_memcpy_d2h, deviceId); } atmi_status_t freesignalpool_memcpy_h2d(void *dest, const void *src, - size_t size) { - return freesignalpool_memcpy(dest, src, size, atmi_memcpy_h2d); + size_t size, int32_t deviceId) { + return freesignalpool_memcpy(dest, src, size, atmi_memcpy_h2d, deviceId); } // Record entry point associated with device @@ -550,7 +551,8 @@ int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size, (long long unsigned)(Elf64_Addr)TgtPtr, (long long unsigned)(Elf64_Addr)HstPtr); - err = DeviceInfo.freesignalpool_memcpy_d2h(HstPtr, TgtPtr, (size_t)Size); + err = DeviceInfo.freesignalpool_memcpy_d2h(HstPtr, TgtPtr, (size_t)Size, + DeviceId); if (err != ATMI_STATUS_SUCCESS) { DP("Error when copying data from device to host. Pointers: " @@ -576,7 +578,8 @@ int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size, DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size, (long long unsigned)(Elf64_Addr)HstPtr, (long long unsigned)(Elf64_Addr)TgtPtr); - err = DeviceInfo.freesignalpool_memcpy_h2d(TgtPtr, HstPtr, (size_t)Size); + err = DeviceInfo.freesignalpool_memcpy_h2d(TgtPtr, HstPtr, (size_t)Size, + DeviceId); if (err != ATMI_STATUS_SUCCESS) { DP("Error when copying data from host to device. Pointers: " "host = 0x%016lx, device = 0x%016lx, size = %lld\n", @@ -1033,7 +1036,8 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, } // write ptr to device memory so it can be used by later kernels - err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr, sizeof(void *)); + err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr, sizeof(void *), + device_id); if (err != ATMI_STATUS_SUCCESS) { fprintf(stderr, "memcpy install of state_ptr failed\n"); return NULL; @@ -1103,7 +1107,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, // can access host addresses directly. There is no longer a // need for device copies. err = DeviceInfo.freesignalpool_memcpy_h2d(varptr, e->addr, - sizeof(void *)); + sizeof(void *), device_id); if (err != ATMI_STATUS_SUCCESS) DP("Error when copying USM\n"); DP("Copy linked variable host address (" DPxMOD ")" @@ -1532,7 +1536,7 @@ static void *AllocateNestedParallelCallMemory(int MaxParLevel, int NumGroups, atmi_status_t err = atmi_malloc(&TgtPtr, NestedMemSize, get_gpu_mem_place(device_id)); err = DeviceInfo.freesignalpool_memcpy_h2d(CallStackAddr, &TgtPtr, - sizeof(void *)); + sizeof(void *), device_id); if (print_kernel_trace > 2) fprintf(stderr, "CallSck %lx TgtPtr %lx *TgtPtr %lx \n", (long)CallStackAddr, (long)&TgtPtr, (long)TgtPtr);