forked from OSchip/llvm-project
[libomptarget][amdgcn] Refactor memcpy to eliminate maps
[libomptarget][amdgcn] Refactor memcpy to eliminate maps Builds on D89776 to remove now dead code. Reviewed By: pdhaliwal Differential Revision: https://reviews.llvm.org/D89888
This commit is contained in:
parent
d900b755ed
commit
55dc123555
|
@ -3,7 +3,13 @@
|
|||
*
|
||||
* This file is distributed under the MIT License. See LICENSE.txt for details.
|
||||
*===------------------------------------------------------------------------*/
|
||||
#include "atmi_runtime.h"
|
||||
#include "internal.h"
|
||||
#include "rt.h"
|
||||
#include <hsa.h>
|
||||
#include <hsa_ext_amd.h>
|
||||
#include <memory>
|
||||
|
||||
/*
|
||||
* Initialize/Finalize
|
||||
*/
|
||||
|
@ -33,9 +39,44 @@ atmi_status_t atmi_module_register_from_memory_to_place(
|
|||
/*
|
||||
* Data
|
||||
*/
|
||||
atmi_status_t atmi_memcpy(hsa_signal_t sig, void *dest, const void *src,
|
||||
size_t size) {
|
||||
hsa_status_t rc = hsa_memory_copy(dest, src, size);
|
||||
|
||||
static hsa_status_t invoke_hsa_copy(hsa_signal_t sig, void *dest,
|
||||
const void *src, size_t size,
|
||||
hsa_agent_t agent) {
|
||||
const hsa_signal_value_t init = 1;
|
||||
const hsa_signal_value_t success = 0;
|
||||
hsa_signal_store_screlease(sig, init);
|
||||
|
||||
hsa_status_t err =
|
||||
hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0, NULL, sig);
|
||||
if (err != HSA_STATUS_SUCCESS) {
|
||||
return err;
|
||||
}
|
||||
|
||||
// async_copy reports success by decrementing and failure by setting to < 0
|
||||
hsa_signal_value_t got = init;
|
||||
while (got == init) {
|
||||
got = hsa_signal_wait_scacquire(sig, HSA_SIGNAL_CONDITION_NE, init,
|
||||
UINT64_MAX, ATMI_WAIT_STATE);
|
||||
}
|
||||
|
||||
if (got != success) {
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
struct atmiFreePtrDeletor {
|
||||
void operator()(void *p) {
|
||||
atmi_free(p); // ignore failure to free
|
||||
}
|
||||
};
|
||||
|
||||
atmi_status_t atmi_memcpy_h2d(hsa_signal_t signal, void *deviceDest,
|
||||
const void *hostSrc, size_t size,
|
||||
hsa_agent_t agent) {
|
||||
hsa_status_t rc = hsa_memory_copy(deviceDest, hostSrc, size);
|
||||
|
||||
// hsa_memory_copy sometimes fails in situations where
|
||||
// allocate + copy succeeds. Looks like it might be related to
|
||||
|
@ -44,17 +85,53 @@ atmi_status_t atmi_memcpy(hsa_signal_t sig, void *dest, const void *src,
|
|||
return ATMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
return core::Runtime::Memcpy(sig, dest, src, size);
|
||||
void *tempHostPtr;
|
||||
atmi_mem_place_t CPU = ATMI_MEM_PLACE_CPU_MEM(0, 0, 0);
|
||||
atmi_status_t ret = atmi_malloc(&tempHostPtr, size, CPU);
|
||||
if (ret != ATMI_STATUS_SUCCESS) {
|
||||
DEBUG_PRINT("atmi_malloc: Unable to alloc %d bytes for temp scratch\n",
|
||||
size);
|
||||
return ret;
|
||||
}
|
||||
std::unique_ptr<void, atmiFreePtrDeletor> del(tempHostPtr);
|
||||
memcpy(tempHostPtr, hostSrc, size);
|
||||
|
||||
if (invoke_hsa_copy(signal, deviceDest, tempHostPtr, size, agent) !=
|
||||
HSA_STATUS_SUCCESS) {
|
||||
return ATMI_STATUS_ERROR;
|
||||
}
|
||||
return ATMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
atmi_status_t atmi_memcpy_h2d(hsa_signal_t sig, void *device_dest,
|
||||
const void *host_src, size_t size) {
|
||||
return atmi_memcpy(sig, device_dest, host_src, size);
|
||||
}
|
||||
atmi_status_t atmi_memcpy_d2h(hsa_signal_t signal, void *dest,
|
||||
const void *deviceSrc, size_t size,
|
||||
hsa_agent_t agent) {
|
||||
hsa_status_t rc = hsa_memory_copy(dest, deviceSrc, size);
|
||||
|
||||
atmi_status_t atmi_memcpy_d2h(hsa_signal_t sig, void *host_dest,
|
||||
const void *device_src, size_t size) {
|
||||
return atmi_memcpy(sig, host_dest, device_src, size);
|
||||
// hsa_memory_copy sometimes fails in situations where
|
||||
// allocate + copy succeeds. Looks like it might be related to
|
||||
// locking part of a read only segment. Fall back for now.
|
||||
if (rc == HSA_STATUS_SUCCESS) {
|
||||
return ATMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
void *tempHostPtr;
|
||||
atmi_mem_place_t CPU = ATMI_MEM_PLACE_CPU_MEM(0, 0, 0);
|
||||
atmi_status_t ret = atmi_malloc(&tempHostPtr, size, CPU);
|
||||
if (ret != ATMI_STATUS_SUCCESS) {
|
||||
DEBUG_PRINT("atmi_malloc: Unable to alloc %d bytes for temp scratch\n",
|
||||
size);
|
||||
return ret;
|
||||
}
|
||||
std::unique_ptr<void, atmiFreePtrDeletor> del(tempHostPtr);
|
||||
|
||||
if (invoke_hsa_copy(signal, tempHostPtr, deviceSrc, size, agent) !=
|
||||
HSA_STATUS_SUCCESS) {
|
||||
return ATMI_STATUS_ERROR;
|
||||
}
|
||||
|
||||
memcpy(dest, tempHostPtr, size);
|
||||
return ATMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
atmi_status_t atmi_free(void *ptr) { return core::Runtime::Memfree(ptr); }
|
||||
|
|
|
@ -155,11 +155,13 @@ atmi_status_t atmi_malloc(void **ptr, size_t size, atmi_mem_place_t place);
|
|||
*/
|
||||
atmi_status_t atmi_free(void *ptr);
|
||||
|
||||
atmi_status_t atmi_memcpy_h2d(hsa_signal_t sig, void *device_dest,
|
||||
const void *host_src, size_t size);
|
||||
atmi_status_t atmi_memcpy_h2d(hsa_signal_t signal, void *deviceDest,
|
||||
const void *hostSrc, size_t size,
|
||||
hsa_agent_t agent);
|
||||
|
||||
atmi_status_t atmi_memcpy_d2h(hsa_signal_t sig, void *host_dest,
|
||||
const void *device_src, size_t size);
|
||||
atmi_status_t atmi_memcpy_d2h(hsa_signal_t sig, void *hostDest,
|
||||
const void *deviceSrc, size_t size,
|
||||
hsa_agent_t agent);
|
||||
|
||||
/** @} */
|
||||
|
||||
|
|
|
@ -3,7 +3,6 @@
|
|||
*
|
||||
* This file is distributed under the MIT License. See LICENSE.txt for details.
|
||||
*===------------------------------------------------------------------------*/
|
||||
#include "data.h"
|
||||
#include "atmi_runtime.h"
|
||||
#include "internal.h"
|
||||
#include "machine.h"
|
||||
|
@ -21,7 +20,6 @@ using core::TaskImpl;
|
|||
extern ATLMachine g_atl_machine;
|
||||
|
||||
namespace core {
|
||||
ATLPointerTracker g_data_map; // Track all am pointer allocations.
|
||||
void allow_access_to_all_gpu_agents(void *ptr);
|
||||
|
||||
const char *getPlaceStr(atmi_devtype_t type) {
|
||||
|
@ -35,37 +33,6 @@ const char *getPlaceStr(atmi_devtype_t type) {
|
|||
}
|
||||
}
|
||||
|
||||
std::ostream &operator<<(std::ostream &os, const ATLData *ap) {
|
||||
atmi_mem_place_t place = ap->place();
|
||||
os << " devicePointer:" << ap->ptr() << " sizeBytes:" << ap->size()
|
||||
<< " place:(" << getPlaceStr(place.dev_type) << ", " << place.dev_id
|
||||
<< ", " << place.mem_id << ")";
|
||||
return os;
|
||||
}
|
||||
|
||||
void ATLPointerTracker::insert(void *pointer, ATLData *p) {
|
||||
std::lock_guard<std::mutex> l(mutex_);
|
||||
|
||||
DEBUG_PRINT("insert: %p + %zu\n", pointer, p->size());
|
||||
tracker_.insert(std::make_pair(ATLMemoryRange(pointer, p->size()), p));
|
||||
}
|
||||
|
||||
void ATLPointerTracker::remove(void *pointer) {
|
||||
std::lock_guard<std::mutex> l(mutex_);
|
||||
DEBUG_PRINT("remove: %p\n", pointer);
|
||||
tracker_.erase(ATLMemoryRange(pointer, 1));
|
||||
}
|
||||
|
||||
ATLData *ATLPointerTracker::find(const void *pointer) {
|
||||
std::lock_guard<std::mutex> l(mutex_);
|
||||
ATLData *ret = NULL;
|
||||
auto iter = tracker_.find(ATLMemoryRange(pointer, 1));
|
||||
DEBUG_PRINT("find: %p\n", pointer);
|
||||
if (iter != tracker_.end()) // found
|
||||
ret = iter->second;
|
||||
return ret;
|
||||
}
|
||||
|
||||
ATLProcessor &get_processor_by_mem_place(atmi_mem_place_t place) {
|
||||
int dev_id = place.dev_id;
|
||||
switch (place.dev_type) {
|
||||
|
@ -76,18 +43,12 @@ ATLProcessor &get_processor_by_mem_place(atmi_mem_place_t place) {
|
|||
}
|
||||
}
|
||||
|
||||
static hsa_agent_t get_mem_agent(atmi_mem_place_t place) {
|
||||
return get_processor_by_mem_place(place).agent();
|
||||
}
|
||||
|
||||
hsa_amd_memory_pool_t get_memory_pool_by_mem_place(atmi_mem_place_t place) {
|
||||
ATLProcessor &proc = get_processor_by_mem_place(place);
|
||||
return get_memory_pool(proc, place.mem_id);
|
||||
}
|
||||
|
||||
void register_allocation(void *ptr, size_t size, atmi_mem_place_t place) {
|
||||
ATLData *data = new ATLData(ptr, size, place);
|
||||
g_data_map.insert(ptr, data);
|
||||
if (place.dev_type == ATMI_DEVTYPE_CPU)
|
||||
allow_access_to_all_gpu_agents(ptr);
|
||||
// TODO(ashwinma): what if one GPU wants to access another GPU?
|
||||
|
@ -112,103 +73,13 @@ atmi_status_t Runtime::Malloc(void **ptr, size_t size, atmi_mem_place_t place) {
|
|||
atmi_status_t Runtime::Memfree(void *ptr) {
|
||||
atmi_status_t ret = ATMI_STATUS_SUCCESS;
|
||||
hsa_status_t err;
|
||||
ATLData *data = g_data_map.find(ptr);
|
||||
if (!data)
|
||||
ErrorCheck(Checking pointer info userData,
|
||||
HSA_STATUS_ERROR_INVALID_ALLOCATION);
|
||||
|
||||
g_data_map.remove(ptr);
|
||||
delete data;
|
||||
|
||||
err = hsa_amd_memory_pool_free(ptr);
|
||||
ErrorCheck(atmi_free, err);
|
||||
DEBUG_PRINT("Freed %p\n", ptr);
|
||||
|
||||
if (err != HSA_STATUS_SUCCESS || !data)
|
||||
if (err != HSA_STATUS_SUCCESS)
|
||||
ret = ATMI_STATUS_ERROR;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static hsa_status_t invoke_hsa_copy(hsa_signal_t sig, void *dest,
|
||||
const void *src, size_t size,
|
||||
hsa_agent_t agent) {
|
||||
const hsa_signal_value_t init = 1;
|
||||
const hsa_signal_value_t success = 0;
|
||||
hsa_signal_store_screlease(sig, init);
|
||||
|
||||
hsa_status_t err =
|
||||
hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0, NULL, sig);
|
||||
if (err != HSA_STATUS_SUCCESS) {
|
||||
return err;
|
||||
}
|
||||
|
||||
// async_copy reports success by decrementing and failure by setting to < 0
|
||||
hsa_signal_value_t got = init;
|
||||
while (got == init) {
|
||||
got = hsa_signal_wait_scacquire(sig, HSA_SIGNAL_CONDITION_NE, init,
|
||||
UINT64_MAX, ATMI_WAIT_STATE);
|
||||
}
|
||||
|
||||
if (got != success) {
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
struct atmiFreePtrDeletor {
|
||||
void operator()(void *p) {
|
||||
atmi_free(p); // ignore failure to free
|
||||
}
|
||||
};
|
||||
|
||||
atmi_status_t Runtime::Memcpy(hsa_signal_t sig, void *dest, const void *src,
|
||||
size_t size) {
|
||||
ATLData *src_data = g_data_map.find(src);
|
||||
ATLData *dest_data = g_data_map.find(dest);
|
||||
atmi_mem_place_t cpu = ATMI_MEM_PLACE_CPU_MEM(0, 0, 0);
|
||||
|
||||
void *temp_host_ptr;
|
||||
atmi_status_t ret = atmi_malloc(&temp_host_ptr, size, cpu);
|
||||
if (ret != ATMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
std::unique_ptr<void, atmiFreePtrDeletor> del(temp_host_ptr);
|
||||
|
||||
if (src_data && !dest_data) {
|
||||
// Copy from device to scratch to host
|
||||
hsa_agent_t agent = get_mem_agent(src_data->place());
|
||||
DEBUG_PRINT("Memcpy D2H device agent: %lu\n", agent.handle);
|
||||
|
||||
if (invoke_hsa_copy(sig, temp_host_ptr, src, size, agent) !=
|
||||
HSA_STATUS_SUCCESS) {
|
||||
return ATMI_STATUS_ERROR;
|
||||
}
|
||||
|
||||
memcpy(dest, temp_host_ptr, size);
|
||||
|
||||
} else if (!src_data && dest_data) {
|
||||
// Copy from host to scratch to device
|
||||
hsa_agent_t agent = get_mem_agent(dest_data->place());
|
||||
DEBUG_PRINT("Memcpy H2D device agent: %lu\n", agent.handle);
|
||||
|
||||
memcpy(temp_host_ptr, src, size);
|
||||
|
||||
if (invoke_hsa_copy(sig, dest, temp_host_ptr, size, agent) !=
|
||||
HSA_STATUS_SUCCESS) {
|
||||
return ATMI_STATUS_ERROR;
|
||||
}
|
||||
|
||||
} else if (!src_data && !dest_data) {
|
||||
// would be host to host, just call memcpy, or missing metadata
|
||||
DEBUG_PRINT("atmi_memcpy invoked without metadata\n");
|
||||
return ATMI_STATUS_ERROR;
|
||||
} else {
|
||||
DEBUG_PRINT("atmi_memcpy unimplemented device to device copy\n");
|
||||
return ATMI_STATUS_ERROR;
|
||||
}
|
||||
|
||||
return ATMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
} // namespace core
|
||||
|
|
|
@ -1,83 +0,0 @@
|
|||
/*===--------------------------------------------------------------------------
|
||||
* ATMI (Asynchronous Task and Memory Interface)
|
||||
*
|
||||
* This file is distributed under the MIT License. See LICENSE.txt for details.
|
||||
*===------------------------------------------------------------------------*/
|
||||
#ifndef SRC_RUNTIME_INCLUDE_DATA_H_
|
||||
#define SRC_RUNTIME_INCLUDE_DATA_H_
|
||||
#include "atmi.h"
|
||||
#include <hsa.h>
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
// we maintain our own mapping of device addr to a user specified data object
|
||||
// in order to work around a (possibly historic) bug in ROCr's
|
||||
// hsa_amd_pointer_info_set_userdata for variable symbols
|
||||
// this is expected to be temporary
|
||||
|
||||
namespace core {
|
||||
// Internal representation of any data that is created and managed by ATMI.
|
||||
// Data can be located on any device memory or host memory.
|
||||
class ATLData {
|
||||
public:
|
||||
ATLData(void *ptr, size_t size, atmi_mem_place_t place)
|
||||
: ptr_(ptr), size_(size), place_(place) {}
|
||||
|
||||
void *ptr() const { return ptr_; }
|
||||
size_t size() const { return size_; }
|
||||
atmi_mem_place_t place() const { return place_; }
|
||||
|
||||
private:
|
||||
void *ptr_;
|
||||
size_t size_;
|
||||
atmi_mem_place_t place_;
|
||||
};
|
||||
|
||||
//---
|
||||
struct ATLMemoryRange {
|
||||
const void *base_pointer;
|
||||
const void *end_pointer;
|
||||
ATLMemoryRange(const void *bp, size_t size_bytes)
|
||||
: base_pointer(bp),
|
||||
end_pointer(reinterpret_cast<const unsigned char *>(bp) + size_bytes -
|
||||
1) {}
|
||||
};
|
||||
|
||||
// Functor to compare ranges:
|
||||
struct ATLMemoryRangeCompare {
|
||||
// Return true is LHS range is less than RHS - used to order the ranges
|
||||
bool operator()(const ATLMemoryRange &lhs, const ATLMemoryRange &rhs) const {
|
||||
return lhs.end_pointer < rhs.base_pointer;
|
||||
}
|
||||
};
|
||||
|
||||
//-------------------------------------------------------------------------------------------------
|
||||
// This structure tracks information for each pointer.
|
||||
// Uses memory-range-based lookups - so pointers that exist anywhere in the
|
||||
// range of hostPtr + size
|
||||
// will find the associated ATLPointerInfo.
|
||||
// The insertions and lookups use a self-balancing binary tree and should
|
||||
// support O(logN) lookup speed.
|
||||
// The structure is thread-safe - writers obtain a mutex before modifying the
|
||||
// tree. Multiple simulatenous readers are supported.
|
||||
class ATLPointerTracker {
|
||||
typedef std::map<ATLMemoryRange, ATLData *, ATLMemoryRangeCompare>
|
||||
MapTrackerType;
|
||||
|
||||
public:
|
||||
void insert(void *pointer, ATLData *data);
|
||||
void remove(void *pointer);
|
||||
ATLData *find(const void *pointer);
|
||||
|
||||
private:
|
||||
MapTrackerType tracker_;
|
||||
std::mutex mutex_;
|
||||
};
|
||||
|
||||
extern ATLPointerTracker g_data_map; // Track all am pointer allocations.
|
||||
|
||||
enum class Direction { ATMI_H2D, ATMI_D2H, ATMI_D2D, ATMI_H2H };
|
||||
|
||||
} // namespace core
|
||||
#endif // SRC_RUNTIME_INCLUDE_DATA_H_
|
|
@ -335,27 +335,28 @@ public:
|
|||
static const int Default_WG_Size =
|
||||
llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Default_WG_Size];
|
||||
|
||||
using MemcpyFunc = atmi_status_t(hsa_signal_t, void *, const void *,
|
||||
size_t size);
|
||||
using MemcpyFunc = atmi_status_t (*)(hsa_signal_t, void *, const void *,
|
||||
size_t size, hsa_agent_t);
|
||||
atmi_status_t freesignalpool_memcpy(void *dest, const void *src, size_t size,
|
||||
MemcpyFunc Func) {
|
||||
MemcpyFunc Func, int32_t deviceId) {
|
||||
hsa_agent_t agent = HSAAgents[deviceId];
|
||||
hsa_signal_t s = FreeSignalPool.pop();
|
||||
if (s.handle == 0) {
|
||||
return ATMI_STATUS_ERROR;
|
||||
}
|
||||
atmi_status_t r = Func(s, dest, src, size);
|
||||
atmi_status_t r = Func(s, dest, src, size, agent);
|
||||
FreeSignalPool.push(s);
|
||||
return r;
|
||||
}
|
||||
|
||||
atmi_status_t freesignalpool_memcpy_d2h(void *dest, const void *src,
|
||||
size_t size) {
|
||||
return freesignalpool_memcpy(dest, src, size, atmi_memcpy_d2h);
|
||||
size_t size, int32_t deviceId) {
|
||||
return freesignalpool_memcpy(dest, src, size, atmi_memcpy_d2h, deviceId);
|
||||
}
|
||||
|
||||
atmi_status_t freesignalpool_memcpy_h2d(void *dest, const void *src,
|
||||
size_t size) {
|
||||
return freesignalpool_memcpy(dest, src, size, atmi_memcpy_h2d);
|
||||
size_t size, int32_t deviceId) {
|
||||
return freesignalpool_memcpy(dest, src, size, atmi_memcpy_h2d, deviceId);
|
||||
}
|
||||
|
||||
// Record entry point associated with device
|
||||
|
@ -550,7 +551,8 @@ int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
|
|||
(long long unsigned)(Elf64_Addr)TgtPtr,
|
||||
(long long unsigned)(Elf64_Addr)HstPtr);
|
||||
|
||||
err = DeviceInfo.freesignalpool_memcpy_d2h(HstPtr, TgtPtr, (size_t)Size);
|
||||
err = DeviceInfo.freesignalpool_memcpy_d2h(HstPtr, TgtPtr, (size_t)Size,
|
||||
DeviceId);
|
||||
|
||||
if (err != ATMI_STATUS_SUCCESS) {
|
||||
DP("Error when copying data from device to host. Pointers: "
|
||||
|
@ -576,7 +578,8 @@ int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
|
|||
DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size,
|
||||
(long long unsigned)(Elf64_Addr)HstPtr,
|
||||
(long long unsigned)(Elf64_Addr)TgtPtr);
|
||||
err = DeviceInfo.freesignalpool_memcpy_h2d(TgtPtr, HstPtr, (size_t)Size);
|
||||
err = DeviceInfo.freesignalpool_memcpy_h2d(TgtPtr, HstPtr, (size_t)Size,
|
||||
DeviceId);
|
||||
if (err != ATMI_STATUS_SUCCESS) {
|
||||
DP("Error when copying data from host to device. Pointers: "
|
||||
"host = 0x%016lx, device = 0x%016lx, size = %lld\n",
|
||||
|
@ -1033,7 +1036,8 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
|
|||
}
|
||||
|
||||
// write ptr to device memory so it can be used by later kernels
|
||||
err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr, sizeof(void *));
|
||||
err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr, sizeof(void *),
|
||||
device_id);
|
||||
if (err != ATMI_STATUS_SUCCESS) {
|
||||
fprintf(stderr, "memcpy install of state_ptr failed\n");
|
||||
return NULL;
|
||||
|
@ -1103,7 +1107,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
|
|||
// can access host addresses directly. There is no longer a
|
||||
// need for device copies.
|
||||
err = DeviceInfo.freesignalpool_memcpy_h2d(varptr, e->addr,
|
||||
sizeof(void *));
|
||||
sizeof(void *), device_id);
|
||||
if (err != ATMI_STATUS_SUCCESS)
|
||||
DP("Error when copying USM\n");
|
||||
DP("Copy linked variable host address (" DPxMOD ")"
|
||||
|
@ -1532,7 +1536,7 @@ static void *AllocateNestedParallelCallMemory(int MaxParLevel, int NumGroups,
|
|||
atmi_status_t err =
|
||||
atmi_malloc(&TgtPtr, NestedMemSize, get_gpu_mem_place(device_id));
|
||||
err = DeviceInfo.freesignalpool_memcpy_h2d(CallStackAddr, &TgtPtr,
|
||||
sizeof(void *));
|
||||
sizeof(void *), device_id);
|
||||
if (print_kernel_trace > 2)
|
||||
fprintf(stderr, "CallSck %lx TgtPtr %lx *TgtPtr %lx \n",
|
||||
(long)CallStackAddr, (long)&TgtPtr, (long)TgtPtr);
|
||||
|
|
Loading…
Reference in New Issue