forked from OSchip/llvm-project
[Libomptarget] Deinitialize AMDGPU global state more intentionally
A previous patch made the destruction of the HSA plugin more deterministic. However, there were still other global values that are not handled this way. When attempting to call a destructor kernel, the device would have already been uninitialized and we could not find the appropriate kernel to call. This is because they were stored in global containers that had their destructors called already. Merges this global state into the rest of the info state by putting those global values inside of the global pointer already allocated and deallocated by the constructor and destructor. This should allow the AMDGPU plugin to correctly identify the destructors if we were to run them. Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D131011
This commit is contained in:
parent
9cf6511bff
commit
2b7203a359
|
@ -210,9 +210,6 @@ private:
|
||||||
};
|
};
|
||||||
pthread_mutex_t KernelArgPool::Mutex = PTHREAD_MUTEX_INITIALIZER;
|
pthread_mutex_t KernelArgPool::Mutex = PTHREAD_MUTEX_INITIALIZER;
|
||||||
|
|
||||||
std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>>
|
|
||||||
KernelArgPoolMap;
|
|
||||||
|
|
||||||
/// Use a single entity to encode a kernel and a set of flags
|
/// Use a single entity to encode a kernel and a set of flags
|
||||||
struct KernelTy {
|
struct KernelTy {
|
||||||
llvm::omp::OMPTgtExecModeFlags ExecutionMode;
|
llvm::omp::OMPTgtExecModeFlags ExecutionMode;
|
||||||
|
@ -224,7 +221,9 @@ struct KernelTy {
|
||||||
KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize,
|
KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize,
|
||||||
int32_t DeviceId, void *CallStackAddr, const char *Name,
|
int32_t DeviceId, void *CallStackAddr, const char *Name,
|
||||||
uint32_t KernargSegmentSize,
|
uint32_t KernargSegmentSize,
|
||||||
hsa_amd_memory_pool_t &KernArgMemoryPool)
|
hsa_amd_memory_pool_t &KernArgMemoryPool,
|
||||||
|
std::unordered_map<std::string, std::unique_ptr<KernelArgPool>>
|
||||||
|
&KernelArgPoolMap)
|
||||||
: ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize),
|
: ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize),
|
||||||
DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) {
|
DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) {
|
||||||
DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
|
DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
|
||||||
|
@ -238,10 +237,6 @@ struct KernelTy {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
/// List that contains all the kernels.
|
|
||||||
/// FIXME: we may need this to be per device and per library.
|
|
||||||
std::list<KernelTy> KernelsList;
|
|
||||||
|
|
||||||
template <typename Callback> static hsa_status_t findAgents(Callback CB) {
|
template <typename Callback> static hsa_status_t findAgents(Callback CB) {
|
||||||
|
|
||||||
hsa_status_t Err =
|
hsa_status_t Err =
|
||||||
|
@ -456,6 +451,12 @@ public:
|
||||||
|
|
||||||
int NumberOfDevices = 0;
|
int NumberOfDevices = 0;
|
||||||
|
|
||||||
|
/// List that contains all the kernels.
|
||||||
|
/// FIXME: we may need this to be per device and per library.
|
||||||
|
std::list<KernelTy> KernelsList;
|
||||||
|
std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>>
|
||||||
|
KernelArgPoolMap;
|
||||||
|
|
||||||
// GPU devices
|
// GPU devices
|
||||||
std::vector<hsa_agent_t> HSAAgents;
|
std::vector<hsa_agent_t> HSAAgents;
|
||||||
std::vector<HSAQueueScheduler> HSAQueueSchedulers; // one per gpu
|
std::vector<HSAQueueScheduler> HSAQueueSchedulers; // one per gpu
|
||||||
|
@ -857,7 +858,6 @@ public:
|
||||||
"Unexpected device id!");
|
"Unexpected device id!");
|
||||||
FuncGblEntries[DeviceId].emplace_back();
|
FuncGblEntries[DeviceId].emplace_back();
|
||||||
FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
|
FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
|
||||||
// KernelArgPoolMap.clear();
|
|
||||||
E.Entries.clear();
|
E.Entries.clear();
|
||||||
E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
|
E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
|
||||||
}
|
}
|
||||||
|
@ -1116,19 +1116,6 @@ pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER;
|
||||||
static RTLDeviceInfoTy *DeviceInfoState = nullptr;
|
static RTLDeviceInfoTy *DeviceInfoState = nullptr;
|
||||||
static RTLDeviceInfoTy &DeviceInfo() { return *DeviceInfoState; }
|
static RTLDeviceInfoTy &DeviceInfo() { return *DeviceInfoState; }
|
||||||
|
|
||||||
int32_t __tgt_rtl_init_plugin() {
|
|
||||||
DeviceInfoState = new RTLDeviceInfoTy;
|
|
||||||
return (DeviceInfoState && DeviceInfoState->ConstructionSucceeded)
|
|
||||||
? OFFLOAD_SUCCESS
|
|
||||||
: OFFLOAD_FAIL;
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t __tgt_rtl_deinit_plugin() {
|
|
||||||
if (DeviceInfoState)
|
|
||||||
delete DeviceInfoState;
|
|
||||||
return OFFLOAD_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
|
int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
|
||||||
|
@ -1171,7 +1158,7 @@ int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
|
||||||
(long long unsigned)(Elf64_Addr)HstPtr,
|
(long long unsigned)(Elf64_Addr)HstPtr,
|
||||||
(long long unsigned)(Elf64_Addr)TgtPtr);
|
(long long unsigned)(Elf64_Addr)TgtPtr);
|
||||||
Err = DeviceInfo().freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size,
|
Err = DeviceInfo().freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size,
|
||||||
DeviceId);
|
DeviceId);
|
||||||
if (Err != HSA_STATUS_SUCCESS) {
|
if (Err != HSA_STATUS_SUCCESS) {
|
||||||
DP("Error when copying data from host to device. Pointers: "
|
DP("Error when copying data from host to device. Pointers: "
|
||||||
"host = 0x%016lx, device = 0x%016lx, size = %lld\n",
|
"host = 0x%016lx, device = 0x%016lx, size = %lld\n",
|
||||||
|
@ -1468,8 +1455,9 @@ int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
|
||||||
KernelArgPool *ArgPool = nullptr;
|
KernelArgPool *ArgPool = nullptr;
|
||||||
void *KernArg = nullptr;
|
void *KernArg = nullptr;
|
||||||
{
|
{
|
||||||
auto It = KernelArgPoolMap.find(std::string(KernelInfo->Name));
|
auto It =
|
||||||
if (It != KernelArgPoolMap.end()) {
|
DeviceInfo().KernelArgPoolMap.find(std::string(KernelInfo->Name));
|
||||||
|
if (It != DeviceInfo().KernelArgPoolMap.end()) {
|
||||||
ArgPool = (It->second).get();
|
ArgPool = (It->second).get();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2031,6 +2019,20 @@ bool IsImageCompatibleWithEnv(const char *ImgInfo, std::string EnvInfo) {
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
|
||||||
|
int32_t __tgt_rtl_init_plugin() {
|
||||||
|
DeviceInfoState = new RTLDeviceInfoTy;
|
||||||
|
return (DeviceInfoState && DeviceInfoState->ConstructionSucceeded)
|
||||||
|
? OFFLOAD_SUCCESS
|
||||||
|
: OFFLOAD_FAIL;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t __tgt_rtl_deinit_plugin() {
|
||||||
|
if (DeviceInfoState)
|
||||||
|
delete DeviceInfoState;
|
||||||
|
return OFFLOAD_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
|
int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
|
||||||
return elfMachineIdIsAmdgcn(Image);
|
return elfMachineIdIsAmdgcn(Image);
|
||||||
}
|
}
|
||||||
|
@ -2373,8 +2375,8 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
|
||||||
}
|
}
|
||||||
|
|
||||||
// write ptr to device memory so it can be used by later kernels
|
// write ptr to device memory so it can be used by later kernels
|
||||||
Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr, sizeof(void *),
|
Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr,
|
||||||
DeviceId);
|
sizeof(void *), DeviceId);
|
||||||
if (Err != HSA_STATUS_SUCCESS) {
|
if (Err != HSA_STATUS_SUCCESS) {
|
||||||
DP("memcpy install of state_ptr failed\n");
|
DP("memcpy install of state_ptr failed\n");
|
||||||
return NULL;
|
return NULL;
|
||||||
|
@ -2437,8 +2439,8 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
|
||||||
// If unified memory is present any target link variables
|
// If unified memory is present any target link variables
|
||||||
// can access host addresses directly. There is no longer a
|
// can access host addresses directly. There is no longer a
|
||||||
// need for device copies.
|
// need for device copies.
|
||||||
Err = DeviceInfo().freesignalpoolMemcpyH2D(Varptr, E->addr,
|
Err = DeviceInfo().freesignalpoolMemcpyH2D(Varptr, E->addr, sizeof(void *),
|
||||||
sizeof(void *), DeviceId);
|
DeviceId);
|
||||||
if (Err != HSA_STATUS_SUCCESS)
|
if (Err != HSA_STATUS_SUCCESS)
|
||||||
DP("Error when copying USM\n");
|
DP("Error when copying USM\n");
|
||||||
DP("Copy linked variable host address (" DPxMOD ")"
|
DP("Copy linked variable host address (" DPxMOD ")"
|
||||||
|
@ -2598,11 +2600,12 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
|
||||||
}
|
}
|
||||||
check("Loading computation property", Err);
|
check("Loading computation property", Err);
|
||||||
|
|
||||||
KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId,
|
DeviceInfo().KernelsList.push_back(
|
||||||
CallStackAddr, E->name, KernargSegmentSize,
|
KernelTy(ExecModeVal, WGSizeVal, DeviceId, CallStackAddr, E->name,
|
||||||
DeviceInfo().KernArgPool));
|
KernargSegmentSize, DeviceInfo().KernArgPool,
|
||||||
|
DeviceInfo().KernelArgPoolMap));
|
||||||
__tgt_offload_entry Entry = *E;
|
__tgt_offload_entry Entry = *E;
|
||||||
Entry.addr = (void *)&KernelsList.back();
|
Entry.addr = (void *)&DeviceInfo().KernelsList.back();
|
||||||
DeviceInfo().addOffloadEntry(DeviceId, Entry);
|
DeviceInfo().addOffloadEntry(DeviceId, Entry);
|
||||||
DP("Entry point %ld maps to %s\n", E - HostBegin, E->name);
|
DP("Entry point %ld maps to %s\n", E - HostBegin, E->name);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue