forked from OSchip/llvm-project
[OpenMP] Increase opportunity for parallel kernel launch in AMDGPUs: add multiple hsa queue's per device in plugin
This patch extends the AMDGPU plugin for OpenMP target offloading from using a single HSA queue to multiple queues (four in this patch) per device. This enables concurrent threads to concurrently submit kernel launches to the same GPU. Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D115771
This commit is contained in:
parent
3031fd71b9
commit
d83dc4c648
|
@ -349,7 +349,7 @@ struct HSALifetime {
|
|||
// cleanup without risking running outside of the lifetime of HSA
|
||||
const hsa_status_t S;
|
||||
|
||||
bool success() { return S == HSA_STATUS_SUCCESS; }
|
||||
bool HSAInitSuccess() { return S == HSA_STATUS_SUCCESS; }
|
||||
HSALifetime() : S(hsa_init()) {}
|
||||
|
||||
~HSALifetime() {
|
||||
|
@ -363,9 +363,63 @@ struct HSALifetime {
|
|||
}
|
||||
};
|
||||
|
||||
// Handle scheduling of multiple hsa_queue's per device to
|
||||
// multiple threads (one scheduler per device)
|
||||
class HSAQueueScheduler {
|
||||
public:
|
||||
HSAQueueScheduler() : current(0) {}
|
||||
|
||||
HSAQueueScheduler(const HSAQueueScheduler &) = delete;
|
||||
|
||||
HSAQueueScheduler(HSAQueueScheduler &&q) {
|
||||
current = q.current.load();
|
||||
for (uint8_t i = 0; i < NUM_QUEUES_PER_DEVICE; i++) {
|
||||
HSAQueues[i] = q.HSAQueues[i];
|
||||
q.HSAQueues[i] = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// \return false if any HSA queue creation fails
|
||||
bool CreateQueues(hsa_agent_t HSAAgent, uint32_t queue_size) {
|
||||
for (uint8_t i = 0; i < NUM_QUEUES_PER_DEVICE; i++) {
|
||||
hsa_queue_t *Q = nullptr;
|
||||
hsa_status_t rc =
|
||||
hsa_queue_create(HSAAgent, queue_size, HSA_QUEUE_TYPE_MULTI,
|
||||
callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q);
|
||||
if (rc != HSA_STATUS_SUCCESS) {
|
||||
DP("Failed to create HSA queue %d\n", i);
|
||||
return false;
|
||||
}
|
||||
HSAQueues[i] = Q;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
~HSAQueueScheduler() {
|
||||
for (uint8_t i = 0; i < NUM_QUEUES_PER_DEVICE; i++) {
|
||||
if (HSAQueues[i]) {
|
||||
hsa_status_t err = hsa_queue_destroy(HSAQueues[i]);
|
||||
if (err != HSA_STATUS_SUCCESS)
|
||||
DP("Error destroying HSA queue");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// \return next queue to use for device
|
||||
hsa_queue_t *Next() {
|
||||
return HSAQueues[(current.fetch_add(1, std::memory_order_relaxed)) %
|
||||
NUM_QUEUES_PER_DEVICE];
|
||||
}
|
||||
|
||||
private:
|
||||
// Number of queues per device
|
||||
enum : uint8_t { NUM_QUEUES_PER_DEVICE = 4 };
|
||||
hsa_queue_t *HSAQueues[NUM_QUEUES_PER_DEVICE] = {};
|
||||
std::atomic<uint8_t> current;
|
||||
};
|
||||
|
||||
/// Class containing all the device information
|
||||
class RTLDeviceInfoTy {
|
||||
HSALifetime HSA; // First field => constructed first and destructed last
|
||||
class RTLDeviceInfoTy : HSALifetime {
|
||||
std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
|
||||
|
||||
struct QueueDeleter {
|
||||
|
@ -390,8 +444,7 @@ public:
|
|||
|
||||
// GPU devices
|
||||
std::vector<hsa_agent_t> HSAAgents;
|
||||
std::vector<std::unique_ptr<hsa_queue_t, QueueDeleter>>
|
||||
HSAQueues; // one per gpu
|
||||
std::vector<HSAQueueScheduler> HSAQueueSchedulers; // one per gpu
|
||||
|
||||
// CPUs
|
||||
std::vector<hsa_agent_t> CPUAgents;
|
||||
|
@ -658,7 +711,7 @@ public:
|
|||
// 1 => tracing dispatch only
|
||||
// >1 => verbosity increase
|
||||
|
||||
if (!HSA.success()) {
|
||||
if (!HSAInitSuccess()) {
|
||||
DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n");
|
||||
return;
|
||||
}
|
||||
|
@ -697,7 +750,7 @@ public:
|
|||
}
|
||||
|
||||
// Init the device info
|
||||
HSAQueues.resize(NumberOfDevices);
|
||||
HSAQueueSchedulers.reserve(NumberOfDevices);
|
||||
FuncGblEntries.resize(NumberOfDevices);
|
||||
ThreadsPerGroup.resize(NumberOfDevices);
|
||||
ComputeUnits.resize(NumberOfDevices);
|
||||
|
@ -740,15 +793,10 @@ public:
|
|||
}
|
||||
|
||||
{
|
||||
hsa_queue_t *Q = nullptr;
|
||||
hsa_status_t rc =
|
||||
hsa_queue_create(HSAAgents[i], queue_size, HSA_QUEUE_TYPE_MULTI,
|
||||
callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q);
|
||||
if (rc != HSA_STATUS_SUCCESS) {
|
||||
DP("Failed to create HSA queue %d\n", i);
|
||||
HSAQueueScheduler QSched;
|
||||
if (!QSched.CreateQueues(HSAAgents[i], queue_size))
|
||||
return;
|
||||
}
|
||||
HSAQueues[i].reset(Q);
|
||||
HSAQueueSchedulers.emplace_back(std::move(QSched));
|
||||
}
|
||||
|
||||
deviceStateStore[i] = {nullptr, 0};
|
||||
|
@ -776,7 +824,7 @@ public:
|
|||
|
||||
~RTLDeviceInfoTy() {
|
||||
DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n");
|
||||
if (!HSA.success()) {
|
||||
if (!HSAInitSuccess()) {
|
||||
// Then none of these can have been set up and they can't be torn down
|
||||
return;
|
||||
}
|
||||
|
@ -1113,7 +1161,7 @@ int32_t runRegionLocked(int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
|
|||
|
||||
// Run on the device.
|
||||
{
|
||||
hsa_queue_t *queue = DeviceInfo.HSAQueues[device_id].get();
|
||||
hsa_queue_t *queue = DeviceInfo.HSAQueueSchedulers[device_id].Next();
|
||||
if (!queue) {
|
||||
return OFFLOAD_FAIL;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue