[OpenMP] Increase opportunity for parallel kernel launch in AMDGPUs: add multiple hsa queue's per device in plugin

This patch extends the AMDGPU plugin for OpenMP target offloading from using a single HSA queue to multiple queues (four in this patch) per device. This enables concurrent threads to concurrently submit kernel launches to the same GPU.

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D115771
This commit is contained in:
Carlo Bertolli 2021-12-15 15:33:17 +00:00
parent 3031fd71b9
commit d83dc4c648
1 changed files with 65 additions and 17 deletions

View File

@ -349,7 +349,7 @@ struct HSALifetime {
// cleanup without risking running outside of the lifetime of HSA
const hsa_status_t S;
bool success() { return S == HSA_STATUS_SUCCESS; }
bool HSAInitSuccess() { return S == HSA_STATUS_SUCCESS; }
HSALifetime() : S(hsa_init()) {}
~HSALifetime() {
@ -363,9 +363,63 @@ struct HSALifetime {
}
};
// Handle scheduling of multiple hsa_queue's per device to
// multiple threads (one scheduler per device)
class HSAQueueScheduler {
public:
HSAQueueScheduler() : current(0) {}
HSAQueueScheduler(const HSAQueueScheduler &) = delete;
HSAQueueScheduler(HSAQueueScheduler &&q) {
current = q.current.load();
for (uint8_t i = 0; i < NUM_QUEUES_PER_DEVICE; i++) {
HSAQueues[i] = q.HSAQueues[i];
q.HSAQueues[i] = nullptr;
}
}
// \return false if any HSA queue creation fails
bool CreateQueues(hsa_agent_t HSAAgent, uint32_t queue_size) {
for (uint8_t i = 0; i < NUM_QUEUES_PER_DEVICE; i++) {
hsa_queue_t *Q = nullptr;
hsa_status_t rc =
hsa_queue_create(HSAAgent, queue_size, HSA_QUEUE_TYPE_MULTI,
callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q);
if (rc != HSA_STATUS_SUCCESS) {
DP("Failed to create HSA queue %d\n", i);
return false;
}
HSAQueues[i] = Q;
}
return true;
}
~HSAQueueScheduler() {
for (uint8_t i = 0; i < NUM_QUEUES_PER_DEVICE; i++) {
if (HSAQueues[i]) {
hsa_status_t err = hsa_queue_destroy(HSAQueues[i]);
if (err != HSA_STATUS_SUCCESS)
DP("Error destroying HSA queue");
}
}
}
// \return next queue to use for device
hsa_queue_t *Next() {
return HSAQueues[(current.fetch_add(1, std::memory_order_relaxed)) %
NUM_QUEUES_PER_DEVICE];
}
private:
// Number of queues per device
enum : uint8_t { NUM_QUEUES_PER_DEVICE = 4 };
hsa_queue_t *HSAQueues[NUM_QUEUES_PER_DEVICE] = {};
std::atomic<uint8_t> current;
};
/// Class containing all the device information
class RTLDeviceInfoTy {
HSALifetime HSA; // First field => constructed first and destructed last
class RTLDeviceInfoTy : HSALifetime {
std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
struct QueueDeleter {
@ -390,8 +444,7 @@ public:
// GPU devices
std::vector<hsa_agent_t> HSAAgents;
std::vector<std::unique_ptr<hsa_queue_t, QueueDeleter>>
HSAQueues; // one per gpu
std::vector<HSAQueueScheduler> HSAQueueSchedulers; // one per gpu
// CPUs
std::vector<hsa_agent_t> CPUAgents;
@ -658,7 +711,7 @@ public:
// 1 => tracing dispatch only
// >1 => verbosity increase
if (!HSA.success()) {
if (!HSAInitSuccess()) {
DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n");
return;
}
@ -697,7 +750,7 @@ public:
}
// Init the device info
HSAQueues.resize(NumberOfDevices);
HSAQueueSchedulers.reserve(NumberOfDevices);
FuncGblEntries.resize(NumberOfDevices);
ThreadsPerGroup.resize(NumberOfDevices);
ComputeUnits.resize(NumberOfDevices);
@ -740,15 +793,10 @@ public:
}
{
hsa_queue_t *Q = nullptr;
hsa_status_t rc =
hsa_queue_create(HSAAgents[i], queue_size, HSA_QUEUE_TYPE_MULTI,
callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q);
if (rc != HSA_STATUS_SUCCESS) {
DP("Failed to create HSA queue %d\n", i);
HSAQueueScheduler QSched;
if (!QSched.CreateQueues(HSAAgents[i], queue_size))
return;
}
HSAQueues[i].reset(Q);
HSAQueueSchedulers.emplace_back(std::move(QSched));
}
deviceStateStore[i] = {nullptr, 0};
@ -776,7 +824,7 @@ public:
~RTLDeviceInfoTy() {
DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n");
if (!HSA.success()) {
if (!HSAInitSuccess()) {
// Then none of these can have been set up and they can't be torn down
return;
}
@ -1113,7 +1161,7 @@ int32_t runRegionLocked(int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
// Run on the device.
{
hsa_queue_t *queue = DeviceInfo.HSAQueues[device_id].get();
hsa_queue_t *queue = DeviceInfo.HSAQueueSchedulers[device_id].Next();
if (!queue) {
return OFFLOAD_FAIL;
}