[OpenMP] Increase opportunity for parallel kernel launch in AMDGPUs: add multiple hsa queue's per device in plugin

This patch extends the AMDGPU plugin for OpenMP target offloading from using a single HSA queue to multiple queues (four in this patch) per device. This enables concurrent threads to concurrently submit kernel launches to the same GPU. Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D115771
2021-12-15 15:33:17 +00:00 · 2021-12-15 15:33:17 +00:00 · d83dc4c648
parent 3031fd71b9
commit d83dc4c648
1 changed files with 65 additions and 17 deletions
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@ -349,7 +349,7 @@ struct HSALifetime {
  // cleanup without risking running outside of the lifetime of HSA
  const hsa_status_t S;

-  bool success() { return S == HSA_STATUS_SUCCESS; }
+  bool HSAInitSuccess() { return S == HSA_STATUS_SUCCESS; }
  HSALifetime() : S(hsa_init()) {}

  ~HSALifetime() {
@ -363,9 +363,63 @@ struct HSALifetime {
  }
 };

+// Handle scheduling of multiple hsa_queue's per device to
+// multiple threads (one scheduler per device)
+class HSAQueueScheduler {
+public:
+  HSAQueueScheduler() : current(0) {}
+
+  HSAQueueScheduler(const HSAQueueScheduler &) = delete;
+
+  HSAQueueScheduler(HSAQueueScheduler &&q) {
+    current = q.current.load();
+    for (uint8_t i = 0; i < NUM_QUEUES_PER_DEVICE; i++) {
+      HSAQueues[i] = q.HSAQueues[i];
+      q.HSAQueues[i] = nullptr;
+    }
+  }
+
+  // \return false if any HSA queue creation fails
+  bool CreateQueues(hsa_agent_t HSAAgent, uint32_t queue_size) {
+    for (uint8_t i = 0; i < NUM_QUEUES_PER_DEVICE; i++) {
+      hsa_queue_t *Q = nullptr;
+      hsa_status_t rc =
+          hsa_queue_create(HSAAgent, queue_size, HSA_QUEUE_TYPE_MULTI,
+                           callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q);
+      if (rc != HSA_STATUS_SUCCESS) {
+        DP("Failed to create HSA queue %d\n", i);
+        return false;
+      }
+      HSAQueues[i] = Q;
+    }
+    return true;
+  }
+
+  ~HSAQueueScheduler() {
+    for (uint8_t i = 0; i < NUM_QUEUES_PER_DEVICE; i++) {
+      if (HSAQueues[i]) {
+        hsa_status_t err = hsa_queue_destroy(HSAQueues[i]);
+        if (err != HSA_STATUS_SUCCESS)
+          DP("Error destroying HSA queue");
+      }
+    }
+  }
+
+  // \return next queue to use for device
+  hsa_queue_t *Next() {
+    return HSAQueues[(current.fetch_add(1, std::memory_order_relaxed)) %
+                     NUM_QUEUES_PER_DEVICE];
+  }
+
+private:
+  // Number of queues per device
+  enum : uint8_t { NUM_QUEUES_PER_DEVICE = 4 };
+  hsa_queue_t *HSAQueues[NUM_QUEUES_PER_DEVICE] = {};
+  std::atomic<uint8_t> current;
+};
+
 /// Class containing all the device information
-class RTLDeviceInfoTy {
-  HSALifetime HSA; // First field => constructed first and destructed last
+class RTLDeviceInfoTy : HSALifetime {
  std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;

  struct QueueDeleter {
@ -390,8 +444,7 @@ public:

  // GPU devices
  std::vector<hsa_agent_t> HSAAgents;
-  std::vector<std::unique_ptr<hsa_queue_t, QueueDeleter>>
-      HSAQueues; // one per gpu
+  std::vector<HSAQueueScheduler> HSAQueueSchedulers; // one per gpu

  // CPUs
  std::vector<hsa_agent_t> CPUAgents;
@ -658,7 +711,7 @@ public:
    //  1 => tracing dispatch only
    // >1 => verbosity increase

-    if (!HSA.success()) {
+    if (!HSAInitSuccess()) {
      DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n");
      return;
    }
@ -697,7 +750,7 @@ public:
    }

    // Init the device info
-    HSAQueues.resize(NumberOfDevices);
+    HSAQueueSchedulers.reserve(NumberOfDevices);
    FuncGblEntries.resize(NumberOfDevices);
    ThreadsPerGroup.resize(NumberOfDevices);
    ComputeUnits.resize(NumberOfDevices);
@ -740,15 +793,10 @@ public:
      }

      {
-        hsa_queue_t *Q = nullptr;
-        hsa_status_t rc =
-            hsa_queue_create(HSAAgents[i], queue_size, HSA_QUEUE_TYPE_MULTI,
-                             callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q);
-        if (rc != HSA_STATUS_SUCCESS) {
-          DP("Failed to create HSA queue %d\n", i);
+        HSAQueueScheduler QSched;
+        if (!QSched.CreateQueues(HSAAgents[i], queue_size))
          return;
-        }
-        HSAQueues[i].reset(Q);
+        HSAQueueSchedulers.emplace_back(std::move(QSched));
      }

      deviceStateStore[i] = {nullptr, 0};
@ -776,7 +824,7 @@ public:

  ~RTLDeviceInfoTy() {
    DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n");
-    if (!HSA.success()) {
+    if (!HSAInitSuccess()) {
      // Then none of these can have been set up and they can't be torn down
      return;
    }
@ -1113,7 +1161,7 @@ int32_t runRegionLocked(int32_t device_id, void *tgt_entry_ptr, void **tgt_args,

  // Run on the device.
  {
-    hsa_queue_t *queue = DeviceInfo.HSAQueues[device_id].get();
+    hsa_queue_t *queue = DeviceInfo.HSAQueueSchedulers[device_id].Next();
    if (!queue) {
      return OFFLOAD_FAIL;
    }