add memory copy module

2020-05-22 18:44:31 +08:00 · 2020-05-22 18:44:31 +08:00 · c5ac2cc38c
parent 4d72cb7227
commit c5ac2cc38c
8 changed files with 329 additions and 23 deletions
--- a/mindspore/ccsrc/device/device_address.h
+++ b/mindspore/ccsrc/device/device_address.h
@ -47,6 +47,8 @@ class GPUMemoryManager;

 namespace mindspore {
 namespace device {
+enum class DeviceAddressStatus { kInDevice, kInHost, kInDeviceToHost, kInHostToDevice };
+
 class DeviceAddress {
 public:
  explicit DeviceAddress(void *ptr, size_t size) : ptr_(ptr), size_(size) {}
@ -60,6 +62,8 @@ class DeviceAddress {
  size_t GetSize() const { return size_; }
  std::string format() const { return format_; }
  TypeId type_id() const { return type_id_; }
+  virtual void set_status(DeviceAddressStatus status) {}
+  virtual DeviceAddressStatus status() const { return DeviceAddressStatus::kInDevice; }

 protected:
  const void *ptr() const { return ptr_; }
--- a/mindspore/ccsrc/device/gpu/cuda_driver.cc
+++ b/mindspore/ccsrc/device/gpu/cuda_driver.cc
@ -15,9 +15,7 @@
 */

 #include "device/gpu/cuda_driver.h"
-
 #include <iostream>
-
 #include "utils/log_adapter.h"
 #include "utils/convert_utils.h"

@ -54,6 +52,27 @@ bool CudaDriver::FreeDeviceMem(const DeviceMemPtr &addr) {
  return true;
 }

+size_t CudaDriver::AllocHostPinnedMem(size_t size, void **addr) {
+  if (size == 0) {
+    MS_LOG(EXCEPTION) << "The memory allocate size is 0";
+  }
+  auto ret = cudaHostAlloc(addr, size, cudaHostAllocDefault);
+  if (ret != cudaSuccess) {
+    MS_LOG(ERROR) << "cudaHostAlloc failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
+    return 0;
+  }
+  return size;
+}
+
+void CudaDriver::FreeHostPinnedMem(void *addr) {
+  if (addr) {
+    auto ret = cudaFreeHost(addr);
+    if (ret != cudaSuccess) {
+      MS_LOG(EXCEPTION) << "cudaFreeHost failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
+    }
+  }
+}
+
 bool CudaDriver::CopyHostMemToDevice(const DeviceMemPtr &dst, const void *src, size_t size) {
  auto ret = cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice);
  if (ret != cudaSuccess) {
@ -72,6 +91,25 @@ bool CudaDriver::CopyDeviceMemToHost(const HostMemPtr &dst, const DeviceMemPtr &
  return true;
 }

+bool CudaDriver::CopyHostMemToDeviceAsync(const DeviceMemPtr &dst, const void *src, size_t size, DeviceStream stream) {
+  auto ret = cudaMemcpyAsync(dst, src, size, cudaMemcpyHostToDevice, (cudaStream_t)stream);
+  if (ret != cudaSuccess) {
+    MS_LOG(ERROR) << "cudaMemcpyAsync failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
+    return false;
+  }
+  return true;
+}
+
+bool CudaDriver::CopyDeviceMemToHostAsync(const HostMemPtr &dst, const DeviceMemPtr &src, size_t size,
+                                          DeviceStream stream) {
+  auto ret = cudaMemcpyAsync(dst, src, size, cudaMemcpyHostToDevice, (cudaStream_t)stream);
+  if (ret != cudaSuccess) {
+    MS_LOG(ERROR) << "cudaMemcpyAsync failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
+    return false;
+  }
+  return true;
+}
+
 size_t CudaDriver::total_mem_size() {
  size_t free;
  size_t total;
@ -122,6 +160,55 @@ bool CudaDriver::SyncStream(const DeviceStream &stream) {
  return true;
 }

+bool CudaDriver::CreateEvent(DeviceEvent *event, unsigned int flag) {
+  auto ret = cudaEventCreateWithFlags(reinterpret_cast<cudaEvent_t *>(event), flag);
+  if (ret != cudaSuccess) {
+    MS_LOG(ERROR) << "cudaEventCreateWithFlags failed, ret[" << static_cast<int>(ret) << "], "
+                  << cudaGetErrorString(ret);
+    return false;
+  }
+  return true;
+}
+
+bool CudaDriver::DestroyEvent(const DeviceEvent &event) {
+  auto ret = cudaEventDestroy((cudaEvent_t)event);
+  if (ret != cudaSuccess) {
+    MS_LOG(ERROR) << "cudaEventDestroy failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
+    return false;
+  }
+  return true;
+}
+
+bool CudaDriver::RecordEvent(DeviceEvent event, DeviceStream stream) {
+  auto ret = cudaEventRecord((cudaEvent_t)event, (cudaStream_t)stream);
+  if (ret != cudaSuccess) {
+    MS_LOG(ERROR) << "cudaEventRecord failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
+    return false;
+  }
+  return true;
+}
+
+bool CudaDriver::SyncEvent(const DeviceEvent &event) {
+  auto ret = cudaEventSynchronize((cudaEvent_t)event);
+  if (ret != cudaSuccess) {
+    MS_LOG(ERROR) << "cudaEventSynchronize failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
+    return false;
+  }
+  return true;
+}
+
+bool CudaDriver::QueryEvent(const DeviceEvent &event) {
+  auto ret = cudaEventQuery((cudaEvent_t)event);
+  if (ret == cudaSuccess) {
+    return true;
+  } else if (ret == cudaErrorNotReady) {
+    return false;
+  } else {
+    MS_LOG(ERROR) << "cudaEventQuery failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
+    return false;
+  }
+}
+
 int CudaDriver::device_count() {
  int dev_count;
  auto ret = cudaGetDeviceCount(&dev_count);
--- a/mindspore/ccsrc/device/gpu/cuda_driver.h
+++ b/mindspore/ccsrc/device/gpu/cuda_driver.h
@ -33,8 +33,16 @@ class CudaDriver {
  // such as malloc/free and memory copy from host to device and reverse.
  static size_t AllocDeviceMem(size_t size, DeviceMemPtr *addr);
  static bool FreeDeviceMem(const DeviceMemPtr &addr);
+  static size_t AllocHostPinnedMem(size_t size, void **addr);
+  static void FreeHostPinnedMem(void *addr);
+
  static bool CopyHostMemToDevice(const DeviceMemPtr &dst, const void *src, size_t size);
  static bool CopyDeviceMemToHost(const HostMemPtr &dst, const DeviceMemPtr &src, size_t size);
+
+  static bool CopyHostMemToDeviceAsync(const DeviceMemPtr &dst, const void *src, size_t size, DeviceStream stream = 0);
+  static bool CopyDeviceMemToHostAsync(const HostMemPtr &dst, const DeviceMemPtr &src, size_t size,
+                                       DeviceStream stream = 0);
+
  static size_t total_mem_size();
  static size_t free_mem_size();

@ -44,6 +52,12 @@ class CudaDriver {
  static bool DestroyStream(const DeviceStream &stream);
  static bool SyncStream(const DeviceStream &stream);

+  static bool CreateEvent(DeviceEvent *event, unsigned int flag = cudaEventDefault);
+  static bool DestroyEvent(const DeviceEvent &event);
+  static bool RecordEvent(DeviceEvent event, DeviceStream stream = 0);
+  static bool SyncEvent(const DeviceEvent &event);
+  static bool QueryEvent(const DeviceEvent &event);
+
  // Encapsulate the cuda APIs associated with device management.
  static int device_count();
  static bool set_current_device(int index);
--- a/mindspore/ccsrc/device/gpu/gpu_device_address.h
+++ b/mindspore/ccsrc/device/gpu/gpu_device_address.h
@ -33,6 +33,11 @@ class GPUDeviceAddress : public DeviceAddress {

  bool SyncDeviceToHost(const std::vector<int> &shape, size_t size, TypeId type, void *host_ptr) const override;
  bool SyncHostToDevice(const std::vector<int> &shape, size_t size, TypeId type, const void *host_ptr) const override;
+  void set_status(DeviceAddressStatus status) { status_ = status; }
+  DeviceAddressStatus status() const { return status_; }
+
+ private:
+  DeviceAddressStatus status_{DeviceAddressStatus::kInDevice};
 };
 }  // namespace gpu
 }  // namespace device
--- a/mindspore/ccsrc/device/gpu/gpu_memory_copy_manager.cc
+++ b/mindspore/ccsrc/device/gpu/gpu_memory_copy_manager.cc
@ -0,0 +1,131 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/gpu/gpu_memory_copy_manager.h"
+#include "device/gpu/gpu_common.h"
+#include "device/gpu/gpu_device_manager.h"
+#include "session/anf_runtime_algorithm.h"
+
+namespace mindspore {
+namespace device {
+namespace gpu {
+void GPUMemCopyManager::Init() {
+  CHECK_OP_RET_WITH_EXCEPT(GPUDeviceManager::GetInstance().CreateStream(&swap_out_stream_),
+                           "Failed to create CUDA stream of memory swap out.");
+  CHECK_OP_RET_WITH_EXCEPT(GPUDeviceManager::GetInstance().CreateStream(&swap_in_stream_),
+                           "Failed to create CUDA stream of memory swap in.");
+}
+
+void GPUMemCopyManager::AddMemSwapOutTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr) {
+  MS_EXCEPTION_IF_NULL(device_address);
+  MS_EXCEPTION_IF_NULL(host_addr.addr);
+  DeviceEvent event = nullptr;
+  CHECK_OP_RET_WITH_EXCEPT(CudaDriver::CreateEvent(&event, cudaEventDisableTiming), "Failed to create CUDA event.");
+  DeviceMemPtr device_ptr = const_cast<DeviceMemPtr>(device_address->GetPtr());
+  MS_EXCEPTION_IF_NULL(device_ptr);
+  device_address->set_status(DeviceAddressStatus::kInDeviceToHost);
+
+  CHECK_OP_RET_WITH_EXCEPT(
+    CudaDriver::CopyDeviceMemToHostAsync(host_addr.addr, device_ptr, host_addr.size, swap_out_stream_),
+    "Failed to copy device memory to host.");
+
+  CHECK_OP_RET_WITH_EXCEPT(CudaDriver::RecordEvent(event, swap_out_stream_),
+                           "Failed to record CUDA event to swap out stream.");
+  swap_out_queue_.emplace(device_address, event);
+}
+
+void GPUMemCopyManager::AddMemSwapInTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr) {
+  MS_EXCEPTION_IF_NULL(device_address);
+  MS_EXCEPTION_IF_NULL(host_addr.addr);
+  DeviceEvent event = nullptr;
+  CHECK_OP_RET_WITH_EXCEPT(CudaDriver::CreateEvent(&event, cudaEventDisableTiming), "Failed to create CUDA event.");
+  DeviceMemPtr device_ptr = const_cast<DeviceMemPtr>(device_address->GetPtr());
+  MS_EXCEPTION_IF_NULL(device_ptr);
+  device_address->set_status(DeviceAddressStatus::kInHostToDevice);
+
+  CHECK_OP_RET_WITH_EXCEPT(
+    CudaDriver::CopyHostMemToDeviceAsync(device_ptr, host_addr.addr, host_addr.size, swap_in_stream_),
+    "Failed to copy host memory to device.");
+  CHECK_OP_RET_WITH_EXCEPT(CudaDriver::RecordEvent(event, swap_in_stream_),
+                           "Failed to record CUDA event to swap in stream.");
+  swap_in_queue_.emplace(device_address, event);
+}
+
+bool GPUMemCopyManager::SyncMemCopyStream(SwapKind swap_kind) {
+  if (swap_kind == SwapKind::kDeviceToHost) {
+    return GPUDeviceManager::GetInstance().SyncStream(swap_out_stream_);
+  } else {
+    return GPUDeviceManager::GetInstance().SyncStream(swap_in_stream_);
+  }
+}
+
+DeviceAddressPtr GPUMemCopyManager::UpdateSwapOutQueue() {
+  if (swap_out_queue_.empty()) {
+    return nullptr;
+  }
+  auto &task = swap_out_queue_.front();
+  auto device_address = task.first;
+  auto &event = task.second;
+  bool finish_swap = CudaDriver::QueryEvent(event);
+  if (!finish_swap) {
+    return nullptr;
+  }
+  CHECK_OP_RET_WITH_EXCEPT(CudaDriver::DestroyEvent(event), "Failed to destroy CUDA event of swap out.");
+  swap_out_queue_.pop();
+  return device_address;
+}
+
+DeviceAddressPtr GPUMemCopyManager::UpdateSwapInQueue() {
+  if (swap_in_queue_.empty()) {
+    return nullptr;
+  }
+  auto &task = swap_in_queue_.front();
+  auto device_address = task.first;
+  auto &event = task.second;
+  bool finish_swap = CudaDriver::QueryEvent(event);
+  if (!finish_swap) {
+    return nullptr;
+  }
+  CHECK_OP_RET_WITH_EXCEPT(CudaDriver::DestroyEvent(event), "Failed to destroy CUDA event of swap in.");
+  swap_in_queue_.pop();
+  return device_address;
+}
+
+bool GPUMemCopyManager::AllocHostPinnedMem(size_t size, void **addr) {
+  auto alloc_size = CudaDriver::AllocHostPinnedMem(size, addr);
+  return alloc_size == size;
+}
+
+void GPUMemCopyManager::FreeHostPinnedMem(void *addr) { CudaDriver::FreeHostPinnedMem(addr); }
+
+void GPUMemCopyManager::ClearSwapQueue() {
+  CHECK_OP_RET_WITH_EXCEPT(SyncMemCopyStream(SwapKind::kDeviceToHost), "Failed to sync swap out stream");
+  CHECK_OP_RET_WITH_EXCEPT(SyncMemCopyStream(SwapKind::kHostToDevice), "Failed to sync swap in stream");
+
+  while (!swap_out_queue_.empty()) {
+    auto &event = swap_out_queue_.front().second;
+    CHECK_OP_RET_WITH_EXCEPT(CudaDriver::DestroyEvent(event), "Failed to destroy CUDA event of swap out.");
+    swap_out_queue_.pop();
+  }
+  while (!swap_in_queue_.empty()) {
+    auto &event = swap_in_queue_.front().second;
+    CHECK_OP_RET_WITH_EXCEPT(CudaDriver::DestroyEvent(event), "Failed to destroy CUDA event of swap in.");
+    swap_in_queue_.pop();
+  }
+}
+}  // namespace gpu
+}  // namespace device
+}  // namespace mindspore
--- a/mindspore/ccsrc/device/gpu/gpu_memory_copy_manager.h
+++ b/mindspore/ccsrc/device/gpu/gpu_memory_copy_manager.h
@ -0,0 +1,69 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_DEVICE_GPU_GPU_MEMORY_COPY_MANAGER_H_
+#define MINDSPORE_CCSRC_DEVICE_GPU_GPU_MEMORY_COPY_MANAGER_H_
+
+#include <memory>
+#include <queue>
+#include <utility>
+#include "pre_activate/mem_reuse/mem_copy_manager.h"
+#include "device/device_address.h"
+#include "device/gpu/cuda_driver.h"
+#include "kernel/kernel.h"
+
+namespace mindspore {
+namespace device {
+namespace gpu {
+using mindspore::device::memswap::MemCopyManager;
+using mindspore::device::memswap::SwapKind;
+class GPUMemCopyManager : public MemCopyManager {
+ public:
+  GPUMemCopyManager() = default;
+
+  ~GPUMemCopyManager() override = default;
+
+  void Init() override;
+
+  void AddMemSwapOutTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr) override;
+
+  void AddMemSwapInTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr) override;
+
+  bool SyncMemCopyStream(SwapKind swap_kind) override;
+
+  DeviceAddressPtr UpdateSwapOutQueue() override;
+
+  DeviceAddressPtr UpdateSwapInQueue() override;
+
+  bool AllocHostPinnedMem(size_t size, void **addr) override;
+
+  void FreeHostPinnedMem(void *addr) override;
+
+  void ClearSwapQueue() override;
+
+ private:
+  DeviceStream swap_out_stream_{nullptr};
+  DeviceStream swap_in_stream_{nullptr};
+  std::queue<std::pair<DeviceAddressPtr, DeviceEvent>> swap_out_queue_;
+  std::queue<std::pair<DeviceAddressPtr, DeviceEvent>> swap_in_queue_;
+};
+using GPUMemCopyManagerPtr = std::shared_ptr<GPUMemCopyManager>;
+
+}  // namespace gpu
+}  // namespace device
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_DEVICE_GPU_GPU_MEMORY_COPY_MANAGER_H_
--- a/mindspore/ccsrc/pre_activate/mem_reuse/mem_copy_manager.h
+++ b/mindspore/ccsrc/pre_activate/mem_reuse/mem_copy_manager.h
@ -14,8 +14,8 @@
 * limitations under the License.
 */

-#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_MEM_REUSE_MEM_SWAP_UTIL_H_
-#define MINDSPORE_CCSRC_PRE_ACTIVATE_MEM_REUSE_MEM_SWAP_UTIL_H_
+#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_MEM_REUSE_MEM_COPY_MANAGER_H_
+#define MINDSPORE_CCSRC_PRE_ACTIVATE_MEM_REUSE_MEM_COPY_MANAGER_H_

 #include <vector>
 #include <map>
@ -26,10 +26,6 @@
 #include "device/gpu/cuda_driver.h"
 #include "kernel/kernel.h"

-using mindspore::device::gpu::DeviceEvent;
-using mindspore::device::gpu::DeviceMemPtr;
-using mindspore::device::gpu::DeviceStream;
-using mindspore::device::gpu::HostMemPtr;
 using HostAddress = mindspore::kernel::Address;
 namespace mindspore {
 namespace device {
@ -74,31 +70,29 @@ class MemCopyManager {
 public:
  MemCopyManager() = default;

-  ~MemCopyManager() = default;
+  virtual ~MemCopyManager() = default;

-  void Init();
+  virtual void Init() {}

-  void AddMemSwapOutTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr);
+  virtual void AddMemSwapOutTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr) {}

-  void AddMemSwapInTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr);
+  virtual void AddMemSwapInTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr) {}

-  bool SyncMemCopyStream(SwapKind swap_kind);
+  virtual bool SyncMemCopyStream(SwapKind swap_kind) { return true; }

-  DeviceAddressPtr UpdateSwapOutQueue();
+  virtual DeviceAddressPtr UpdateSwapOutQueue() { return nullptr; }

-  DeviceAddressPtr UpdateSwapInQueue();
+  virtual DeviceAddressPtr UpdateSwapInQueue() { return nullptr; }

-  void ClearSwapQueue();
+  virtual bool AllocHostPinnedMem(size_t size, void **addr) { return true; }

- private:
-  DeviceStream swap_out_stream_{nullptr};
-  DeviceStream swap_in_stream_{nullptr};
-  std::queue<std::pair<DeviceAddressPtr, DeviceEvent>> swap_out_queue_;
-  std::queue<std::pair<DeviceAddressPtr, DeviceEvent>> swap_in_queue_;
+  virtual void FreeHostPinnedMem(void *addr) {}
+
+  virtual void ClearSwapQueue() {}
 };
 using MemCopyManagerPtr = std::shared_ptr<MemCopyManager>;
 }  // namespace memswap
 }  // namespace device
 }  // namespace mindspore

-#endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_MEM_REUSE_MEM_SWAP_UTIL_H_
+#endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_MEM_REUSE_MEM_COPY_MANAGER_H_
--- a/tests/ut/cpp/stub/runtime/cuda_runtime_api.h
+++ b/tests/ut/cpp/stub/runtime/cuda_runtime_api.h
@ -17,7 +17,9 @@
 #define TESTS_UT_STUB_RUNTIME_INCLUDE_CUDA_RUNTIME_API_H_

 #include <cstddef>
-typedef enum { cudaSuccess = 0 } cudaError_t;
+typedef enum { cudaSuccess = 0, cudaErrorNotReady = 1 } cudaError_t;
+
+unsigned int cudaEventDefault = 0;

 enum cudaMemcpyKind {
  cudaMemcpyHostToHost = 0,