From c5ac2cc38c95fc10dedfc003c2cfcd49d0d4f07e Mon Sep 17 00:00:00 2001 From: lizhenyu Date: Fri, 22 May 2020 18:44:31 +0800 Subject: [PATCH] add memory copy module --- mindspore/ccsrc/device/device_address.h | 4 + mindspore/ccsrc/device/gpu/cuda_driver.cc | 91 +++++++++++- mindspore/ccsrc/device/gpu/cuda_driver.h | 14 ++ .../ccsrc/device/gpu/gpu_device_address.h | 5 + .../device/gpu/gpu_memory_copy_manager.cc | 131 ++++++++++++++++++ .../device/gpu/gpu_memory_copy_manager.h | 69 +++++++++ .../{mem_swap_util.h => mem_copy_manager.h} | 34 ++--- tests/ut/cpp/stub/runtime/cuda_runtime_api.h | 4 +- 8 files changed, 329 insertions(+), 23 deletions(-) create mode 100644 mindspore/ccsrc/device/gpu/gpu_memory_copy_manager.cc create mode 100644 mindspore/ccsrc/device/gpu/gpu_memory_copy_manager.h rename mindspore/ccsrc/pre_activate/mem_reuse/{mem_swap_util.h => mem_copy_manager.h} (69%) diff --git a/mindspore/ccsrc/device/device_address.h b/mindspore/ccsrc/device/device_address.h index 2d43963934c..fd3188e0f2f 100644 --- a/mindspore/ccsrc/device/device_address.h +++ b/mindspore/ccsrc/device/device_address.h @@ -47,6 +47,8 @@ class GPUMemoryManager; namespace mindspore { namespace device { +enum class DeviceAddressStatus { kInDevice, kInHost, kInDeviceToHost, kInHostToDevice }; + class DeviceAddress { public: explicit DeviceAddress(void *ptr, size_t size) : ptr_(ptr), size_(size) {} @@ -60,6 +62,8 @@ class DeviceAddress { size_t GetSize() const { return size_; } std::string format() const { return format_; } TypeId type_id() const { return type_id_; } + virtual void set_status(DeviceAddressStatus status) {} + virtual DeviceAddressStatus status() const { return DeviceAddressStatus::kInDevice; } protected: const void *ptr() const { return ptr_; } diff --git a/mindspore/ccsrc/device/gpu/cuda_driver.cc b/mindspore/ccsrc/device/gpu/cuda_driver.cc index 3b265a4d5ee..85329a4069a 100644 --- a/mindspore/ccsrc/device/gpu/cuda_driver.cc +++ b/mindspore/ccsrc/device/gpu/cuda_driver.cc @@ -15,9 +15,7 @@ */ #include "device/gpu/cuda_driver.h" - #include - #include "utils/log_adapter.h" #include "utils/convert_utils.h" @@ -54,6 +52,27 @@ bool CudaDriver::FreeDeviceMem(const DeviceMemPtr &addr) { return true; } +size_t CudaDriver::AllocHostPinnedMem(size_t size, void **addr) { + if (size == 0) { + MS_LOG(EXCEPTION) << "The memory allocate size is 0"; + } + auto ret = cudaHostAlloc(addr, size, cudaHostAllocDefault); + if (ret != cudaSuccess) { + MS_LOG(ERROR) << "cudaHostAlloc failed, ret[" << static_cast(ret) << "], " << cudaGetErrorString(ret); + return 0; + } + return size; +} + +void CudaDriver::FreeHostPinnedMem(void *addr) { + if (addr) { + auto ret = cudaFreeHost(addr); + if (ret != cudaSuccess) { + MS_LOG(EXCEPTION) << "cudaFreeHost failed, ret[" << static_cast(ret) << "], " << cudaGetErrorString(ret); + } + } +} + bool CudaDriver::CopyHostMemToDevice(const DeviceMemPtr &dst, const void *src, size_t size) { auto ret = cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice); if (ret != cudaSuccess) { @@ -72,6 +91,25 @@ bool CudaDriver::CopyDeviceMemToHost(const HostMemPtr &dst, const DeviceMemPtr & return true; } +bool CudaDriver::CopyHostMemToDeviceAsync(const DeviceMemPtr &dst, const void *src, size_t size, DeviceStream stream) { + auto ret = cudaMemcpyAsync(dst, src, size, cudaMemcpyHostToDevice, (cudaStream_t)stream); + if (ret != cudaSuccess) { + MS_LOG(ERROR) << "cudaMemcpyAsync failed, ret[" << static_cast(ret) << "], " << cudaGetErrorString(ret); + return false; + } + return true; +} + +bool CudaDriver::CopyDeviceMemToHostAsync(const HostMemPtr &dst, const DeviceMemPtr &src, size_t size, + DeviceStream stream) { + auto ret = cudaMemcpyAsync(dst, src, size, cudaMemcpyHostToDevice, (cudaStream_t)stream); + if (ret != cudaSuccess) { + MS_LOG(ERROR) << "cudaMemcpyAsync failed, ret[" << static_cast(ret) << "], " << cudaGetErrorString(ret); + return false; + } + return true; +} + size_t CudaDriver::total_mem_size() { size_t free; size_t total; @@ -122,6 +160,55 @@ bool CudaDriver::SyncStream(const DeviceStream &stream) { return true; } +bool CudaDriver::CreateEvent(DeviceEvent *event, unsigned int flag) { + auto ret = cudaEventCreateWithFlags(reinterpret_cast(event), flag); + if (ret != cudaSuccess) { + MS_LOG(ERROR) << "cudaEventCreateWithFlags failed, ret[" << static_cast(ret) << "], " + << cudaGetErrorString(ret); + return false; + } + return true; +} + +bool CudaDriver::DestroyEvent(const DeviceEvent &event) { + auto ret = cudaEventDestroy((cudaEvent_t)event); + if (ret != cudaSuccess) { + MS_LOG(ERROR) << "cudaEventDestroy failed, ret[" << static_cast(ret) << "], " << cudaGetErrorString(ret); + return false; + } + return true; +} + +bool CudaDriver::RecordEvent(DeviceEvent event, DeviceStream stream) { + auto ret = cudaEventRecord((cudaEvent_t)event, (cudaStream_t)stream); + if (ret != cudaSuccess) { + MS_LOG(ERROR) << "cudaEventRecord failed, ret[" << static_cast(ret) << "], " << cudaGetErrorString(ret); + return false; + } + return true; +} + +bool CudaDriver::SyncEvent(const DeviceEvent &event) { + auto ret = cudaEventSynchronize((cudaEvent_t)event); + if (ret != cudaSuccess) { + MS_LOG(ERROR) << "cudaEventSynchronize failed, ret[" << static_cast(ret) << "], " << cudaGetErrorString(ret); + return false; + } + return true; +} + +bool CudaDriver::QueryEvent(const DeviceEvent &event) { + auto ret = cudaEventQuery((cudaEvent_t)event); + if (ret == cudaSuccess) { + return true; + } else if (ret == cudaErrorNotReady) { + return false; + } else { + MS_LOG(ERROR) << "cudaEventQuery failed, ret[" << static_cast(ret) << "], " << cudaGetErrorString(ret); + return false; + } +} + int CudaDriver::device_count() { int dev_count; auto ret = cudaGetDeviceCount(&dev_count); diff --git a/mindspore/ccsrc/device/gpu/cuda_driver.h b/mindspore/ccsrc/device/gpu/cuda_driver.h index 860e747f38d..fb5d60f6cfb 100644 --- a/mindspore/ccsrc/device/gpu/cuda_driver.h +++ b/mindspore/ccsrc/device/gpu/cuda_driver.h @@ -33,8 +33,16 @@ class CudaDriver { // such as malloc/free and memory copy from host to device and reverse. static size_t AllocDeviceMem(size_t size, DeviceMemPtr *addr); static bool FreeDeviceMem(const DeviceMemPtr &addr); + static size_t AllocHostPinnedMem(size_t size, void **addr); + static void FreeHostPinnedMem(void *addr); + static bool CopyHostMemToDevice(const DeviceMemPtr &dst, const void *src, size_t size); static bool CopyDeviceMemToHost(const HostMemPtr &dst, const DeviceMemPtr &src, size_t size); + + static bool CopyHostMemToDeviceAsync(const DeviceMemPtr &dst, const void *src, size_t size, DeviceStream stream = 0); + static bool CopyDeviceMemToHostAsync(const HostMemPtr &dst, const DeviceMemPtr &src, size_t size, + DeviceStream stream = 0); + static size_t total_mem_size(); static size_t free_mem_size(); @@ -44,6 +52,12 @@ class CudaDriver { static bool DestroyStream(const DeviceStream &stream); static bool SyncStream(const DeviceStream &stream); + static bool CreateEvent(DeviceEvent *event, unsigned int flag = cudaEventDefault); + static bool DestroyEvent(const DeviceEvent &event); + static bool RecordEvent(DeviceEvent event, DeviceStream stream = 0); + static bool SyncEvent(const DeviceEvent &event); + static bool QueryEvent(const DeviceEvent &event); + // Encapsulate the cuda APIs associated with device management. static int device_count(); static bool set_current_device(int index); diff --git a/mindspore/ccsrc/device/gpu/gpu_device_address.h b/mindspore/ccsrc/device/gpu/gpu_device_address.h index ac54102a0a8..f5c6b6e36b7 100644 --- a/mindspore/ccsrc/device/gpu/gpu_device_address.h +++ b/mindspore/ccsrc/device/gpu/gpu_device_address.h @@ -33,6 +33,11 @@ class GPUDeviceAddress : public DeviceAddress { bool SyncDeviceToHost(const std::vector &shape, size_t size, TypeId type, void *host_ptr) const override; bool SyncHostToDevice(const std::vector &shape, size_t size, TypeId type, const void *host_ptr) const override; + void set_status(DeviceAddressStatus status) { status_ = status; } + DeviceAddressStatus status() const { return status_; } + + private: + DeviceAddressStatus status_{DeviceAddressStatus::kInDevice}; }; } // namespace gpu } // namespace device diff --git a/mindspore/ccsrc/device/gpu/gpu_memory_copy_manager.cc b/mindspore/ccsrc/device/gpu/gpu_memory_copy_manager.cc new file mode 100644 index 00000000000..8443e4799f2 --- /dev/null +++ b/mindspore/ccsrc/device/gpu/gpu_memory_copy_manager.cc @@ -0,0 +1,131 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/gpu/gpu_memory_copy_manager.h" +#include "device/gpu/gpu_common.h" +#include "device/gpu/gpu_device_manager.h" +#include "session/anf_runtime_algorithm.h" + +namespace mindspore { +namespace device { +namespace gpu { +void GPUMemCopyManager::Init() { + CHECK_OP_RET_WITH_EXCEPT(GPUDeviceManager::GetInstance().CreateStream(&swap_out_stream_), + "Failed to create CUDA stream of memory swap out."); + CHECK_OP_RET_WITH_EXCEPT(GPUDeviceManager::GetInstance().CreateStream(&swap_in_stream_), + "Failed to create CUDA stream of memory swap in."); +} + +void GPUMemCopyManager::AddMemSwapOutTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr) { + MS_EXCEPTION_IF_NULL(device_address); + MS_EXCEPTION_IF_NULL(host_addr.addr); + DeviceEvent event = nullptr; + CHECK_OP_RET_WITH_EXCEPT(CudaDriver::CreateEvent(&event, cudaEventDisableTiming), "Failed to create CUDA event."); + DeviceMemPtr device_ptr = const_cast(device_address->GetPtr()); + MS_EXCEPTION_IF_NULL(device_ptr); + device_address->set_status(DeviceAddressStatus::kInDeviceToHost); + + CHECK_OP_RET_WITH_EXCEPT( + CudaDriver::CopyDeviceMemToHostAsync(host_addr.addr, device_ptr, host_addr.size, swap_out_stream_), + "Failed to copy device memory to host."); + + CHECK_OP_RET_WITH_EXCEPT(CudaDriver::RecordEvent(event, swap_out_stream_), + "Failed to record CUDA event to swap out stream."); + swap_out_queue_.emplace(device_address, event); +} + +void GPUMemCopyManager::AddMemSwapInTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr) { + MS_EXCEPTION_IF_NULL(device_address); + MS_EXCEPTION_IF_NULL(host_addr.addr); + DeviceEvent event = nullptr; + CHECK_OP_RET_WITH_EXCEPT(CudaDriver::CreateEvent(&event, cudaEventDisableTiming), "Failed to create CUDA event."); + DeviceMemPtr device_ptr = const_cast(device_address->GetPtr()); + MS_EXCEPTION_IF_NULL(device_ptr); + device_address->set_status(DeviceAddressStatus::kInHostToDevice); + + CHECK_OP_RET_WITH_EXCEPT( + CudaDriver::CopyHostMemToDeviceAsync(device_ptr, host_addr.addr, host_addr.size, swap_in_stream_), + "Failed to copy host memory to device."); + CHECK_OP_RET_WITH_EXCEPT(CudaDriver::RecordEvent(event, swap_in_stream_), + "Failed to record CUDA event to swap in stream."); + swap_in_queue_.emplace(device_address, event); +} + +bool GPUMemCopyManager::SyncMemCopyStream(SwapKind swap_kind) { + if (swap_kind == SwapKind::kDeviceToHost) { + return GPUDeviceManager::GetInstance().SyncStream(swap_out_stream_); + } else { + return GPUDeviceManager::GetInstance().SyncStream(swap_in_stream_); + } +} + +DeviceAddressPtr GPUMemCopyManager::UpdateSwapOutQueue() { + if (swap_out_queue_.empty()) { + return nullptr; + } + auto &task = swap_out_queue_.front(); + auto device_address = task.first; + auto &event = task.second; + bool finish_swap = CudaDriver::QueryEvent(event); + if (!finish_swap) { + return nullptr; + } + CHECK_OP_RET_WITH_EXCEPT(CudaDriver::DestroyEvent(event), "Failed to destroy CUDA event of swap out."); + swap_out_queue_.pop(); + return device_address; +} + +DeviceAddressPtr GPUMemCopyManager::UpdateSwapInQueue() { + if (swap_in_queue_.empty()) { + return nullptr; + } + auto &task = swap_in_queue_.front(); + auto device_address = task.first; + auto &event = task.second; + bool finish_swap = CudaDriver::QueryEvent(event); + if (!finish_swap) { + return nullptr; + } + CHECK_OP_RET_WITH_EXCEPT(CudaDriver::DestroyEvent(event), "Failed to destroy CUDA event of swap in."); + swap_in_queue_.pop(); + return device_address; +} + +bool GPUMemCopyManager::AllocHostPinnedMem(size_t size, void **addr) { + auto alloc_size = CudaDriver::AllocHostPinnedMem(size, addr); + return alloc_size == size; +} + +void GPUMemCopyManager::FreeHostPinnedMem(void *addr) { CudaDriver::FreeHostPinnedMem(addr); } + +void GPUMemCopyManager::ClearSwapQueue() { + CHECK_OP_RET_WITH_EXCEPT(SyncMemCopyStream(SwapKind::kDeviceToHost), "Failed to sync swap out stream"); + CHECK_OP_RET_WITH_EXCEPT(SyncMemCopyStream(SwapKind::kHostToDevice), "Failed to sync swap in stream"); + + while (!swap_out_queue_.empty()) { + auto &event = swap_out_queue_.front().second; + CHECK_OP_RET_WITH_EXCEPT(CudaDriver::DestroyEvent(event), "Failed to destroy CUDA event of swap out."); + swap_out_queue_.pop(); + } + while (!swap_in_queue_.empty()) { + auto &event = swap_in_queue_.front().second; + CHECK_OP_RET_WITH_EXCEPT(CudaDriver::DestroyEvent(event), "Failed to destroy CUDA event of swap in."); + swap_in_queue_.pop(); + } +} +} // namespace gpu +} // namespace device +} // namespace mindspore diff --git a/mindspore/ccsrc/device/gpu/gpu_memory_copy_manager.h b/mindspore/ccsrc/device/gpu/gpu_memory_copy_manager.h new file mode 100644 index 00000000000..a7cd8d4d8fc --- /dev/null +++ b/mindspore/ccsrc/device/gpu/gpu_memory_copy_manager.h @@ -0,0 +1,69 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_DEVICE_GPU_GPU_MEMORY_COPY_MANAGER_H_ +#define MINDSPORE_CCSRC_DEVICE_GPU_GPU_MEMORY_COPY_MANAGER_H_ + +#include +#include +#include +#include "pre_activate/mem_reuse/mem_copy_manager.h" +#include "device/device_address.h" +#include "device/gpu/cuda_driver.h" +#include "kernel/kernel.h" + +namespace mindspore { +namespace device { +namespace gpu { +using mindspore::device::memswap::MemCopyManager; +using mindspore::device::memswap::SwapKind; +class GPUMemCopyManager : public MemCopyManager { + public: + GPUMemCopyManager() = default; + + ~GPUMemCopyManager() override = default; + + void Init() override; + + void AddMemSwapOutTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr) override; + + void AddMemSwapInTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr) override; + + bool SyncMemCopyStream(SwapKind swap_kind) override; + + DeviceAddressPtr UpdateSwapOutQueue() override; + + DeviceAddressPtr UpdateSwapInQueue() override; + + bool AllocHostPinnedMem(size_t size, void **addr) override; + + void FreeHostPinnedMem(void *addr) override; + + void ClearSwapQueue() override; + + private: + DeviceStream swap_out_stream_{nullptr}; + DeviceStream swap_in_stream_{nullptr}; + std::queue> swap_out_queue_; + std::queue> swap_in_queue_; +}; +using GPUMemCopyManagerPtr = std::shared_ptr; + +} // namespace gpu +} // namespace device +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_DEVICE_GPU_GPU_MEMORY_COPY_MANAGER_H_ diff --git a/mindspore/ccsrc/pre_activate/mem_reuse/mem_swap_util.h b/mindspore/ccsrc/pre_activate/mem_reuse/mem_copy_manager.h similarity index 69% rename from mindspore/ccsrc/pre_activate/mem_reuse/mem_swap_util.h rename to mindspore/ccsrc/pre_activate/mem_reuse/mem_copy_manager.h index 35a36d6ab5c..a635ef1da1b 100644 --- a/mindspore/ccsrc/pre_activate/mem_reuse/mem_swap_util.h +++ b/mindspore/ccsrc/pre_activate/mem_reuse/mem_copy_manager.h @@ -14,8 +14,8 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_MEM_REUSE_MEM_SWAP_UTIL_H_ -#define MINDSPORE_CCSRC_PRE_ACTIVATE_MEM_REUSE_MEM_SWAP_UTIL_H_ +#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_MEM_REUSE_MEM_COPY_MANAGER_H_ +#define MINDSPORE_CCSRC_PRE_ACTIVATE_MEM_REUSE_MEM_COPY_MANAGER_H_ #include #include @@ -26,10 +26,6 @@ #include "device/gpu/cuda_driver.h" #include "kernel/kernel.h" -using mindspore::device::gpu::DeviceEvent; -using mindspore::device::gpu::DeviceMemPtr; -using mindspore::device::gpu::DeviceStream; -using mindspore::device::gpu::HostMemPtr; using HostAddress = mindspore::kernel::Address; namespace mindspore { namespace device { @@ -74,31 +70,29 @@ class MemCopyManager { public: MemCopyManager() = default; - ~MemCopyManager() = default; + virtual ~MemCopyManager() = default; - void Init(); + virtual void Init() {} - void AddMemSwapOutTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr); + virtual void AddMemSwapOutTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr) {} - void AddMemSwapInTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr); + virtual void AddMemSwapInTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr) {} - bool SyncMemCopyStream(SwapKind swap_kind); + virtual bool SyncMemCopyStream(SwapKind swap_kind) { return true; } - DeviceAddressPtr UpdateSwapOutQueue(); + virtual DeviceAddressPtr UpdateSwapOutQueue() { return nullptr; } - DeviceAddressPtr UpdateSwapInQueue(); + virtual DeviceAddressPtr UpdateSwapInQueue() { return nullptr; } - void ClearSwapQueue(); + virtual bool AllocHostPinnedMem(size_t size, void **addr) { return true; } - private: - DeviceStream swap_out_stream_{nullptr}; - DeviceStream swap_in_stream_{nullptr}; - std::queue> swap_out_queue_; - std::queue> swap_in_queue_; + virtual void FreeHostPinnedMem(void *addr) {} + + virtual void ClearSwapQueue() {} }; using MemCopyManagerPtr = std::shared_ptr; } // namespace memswap } // namespace device } // namespace mindspore -#endif // MINDSPORE_CCSRC_PRE_ACTIVATE_MEM_REUSE_MEM_SWAP_UTIL_H_ +#endif // MINDSPORE_CCSRC_PRE_ACTIVATE_MEM_REUSE_MEM_COPY_MANAGER_H_ diff --git a/tests/ut/cpp/stub/runtime/cuda_runtime_api.h b/tests/ut/cpp/stub/runtime/cuda_runtime_api.h index 1a30d5879c4..7f0f0d0be33 100644 --- a/tests/ut/cpp/stub/runtime/cuda_runtime_api.h +++ b/tests/ut/cpp/stub/runtime/cuda_runtime_api.h @@ -17,7 +17,9 @@ #define TESTS_UT_STUB_RUNTIME_INCLUDE_CUDA_RUNTIME_API_H_ #include -typedef enum { cudaSuccess = 0 } cudaError_t; +typedef enum { cudaSuccess = 0, cudaErrorNotReady = 1 } cudaError_t; + +unsigned int cudaEventDefault = 0; enum cudaMemcpyKind { cudaMemcpyHostToHost = 0,