!45662 gpu communication op memory alignment

Merge pull request !45662 from limingqi107/bug_fix4
2022-11-21 01:16:01 +00:00 · 2022-11-21 01:16:01 +00:00 · 438c18f5f4
parent 02bf0de98e eadaaee098
commit 438c18f5f4
12 changed files with 50 additions and 31 deletions
--- a/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_memory_allocator.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_memory_allocator.cc
@ -26,6 +26,7 @@ namespace device {
 namespace gpu {
 const size_t kGBToByte = 1024 << 20;
 constexpr float kReservedMemoryRatio = 0.0625;  // 1/16
+static const size_t MEM_ALIGN_SIZE = 512;

 bool GPUMemoryAllocator::Init() {
  size_t total_size = CudaDriver::total_mem_size();
@ -97,6 +98,13 @@ size_t GPUMemoryAllocator::AllocDeviceMem(size_t size, DeviceMemPtr *addr) {
 bool GPUMemoryAllocator::FreeDeviceMem(const DeviceMemPtr &addr) { return CudaDriver::FreeDeviceMem(addr); }

 size_t GPUMemoryAllocator::free_mem_size() { return std::min(CudaDriver::free_mem_size(), available_device_memory_); }
+
+size_t GPUMemoryAllocator::AlignMemorySize(size_t size) const {
+  if (size == 0) {
+    return MEM_ALIGN_SIZE;
+  }
+  return ((size + MEM_ALIGN_SIZE - 1) / MEM_ALIGN_SIZE) * MEM_ALIGN_SIZE;
+}
 }  // namespace gpu
 }  // namespace device
 }  // namespace mindspore
--- a/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_memory_allocator.h
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_memory_allocator.h
@ -34,6 +34,7 @@ class GPUMemoryAllocator : public DynamicMemPoolBestFit {
  size_t AllocDeviceMem(size_t size, DeviceMemPtr *addr) override;
  bool FreeDeviceMem(const DeviceMemPtr &addr) override;
  size_t free_mem_size() override;
+  size_t AlignMemorySize(size_t size) const override;

  static GPUMemoryAllocator &GetInstance() {
    static GPUMemoryAllocator instance;
--- a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc
@ -263,10 +263,17 @@ std::vector<void *> GPUDeviceResManager::AllocateContinuousMemory(const std::vec
    std::vector<void *> ptr_list;
    return ptr_list;
  }
-  if (auto_mem_offload_ != nullptr) {
-    return auto_mem_offload_->MallocContinuousMem(size_list);
+
+  // Memory allocation ensures memory alignment.
+  std::vector<size_t> align_size_list;
+  for (size_t size : size_list) {
+    auto align_size = GPUMemoryAllocator::GetInstance().AlignMemorySize(size);
+    (void)align_size_list.emplace_back(align_size);
  }
-  return mem_manager_->MallocContinuousMemFromMemPool(size_list);
+  if (auto_mem_offload_ != nullptr) {
+    return auto_mem_offload_->MallocContinuousMem(align_size_list);
+  }
+  return mem_manager_->MallocContinuousMemFromMemPool(align_size_list);
 }

 namespace {
--- a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_somas.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_somas.cc
@ -141,16 +141,11 @@ bool GPUSomas::InplaceNodeProcess(const session::KernelGraph &graph) {

 void GPUSomas::CommunicationTensorProcess(const std::vector<somas::SomasTensorPtr> &tensors) const {
  if (tensors.size() != ALONE) {
-    size_t all_communication_size = 0;
    for (auto &tensor : tensors) {
-      tensor->aligned_size_ = tensor->GetOriginalSize();
+      MS_EXCEPTION_IF_NULL(tensor);
      MS_EXCEPTION_IF_CHECK_FAIL(tensor->aligned_size_ != 0, "The size of communication tensor is zero, tensor id: " +
                                                               std::to_string(tensor->GetId()));
-      all_communication_size += tensor->aligned_size_;
    }
-    auto aligned_communication_size = GetAlignSize(all_communication_size);
-    auto need_aligned = aligned_communication_size - all_communication_size;
-    tensors[tensors.size() - 1]->aligned_size_ += need_aligned;
  }
 }
 }  // namespace gpu
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nccl/nccl_collective_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nccl/nccl_collective_gpu_kernel.h
@ -94,9 +94,13 @@ class NcclCollectiveGpuKernel : public NcclGpuKernelMod {
      for (size_t j = 0; j < shape.size(); j++) {
        size *= LongToSizeClipNeg(shape[j]);
      }
-      size_t aligned_size = (nccl_kernel_type_ != NCCL_ALL_REDUCE) ? size : AlignMemorySize(size);
-      input_size_list_.push_back(aligned_size);
-      input_size_ += aligned_size;
+      input_size_list_.push_back(size);
+      // Framework memory allocation ensures memory alignment, but AllGather/ReduceScatter calculation cann‘t have
+      // aligned gaps in single input scenarios.
+      if (input_num > 1) {
+        size = device::gpu::GPUMemoryAllocator::GetInstance().AlignMemorySize(size);
+      }
+      input_size_ += size;
    }
    for (size_t i = 0; i < output_num; ++i) {
      auto shape = AnfAlgo::GetOutputDeviceShapeAdaptively(kernel_node, i);
@ -109,9 +113,13 @@ class NcclCollectiveGpuKernel : public NcclGpuKernelMod {
      for (size_t j = 0; j < shape.size(); j++) {
        size *= LongToSizeClipNeg(shape[j]);
      }
-      size_t aligned_size = (nccl_kernel_type_ != NCCL_ALL_REDUCE) ? size : AlignMemorySize(size);
-      output_size_list_.push_back(aligned_size);
-      output_size_ += aligned_size;
+      output_size_list_.push_back(size);
+      // Framework memory allocation ensures memory alignment, but AllGather/ReduceScatter calculation cann‘t have
+      // aligned gaps in single output scenarios.
+      if (output_num > 1) {
+        size = device::gpu::GPUMemoryAllocator::GetInstance().AlignMemorySize(size);
+      }
+      output_size_ += size;
    }

    group_name_ = GetAttr<std::string>(kernel_node, kAttrGroup);
@ -216,21 +224,12 @@ class NcclCollectiveGpuKernel : public NcclGpuKernelMod {
    return;
  }

-  size_t AlignMemorySize(size_t size) const {
-    if (size == 0) {
-      return COMMUNICATION_MEM_ALIGN_SIZE;
-    }
-    return ((size + COMMUNICATION_MEM_ALIGN_SIZE - 1) / COMMUNICATION_MEM_ALIGN_SIZE) * COMMUNICATION_MEM_ALIGN_SIZE;
-  }
-
  NcclKernelType nccl_kernel_type_;
  ncclRedOp_t nccl_reduce_type_;
  size_t input_size_;
  size_t output_size_;
  int root_;
  bool is_null_input_;
-
-  static const size_t COMMUNICATION_MEM_ALIGN_SIZE = 16;
 };
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nccl/nccl_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nccl/nccl_gpu_kernel.h
@ -25,6 +25,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/kernel_constants.h"
 #include "plugin/device/gpu/hal/hardware/nvidia_collective_comm_lib.h"
+#include "plugin/device/gpu/hal/device/gpu_memory_allocator.h"
 #include "runtime/collective/collective_comm_lib_loader.h"

 namespace mindspore {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nccl/nccl_p2p_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nccl/nccl_p2p_gpu_kernel.h
@ -81,7 +81,8 @@ class NcclP2PGpuKernel : public NcclGpuKernelMod {
        size *= LongToSizeClipNeg(shape[j]);
      }
      input_size_list_.push_back(size);
-      input_size_ += size;
+      // Framework memory allocation ensures memory alignment.
+      input_size_ += device::gpu::GPUMemoryAllocator::GetInstance().AlignMemorySize(size);
    }
    for (size_t i = 0; i < output_num; ++i) {
      auto shape = AnfAlgo::GetOutputDeviceShapeAdaptively(kernel_node, i);
@ -95,7 +96,8 @@ class NcclP2PGpuKernel : public NcclGpuKernelMod {
        size *= LongToSizeClipNeg(shape[j]);
      }
      output_size_list_.push_back(size);
-      output_size_ += size;
+      // Framework memory allocation ensures memory alignment.
+      output_size_ += device::gpu::GPUMemoryAllocator::GetInstance().AlignMemorySize(size);
    }

    group_name_ = GetAttr<std::string>(kernel_node, kAttrGroup);
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/rl/mux_recv_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/rl/mux_recv_gpu_kernel.h
@ -105,7 +105,8 @@ class MuxRecvGpuKernel : public MuxBaseGpuKernel {
      size_t output_size =
        std::accumulate(output_shape.begin(), output_shape.end(), data_size, std::multiplies<size_t>());
      output_size_list_.push_back(output_size);
-      total_size_ += output_size;
+      // Framework memory allocation ensures memory alignment.
+      total_size_ += device::gpu::GPUMemoryAllocator::GetInstance().AlignMemorySize(output_size);
    }

    SelectCollectiveHandle();
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/rl/mux_send_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/rl/mux_send_gpu_kernel.h
@ -118,7 +118,8 @@ class MuxSendGpuKernel : public MuxBaseGpuKernel {
      }
      size_t input_size = std::accumulate(input_shape.begin(), input_shape.end(), data_size, std::multiplies<size_t>());
      input_size_list_.push_back(input_size);
-      total_size_ += input_size;
+      // Framework memory allocation ensures memory alignment.
+      total_size_ += device::gpu::GPUMemoryAllocator::GetInstance().AlignMemorySize(input_size);
    }
    output_size_list_.push_back(0);

--- a/mindspore/ccsrc/runtime/graph_scheduler/actor/abstract_actor.cc
+++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/abstract_actor.cc
@ -24,7 +24,8 @@ void AbstractActor::RunOpData(OpData<DeviceTensor> *const input_data, OpContext<
  MS_EXCEPTION_IF_NULL(input_data);
  MS_EXCEPTION_IF_NULL(input_data->data_);
  if (!input_data->data_->IsPtrValid()) {
-    MS_LOG(EXCEPTION) << "The input_data does not have a valid ptr.";
+    MS_LOG(EXCEPTION) << "The input_data does not have a valid ptr of actor:" << GetAID().Name()
+                      << " with index:" << input_data->index_;
  }
  MS_EXCEPTION_IF_NULL(context);
  auto &sequential_num = context->sequential_num_;
--- a/mindspore/ccsrc/runtime/graph_scheduler/actor/memory_manager_actor.cc
+++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/memory_manager_actor.cc
@ -81,8 +81,10 @@ void MemoryManagerActor::AllocateContinuousMemory(const std::vector<std::vector<
    auto &size_list = (*size_list_list)[i];
    auto &device_context = (*device_contexts)[i];
    MS_EXCEPTION_IF_NULL(device_context);
-    // if the address of continuous tensor has already been allocated, skip the tensor
+    // If the address of continuous tensor has already been allocated, skip the tensor.
    if (alloc_list[0]->GetPtr() != nullptr) {
+      MS_LOG(WARNING) << "The continuous memory has already been allocated of actor: " << from_aid.Name()
+                      << " with index: " << i;
      continue;
    }
    // Allocate memory through the device context.
--- a/mindspore/ccsrc/runtime/graph_scheduler/graph_scheduler.cc
+++ b/mindspore/ccsrc/runtime/graph_scheduler/graph_scheduler.cc
@ -1495,7 +1495,8 @@ void GraphScheduler::LinkDataArrow(AbstractActor *const to_actor, const GraphCom

  if (kKernelTypeToLinkFunc.count(kernel_type) == 0) {
    if (graph_compiler_info.strategy_ == GraphExecutionStrategy::kPipeline) {
-      MS_LOG(WARNING) << "Invalid from node:" << from_kernel->fullname_with_scope() << ", type:" << kernel_type;
+      MS_LOG(EXCEPTION) << "Invalid from node:" << from_kernel->fullname_with_scope()
+                        << " to actor:" << to_actor->GetAID().Name() << ", type:" << kernel_type;
    }
    return;
  }