!45662 gpu communication op memory alignment

Merge pull request !45662 from limingqi107/bug_fix4
This commit is contained in:
i-robot 2022-11-21 01:16:01 +00:00 committed by Gitee
commit 438c18f5f4
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
12 changed files with 50 additions and 31 deletions

View File

@ -26,6 +26,7 @@ namespace device {
namespace gpu {
const size_t kGBToByte = 1024 << 20;
constexpr float kReservedMemoryRatio = 0.0625; // 1/16
static const size_t MEM_ALIGN_SIZE = 512;
bool GPUMemoryAllocator::Init() {
size_t total_size = CudaDriver::total_mem_size();
@ -97,6 +98,13 @@ size_t GPUMemoryAllocator::AllocDeviceMem(size_t size, DeviceMemPtr *addr) {
bool GPUMemoryAllocator::FreeDeviceMem(const DeviceMemPtr &addr) { return CudaDriver::FreeDeviceMem(addr); }
size_t GPUMemoryAllocator::free_mem_size() { return std::min(CudaDriver::free_mem_size(), available_device_memory_); }
size_t GPUMemoryAllocator::AlignMemorySize(size_t size) const {
if (size == 0) {
return MEM_ALIGN_SIZE;
}
return ((size + MEM_ALIGN_SIZE - 1) / MEM_ALIGN_SIZE) * MEM_ALIGN_SIZE;
}
} // namespace gpu
} // namespace device
} // namespace mindspore

View File

@ -34,6 +34,7 @@ class GPUMemoryAllocator : public DynamicMemPoolBestFit {
size_t AllocDeviceMem(size_t size, DeviceMemPtr *addr) override;
bool FreeDeviceMem(const DeviceMemPtr &addr) override;
size_t free_mem_size() override;
size_t AlignMemorySize(size_t size) const override;
static GPUMemoryAllocator &GetInstance() {
static GPUMemoryAllocator instance;

View File

@ -263,10 +263,17 @@ std::vector<void *> GPUDeviceResManager::AllocateContinuousMemory(const std::vec
std::vector<void *> ptr_list;
return ptr_list;
}
if (auto_mem_offload_ != nullptr) {
return auto_mem_offload_->MallocContinuousMem(size_list);
// Memory allocation ensures memory alignment.
std::vector<size_t> align_size_list;
for (size_t size : size_list) {
auto align_size = GPUMemoryAllocator::GetInstance().AlignMemorySize(size);
(void)align_size_list.emplace_back(align_size);
}
return mem_manager_->MallocContinuousMemFromMemPool(size_list);
if (auto_mem_offload_ != nullptr) {
return auto_mem_offload_->MallocContinuousMem(align_size_list);
}
return mem_manager_->MallocContinuousMemFromMemPool(align_size_list);
}
namespace {

View File

@ -141,16 +141,11 @@ bool GPUSomas::InplaceNodeProcess(const session::KernelGraph &graph) {
void GPUSomas::CommunicationTensorProcess(const std::vector<somas::SomasTensorPtr> &tensors) const {
if (tensors.size() != ALONE) {
size_t all_communication_size = 0;
for (auto &tensor : tensors) {
tensor->aligned_size_ = tensor->GetOriginalSize();
MS_EXCEPTION_IF_NULL(tensor);
MS_EXCEPTION_IF_CHECK_FAIL(tensor->aligned_size_ != 0, "The size of communication tensor is zero, tensor id: " +
std::to_string(tensor->GetId()));
all_communication_size += tensor->aligned_size_;
}
auto aligned_communication_size = GetAlignSize(all_communication_size);
auto need_aligned = aligned_communication_size - all_communication_size;
tensors[tensors.size() - 1]->aligned_size_ += need_aligned;
}
}
} // namespace gpu

View File

@ -94,9 +94,13 @@ class NcclCollectiveGpuKernel : public NcclGpuKernelMod {
for (size_t j = 0; j < shape.size(); j++) {
size *= LongToSizeClipNeg(shape[j]);
}
size_t aligned_size = (nccl_kernel_type_ != NCCL_ALL_REDUCE) ? size : AlignMemorySize(size);
input_size_list_.push_back(aligned_size);
input_size_ += aligned_size;
input_size_list_.push_back(size);
// Framework memory allocation ensures memory alignment, but AllGather/ReduceScatter calculation cannt have
// aligned gaps in single input scenarios.
if (input_num > 1) {
size = device::gpu::GPUMemoryAllocator::GetInstance().AlignMemorySize(size);
}
input_size_ += size;
}
for (size_t i = 0; i < output_num; ++i) {
auto shape = AnfAlgo::GetOutputDeviceShapeAdaptively(kernel_node, i);
@ -109,9 +113,13 @@ class NcclCollectiveGpuKernel : public NcclGpuKernelMod {
for (size_t j = 0; j < shape.size(); j++) {
size *= LongToSizeClipNeg(shape[j]);
}
size_t aligned_size = (nccl_kernel_type_ != NCCL_ALL_REDUCE) ? size : AlignMemorySize(size);
output_size_list_.push_back(aligned_size);
output_size_ += aligned_size;
output_size_list_.push_back(size);
// Framework memory allocation ensures memory alignment, but AllGather/ReduceScatter calculation cannt have
// aligned gaps in single output scenarios.
if (output_num > 1) {
size = device::gpu::GPUMemoryAllocator::GetInstance().AlignMemorySize(size);
}
output_size_ += size;
}
group_name_ = GetAttr<std::string>(kernel_node, kAttrGroup);
@ -216,21 +224,12 @@ class NcclCollectiveGpuKernel : public NcclGpuKernelMod {
return;
}
size_t AlignMemorySize(size_t size) const {
if (size == 0) {
return COMMUNICATION_MEM_ALIGN_SIZE;
}
return ((size + COMMUNICATION_MEM_ALIGN_SIZE - 1) / COMMUNICATION_MEM_ALIGN_SIZE) * COMMUNICATION_MEM_ALIGN_SIZE;
}
NcclKernelType nccl_kernel_type_;
ncclRedOp_t nccl_reduce_type_;
size_t input_size_;
size_t output_size_;
int root_;
bool is_null_input_;
static const size_t COMMUNICATION_MEM_ALIGN_SIZE = 16;
};
} // namespace kernel
} // namespace mindspore

View File

@ -25,6 +25,7 @@
#include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
#include "plugin/device/gpu/kernel/kernel_constants.h"
#include "plugin/device/gpu/hal/hardware/nvidia_collective_comm_lib.h"
#include "plugin/device/gpu/hal/device/gpu_memory_allocator.h"
#include "runtime/collective/collective_comm_lib_loader.h"
namespace mindspore {

View File

@ -81,7 +81,8 @@ class NcclP2PGpuKernel : public NcclGpuKernelMod {
size *= LongToSizeClipNeg(shape[j]);
}
input_size_list_.push_back(size);
input_size_ += size;
// Framework memory allocation ensures memory alignment.
input_size_ += device::gpu::GPUMemoryAllocator::GetInstance().AlignMemorySize(size);
}
for (size_t i = 0; i < output_num; ++i) {
auto shape = AnfAlgo::GetOutputDeviceShapeAdaptively(kernel_node, i);
@ -95,7 +96,8 @@ class NcclP2PGpuKernel : public NcclGpuKernelMod {
size *= LongToSizeClipNeg(shape[j]);
}
output_size_list_.push_back(size);
output_size_ += size;
// Framework memory allocation ensures memory alignment.
output_size_ += device::gpu::GPUMemoryAllocator::GetInstance().AlignMemorySize(size);
}
group_name_ = GetAttr<std::string>(kernel_node, kAttrGroup);

View File

@ -105,7 +105,8 @@ class MuxRecvGpuKernel : public MuxBaseGpuKernel {
size_t output_size =
std::accumulate(output_shape.begin(), output_shape.end(), data_size, std::multiplies<size_t>());
output_size_list_.push_back(output_size);
total_size_ += output_size;
// Framework memory allocation ensures memory alignment.
total_size_ += device::gpu::GPUMemoryAllocator::GetInstance().AlignMemorySize(output_size);
}
SelectCollectiveHandle();

View File

@ -118,7 +118,8 @@ class MuxSendGpuKernel : public MuxBaseGpuKernel {
}
size_t input_size = std::accumulate(input_shape.begin(), input_shape.end(), data_size, std::multiplies<size_t>());
input_size_list_.push_back(input_size);
total_size_ += input_size;
// Framework memory allocation ensures memory alignment.
total_size_ += device::gpu::GPUMemoryAllocator::GetInstance().AlignMemorySize(input_size);
}
output_size_list_.push_back(0);

View File

@ -24,7 +24,8 @@ void AbstractActor::RunOpData(OpData<DeviceTensor> *const input_data, OpContext<
MS_EXCEPTION_IF_NULL(input_data);
MS_EXCEPTION_IF_NULL(input_data->data_);
if (!input_data->data_->IsPtrValid()) {
MS_LOG(EXCEPTION) << "The input_data does not have a valid ptr.";
MS_LOG(EXCEPTION) << "The input_data does not have a valid ptr of actor:" << GetAID().Name()
<< " with index:" << input_data->index_;
}
MS_EXCEPTION_IF_NULL(context);
auto &sequential_num = context->sequential_num_;

View File

@ -81,8 +81,10 @@ void MemoryManagerActor::AllocateContinuousMemory(const std::vector<std::vector<
auto &size_list = (*size_list_list)[i];
auto &device_context = (*device_contexts)[i];
MS_EXCEPTION_IF_NULL(device_context);
// if the address of continuous tensor has already been allocated, skip the tensor
// If the address of continuous tensor has already been allocated, skip the tensor.
if (alloc_list[0]->GetPtr() != nullptr) {
MS_LOG(WARNING) << "The continuous memory has already been allocated of actor: " << from_aid.Name()
<< " with index: " << i;
continue;
}
// Allocate memory through the device context.

View File

@ -1495,7 +1495,8 @@ void GraphScheduler::LinkDataArrow(AbstractActor *const to_actor, const GraphCom
if (kKernelTypeToLinkFunc.count(kernel_type) == 0) {
if (graph_compiler_info.strategy_ == GraphExecutionStrategy::kPipeline) {
MS_LOG(WARNING) << "Invalid from node:" << from_kernel->fullname_with_scope() << ", type:" << kernel_type;
MS_LOG(EXCEPTION) << "Invalid from node:" << from_kernel->fullname_with_scope()
<< " to actor:" << to_actor->GetAID().Name() << ", type:" << kernel_type;
}
return;
}