!45662 gpu communication op memory alignment
Merge pull request !45662 from limingqi107/bug_fix4
This commit is contained in:
commit
438c18f5f4
|
@ -26,6 +26,7 @@ namespace device {
|
|||
namespace gpu {
|
||||
const size_t kGBToByte = 1024 << 20;
|
||||
constexpr float kReservedMemoryRatio = 0.0625; // 1/16
|
||||
static const size_t MEM_ALIGN_SIZE = 512;
|
||||
|
||||
bool GPUMemoryAllocator::Init() {
|
||||
size_t total_size = CudaDriver::total_mem_size();
|
||||
|
@ -97,6 +98,13 @@ size_t GPUMemoryAllocator::AllocDeviceMem(size_t size, DeviceMemPtr *addr) {
|
|||
bool GPUMemoryAllocator::FreeDeviceMem(const DeviceMemPtr &addr) { return CudaDriver::FreeDeviceMem(addr); }
|
||||
|
||||
size_t GPUMemoryAllocator::free_mem_size() { return std::min(CudaDriver::free_mem_size(), available_device_memory_); }
|
||||
|
||||
size_t GPUMemoryAllocator::AlignMemorySize(size_t size) const {
|
||||
if (size == 0) {
|
||||
return MEM_ALIGN_SIZE;
|
||||
}
|
||||
return ((size + MEM_ALIGN_SIZE - 1) / MEM_ALIGN_SIZE) * MEM_ALIGN_SIZE;
|
||||
}
|
||||
} // namespace gpu
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -34,6 +34,7 @@ class GPUMemoryAllocator : public DynamicMemPoolBestFit {
|
|||
size_t AllocDeviceMem(size_t size, DeviceMemPtr *addr) override;
|
||||
bool FreeDeviceMem(const DeviceMemPtr &addr) override;
|
||||
size_t free_mem_size() override;
|
||||
size_t AlignMemorySize(size_t size) const override;
|
||||
|
||||
static GPUMemoryAllocator &GetInstance() {
|
||||
static GPUMemoryAllocator instance;
|
||||
|
|
|
@ -263,10 +263,17 @@ std::vector<void *> GPUDeviceResManager::AllocateContinuousMemory(const std::vec
|
|||
std::vector<void *> ptr_list;
|
||||
return ptr_list;
|
||||
}
|
||||
if (auto_mem_offload_ != nullptr) {
|
||||
return auto_mem_offload_->MallocContinuousMem(size_list);
|
||||
|
||||
// Memory allocation ensures memory alignment.
|
||||
std::vector<size_t> align_size_list;
|
||||
for (size_t size : size_list) {
|
||||
auto align_size = GPUMemoryAllocator::GetInstance().AlignMemorySize(size);
|
||||
(void)align_size_list.emplace_back(align_size);
|
||||
}
|
||||
return mem_manager_->MallocContinuousMemFromMemPool(size_list);
|
||||
if (auto_mem_offload_ != nullptr) {
|
||||
return auto_mem_offload_->MallocContinuousMem(align_size_list);
|
||||
}
|
||||
return mem_manager_->MallocContinuousMemFromMemPool(align_size_list);
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
|
|
@ -141,16 +141,11 @@ bool GPUSomas::InplaceNodeProcess(const session::KernelGraph &graph) {
|
|||
|
||||
void GPUSomas::CommunicationTensorProcess(const std::vector<somas::SomasTensorPtr> &tensors) const {
|
||||
if (tensors.size() != ALONE) {
|
||||
size_t all_communication_size = 0;
|
||||
for (auto &tensor : tensors) {
|
||||
tensor->aligned_size_ = tensor->GetOriginalSize();
|
||||
MS_EXCEPTION_IF_NULL(tensor);
|
||||
MS_EXCEPTION_IF_CHECK_FAIL(tensor->aligned_size_ != 0, "The size of communication tensor is zero, tensor id: " +
|
||||
std::to_string(tensor->GetId()));
|
||||
all_communication_size += tensor->aligned_size_;
|
||||
}
|
||||
auto aligned_communication_size = GetAlignSize(all_communication_size);
|
||||
auto need_aligned = aligned_communication_size - all_communication_size;
|
||||
tensors[tensors.size() - 1]->aligned_size_ += need_aligned;
|
||||
}
|
||||
}
|
||||
} // namespace gpu
|
||||
|
|
|
@ -94,9 +94,13 @@ class NcclCollectiveGpuKernel : public NcclGpuKernelMod {
|
|||
for (size_t j = 0; j < shape.size(); j++) {
|
||||
size *= LongToSizeClipNeg(shape[j]);
|
||||
}
|
||||
size_t aligned_size = (nccl_kernel_type_ != NCCL_ALL_REDUCE) ? size : AlignMemorySize(size);
|
||||
input_size_list_.push_back(aligned_size);
|
||||
input_size_ += aligned_size;
|
||||
input_size_list_.push_back(size);
|
||||
// Framework memory allocation ensures memory alignment, but AllGather/ReduceScatter calculation cann‘t have
|
||||
// aligned gaps in single input scenarios.
|
||||
if (input_num > 1) {
|
||||
size = device::gpu::GPUMemoryAllocator::GetInstance().AlignMemorySize(size);
|
||||
}
|
||||
input_size_ += size;
|
||||
}
|
||||
for (size_t i = 0; i < output_num; ++i) {
|
||||
auto shape = AnfAlgo::GetOutputDeviceShapeAdaptively(kernel_node, i);
|
||||
|
@ -109,9 +113,13 @@ class NcclCollectiveGpuKernel : public NcclGpuKernelMod {
|
|||
for (size_t j = 0; j < shape.size(); j++) {
|
||||
size *= LongToSizeClipNeg(shape[j]);
|
||||
}
|
||||
size_t aligned_size = (nccl_kernel_type_ != NCCL_ALL_REDUCE) ? size : AlignMemorySize(size);
|
||||
output_size_list_.push_back(aligned_size);
|
||||
output_size_ += aligned_size;
|
||||
output_size_list_.push_back(size);
|
||||
// Framework memory allocation ensures memory alignment, but AllGather/ReduceScatter calculation cann‘t have
|
||||
// aligned gaps in single output scenarios.
|
||||
if (output_num > 1) {
|
||||
size = device::gpu::GPUMemoryAllocator::GetInstance().AlignMemorySize(size);
|
||||
}
|
||||
output_size_ += size;
|
||||
}
|
||||
|
||||
group_name_ = GetAttr<std::string>(kernel_node, kAttrGroup);
|
||||
|
@ -216,21 +224,12 @@ class NcclCollectiveGpuKernel : public NcclGpuKernelMod {
|
|||
return;
|
||||
}
|
||||
|
||||
size_t AlignMemorySize(size_t size) const {
|
||||
if (size == 0) {
|
||||
return COMMUNICATION_MEM_ALIGN_SIZE;
|
||||
}
|
||||
return ((size + COMMUNICATION_MEM_ALIGN_SIZE - 1) / COMMUNICATION_MEM_ALIGN_SIZE) * COMMUNICATION_MEM_ALIGN_SIZE;
|
||||
}
|
||||
|
||||
NcclKernelType nccl_kernel_type_;
|
||||
ncclRedOp_t nccl_reduce_type_;
|
||||
size_t input_size_;
|
||||
size_t output_size_;
|
||||
int root_;
|
||||
bool is_null_input_;
|
||||
|
||||
static const size_t COMMUNICATION_MEM_ALIGN_SIZE = 16;
|
||||
};
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
#include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
|
||||
#include "plugin/device/gpu/kernel/kernel_constants.h"
|
||||
#include "plugin/device/gpu/hal/hardware/nvidia_collective_comm_lib.h"
|
||||
#include "plugin/device/gpu/hal/device/gpu_memory_allocator.h"
|
||||
#include "runtime/collective/collective_comm_lib_loader.h"
|
||||
|
||||
namespace mindspore {
|
||||
|
|
|
@ -81,7 +81,8 @@ class NcclP2PGpuKernel : public NcclGpuKernelMod {
|
|||
size *= LongToSizeClipNeg(shape[j]);
|
||||
}
|
||||
input_size_list_.push_back(size);
|
||||
input_size_ += size;
|
||||
// Framework memory allocation ensures memory alignment.
|
||||
input_size_ += device::gpu::GPUMemoryAllocator::GetInstance().AlignMemorySize(size);
|
||||
}
|
||||
for (size_t i = 0; i < output_num; ++i) {
|
||||
auto shape = AnfAlgo::GetOutputDeviceShapeAdaptively(kernel_node, i);
|
||||
|
@ -95,7 +96,8 @@ class NcclP2PGpuKernel : public NcclGpuKernelMod {
|
|||
size *= LongToSizeClipNeg(shape[j]);
|
||||
}
|
||||
output_size_list_.push_back(size);
|
||||
output_size_ += size;
|
||||
// Framework memory allocation ensures memory alignment.
|
||||
output_size_ += device::gpu::GPUMemoryAllocator::GetInstance().AlignMemorySize(size);
|
||||
}
|
||||
|
||||
group_name_ = GetAttr<std::string>(kernel_node, kAttrGroup);
|
||||
|
|
|
@ -105,7 +105,8 @@ class MuxRecvGpuKernel : public MuxBaseGpuKernel {
|
|||
size_t output_size =
|
||||
std::accumulate(output_shape.begin(), output_shape.end(), data_size, std::multiplies<size_t>());
|
||||
output_size_list_.push_back(output_size);
|
||||
total_size_ += output_size;
|
||||
// Framework memory allocation ensures memory alignment.
|
||||
total_size_ += device::gpu::GPUMemoryAllocator::GetInstance().AlignMemorySize(output_size);
|
||||
}
|
||||
|
||||
SelectCollectiveHandle();
|
||||
|
|
|
@ -118,7 +118,8 @@ class MuxSendGpuKernel : public MuxBaseGpuKernel {
|
|||
}
|
||||
size_t input_size = std::accumulate(input_shape.begin(), input_shape.end(), data_size, std::multiplies<size_t>());
|
||||
input_size_list_.push_back(input_size);
|
||||
total_size_ += input_size;
|
||||
// Framework memory allocation ensures memory alignment.
|
||||
total_size_ += device::gpu::GPUMemoryAllocator::GetInstance().AlignMemorySize(input_size);
|
||||
}
|
||||
output_size_list_.push_back(0);
|
||||
|
||||
|
|
|
@ -24,7 +24,8 @@ void AbstractActor::RunOpData(OpData<DeviceTensor> *const input_data, OpContext<
|
|||
MS_EXCEPTION_IF_NULL(input_data);
|
||||
MS_EXCEPTION_IF_NULL(input_data->data_);
|
||||
if (!input_data->data_->IsPtrValid()) {
|
||||
MS_LOG(EXCEPTION) << "The input_data does not have a valid ptr.";
|
||||
MS_LOG(EXCEPTION) << "The input_data does not have a valid ptr of actor:" << GetAID().Name()
|
||||
<< " with index:" << input_data->index_;
|
||||
}
|
||||
MS_EXCEPTION_IF_NULL(context);
|
||||
auto &sequential_num = context->sequential_num_;
|
||||
|
|
|
@ -81,8 +81,10 @@ void MemoryManagerActor::AllocateContinuousMemory(const std::vector<std::vector<
|
|||
auto &size_list = (*size_list_list)[i];
|
||||
auto &device_context = (*device_contexts)[i];
|
||||
MS_EXCEPTION_IF_NULL(device_context);
|
||||
// if the address of continuous tensor has already been allocated, skip the tensor
|
||||
// If the address of continuous tensor has already been allocated, skip the tensor.
|
||||
if (alloc_list[0]->GetPtr() != nullptr) {
|
||||
MS_LOG(WARNING) << "The continuous memory has already been allocated of actor: " << from_aid.Name()
|
||||
<< " with index: " << i;
|
||||
continue;
|
||||
}
|
||||
// Allocate memory through the device context.
|
||||
|
|
|
@ -1495,7 +1495,8 @@ void GraphScheduler::LinkDataArrow(AbstractActor *const to_actor, const GraphCom
|
|||
|
||||
if (kKernelTypeToLinkFunc.count(kernel_type) == 0) {
|
||||
if (graph_compiler_info.strategy_ == GraphExecutionStrategy::kPipeline) {
|
||||
MS_LOG(WARNING) << "Invalid from node:" << from_kernel->fullname_with_scope() << ", type:" << kernel_type;
|
||||
MS_LOG(EXCEPTION) << "Invalid from node:" << from_kernel->fullname_with_scope()
|
||||
<< " to actor:" << to_actor->GetAID().Name() << ", type:" << kernel_type;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue