Merge pull request !22744 from zyli2020/fix_defect
This commit is contained in:
i-robot 2021-09-06 08:37:02 +00:00 committed by Gitee
commit e02ebc9ae5
4 changed files with 19 additions and 17 deletions

View File

@ -21,15 +21,13 @@ namespace mindspore {
namespace device {
namespace gpu {
void CollectiveFakeInitializer::InitCollective() {
MS_LOG(EXCEPTION)
<< "You are trying to call 'init('nccl')', but this MindSpore package is built without NCCL. Please download GPU "
"version of MindSpore and try again.";
MS_LOG(EXCEPTION) << "You are trying to call 'init('nccl')', Please check "
"this MindSpore package is GPU version and built with NCCL.";
}
void CollectiveFakeInitializer::FinalizeCollective() {
MS_LOG(EXCEPTION)
<< "You are trying to call 'init('nccl')', but this MindSpore package is built without NCCL. Please download GPU "
"version of MindSpore and try again.";
MS_LOG(EXCEPTION) << "You are trying to call 'init('nccl')', Please check "
"this MindSpore package is GPU version and built with NCCL.";
}
} // namespace gpu
} // namespace device

View File

@ -32,12 +32,14 @@ const void *CollectiveInitializer::collective_handle() const { return collective
void CollectiveInitializer::InitCollective() {
void *handle = dlopen("libgpu_collective.so", RTLD_LAZY);
if (handle == nullptr) {
MS_LOG(EXCEPTION) << "Loading libgpu_collective.so failed. Many reasons could cause this:\n"
"1.libgpu_collective.so is not found.\n"
"2.NCCL is not found or the user-installed NCCL version installed is incompatible: MindSpore "
"requires NCCL-2.7.6.\n"
"3.OpenMPI is not found or the user-installed OpenMPI version is incompatible: MindSpore "
"requires OpenMPI-4.0.3.\n";
MS_LOG(EXCEPTION)
<< "Loading libgpu_collective.so failed. Many reasons could cause this:\n"
"1.libgpu_collective.so is not found, please check this MindSpore package is GPU version and built "
"with distributed feature.\n"
"2.NCCL is not found or the user-installed NCCL version installed is incompatible: MindSpore "
"requires NCCL-2.7.6.\n"
"3.OpenMPI is not found or the user-installed OpenMPI version is incompatible: MindSpore "
"requires OpenMPI-4.0.3.\n";
}
auto mpi_init_funcptr = reinterpret_cast<InitMPI>(dlsym(handle, "InitMPI"));
MS_EXCEPTION_IF_NULL(mpi_init_funcptr);

View File

@ -37,8 +37,10 @@ bool GPUMemoryAllocator::Init() {
MS_LOG(INFO) << "GPU device total memory size " << total_size << ", current free memory size " << free_size
<< ", set max available memory size " << available_device_memory_ << ".";
} else {
MS_LOG(EXCEPTION) << "GPU device memory error, total memory size " << total_size << ", current free memory size "
<< free_size << ", set max available memory size " << available_device_memory_ << ".";
MS_LOG(EXCEPTION)
<< "The total size or free size or max_device_memory size of GPU memory can't be zero, total memory size "
<< total_size << ", current free memory size " << free_size << ", set max available memory size "
<< available_device_memory_ << ".";
}
return true;
}
@ -50,7 +52,7 @@ void GPUMemoryAllocator::CheckMaxDeviceMemory() const {
// Currently not support modifying the max device memory.
if (limited_device_memory_ != max_device_memory) {
MS_LOG(EXCEPTION)
<< "Can't change context param max_device_memory in runtime, currently effective max_device_memory("
<< "Can't change or set context param max_device_memory during running, currently effective max_device_memory("
<< limited_device_memory_ << "GB), set new max_device_memory(" << max_device_memory << "GB) failed.";
}
}

View File

@ -80,7 +80,7 @@ std::string SupportedTypeList(const CNodePtr &kernel_node) {
std::string op_name = AnfAlgo::GetCNodeName(kernel_node);
auto op_info_ptr = mindspore::kernel::OpLib::FindOp(op_name, kernel::OpImplyType::kAKG);
if (op_info_ptr == nullptr) {
MS_LOG(EXCEPTION) << "Unsupported op [" << op_name << "]";
MS_LOG(EXCEPTION) << "Unsupported op [" << op_name << "] on GPU";
}
(void)ParseMetadata(kernel_node, op_info_ptr, kernel::Processor::CUDA, &kernel_info_list);
for (size_t i = 0; i < kernel_info_list.size(); i++) {
@ -135,7 +135,7 @@ bool SelectAkgKernel(const CNodePtr &kernel_node, const std::shared_ptr<KernelBu
return CheckKernelInfo(alternative_kernel_info, selected_kernel_info);
});
if (!match) {
MS_LOG(ERROR) << "Not find op[" << op_name << "] in akg";
MS_LOG(ERROR) << "Not find op[" << op_name << "] which both match data type and format in akg";
return false;
}
return true;