diff --git a/mindspore/ccsrc/runtime/device/gpu/distribution/collective_fake_init.cc b/mindspore/ccsrc/runtime/device/gpu/distribution/collective_fake_init.cc index cd511277456..2258ccf4421 100644 --- a/mindspore/ccsrc/runtime/device/gpu/distribution/collective_fake_init.cc +++ b/mindspore/ccsrc/runtime/device/gpu/distribution/collective_fake_init.cc @@ -21,15 +21,13 @@ namespace mindspore { namespace device { namespace gpu { void CollectiveFakeInitializer::InitCollective() { - MS_LOG(EXCEPTION) - << "You are trying to call 'init('nccl')', but this MindSpore package is built without NCCL. Please download GPU " - "version of MindSpore and try again."; + MS_LOG(EXCEPTION) << "You are trying to call 'init('nccl')', Please check " + "this MindSpore package is GPU version and built with NCCL."; } void CollectiveFakeInitializer::FinalizeCollective() { - MS_LOG(EXCEPTION) - << "You are trying to call 'init('nccl')', but this MindSpore package is built without NCCL. Please download GPU " - "version of MindSpore and try again."; + MS_LOG(EXCEPTION) << "You are trying to call 'init('nccl')', Please check " + "this MindSpore package is GPU version and built with NCCL."; } } // namespace gpu } // namespace device diff --git a/mindspore/ccsrc/runtime/device/gpu/distribution/collective_init.cc b/mindspore/ccsrc/runtime/device/gpu/distribution/collective_init.cc index 137956a6a12..65d7c27f786 100644 --- a/mindspore/ccsrc/runtime/device/gpu/distribution/collective_init.cc +++ b/mindspore/ccsrc/runtime/device/gpu/distribution/collective_init.cc @@ -32,12 +32,14 @@ const void *CollectiveInitializer::collective_handle() const { return collective void CollectiveInitializer::InitCollective() { void *handle = dlopen("libgpu_collective.so", RTLD_LAZY); if (handle == nullptr) { - MS_LOG(EXCEPTION) << "Loading libgpu_collective.so failed. Many reasons could cause this:\n" - "1.libgpu_collective.so is not found.\n" - "2.NCCL is not found or the user-installed NCCL version installed is incompatible: MindSpore " - "requires NCCL-2.7.6.\n" - "3.OpenMPI is not found or the user-installed OpenMPI version is incompatible: MindSpore " - "requires OpenMPI-4.0.3.\n"; + MS_LOG(EXCEPTION) + << "Loading libgpu_collective.so failed. Many reasons could cause this:\n" + "1.libgpu_collective.so is not found, please check this MindSpore package is GPU version and built " + "with distributed feature.\n" + "2.NCCL is not found or the user-installed NCCL version installed is incompatible: MindSpore " + "requires NCCL-2.7.6.\n" + "3.OpenMPI is not found or the user-installed OpenMPI version is incompatible: MindSpore " + "requires OpenMPI-4.0.3.\n"; } auto mpi_init_funcptr = reinterpret_cast(dlsym(handle, "InitMPI")); MS_EXCEPTION_IF_NULL(mpi_init_funcptr); diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_memory_allocator.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_memory_allocator.cc index 46a135ffffe..4aa48a1f402 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_memory_allocator.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_memory_allocator.cc @@ -37,8 +37,10 @@ bool GPUMemoryAllocator::Init() { MS_LOG(INFO) << "GPU device total memory size " << total_size << ", current free memory size " << free_size << ", set max available memory size " << available_device_memory_ << "."; } else { - MS_LOG(EXCEPTION) << "GPU device memory error, total memory size " << total_size << ", current free memory size " - << free_size << ", set max available memory size " << available_device_memory_ << "."; + MS_LOG(EXCEPTION) + << "The total size or free size or max_device_memory size of GPU memory can't be zero, total memory size " + << total_size << ", current free memory size " << free_size << ", set max available memory size " + << available_device_memory_ << "."; } return true; } @@ -50,7 +52,7 @@ void GPUMemoryAllocator::CheckMaxDeviceMemory() const { // Currently not support modifying the max device memory. if (limited_device_memory_ != max_device_memory) { MS_LOG(EXCEPTION) - << "Can't change context param max_device_memory in runtime, currently effective max_device_memory(" + << "Can't change or set context param max_device_memory during running, currently effective max_device_memory(" << limited_device_memory_ << "GB), set new max_device_memory(" << max_device_memory << "GB) failed."; } } diff --git a/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.cc b/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.cc index 7d62e65d10c..6d4ce0311e1 100644 --- a/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.cc +++ b/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.cc @@ -80,7 +80,7 @@ std::string SupportedTypeList(const CNodePtr &kernel_node) { std::string op_name = AnfAlgo::GetCNodeName(kernel_node); auto op_info_ptr = mindspore::kernel::OpLib::FindOp(op_name, kernel::OpImplyType::kAKG); if (op_info_ptr == nullptr) { - MS_LOG(EXCEPTION) << "Unsupported op [" << op_name << "]"; + MS_LOG(EXCEPTION) << "Unsupported op [" << op_name << "] on GPU"; } (void)ParseMetadata(kernel_node, op_info_ptr, kernel::Processor::CUDA, &kernel_info_list); for (size_t i = 0; i < kernel_info_list.size(); i++) { @@ -135,7 +135,7 @@ bool SelectAkgKernel(const CNodePtr &kernel_node, const std::shared_ptr