forked from mindspore-Ecosystem/mindspore
commit
e02ebc9ae5
|
@ -21,15 +21,13 @@ namespace mindspore {
|
|||
namespace device {
|
||||
namespace gpu {
|
||||
void CollectiveFakeInitializer::InitCollective() {
|
||||
MS_LOG(EXCEPTION)
|
||||
<< "You are trying to call 'init('nccl')', but this MindSpore package is built without NCCL. Please download GPU "
|
||||
"version of MindSpore and try again.";
|
||||
MS_LOG(EXCEPTION) << "You are trying to call 'init('nccl')', Please check "
|
||||
"this MindSpore package is GPU version and built with NCCL.";
|
||||
}
|
||||
|
||||
void CollectiveFakeInitializer::FinalizeCollective() {
|
||||
MS_LOG(EXCEPTION)
|
||||
<< "You are trying to call 'init('nccl')', but this MindSpore package is built without NCCL. Please download GPU "
|
||||
"version of MindSpore and try again.";
|
||||
MS_LOG(EXCEPTION) << "You are trying to call 'init('nccl')', Please check "
|
||||
"this MindSpore package is GPU version and built with NCCL.";
|
||||
}
|
||||
} // namespace gpu
|
||||
} // namespace device
|
||||
|
|
|
@ -32,12 +32,14 @@ const void *CollectiveInitializer::collective_handle() const { return collective
|
|||
void CollectiveInitializer::InitCollective() {
|
||||
void *handle = dlopen("libgpu_collective.so", RTLD_LAZY);
|
||||
if (handle == nullptr) {
|
||||
MS_LOG(EXCEPTION) << "Loading libgpu_collective.so failed. Many reasons could cause this:\n"
|
||||
"1.libgpu_collective.so is not found.\n"
|
||||
"2.NCCL is not found or the user-installed NCCL version installed is incompatible: MindSpore "
|
||||
"requires NCCL-2.7.6.\n"
|
||||
"3.OpenMPI is not found or the user-installed OpenMPI version is incompatible: MindSpore "
|
||||
"requires OpenMPI-4.0.3.\n";
|
||||
MS_LOG(EXCEPTION)
|
||||
<< "Loading libgpu_collective.so failed. Many reasons could cause this:\n"
|
||||
"1.libgpu_collective.so is not found, please check this MindSpore package is GPU version and built "
|
||||
"with distributed feature.\n"
|
||||
"2.NCCL is not found or the user-installed NCCL version installed is incompatible: MindSpore "
|
||||
"requires NCCL-2.7.6.\n"
|
||||
"3.OpenMPI is not found or the user-installed OpenMPI version is incompatible: MindSpore "
|
||||
"requires OpenMPI-4.0.3.\n";
|
||||
}
|
||||
auto mpi_init_funcptr = reinterpret_cast<InitMPI>(dlsym(handle, "InitMPI"));
|
||||
MS_EXCEPTION_IF_NULL(mpi_init_funcptr);
|
||||
|
|
|
@ -37,8 +37,10 @@ bool GPUMemoryAllocator::Init() {
|
|||
MS_LOG(INFO) << "GPU device total memory size " << total_size << ", current free memory size " << free_size
|
||||
<< ", set max available memory size " << available_device_memory_ << ".";
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "GPU device memory error, total memory size " << total_size << ", current free memory size "
|
||||
<< free_size << ", set max available memory size " << available_device_memory_ << ".";
|
||||
MS_LOG(EXCEPTION)
|
||||
<< "The total size or free size or max_device_memory size of GPU memory can't be zero, total memory size "
|
||||
<< total_size << ", current free memory size " << free_size << ", set max available memory size "
|
||||
<< available_device_memory_ << ".";
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
@ -50,7 +52,7 @@ void GPUMemoryAllocator::CheckMaxDeviceMemory() const {
|
|||
// Currently not support modifying the max device memory.
|
||||
if (limited_device_memory_ != max_device_memory) {
|
||||
MS_LOG(EXCEPTION)
|
||||
<< "Can't change context param max_device_memory in runtime, currently effective max_device_memory("
|
||||
<< "Can't change or set context param max_device_memory during running, currently effective max_device_memory("
|
||||
<< limited_device_memory_ << "GB), set new max_device_memory(" << max_device_memory << "GB) failed.";
|
||||
}
|
||||
}
|
||||
|
|
|
@ -80,7 +80,7 @@ std::string SupportedTypeList(const CNodePtr &kernel_node) {
|
|||
std::string op_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
auto op_info_ptr = mindspore::kernel::OpLib::FindOp(op_name, kernel::OpImplyType::kAKG);
|
||||
if (op_info_ptr == nullptr) {
|
||||
MS_LOG(EXCEPTION) << "Unsupported op [" << op_name << "]";
|
||||
MS_LOG(EXCEPTION) << "Unsupported op [" << op_name << "] on GPU";
|
||||
}
|
||||
(void)ParseMetadata(kernel_node, op_info_ptr, kernel::Processor::CUDA, &kernel_info_list);
|
||||
for (size_t i = 0; i < kernel_info_list.size(); i++) {
|
||||
|
@ -135,7 +135,7 @@ bool SelectAkgKernel(const CNodePtr &kernel_node, const std::shared_ptr<KernelBu
|
|||
return CheckKernelInfo(alternative_kernel_info, selected_kernel_info);
|
||||
});
|
||||
if (!match) {
|
||||
MS_LOG(ERROR) << "Not find op[" << op_name << "] in akg";
|
||||
MS_LOG(ERROR) << "Not find op[" << op_name << "] which both match data type and format in akg";
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
|
Loading…
Reference in New Issue