forked from mindspore-Ecosystem/mindspore
!14055 Add clear error log when set device id failed
From: @zyli2020 Reviewed-by: @wilfchen,@limingqi107,@wilfchen Signed-off-by: @wilfchen
This commit is contained in:
commit
a4db0f862d
|
@ -95,7 +95,7 @@ void GPUSession::Init(uint32_t device_id) {
|
|||
MS_EXCEPTION_IF_NULL(get_local_rank_funcptr);
|
||||
device_id = IntToUint((*get_local_rank_funcptr)());
|
||||
}
|
||||
bool ret = device::gpu::CudaDriver::set_current_device(UintToInt(device_id));
|
||||
bool ret = device::gpu::CudaDriver::SetDevice(UintToInt(device_id));
|
||||
if (!ret) {
|
||||
MS_LOG(EXCEPTION) << "GPUSession failed to set current device id:" << device_id;
|
||||
}
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#include "ps/ps_cache/ps_cache_factory.h"
|
||||
#include "backend/kernel_compiler/gpu/cuda_impl/hash_impl.cuh"
|
||||
#include "runtime/device/gpu/gpu_common.h"
|
||||
#include "runtime/device/gpu/cuda_driver.h"
|
||||
#include "runtime/device/gpu/gpu_memory_allocator.h"
|
||||
#include "utils/ms_context.h"
|
||||
|
||||
|
@ -26,7 +27,11 @@ namespace ps {
|
|||
namespace gpu {
|
||||
MS_REG_PS_CACHE(kGPUDevice, GPUPsCache);
|
||||
bool GPUPsCache::InitDevice(uint32_t device_id, const void *) {
|
||||
CHECK_CUDA_RET_WITH_RETURN_ERROR_NOTRACE(cudaSetDevice(device_id), "Cuda set device failed")
|
||||
bool ret = device::gpu::CudaDriver::SetDevice(UintToInt(device_id));
|
||||
if (!ret) {
|
||||
MS_LOG(ERROR) << "Failed to set device id:" << device_id;
|
||||
return false;
|
||||
}
|
||||
CHECK_CUDA_RET_WITH_RETURN_ERROR_NOTRACE(cudaStreamCreate(reinterpret_cast<CUstream_st **>(&stream_)),
|
||||
"Cuda create stream failed");
|
||||
return true;
|
||||
|
|
|
@ -238,11 +238,16 @@ int CudaDriver::device_count() {
|
|||
return dev_count;
|
||||
}
|
||||
|
||||
bool CudaDriver::set_current_device(int index) {
|
||||
bool CudaDriver::SetDevice(int index) {
|
||||
auto ret = cudaSetDevice(index);
|
||||
if (ret != cudaSuccess) {
|
||||
MS_LOG(ERROR) << "cudaSetDevice " << index << " failed, ret[" << static_cast<int>(ret) << "], "
|
||||
<< cudaGetErrorString(ret);
|
||||
MS_LOG(ERROR)
|
||||
<< "SetDevice for id:" << index << " failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret)
|
||||
<< ". Please make sure that the 'device_id' set in context is in the range:[0, total number of GPU). "
|
||||
"If the environment variable 'CUDA_VISIBLE_DEVICES' is set, the total number of GPU will be the number set "
|
||||
"in the environment variable 'CUDA_VISIBLE_DEVICES'. For example, if export CUDA_VISIBLE_DEVICES=4,5,6, the "
|
||||
"'device_id' can be 0,1,2 at the moment, 'device_id' starts from 0, and 'device_id'=0 means using GPU of "
|
||||
"number 4.";
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
|
|
@ -63,7 +63,7 @@ class CudaDriver {
|
|||
|
||||
// Encapsulate the cuda APIs associated with device management.
|
||||
static int device_count();
|
||||
static bool set_current_device(int index);
|
||||
static bool SetDevice(int index);
|
||||
|
||||
private:
|
||||
CudaDriver() = delete;
|
||||
|
|
|
@ -106,7 +106,14 @@ void GpuBufferMgr::set_device_id(int device_id) { cur_dev_id_ = device_id; }
|
|||
void GpuBufferMgr::set_device() const {
|
||||
auto ret = cudaSetDevice(cur_dev_id_);
|
||||
if (ret != cudaSuccess) {
|
||||
MS_LOG(ERROR) << "cudaSetDevice, ret[" << static_cast<int>(ret) << "]";
|
||||
MS_LOG(ERROR)
|
||||
<< "Set device for id:" << cur_dev_id_ << " failed, ret[" << static_cast<int>(ret) << "], "
|
||||
<< cudaGetErrorString(ret)
|
||||
<< ". Please make sure that the 'device_id' set in context is in the range:[0, total number of GPU). "
|
||||
"If the environment variable 'CUDA_VISIBLE_DEVICES' is set, the total number of GPU will be the number set "
|
||||
"in the environment variable 'CUDA_VISIBLE_DEVICES'. For example, if export CUDA_VISIBLE_DEVICES=4,5,6, the "
|
||||
"'device_id' can be 0,1,2 at the moment, 'device_id' starts from 0, and 'device_id'=0 means using GPU of "
|
||||
"number 4.";
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -24,7 +24,7 @@ namespace mindspore {
|
|||
namespace device {
|
||||
namespace gpu {
|
||||
void GPUDeviceManager::InitDevice() {
|
||||
CHECK_OP_RET_WITH_EXCEPT(CudaDriver::set_current_device(SizeToInt(cur_dev_id_)), "Failed to set current device id");
|
||||
CHECK_OP_RET_WITH_EXCEPT(CudaDriver::SetDevice(SizeToInt(cur_dev_id_)), "Failed to set current device id");
|
||||
CHECK_OP_RET_WITH_EXCEPT(CreateStream(&default_stream_), "Failed to create CUDA stream.");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(cudnnCreate(&cudnn_handle_), "Failed to create cuDNN handle");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(cudnnSetStream(cudnn_handle_, reinterpret_cast<cudaStream_t>(default_stream())),
|
||||
|
|
Loading…
Reference in New Issue