forked from mindspore-Ecosystem/mindspore
!9491 GPU add trace for error/excpt
From: @VectorSL Reviewed-by: @cristoval,@jjfeing Signed-off-by: @jjfeing
This commit is contained in:
commit
ab0f23a90e
|
@ -56,11 +56,13 @@ class ArrayReduceGpuKernel : public GpuKernel {
|
|||
if (all_match_) {
|
||||
MS_LOG(DEBUG)
|
||||
<< "The corresponding dimensions of the input and output tensors all match. No need to call cuDNN kernel.";
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(output_addr, input_addr, inputs[0]->size, cudaMemcpyDeviceToDevice,
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(output_addr, input_addr, inputs[0]->size, cudaMemcpyDeviceToDevice,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemcpyAsync failed in ArrayReduceGpuKernel::Launch.");
|
||||
} else {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnReduceTensor(cudnn_handle_, reduce_tensor_descriptor_, nullptr, 0, workspace_addr, workspace_size_, &alpha,
|
||||
inputA_descriptor_, input_addr, &beta, outputC_descriptor_, output_addr),
|
||||
"cudnnReduceTensor failed.");
|
||||
|
@ -68,6 +70,7 @@ class ArrayReduceGpuKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
InitResource();
|
||||
data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
|
@ -140,34 +143,35 @@ class ArrayReduceGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
void DestroyResource() noexcept override {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyReduceTensorDescriptor(reduce_tensor_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyReduceTensorDescriptor(reduce_tensor_descriptor_),
|
||||
"cudnnDestroyReduceTensorDescriptor failed.");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(inputA_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(inputA_descriptor_),
|
||||
"cudnnDestroyTensorDescriptor failed.");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(outputC_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(outputC_descriptor_),
|
||||
"cudnnDestroyTensorDescriptor failed.");
|
||||
}
|
||||
|
||||
protected:
|
||||
void InitResource() override {
|
||||
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateReduceTensorDescriptor(&reduce_tensor_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateReduceTensorDescriptor(&reduce_tensor_descriptor_),
|
||||
"cudnnCreateReduceTensorDescriptor failed.");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&inputA_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&inputA_descriptor_),
|
||||
"cudnnCreateTensorDescriptor failed.");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&outputC_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&outputC_descriptor_),
|
||||
"cudnnCreateTensorDescriptor failed.");
|
||||
}
|
||||
void InitSizeLists() override {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(inputA_descriptor_, &input_size_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(inputA_descriptor_, &input_size_),
|
||||
"cudnnGetTensorSizeInBytes failed.");
|
||||
input_size_list_.push_back(input_size_);
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(outputC_descriptor_, &output_size_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(outputC_descriptor_, &output_size_),
|
||||
"cudnnGetTensorSizeInBytes failed.");
|
||||
output_size_list_.push_back(output_size_);
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnGetReductionWorkspaceSize(cudnn_handle_, reduce_tensor_descriptor_, inputA_descriptor_, outputC_descriptor_,
|
||||
&workspace_size_),
|
||||
"cudnnGetReductionWorkspaceSize failed.");
|
||||
|
@ -186,6 +190,7 @@ class ArrayReduceGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetReduceTensorDescriptor(reduce_tensor_descriptor_, reduce_tensor_op_, CUDNN_DATA_FLOAT, nan_prop_,
|
||||
reduce_indices_, CUDNN_32BIT_INDICES),
|
||||
"cudnnSetReduceTensorDescriptor failed");
|
||||
|
@ -199,11 +204,12 @@ class ArrayReduceGpuKernel : public GpuKernel {
|
|||
if (input_shape.size() <= split_dim) {
|
||||
ShapeNdTo4d(input_shape, &inputA);
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(inputA_descriptor_, CUDNN_TENSOR_NCHW, data_type_, SizeToInt(inputA[0]),
|
||||
SizeToInt(inputA[1]), SizeToInt(inputA[2]), SizeToInt(inputA[3])),
|
||||
"cudnnSetTensor4dDescriptor failed");
|
||||
} else {
|
||||
CudnnSetTensorNdDescriptor(input_shape, inputA_descriptor_, data_type_);
|
||||
CudnnSetTensorNdDescriptor(input_shape, inputA_descriptor_, data_type_, kernel_node_);
|
||||
for (auto dim : input_shape) {
|
||||
inputA.emplace_back(SizeToInt(dim));
|
||||
}
|
||||
|
@ -213,10 +219,10 @@ class ArrayReduceGpuKernel : public GpuKernel {
|
|||
outputC_shape.resize(input_shape.size(), 1);
|
||||
if (outputC_shape.size() <= split_dim) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetTensor4dDescriptor(outputC_descriptor_, CUDNN_TENSOR_NCHW, data_type_, 1, 1, 1, 1),
|
||||
kernel_node_, cudnnSetTensor4dDescriptor(outputC_descriptor_, CUDNN_TENSOR_NCHW, data_type_, 1, 1, 1, 1),
|
||||
"cudnnSetTensor4dDescriptor failed");
|
||||
} else {
|
||||
CudnnSetTensorNdDescriptor(outputC_shape, outputC_descriptor_, data_type_);
|
||||
CudnnSetTensorNdDescriptor(outputC_shape, outputC_descriptor_, data_type_, kernel_node_);
|
||||
}
|
||||
|
||||
for (auto dim : inputA) {
|
||||
|
@ -239,11 +245,12 @@ class ArrayReduceGpuKernel : public GpuKernel {
|
|||
if (outputC_shape.size() <= split_dim) {
|
||||
ShapeNdTo4d(outputC_shape, &outputC);
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(outputC_descriptor_, CUDNN_TENSOR_NCHW, data_type_, SizeToInt(outputC[0]),
|
||||
SizeToInt(outputC[1]), SizeToInt(outputC[2]), SizeToInt(outputC[3])),
|
||||
"cudnnSetTensor4dDescriptor failed");
|
||||
} else {
|
||||
CudnnSetTensorNdDescriptor(outputC_shape, outputC_descriptor_, data_type_);
|
||||
CudnnSetTensorNdDescriptor(outputC_shape, outputC_descriptor_, data_type_, kernel_node_);
|
||||
for (auto dim : outputC_shape) {
|
||||
outputC.emplace_back(SizeToInt(dim));
|
||||
}
|
||||
|
|
|
@ -49,10 +49,12 @@ class ConcatV2GpuFwdKernel : public GpuKernel {
|
|||
for (size_t i = 0; i < inputs.size(); i++) {
|
||||
inputs_host_[i] = GetDeviceAddress<T>(inputs, i);
|
||||
}
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(inputs_device, inputs_host_.get(), sizeof(T *) * input_num_,
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(inputs_device, inputs_host_.get(), sizeof(T *) * input_num_,
|
||||
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"ConcatV2 opt cudaMemcpyAsync inputs failed");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(len_axis_device, len_axis_.get(), sizeof(int) * input_num_,
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(len_axis_device, len_axis_.get(), sizeof(int) * input_num_,
|
||||
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"ConcatV2 opt cudaMemcpyAsync length on axis failed");
|
||||
ConcatKernel(output_size_, input_num_, all_size_before_axis_, all_size_axis_, len_axis_device, inputs_device,
|
||||
|
@ -60,6 +62,7 @@ class ConcatV2GpuFwdKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
if (!CheckParam(kernel_node)) {
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -41,6 +41,7 @@ class DynamicShapeGpuKernel : public GpuKernel {
|
|||
S *output_device_address = GetDeviceAddress<S>(outputs, 0);
|
||||
size_t prev_node_output_shape_size = prev_node_output_shape_.size() * sizeof(S);
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudaMemcpyAsync(output_device_address, prev_node_output_shape_.data(), prev_node_output_shape_size,
|
||||
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemcpyAsync prev_node_output_shape failed");
|
||||
|
@ -49,6 +50,7 @@ class DynamicShapeGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
size_t input_count = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_count != 1) {
|
||||
MS_LOG(EXCEPTION) << input_count << " arguments were provided, but DynamicShapeGpuKernel expects 1.";
|
||||
|
|
|
@ -51,10 +51,12 @@ class GatherNdGpuFwdKernel : public GpuKernel {
|
|||
if (!memcpy_flag_) {
|
||||
const size_t strides_len = sizeof(S) * batch_strides_.size();
|
||||
const size_t indices_len = sizeof(S) * batch_indices_.size();
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(dev_batch_strides_, &batch_strides_[0], strides_len,
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(dev_batch_strides_, &batch_strides_[0], strides_len,
|
||||
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemcpyAsync failed in GatherNdGpuFwdKernel::Launch.");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(dev_batch_indices_, &batch_indices_[0], indices_len,
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(dev_batch_indices_, &batch_indices_[0], indices_len,
|
||||
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemcpyAsync failed in GatherNdGpuFwdKernel::Launch.");
|
||||
memcpy_flag_ = true;
|
||||
|
@ -65,6 +67,7 @@ class GatherNdGpuFwdKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
InitResource();
|
||||
memcpy_flag_ = false;
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
|
|
|
@ -43,10 +43,12 @@ class GatherV2GpuFwdKernel : public GpuKernel {
|
|||
T *output_addr = GetDeviceAddress<T>(outputs, 0);
|
||||
if (is_dynamic_shape_) {
|
||||
int64_t *axis_device_address = GetDeviceAddress<int64_t>(inputs, 2); // only get this if in dynamic mode
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(&axis_, axis_device_address, sizeof(int64_t), cudaMemcpyDeviceToHost,
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(&axis_, axis_device_address, sizeof(int64_t), cudaMemcpyDeviceToHost,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemcpyAsync axis_ failed");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaDeviceSynchronize(), "cudaDeviceSyncFailed - GatherV2 - in dynamic mode");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaDeviceSynchronize(),
|
||||
"cudaDeviceSyncFailed - GatherV2 - in dynamic mode");
|
||||
Reshape();
|
||||
}
|
||||
auto input_dim1 = input_shapes_[IntToSize(axis_)];
|
||||
|
@ -55,6 +57,7 @@ class GatherV2GpuFwdKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
InitResource();
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num == 3) {
|
||||
|
|
|
@ -41,7 +41,8 @@ class PackGpuFwdKernel : public GpuKernel {
|
|||
for (size_t i = 0; i < inputs.size(); i++) {
|
||||
inputs_host_[i] = GetDeviceAddress<T>(inputs, i);
|
||||
}
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(inputs_array, // NOLINT
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(inputs_array, // NOLINT
|
||||
inputs_host_.get(), sizeof(T *) * input_num_, cudaMemcpyHostToDevice,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"Pack opt cudaMemcpyAsync inputs failed");
|
||||
|
@ -50,6 +51,7 @@ class PackGpuFwdKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
if (!CheckParam(kernel_node)) {
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -41,13 +41,15 @@ class ScatterAddKernel : public GpuKernel {
|
|||
T *updates = GetDeviceAddress<T>(inputs, 2);
|
||||
T *output = GetDeviceAddress<T>(outputs, 0);
|
||||
CalScatterAdd(inner_size_, indices_size_, indices, updates, input, reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(&output[0], &input[0], input_size_ * sizeof(T), cudaMemcpyDeviceToDevice,
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(&output[0], &input[0], input_size_ * sizeof(T), cudaMemcpyDeviceToDevice,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemcpyAsync output failed");
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 3) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but ScatterAdd needs 3 inputs.";
|
||||
|
|
|
@ -61,16 +61,19 @@ class ScatterNdGpuFwdKernel : public GpuKernel {
|
|||
if (!memcpy_flag_) {
|
||||
const size_t indices_len = sizeof(S) * vec_indices_stride_.size();
|
||||
const size_t vec_work_len = sizeof(S) * vec_work_shape_.size();
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(indices_stride_, &vec_indices_stride_[0], indices_len,
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(indices_stride_, &vec_indices_stride_[0], indices_len,
|
||||
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemcpy failed in ScatterNdGpuFwdKernel::Launch.");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(work_shape_, &vec_work_shape_[0], vec_work_len, cudaMemcpyHostToDevice,
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(work_shape_, &vec_work_shape_[0], vec_work_len, cudaMemcpyHostToDevice,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemcpy failed in ScatterNdGpuFwdKernel::Launch.");
|
||||
memcpy_flag_ = true;
|
||||
}
|
||||
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudaMemsetAsync(output, static_cast<T>(0.0), output_size_, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemSet failed in ScatterNdGpuFwdKernel::Launch.");
|
||||
|
||||
|
@ -83,6 +86,7 @@ class ScatterNdGpuFwdKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
memcpy_flag_ = false;
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 2) {
|
||||
|
|
|
@ -41,13 +41,15 @@ class ScatterUpdateKernel : public GpuKernel {
|
|||
T *updates = GetDeviceAddress<T>(inputs, 2);
|
||||
T *output = GetDeviceAddress<T>(outputs, 0);
|
||||
CalScatterUpdate(inner_size_, indices_size_, indices, updates, input, reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(&output[0], &input[0], input_size_ * sizeof(T), cudaMemcpyDeviceToDevice,
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(&output[0], &input[0], input_size_ * sizeof(T), cudaMemcpyDeviceToDevice,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemcpyAsync output failed");
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 3) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but ScatterUpdate needs 3 inputs.";
|
||||
|
|
|
@ -41,7 +41,8 @@ class SplitGpuFwdKernel : public GpuKernel {
|
|||
for (size_t i = 0; i < outputs.size(); i++) {
|
||||
outputs_host_[i] = GetDeviceAddress<T>(outputs, i);
|
||||
}
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(outputs_device, outputs_host_.get(), sizeof(T *) * output_num_,
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(outputs_device, outputs_host_.get(), sizeof(T *) * output_num_,
|
||||
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"Split opt cudaMemcpyAsync outputs failed");
|
||||
SplitKernel(input_size_, axis_step_, all_size_before_axis_, all_size_axis_, input, outputs_device,
|
||||
|
@ -50,6 +51,7 @@ class SplitGpuFwdKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
axis_ = static_cast<int64_t>(GetAttr<int64_t>(kernel_node, "axis"));
|
||||
if (axis_ < 0) {
|
||||
auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
|
||||
|
|
|
@ -40,10 +40,12 @@ class TransposeGpuFwdKernel : public GpuKernel {
|
|||
T *output = GetDeviceAddress<T>(outputs, 0);
|
||||
size_t *input_shape = GetDeviceAddress<size_t>(workspace, 0);
|
||||
size_t *input_axis = GetDeviceAddress<size_t>(workspace, 1);
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(input_shape, &input_shape_[0], workspace_size_, cudaMemcpyHostToDevice,
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(input_shape, &input_shape_[0], workspace_size_, cudaMemcpyHostToDevice,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemcpyAsync input_shape failed");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(input_axis, &input_axis_[0], workspace_size_, cudaMemcpyHostToDevice,
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(input_axis, &input_axis_[0], workspace_size_, cudaMemcpyHostToDevice,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemcpyAsync input_axis failed");
|
||||
size_t size = input_size_ / sizeof(T);
|
||||
|
@ -52,6 +54,7 @@ class TransposeGpuFwdKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 1) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but transpose needs 1 input.";
|
||||
|
|
|
@ -60,7 +60,7 @@ class UniqueGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
void PostExecute() override {
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream_ptr_)),
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream_ptr_)),
|
||||
"cudaStreamSynchronized failed");
|
||||
std::vector<TypeId> type_ids;
|
||||
std::vector<std::vector<size_t>> shapes;
|
||||
|
|
|
@ -41,7 +41,8 @@ class UnpackGpuFwdKernel : public GpuKernel {
|
|||
for (size_t i = 0; i < outputs.size(); i++) {
|
||||
outputs_host_[i] = GetDeviceAddress<T>(outputs, i);
|
||||
}
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(outputs_array, // NOLINT
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(outputs_array, // NOLINT
|
||||
outputs_host_.get(), sizeof(T *) * output_num_, cudaMemcpyHostToDevice,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"Unpack opt cudaMemcpyAsync outputs failed");
|
||||
|
@ -50,6 +51,7 @@ class UnpackGpuFwdKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
if (!CheckParam(kernel_node)) {
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -44,7 +44,8 @@ class UnsortedSegmentMaxGpuKernel : public GpuKernel {
|
|||
int *indices_addr = GetDeviceAddress<int>(inputs, 1);
|
||||
T *output_addr = GetDeviceAddress<T>(outputs, 0);
|
||||
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemsetAsync(output_addr, std::numeric_limits<T>::min(), outputs[0]->size,
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemsetAsync(output_addr, std::numeric_limits<T>::min(), outputs[0]->size,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemSet Failed");
|
||||
CalUnsortedSegmentMax(input_addr, indices_addr, num_segments_, outer_size_, inner_size_, output_addr,
|
||||
|
@ -53,6 +54,7 @@ class UnsortedSegmentMaxGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
auto input_shapes = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shapes);
|
||||
if (is_null_input_) {
|
||||
|
|
|
@ -44,7 +44,7 @@ class UnsortedSegmentSumGpuKernel : public GpuKernel {
|
|||
T *output_addr = GetDeviceAddress<T>(outputs, 0);
|
||||
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(
|
||||
cudaMemsetAsync(output_addr, 0, outputs[0]->size, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
kernel_node_, cudaMemsetAsync(output_addr, 0, outputs[0]->size, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemSet Failed");
|
||||
UnsortedSegmentSum(input_dim0_, input_dim1_, output_dim0_, output_dim1_, input_addr, indices_addr, output_addr,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
|
@ -52,6 +52,7 @@ class UnsortedSegmentSumGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
auto input_shapes = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shapes);
|
||||
if (is_null_input_) {
|
||||
|
|
|
@ -34,10 +34,12 @@ class RecvGpuKernel : public GpuKernel {
|
|||
|
||||
bool Launch(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
|
||||
void *) override {
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamWaitEvent(wait_stream_, wait_event_, 0), "Waiting cuda event failed.");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaStreamWaitEvent(wait_stream_, wait_event_, 0),
|
||||
"Waiting cuda event failed.");
|
||||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
wait_stream_ = reinterpret_cast<cudaStream_t>(GetAttr<uintptr_t>(kernel_node, "wait_event_stream"));
|
||||
wait_event_ = reinterpret_cast<cudaEvent_t>(GetAttr<uintptr_t>(kernel_node, "wait_event"));
|
||||
InitSizeLists();
|
||||
|
|
|
@ -34,10 +34,12 @@ class SendGpuKernel : public GpuKernel {
|
|||
|
||||
bool Launch(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
|
||||
void *) override {
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaEventRecord(record_event_, record_stream_), "Recording cuda event failed.");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaEventRecord(record_event_, record_stream_),
|
||||
"Recording cuda event failed.");
|
||||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
record_stream_ = reinterpret_cast<cudaStream_t>(GetAttr<uintptr_t>(kernel_node, "record_event_stream"));
|
||||
record_event_ = reinterpret_cast<cudaEvent_t>(GetAttr<uintptr_t>(kernel_node, "record_event"));
|
||||
InitSizeLists();
|
||||
|
|
|
@ -44,6 +44,7 @@ const std::vector<size_t> &DatasetIteratorKernel::GetOutputSizeList() const { re
|
|||
const std::vector<size_t> &DatasetIteratorKernel::GetWorkspaceSizeList() const { return workspace_size_list_; }
|
||||
|
||||
bool DatasetIteratorKernel::Init(const CNodePtr &kernel_node) {
|
||||
kernel_node_ = kernel_node;
|
||||
queue_name_ = GetAttr<std::string>(kernel_node, "shared_name");
|
||||
std::vector<std::vector<int>> shapes;
|
||||
std::vector<std::vector<int64_t>> shapes_me = GetAttr<const std::vector<std::vector<int64_t>>>(kernel_node, "shapes");
|
||||
|
@ -143,13 +144,14 @@ bool DatasetIteratorKernel::Launch(const std::vector<AddressPtr> &, const std::v
|
|||
|
||||
for (size_t i = 0; i < output_size_list_.size(); i++) {
|
||||
void *output_addr = GetDeviceAddress<void>(outputs, i);
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(output_addr, addr, output_size_list_[i], cudaMemcpyDeviceToDevice,
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(output_addr, addr, output_size_list_[i], cudaMemcpyDeviceToDevice,
|
||||
reinterpret_cast<cudaStream_t>(stream)),
|
||||
"Cuda Memcpy Failed");
|
||||
addr = reinterpret_cast<unsigned char *>(addr) + output_size_list_[i];
|
||||
}
|
||||
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream)),
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream)),
|
||||
"cudaStreamSynchronize failed");
|
||||
(void)GpuBufferMgr::GetInstance().Pop(handle_);
|
||||
return true;
|
||||
|
|
|
@ -73,6 +73,7 @@ class GpuKernel : public KernelMod {
|
|||
protected:
|
||||
virtual void InitResource() {}
|
||||
virtual void InitSizeLists() = 0;
|
||||
CNodePtr kernel_node_;
|
||||
|
||||
template <typename T>
|
||||
inline T *GetDeviceAddress(const std::vector<AddressPtr> &addr_list, size_t index) {
|
||||
|
@ -201,7 +202,7 @@ class GpuKernel : public KernelMod {
|
|||
|
||||
// set the tensor descriptor for cudnn/cublas
|
||||
void CudnnSetTensorNdDescriptor(const std::vector<size_t> &shape, cudnnTensorDescriptor_t descriptor,
|
||||
cudnnDataType_t data_type) {
|
||||
cudnnDataType_t data_type, const CNodePtr &node) {
|
||||
if (shape.size() < 3) {
|
||||
MS_EXCEPTION(ValueError) << "cudnnSetTensorNdDescriptor don't support" << shape.size() << "D.";
|
||||
}
|
||||
|
@ -224,7 +225,7 @@ class GpuKernel : public KernelMod {
|
|||
stride[i] = stride[i + 1] * SizeToInt(shape[i + 1]);
|
||||
}
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(descriptor, data_type, nbDims, dim, stride),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(node, cudnnSetTensorNdDescriptor(descriptor, data_type, nbDims, dim, stride),
|
||||
"cudnnSetTensorNdDescriptor failed");
|
||||
|
||||
delete[] dim;
|
||||
|
|
|
@ -69,19 +69,22 @@ class AddNGpuFwdKernel : public GpuKernel {
|
|||
ElewiseArith(outputs[0]->size / sizeof(T), BROADCAST_TYPE_ADD, input_addr, work_addr, work_addr,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
} else {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnAddTensor(cudnn_handle_, &alpha, input_descriptor_, input_addr,
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnAddTensor(cudnn_handle_, &alpha, input_descriptor_, input_addr,
|
||||
&(i > 0 ? alpha : beta), input_descriptor_, work_addr),
|
||||
"cudnnAddTensor failed");
|
||||
}
|
||||
}
|
||||
if (work_addr != output_addr) {
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(output_addr, work_addr, outputs[0]->size, cudaMemcpyDeviceToDevice,
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(output_addr, work_addr, outputs[0]->size, cudaMemcpyDeviceToDevice,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"Addn cudaMemcpyAsync outputs failed");
|
||||
}
|
||||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
InitResource();
|
||||
cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
|
@ -111,11 +114,13 @@ class AddNGpuFwdKernel : public GpuKernel {
|
|||
}
|
||||
auto input_format = AnfAlgo::GetInputFormat(kernel_node, 0);
|
||||
if (input_format == kOpFormat_NHWC) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(input_descriptor_, CUDNN_TENSOR_NHWC, cudnn_data_type_,
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetTensorNdDescriptorEx(input_descriptor_, CUDNN_TENSOR_NHWC, cudnn_data_type_,
|
||||
SizeToInt(input_shape.size()), dimA),
|
||||
"cudnnSetTensorNdDescriptor failed");
|
||||
} else {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(input_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_,
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetTensorNdDescriptorEx(input_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_,
|
||||
SizeToInt(input_shape.size()), dimA),
|
||||
"cudnnSetTensorNdDescriptor failed");
|
||||
}
|
||||
|
@ -124,17 +129,19 @@ class AddNGpuFwdKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
void DestroyResource() noexcept override {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(input_descriptor_),
|
||||
"cudnnDestroyTensorDescriptor failed");
|
||||
}
|
||||
|
||||
protected:
|
||||
void InitResource() override {
|
||||
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&input_descriptor_), "cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&input_descriptor_),
|
||||
"cudnnCreateTensorDescriptor failed");
|
||||
}
|
||||
void InitSizeLists() override {
|
||||
if (!is_null_input_) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(input_descriptor_, &input_size_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(input_descriptor_, &input_size_),
|
||||
"cudnnGetTensorSizeInBytes failed");
|
||||
}
|
||||
for (size_t i = 0; i < num_input_; i++) {
|
||||
|
|
|
@ -57,7 +57,8 @@ class BiasAddGpuKernel : public GpuKernel {
|
|||
try {
|
||||
const float alpha = 1;
|
||||
const float beta = 0;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnOpTensor(cudnn_handle_, op_desc_, &alpha, x_desc_, x_addr, &alpha, b_desc_,
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnOpTensor(cudnn_handle_, op_desc_, &alpha, x_desc_, x_addr, &alpha, b_desc_,
|
||||
b_addr, &beta, x_desc_, output_addr),
|
||||
"cudnnOpTensor failed");
|
||||
} catch (const std::exception &e) {
|
||||
|
@ -66,6 +67,7 @@ class BiasAddGpuKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
InitResource();
|
||||
cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
|
||||
auto x_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
|
@ -99,12 +101,15 @@ class BiasAddGpuKernel : public GpuKernel {
|
|||
auto input_device_format = AnfAlgo::GetInputFormat(kernel_node, 0);
|
||||
auto cudnn_cal_format = (input_device_format == kOpFormat_NHWC) ? CUDNN_TENSOR_NHWC : CUDNN_TENSOR_NCHW;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensorNdDescriptorEx(x_desc_, cudnn_cal_format, cudnn_data_type_, SizeToInt(cudnn_dims), x_dims.get()),
|
||||
"cudnnSetTensorNdDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensorNdDescriptorEx(b_desc_, cudnn_cal_format, cudnn_data_type_, SizeToInt(cudnn_dims), b_dims.get()),
|
||||
"cudnnSetTensorNdDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetOpTensorDescriptor(op_desc_, CUDNN_OP_TENSOR_ADD, CUDNN_DATA_FLOAT, CUDNN_NOT_PROPAGATE_NAN),
|
||||
"cudnnSetOpTensorDescriptor failed");
|
||||
|
||||
|
@ -113,22 +118,30 @@ class BiasAddGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
void DestroyResource() noexcept override {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyOpTensorDescriptor(op_desc_), "cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(b_desc_), "cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "cudnnDestroyOpTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyOpTensorDescriptor(op_desc_),
|
||||
"cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(b_desc_),
|
||||
"cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(x_desc_),
|
||||
"cudnnDestroyOpTensorDescriptor failed");
|
||||
}
|
||||
|
||||
protected:
|
||||
void InitResource() override {
|
||||
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&x_desc_), "cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&b_desc_), "cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateOpTensorDescriptor(&op_desc_), "cudnnCreateOpTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&x_desc_),
|
||||
"cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&b_desc_),
|
||||
"cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateOpTensorDescriptor(&op_desc_),
|
||||
"cudnnCreateOpTensorDescriptor failed");
|
||||
}
|
||||
void InitSizeLists() override {
|
||||
size_t x_size, b_size;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(x_desc_, &x_size), "cudnnGetTensorSizeInBytes failed.");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(b_desc_, &b_size), "cudnnGetTensorSizeInBytes failed.");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(x_desc_, &x_size),
|
||||
"cudnnGetTensorSizeInBytes failed.");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(b_desc_, &b_size),
|
||||
"cudnnGetTensorSizeInBytes failed.");
|
||||
input_size_list_.push_back(x_size);
|
||||
input_size_list_.push_back(b_size);
|
||||
output_size_list_.push_back(x_size);
|
||||
|
|
|
@ -45,9 +45,11 @@ class BroadcastOpGradGpuKernel : public GpuKernel {
|
|||
T *dx1 = GetDeviceAddress<T>(outputs, 0);
|
||||
T *dx2 = GetDeviceAddress<T>(outputs, 1);
|
||||
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemsetAsync(dx1, 0, outputs[0]->size, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemsetAsync(dx1, 0, outputs[0]->size, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemSet Failed");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemsetAsync(dx2, 0, outputs[1]->size, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemsetAsync(dx2, 0, outputs[1]->size, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemSet Failed");
|
||||
if (need_broadcast_) {
|
||||
BroadcastGrad(x1_shape_[0], x1_shape_[1], x1_shape_[2], x1_shape_[3], x2_shape_[0], x2_shape_[1], x2_shape_[2],
|
||||
|
@ -61,6 +63,7 @@ class BroadcastOpGradGpuKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
GetOpType(kernel_node);
|
||||
auto shape1 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
auto shape2 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
|
||||
|
|
|
@ -48,19 +48,22 @@ class CastAllGpuFwdKernel : public GpuKernel {
|
|||
S **outputs_dev = GetDeviceAddress<S *>(workspace, 1);
|
||||
size_t *size_dev = GetDeviceAddress<size_t>(workspace, 2);
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudaMemcpyAsync(inputs_dev, in_addr.get(), sizeof(T *) * num_input_, cudaMemcpyHostToDevice, stream),
|
||||
"cudaMemCPY failed")
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudaMemcpyAsync(outputs_dev, out_addr.get(), sizeof(S *) * num_input_, cudaMemcpyHostToDevice, stream),
|
||||
"cudaMemCPY failed")
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(
|
||||
cudaMemcpyAsync(size_dev, size_.get(), sizeof(size_t) * num_input_, cudaMemcpyHostToDevice, stream),
|
||||
kernel_node_, cudaMemcpyAsync(size_dev, size_.get(), sizeof(size_t) * num_input_, cudaMemcpyHostToDevice, stream),
|
||||
"cudaMemCPY failed")
|
||||
CastAllKernel(inputs_dev, outputs_dev, max_, num_input_, size_dev, stream);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
num_input_ = GetAttr<size_t>(kernel_node, "n");
|
||||
size_ = std::make_unique<size_t[]>(num_input_);
|
||||
for (size_t i = 0; i < num_input_; i++) {
|
||||
|
|
|
@ -54,18 +54,21 @@ class CholeskyGpuKernel : public GpuKernel {
|
|||
h_array[i] = input1_addr + i * lda_ * m_;
|
||||
h_identity[i] = output_addr + i * ldb_ * m_;
|
||||
CHECK_CUDA_RET_WITH_ERROR(
|
||||
kernel_node_,
|
||||
cudaMemcpyAsync(output_addr + i * ldb_ * m_, h_identity_data.data(), sizeof(T) * ldb_ * m_,
|
||||
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cuda memcopy Fail");
|
||||
}
|
||||
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_array_addr, h_array.data(), sizeof(T *) * batch_,
|
||||
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
|
||||
cudaMemcpyAsync(d_array_addr, h_array.data(), sizeof(T *) * batch_,
|
||||
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cuda memcopy Fail");
|
||||
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_identity_addr, h_identity.data(), sizeof(T *) * batch_,
|
||||
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
|
||||
cudaMemcpyAsync(d_identity_addr, h_identity.data(), sizeof(T *) * batch_,
|
||||
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cuda memcopy Fail");
|
||||
CHECK_CUSOLVER_RET_WITH_EXCEPT(
|
||||
cusolverDnSpotrfBatched(handle_, uplo, m_, d_array_addr, lda_, d_info_array_addr, batch_),
|
||||
kernel_node_, cusolverDnSpotrfBatched(handle_, uplo, m_, d_array_addr, lda_, d_info_array_addr, batch_),
|
||||
"cusolver cholesky batched Fail");
|
||||
TriangleMatrixCopy(input1_addr, output_addr, uplo, outputs[0]->size / sizeof(T), ldb_, m_,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
|
@ -79,14 +82,16 @@ class CholeskyGpuKernel : public GpuKernel {
|
|||
Identity(batch_ * split_dim * split_dim, split_dim, output_addr, reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
MatrixSplit(batch_ * split_dim * split_dim, split_dim, width, input1_addr, d_batch_input_addr,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_array_addr, h_array.data(), sizeof(T *) * batch_,
|
||||
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
|
||||
cudaMemcpyAsync(d_array_addr, h_array.data(), sizeof(T *) * batch_,
|
||||
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cuda memcopy Fail");
|
||||
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_identity_addr, h_identity.data(), sizeof(T *) * batch_,
|
||||
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
|
||||
cudaMemcpyAsync(d_identity_addr, h_identity.data(), sizeof(T *) * batch_,
|
||||
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cuda memcopy Fail");
|
||||
CHECK_CUSOLVER_RET_WITH_EXCEPT(
|
||||
cusolverDnSpotrfBatched(handle_, uplo, m_, d_array_addr, lda_, d_info_array_addr, batch_),
|
||||
kernel_node_, cusolverDnSpotrfBatched(handle_, uplo, m_, d_array_addr, lda_, d_info_array_addr, batch_),
|
||||
"cusolver cholesky batched Fail");
|
||||
TriangleMatrixCopy(d_batch_input_addr, output_addr, uplo, outputs[0]->size / sizeof(T), ldb_, m_,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
|
@ -95,6 +100,7 @@ class CholeskyGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCusolverDnHandle();
|
||||
blas_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCublasHandle();
|
||||
auto in_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
|
|
|
@ -51,6 +51,7 @@ class CholeskyTrsmGpuKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCusolverDnHandle();
|
||||
blas_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCublasHandle();
|
||||
auto in_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
|
@ -113,21 +114,25 @@ class CholeskyTrsmGpuKernel : public GpuKernel {
|
|||
h_array[i] = input1_addr + i * lda_ * m_;
|
||||
h_identity[i] = output_addr + i * ldb_ * m_;
|
||||
CHECK_CUDA_RET_WITH_ERROR(
|
||||
kernel_node_,
|
||||
cudaMemcpyAsync(output_addr + i * ldb_ * m_, h_identity_data.data(), sizeof(T) * ldb_ * m_,
|
||||
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cuda memcopy Fail");
|
||||
}
|
||||
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_array_addr, h_array.data(), sizeof(T *) * batch_,
|
||||
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
|
||||
cudaMemcpyAsync(d_array_addr, h_array.data(), sizeof(T *) * batch_,
|
||||
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cuda memcopy Fail");
|
||||
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_identity_addr, h_identity.data(), sizeof(T *) * batch_,
|
||||
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
|
||||
cudaMemcpyAsync(d_identity_addr, h_identity.data(), sizeof(T *) * batch_,
|
||||
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cuda memcopy Fail");
|
||||
CHECK_CUSOLVER_RET_WITH_EXCEPT(
|
||||
cusolverDnSpotrfBatched(handle_, uplo, m_, d_array_addr, lda_, d_info_array_addr, batch_),
|
||||
kernel_node_, cusolverDnSpotrfBatched(handle_, uplo, m_, d_array_addr, lda_, d_info_array_addr, batch_),
|
||||
"cusolver cholesky batched Fail");
|
||||
float alpha = 1;
|
||||
CHECK_CUBLAS_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cublasStrsmBatched(blas_handle_, CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, m_, m_, &alpha,
|
||||
d_array_addr, lda_, d_identity_addr, ldb_, batch_),
|
||||
"cublas trsm batched Fail");
|
||||
|
@ -147,17 +152,20 @@ class CholeskyTrsmGpuKernel : public GpuKernel {
|
|||
Identity(batch_ * split_dim * split_dim, split_dim, output_addr, reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
MatrixSplit(batch_ * split_dim * split_dim, split_dim, width, input1_addr, d_batch_input_addr,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_array_addr, h_array.data(), sizeof(T *) * batch_,
|
||||
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
|
||||
cudaMemcpyAsync(d_array_addr, h_array.data(), sizeof(T *) * batch_,
|
||||
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cuda memcopy Fail");
|
||||
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_identity_addr, h_identity.data(), sizeof(T *) * batch_,
|
||||
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
|
||||
cudaMemcpyAsync(d_identity_addr, h_identity.data(), sizeof(T *) * batch_,
|
||||
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cuda memcopy Fail");
|
||||
CHECK_CUSOLVER_RET_WITH_EXCEPT(
|
||||
cusolverDnSpotrfBatched(handle_, uplo, m_, d_array_addr, lda_, d_info_array_addr, batch_),
|
||||
kernel_node_, cusolverDnSpotrfBatched(handle_, uplo, m_, d_array_addr, lda_, d_info_array_addr, batch_),
|
||||
"cusolver cholesky batched Fail");
|
||||
float alpha = 1;
|
||||
CHECK_CUBLAS_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cublasStrsmBatched(blas_handle_, CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, m_, m_, &alpha,
|
||||
d_array_addr, lda_, d_identity_addr, ldb_, batch_),
|
||||
"cublas trsm batched Fail");
|
||||
|
|
|
@ -71,6 +71,7 @@ class MatMulGpuKernel : public GpuKernel {
|
|||
|
||||
try {
|
||||
CHECK_CUBLAS_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cublasGemmStridedBatchedEx(handle_, transpose_x2_, transpose_x1_, SizeToInt(n_), SizeToInt(m_), SizeToInt(k_),
|
||||
&alpha, input2_addr, dtype_b_, ldb, stride_b, input1_addr, dtype_a_, lda, stride_a,
|
||||
&beta, output_addr, dtype_c_, ldc, stride_c, batch_, CUDA_R_32F, algo_),
|
||||
|
@ -81,6 +82,7 @@ class MatMulGpuKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCublasHandle();
|
||||
dtype_a_ = GetCudaDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
|
||||
dtype_b_ = GetCudaDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 1)));
|
||||
|
|
|
@ -85,6 +85,7 @@ class UpdateThorGradientGpuKernel : public GpuKernel {
|
|||
|
||||
try {
|
||||
CHECK_CUBLAS_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cublasGemmStridedBatchedEx(handle_, CUBLAS_OP_N, CUBLAS_OP_N, SizeToInt(gradient_size.ori_w),
|
||||
SizeToInt(gradient_size.h), SizeToInt(gradient_size.h), &alpha, input2_addr,
|
||||
gradient_size.dtype, ldb, stride_b, input1_addr, gradient_size.dtype, lda, stride_a,
|
||||
|
@ -116,6 +117,7 @@ class UpdateThorGradientGpuKernel : public GpuKernel {
|
|||
r_output_addr = workspace3_addr;
|
||||
}
|
||||
CHECK_CUBLAS_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cublasGemmStridedBatchedEx(handle_, CUBLAS_OP_N, CUBLAS_OP_N, SizeToInt(gradient_size.w),
|
||||
SizeToInt(gradient_size.h), SizeToInt(gradient_size.w), &alpha, input3_addr,
|
||||
gradient_size.dtype, ldb_r, stride_b, r_input_addr, gradient_size.dtype, lda_r,
|
||||
|
@ -138,6 +140,7 @@ class UpdateThorGradientGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCublasHandle();
|
||||
SetProperty(kernel_node);
|
||||
InitSizeLists();
|
||||
|
|
|
@ -77,6 +77,7 @@ class NcclCollectiveGpuKernel : public NcclGpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
nccl_data_type_ = nccl_dtype(AnfAlgo::GetInputDeviceDataType(kernel_node, 0));
|
||||
InferCommType(kernel_node);
|
||||
|
||||
|
@ -140,7 +141,8 @@ class NcclCollectiveGpuKernel : public NcclGpuKernel {
|
|||
cudaStream_t stream = comm_stream_ ? comm_stream_ : reinterpret_cast<cudaStream_t>(stream_ptr);
|
||||
auto all_reduce_funcptr = reinterpret_cast<AllReduce>(dlsym(const_cast<void *>(collective_handle_), "AllReduce"));
|
||||
MS_EXCEPTION_IF_NULL(all_reduce_funcptr);
|
||||
CHECK_NCCL_RET_WITH_EXCEPT((*all_reduce_funcptr)(input_addr, output_addr, output_size_ / sizeof(T), nccl_data_type_,
|
||||
CHECK_NCCL_RET_WITH_EXCEPT(kernel_node_,
|
||||
(*all_reduce_funcptr)(input_addr, output_addr, output_size_ / sizeof(T), nccl_data_type_,
|
||||
nccl_reduce_type_, stream, group_name_),
|
||||
"ncclAllReduce failed");
|
||||
}
|
||||
|
@ -153,6 +155,7 @@ class NcclCollectiveGpuKernel : public NcclGpuKernel {
|
|||
auto all_gather_funcptr = reinterpret_cast<AllGather>(dlsym(const_cast<void *>(collective_handle_), "AllGather"));
|
||||
MS_EXCEPTION_IF_NULL(all_gather_funcptr);
|
||||
CHECK_NCCL_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
(*all_gather_funcptr)(input_addr, output_addr, input_size_ / sizeof(T), nccl_data_type_, stream, group_name_),
|
||||
"ncclAllGather failed");
|
||||
}
|
||||
|
@ -165,7 +168,8 @@ class NcclCollectiveGpuKernel : public NcclGpuKernel {
|
|||
auto reduce_scatter_funcptr =
|
||||
reinterpret_cast<ReduceScatter>(dlsym(const_cast<void *>(collective_handle_), "ReduceScatter"));
|
||||
MS_EXCEPTION_IF_NULL(reduce_scatter_funcptr);
|
||||
CHECK_NCCL_RET_WITH_EXCEPT((*reduce_scatter_funcptr)(input_addr, output_addr, output_size_ / sizeof(T),
|
||||
CHECK_NCCL_RET_WITH_EXCEPT(kernel_node_,
|
||||
(*reduce_scatter_funcptr)(input_addr, output_addr, output_size_ / sizeof(T),
|
||||
nccl_data_type_, nccl_reduce_type_, stream, group_name_),
|
||||
"ncclReduceScatter failed");
|
||||
}
|
||||
|
@ -180,7 +184,8 @@ class NcclCollectiveGpuKernel : public NcclGpuKernel {
|
|||
for (int i = 0; i < SizeToInt(input_size_list_.size()); ++i) {
|
||||
input_addr = GetDeviceAddress<T>(inputs, i);
|
||||
output_addr = GetDeviceAddress<T>(outputs, i);
|
||||
CHECK_NCCL_RET_WITH_EXCEPT((*broadcast_funcptr)(input_addr, output_addr, output_size_list_[i] / sizeof(T),
|
||||
CHECK_NCCL_RET_WITH_EXCEPT(kernel_node_,
|
||||
(*broadcast_funcptr)(input_addr, output_addr, output_size_list_[i] / sizeof(T),
|
||||
nccl_data_type_, root_, stream, group_name_),
|
||||
"ncclBroadcast failed");
|
||||
}
|
||||
|
|
|
@ -39,7 +39,8 @@ class NcclRecvGpuKernel : public NcclGpuKernel {
|
|||
T *output_addr = GetDeviceAddress<T>(outputs, 0);
|
||||
auto nccl_recv_func = reinterpret_cast<Recv>(dlsym(const_cast<void *>(collective_handle_), "Recv"));
|
||||
MS_EXCEPTION_IF_NULL(nccl_recv_func);
|
||||
CHECK_NCCL_RET_WITH_EXCEPT((*nccl_recv_func)(output_addr, output_size_list_[0] / sizeof(T), nccl_data_type_,
|
||||
CHECK_NCCL_RET_WITH_EXCEPT(kernel_node_,
|
||||
(*nccl_recv_func)(output_addr, output_size_list_[0] / sizeof(T), nccl_data_type_,
|
||||
src_rank_, reinterpret_cast<cudaStream_t>(stream_ptr), group_name_),
|
||||
"ncclRecv failed");
|
||||
return true;
|
||||
|
@ -47,6 +48,7 @@ class NcclRecvGpuKernel : public NcclGpuKernel {
|
|||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 0) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but NCCL receive needs 0 input.";
|
||||
|
|
|
@ -39,7 +39,8 @@ class NcclSendGpuKernel : public NcclGpuKernel {
|
|||
T *input_addr = GetDeviceAddress<T>(inputs, 0);
|
||||
auto nccl_send_func = reinterpret_cast<Send>(dlsym(const_cast<void *>(collective_handle_), "Send"));
|
||||
MS_EXCEPTION_IF_NULL(nccl_send_func);
|
||||
CHECK_NCCL_RET_WITH_EXCEPT((*nccl_send_func)(input_addr, input_size_list_[0] / sizeof(T), nccl_data_type_,
|
||||
CHECK_NCCL_RET_WITH_EXCEPT(kernel_node_,
|
||||
(*nccl_send_func)(input_addr, input_size_list_[0] / sizeof(T), nccl_data_type_,
|
||||
dest_rank_, reinterpret_cast<cudaStream_t>(stream_ptr), group_name_),
|
||||
"ncclSend failed");
|
||||
return true;
|
||||
|
@ -47,6 +48,7 @@ class NcclSendGpuKernel : public NcclGpuKernel {
|
|||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 1) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but NCCL send needs 1 input.";
|
||||
|
|
|
@ -50,7 +50,8 @@ class ActivationGpuFwdKernel : public GpuKernel {
|
|||
} else {
|
||||
const float alpha = 1;
|
||||
const float beta = 0;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnActivationForward(cudnn_handle_, activation_desc_, &alpha, data_descriptor_,
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnActivationForward(cudnn_handle_, activation_desc_, &alpha, data_descriptor_,
|
||||
input, &beta, data_descriptor_, output),
|
||||
"cudnnActivationForward failed");
|
||||
}
|
||||
|
@ -58,6 +59,7 @@ class ActivationGpuFwdKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
auto node_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
auto iter = kernel_map.find(node_name);
|
||||
if (iter == kernel_map.end()) {
|
||||
|
@ -85,7 +87,8 @@ class ActivationGpuFwdKernel : public GpuKernel {
|
|||
float alpha = GetAttr<float>(kernel_node, "alpha");
|
||||
coef = static_cast<double>(alpha);
|
||||
}
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetActivationDescriptor(activation_desc_, mode_, CUDNN_NOT_PROPAGATE_NAN, coef),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetActivationDescriptor(activation_desc_, mode_, CUDNN_NOT_PROPAGATE_NAN, coef),
|
||||
"cudnnSetActivationDescriptor failed");
|
||||
|
||||
const int split_dim = 4;
|
||||
|
@ -93,17 +96,19 @@ class ActivationGpuFwdKernel : public GpuKernel {
|
|||
ShapeNdTo4d(input_shape, &shape);
|
||||
if (AnfAlgo::GetInputFormat(kernel_node, 0) == kOpFormat_NHWC) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(data_descriptor_, CUDNN_TENSOR_NHWC, cudnn_data_type_, SizeToInt(shape[0]),
|
||||
SizeToInt(shape[3]), SizeToInt(shape[1]), SizeToInt(shape[2])),
|
||||
"cudnnSetTensor4dDescriptor failed");
|
||||
} else {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(data_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, SizeToInt(shape[0]),
|
||||
SizeToInt(shape[1]), SizeToInt(shape[2]), SizeToInt(shape[3])),
|
||||
"cudnnSetTensor4dDescriptor failed");
|
||||
}
|
||||
} else {
|
||||
CudnnSetTensorNdDescriptor(input_shape, data_descriptor_, cudnn_data_type_);
|
||||
CudnnSetTensorNdDescriptor(input_shape, data_descriptor_, cudnn_data_type_, kernel_node_);
|
||||
}
|
||||
|
||||
InitSizeLists();
|
||||
|
@ -111,9 +116,10 @@ class ActivationGpuFwdKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
void DestroyResource() noexcept override {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_),
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyActivationDescriptor(activation_desc_),
|
||||
"cudnnDestroyActivationDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(data_descriptor_), "cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(data_descriptor_),
|
||||
"cudnnDestroyTensorDescriptor failed");
|
||||
}
|
||||
|
||||
void ResetResource() noexcept override {
|
||||
|
@ -134,14 +140,15 @@ class ActivationGpuFwdKernel : public GpuKernel {
|
|||
protected:
|
||||
void InitResource() override {
|
||||
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&data_descriptor_), "cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateActivationDescriptor(&activation_desc_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&data_descriptor_),
|
||||
"cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateActivationDescriptor(&activation_desc_),
|
||||
"cudnnCreateActivationDescriptor failed");
|
||||
}
|
||||
|
||||
void InitSizeLists() override {
|
||||
if (!is_null_input_) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(data_descriptor_, &input_size_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(data_descriptor_, &input_size_),
|
||||
"cudnnGetTensorSizeInBytes failed");
|
||||
output_size_ = input_size_;
|
||||
}
|
||||
|
|
|
@ -54,6 +54,7 @@ class ActivationGradGpuKernel : public GpuKernel {
|
|||
const float alpha = 1;
|
||||
const float beta = 0;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnActivationBackward(cudnn_handle_, activation_desc_, &alpha, data_descriptor_, y, data_descriptor_, dy,
|
||||
data_descriptor_, y, &beta, data_descriptor_, dx),
|
||||
"cudnnActivationBackward failed");
|
||||
|
@ -61,6 +62,7 @@ class ActivationGradGpuKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
auto node_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
auto iter = kernel_map.find(node_name);
|
||||
if (iter == kernel_map.end()) {
|
||||
|
@ -85,7 +87,8 @@ class ActivationGradGpuKernel : public GpuKernel {
|
|||
std::vector<size_t> shape;
|
||||
double coef = (mode_ == CUDNN_ACTIVATION_CLIPPED_RELU) ? 5.999999 : 0.0;
|
||||
if (mode_ == CUDNN_ACTIVATION_ELU) coef = 1.0;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetActivationDescriptor(activation_desc_, mode_, CUDNN_PROPAGATE_NAN, coef),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetActivationDescriptor(activation_desc_, mode_, CUDNN_PROPAGATE_NAN, coef),
|
||||
"SetActivationDescriptor failed");
|
||||
|
||||
const int split_dim = 4;
|
||||
|
@ -93,17 +96,19 @@ class ActivationGradGpuKernel : public GpuKernel {
|
|||
ShapeNdTo4d(input_shape, &shape);
|
||||
if (AnfAlgo::GetInputFormat(kernel_node, 0) == kOpFormat_NHWC) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(data_descriptor_, CUDNN_TENSOR_NHWC, cudnn_data_type_, SizeToInt(shape[0]),
|
||||
SizeToInt(shape[3]), SizeToInt(shape[1]), SizeToInt(shape[2])),
|
||||
"cudnnSetTensor4dDescriptor failed");
|
||||
} else {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(data_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, SizeToInt(shape[0]),
|
||||
SizeToInt(shape[1]), SizeToInt(shape[2]), SizeToInt(shape[3])),
|
||||
"cudnnSetTensor4dDescriptor failed");
|
||||
}
|
||||
} else {
|
||||
CudnnSetTensorNdDescriptor(input_shape, data_descriptor_, cudnn_data_type_);
|
||||
CudnnSetTensorNdDescriptor(input_shape, data_descriptor_, cudnn_data_type_, kernel_node_);
|
||||
}
|
||||
|
||||
InitSizeLists();
|
||||
|
@ -111,9 +116,10 @@ class ActivationGradGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
void DestroyResource() noexcept override {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_),
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyActivationDescriptor(activation_desc_),
|
||||
"cudnnDestroyActivationDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(data_descriptor_), "cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(data_descriptor_),
|
||||
"cudnnDestroyTensorDescriptor failed");
|
||||
}
|
||||
|
||||
void ResetResource() noexcept override {
|
||||
|
@ -132,13 +138,14 @@ class ActivationGradGpuKernel : public GpuKernel {
|
|||
protected:
|
||||
void InitResource() override {
|
||||
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&data_descriptor_), "cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateActivationDescriptor(&activation_desc_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&data_descriptor_),
|
||||
"cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateActivationDescriptor(&activation_desc_),
|
||||
"cudnnCreateActivationDescriptor failed");
|
||||
}
|
||||
void InitSizeLists() override {
|
||||
if (!is_null_input_) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(data_descriptor_, &input_size_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(data_descriptor_, &input_size_),
|
||||
"cudnnGetTensorSizeInBytes failed");
|
||||
}
|
||||
input_size_list_.push_back(input_size_);
|
||||
|
|
|
@ -71,6 +71,7 @@ class BatchNormGradGpuKernel : public GpuKernel {
|
|||
const float alpha_param_diff = 1;
|
||||
const float beta_param_diff = 0;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnBatchNormalizationBackward(handle_, mode_, &alpha_data_diff, &beta_data_diff, &alpha_param_diff,
|
||||
&beta_param_diff, x_desc_, x, dy_desc_, dy, dx_desc_, dx, scale_bias_desc_, scale,
|
||||
bn_scale, bn_bias, epsilon_, save_mean, save_variance),
|
||||
|
@ -78,6 +79,7 @@ class BatchNormGradGpuKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
InitResource();
|
||||
cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
|
@ -105,15 +107,19 @@ class BatchNormGradGpuKernel : public GpuKernel {
|
|||
epsilon_ = GetAttr<float>(kernel_node, "epsilon");
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, batch_, channel_, height_, width_),
|
||||
"Set x desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(dy_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, batch_, channel_, height_, width_),
|
||||
"Set dy desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(dx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, batch_, channel_, height_, width_),
|
||||
"Set dx desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(scale_bias_desc_, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, channel_, 1, 1),
|
||||
"Set para desc failed");
|
||||
|
||||
|
@ -122,27 +128,31 @@ class BatchNormGradGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
void DestroyResource() noexcept override {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_desc_), "Destroy para desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(scale_bias_desc_),
|
||||
"Destroy para desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
|
||||
}
|
||||
|
||||
protected:
|
||||
void InitResource() override {
|
||||
handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&x_desc_), "Create x desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dy_desc_), "Create dy desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dx_desc_), "Create dx desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&scale_bias_desc_), "Create para desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&x_desc_), "Create x desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dy_desc_), "Create dy desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dx_desc_), "Create dx desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&scale_bias_desc_),
|
||||
"Create para desc failed");
|
||||
}
|
||||
|
||||
void InitSizeLists() override {
|
||||
size_t input_size = 0;
|
||||
size_t para_size = 0;
|
||||
if (!is_null_input_) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(x_desc_, &input_size), "Get input size failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(scale_bias_desc_, ¶_size), "Get input size failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(x_desc_, &input_size),
|
||||
"Get input size failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(scale_bias_desc_, ¶_size),
|
||||
"Get input size failed");
|
||||
}
|
||||
|
||||
input_size_list_.push_back(input_size);
|
||||
|
|
|
@ -54,11 +54,13 @@ class BiasAddGradGpuKernel : public GpuKernel {
|
|||
const float alpha = 1;
|
||||
const float beta = 0;
|
||||
if (same_dims_) {
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(db_addr, dy_addr, output_size_list_[0], cudaMemcpyDeviceToDevice,
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(db_addr, dy_addr, output_size_list_[0], cudaMemcpyDeviceToDevice,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemcpyAsync failed.");
|
||||
} else {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnReduceTensor(cudnn_handle_, op_desc_, indices_addr, workspace_size_list_[0], workspace_addr,
|
||||
workspace_size_list_[1], &alpha, dy_desc_, dy_addr, &beta, db_desc_, db_addr),
|
||||
"cudnnReduceTensor failed");
|
||||
|
@ -67,6 +69,7 @@ class BiasAddGradGpuKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
InitResource();
|
||||
cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
|
||||
auto dy_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
|
@ -97,12 +100,15 @@ class BiasAddGradGpuKernel : public GpuKernel {
|
|||
auto input_device_format = AnfAlgo::GetInputFormat(kernel_node, 0);
|
||||
auto cudnn_cal_format = (input_device_format == kOpFormat_NHWC) ? CUDNN_TENSOR_NHWC : CUDNN_TENSOR_NCHW;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensorNdDescriptorEx(dy_desc_, cudnn_cal_format, cudnn_data_type_, SizeToInt(cudnn_dims), dy_dims.get()),
|
||||
"cudnnSetTensorNdDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensorNdDescriptorEx(db_desc_, cudnn_cal_format, cudnn_data_type_, SizeToInt(cudnn_dims), db_dims.get()),
|
||||
"cudnnSetTensorNdDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetReduceTensorDescriptor(op_desc_, CUDNN_REDUCE_TENSOR_ADD, CUDNN_DATA_FLOAT, CUDNN_NOT_PROPAGATE_NAN,
|
||||
CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES),
|
||||
"cudnnSetReduceTensorDescriptor failed");
|
||||
|
@ -112,32 +118,39 @@ class BiasAddGradGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
void DestroyResource() noexcept override {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnDestroyReduceTensorDescriptor(op_desc_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnDestroyReduceTensorDescriptor(op_desc_),
|
||||
"cudnnDestroyReduceTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(db_desc_), "cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "cudnnDestroyOpTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(db_desc_),
|
||||
"cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dy_desc_),
|
||||
"cudnnDestroyOpTensorDescriptor failed");
|
||||
}
|
||||
|
||||
protected:
|
||||
void InitResource() override {
|
||||
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dy_desc_), "cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&db_desc_), "cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateReduceTensorDescriptor(&op_desc_), "cudnnCreateOpTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dy_desc_),
|
||||
"cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&db_desc_),
|
||||
"cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateReduceTensorDescriptor(&op_desc_),
|
||||
"cudnnCreateOpTensorDescriptor failed");
|
||||
}
|
||||
void InitSizeLists() override {
|
||||
size_t dy_size, db_size;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(dy_desc_, &dy_size), "cudnnGetTensorSizeInBytes failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(db_desc_, &db_size), "cudnnGetTensorSizeInBytes failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(dy_desc_, &dy_size),
|
||||
"cudnnGetTensorSizeInBytes failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(db_desc_, &db_size),
|
||||
"cudnnGetTensorSizeInBytes failed");
|
||||
input_size_list_.push_back(dy_size);
|
||||
output_size_list_.push_back(db_size);
|
||||
|
||||
size_t indices_size, workspace_size;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnGetReductionIndicesSize(cudnn_handle_, op_desc_, dy_desc_, db_desc_, &indices_size),
|
||||
kernel_node_, cudnnGetReductionIndicesSize(cudnn_handle_, op_desc_, dy_desc_, db_desc_, &indices_size),
|
||||
"cudnnGetReductionIndicesSize failed")
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnGetReductionWorkspaceSize(cudnn_handle_, op_desc_, dy_desc_, db_desc_, &workspace_size),
|
||||
kernel_node_, cudnnGetReductionWorkspaceSize(cudnn_handle_, op_desc_, dy_desc_, db_desc_, &workspace_size),
|
||||
"cudnnGetReductionWorkspaceSize failed")
|
||||
workspace_size_list_.push_back(indices_size);
|
||||
workspace_size_list_.push_back(workspace_size);
|
||||
|
|
|
@ -88,11 +88,13 @@ class Conv2dGpuFwdKernel : public GpuKernel {
|
|||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
}
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnConvolutionForward(cudnn_handle_, &alpha, padded_desc_, padded_addr, filter_desc_, filter_addr, conv_desc_,
|
||||
conv_algorithm_, workspace_addr, workspace_size_, &beta, output_desc_, output_addr),
|
||||
"cudnnConvolutionForward failed");
|
||||
} else {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnConvolutionForward(cudnn_handle_, &alpha, input_desc_, input_addr, filter_desc_, filter_addr, conv_desc_,
|
||||
conv_algorithm_, workspace_addr, workspace_size_, &beta, output_desc_, output_addr),
|
||||
"cudnnConvolutionForward failed");
|
||||
|
@ -101,6 +103,7 @@ class Conv2dGpuFwdKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
InitResource();
|
||||
if (!CheckParam(kernel_node)) {
|
||||
return false;
|
||||
|
@ -126,7 +129,8 @@ class Conv2dGpuFwdKernel : public GpuKernel {
|
|||
}
|
||||
Set4DDesc(in_shape, filter_shape, output_shape);
|
||||
group_ = static_cast<int>(GetAttr<int64_t>(kernel_node, "group"));
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetConvolutionGroupCount(conv_desc_, group_), "cudnnSetConvGroupCount failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnSetConvolutionGroupCount(conv_desc_, group_),
|
||||
"cudnnSetConvGroupCount failed");
|
||||
std::vector<int> pad_list;
|
||||
std::vector<int64_t> pad_list_me = GetAttr<std::vector<int64_t>>(kernel_node, "pad_list");
|
||||
(void)std::transform(pad_list_me.begin(), pad_list_me.end(), std::back_inserter(pad_list),
|
||||
|
@ -158,11 +162,13 @@ class Conv2dGpuFwdKernel : public GpuKernel {
|
|||
SetDimA(padded_shape, dimA, 4, data_format_);
|
||||
SetStrideA(padded_shape, strideApadded, 4, data_format_);
|
||||
}
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(padded_desc_, cudnn_data_type_, 4, dimA, strideApadded),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetTensorNdDescriptor(padded_desc_, cudnn_data_type_, 4, dimA, strideApadded),
|
||||
"cudnnSetTensor4dDescriptor failed");
|
||||
padA[0] = 0;
|
||||
padA[1] = 0;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetConvolutionNdDescriptor(conv_desc_, 2, padA, strideA, dilaA, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT),
|
||||
"cudnnSetConvolutionNdDescriptor failed");
|
||||
input_descriptor_real = padded_desc_;
|
||||
|
@ -174,12 +180,13 @@ class Conv2dGpuFwdKernel : public GpuKernel {
|
|||
padA[0] = pad_height_;
|
||||
padA[1] = pad_width_;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetConvolutionNdDescriptor(conv_desc_, 2, padA, strideA, dilaA, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT),
|
||||
"cudnnSetConvolution2dDescriptor failed");
|
||||
input_descriptor_real = input_desc_;
|
||||
}
|
||||
if (cudnn_data_type_ == CUDNN_DATA_HALF) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH),
|
||||
"cudnnSetConvolutionMathType failed.")
|
||||
}
|
||||
SelectAlgorithm(input_descriptor_real);
|
||||
|
@ -188,34 +195,46 @@ class Conv2dGpuFwdKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
void DestroyResource() noexcept override {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_),
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyConvolutionDescriptor(conv_desc_),
|
||||
"cudnnDestroyConvolutionDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(filter_desc_), "cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_desc_), "cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_desc_), "cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_desc_), "cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyFilterDescriptor(filter_desc_),
|
||||
"cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(padded_desc_),
|
||||
"cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(output_desc_),
|
||||
"cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(input_desc_),
|
||||
"cudnnDestroyTensorDescriptor failed");
|
||||
}
|
||||
|
||||
protected:
|
||||
void InitResource() override {
|
||||
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&input_desc_), "cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&output_desc_), "cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&padded_desc_), "cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateFilterDescriptor(&filter_desc_), "cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateConvolutionDescriptor(&conv_desc_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&input_desc_),
|
||||
"cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&output_desc_),
|
||||
"cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&padded_desc_),
|
||||
"cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateFilterDescriptor(&filter_desc_),
|
||||
"cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateConvolutionDescriptor(&conv_desc_),
|
||||
"cudnnCreateConvolutionDescriptor failed");
|
||||
}
|
||||
|
||||
void InitSizeLists() override {
|
||||
if (!is_null_input_) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(input_desc_, reinterpret_cast<size_t *>(&input_size_)),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnGetTensorSizeInBytes(input_desc_, reinterpret_cast<size_t *>(&input_size_)),
|
||||
"cudnnGetTensorSizeInBytes failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetFilterSizeInBytes(filter_desc_, reinterpret_cast<size_t *>(&filter_size_)),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnGetFilterSizeInBytes(filter_desc_, reinterpret_cast<size_t *>(&filter_size_)),
|
||||
"cudnnGetFilterSizeInBytes failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(output_desc_, reinterpret_cast<size_t *>(&output_size_)),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnGetTensorSizeInBytes(output_desc_, reinterpret_cast<size_t *>(&output_size_)),
|
||||
"cudnnGetTensorSizeInBytes failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(padded_desc_, reinterpret_cast<size_t *>(&padded_size_)),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnGetTensorSizeInBytes(padded_desc_, reinterpret_cast<size_t *>(&padded_size_)),
|
||||
"cudnnGetTensorSizeInBytes failed");
|
||||
}
|
||||
input_size_list_.push_back(input_size_);
|
||||
|
@ -223,6 +242,7 @@ class Conv2dGpuFwdKernel : public GpuKernel {
|
|||
output_size_list_.push_back(output_size_);
|
||||
if (use_pad_ && !is_null_input_) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle_, padded_desc_, filter_desc_, conv_desc_, output_desc_,
|
||||
conv_algorithm_, &workspace_size_),
|
||||
"cudnnGetConvolutionForwardWorkspaceSize failed");
|
||||
|
@ -230,6 +250,7 @@ class Conv2dGpuFwdKernel : public GpuKernel {
|
|||
} else {
|
||||
if (!is_null_input_) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle_, input_desc_, filter_desc_, conv_desc_, output_desc_,
|
||||
conv_algorithm_, &workspace_size_),
|
||||
"cudnnGetConvolutionForwardWorkspaceSize failed");
|
||||
|
@ -269,18 +290,21 @@ class Conv2dGpuFwdKernel : public GpuKernel {
|
|||
int filterDimA[4];
|
||||
// OHWI for NHWC; OIHW for NCHW
|
||||
SetDimA(filter_shape, filterDimA, 4, data_format_);
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(input_desc_, cudnn_data_type_, nbDims, dimA, strideAin),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetTensorNdDescriptor(input_desc_, cudnn_data_type_, nbDims, dimA, strideAin),
|
||||
"cudnnSetTensor4dDescriptor failed");
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetFilterNdDescriptor(filter_desc_, cudnn_data_type_, compute_format_, nbDims, filterDimA),
|
||||
kernel_node_, cudnnSetFilterNdDescriptor(filter_desc_, cudnn_data_type_, compute_format_, nbDims, filterDimA),
|
||||
"cudnnSetFilter4dDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(output_desc_, cudnn_data_type_, nbDims, dimAout, strideAout),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetTensorNdDescriptor(output_desc_, cudnn_data_type_, nbDims, dimAout, strideAout),
|
||||
"cudnnSetTensor4dDescriptor failed");
|
||||
}
|
||||
void SelectAlgorithm(cudnnTensorDescriptor_t input_descriptor_real) {
|
||||
if (group_ > 1 || CUDNN_MAJOR < 7) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetConvolutionForwardAlgorithm(
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnGetConvolutionForwardAlgorithm(
|
||||
cudnn_handle_, input_descriptor_real, filter_desc_, conv_desc_, output_desc_,
|
||||
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, 0, &conv_algorithm_),
|
||||
"cudnnGetConvolutionForwardAlgorithm failed");
|
||||
|
@ -289,6 +313,7 @@ class Conv2dGpuFwdKernel : public GpuKernel {
|
|||
int returned_algo_count;
|
||||
cudnnConvolutionFwdAlgoPerf_t perf_results;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnGetConvolutionForwardAlgorithm_v7(cudnn_handle_, input_descriptor_real, filter_desc_, conv_desc_,
|
||||
output_desc_, requested_algo_count, &returned_algo_count, &perf_results),
|
||||
"cudnnGetConvolutionForwardAlgorithm_v7 failed");
|
||||
|
|
|
@ -90,18 +90,21 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel {
|
|||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
}
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnConvolutionBackwardFilter(cudnn_handle_, &alpha, padded_descriptor_, padded, dy_desc_, dy, conv_desc_,
|
||||
algo_, work_space, workspace_size_, &beta, dw_desc_, dw),
|
||||
"ConvolutionBackwardFilter failed");
|
||||
return true;
|
||||
}
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnConvolutionBackwardFilter(cudnn_handle_, &alpha, x_desc_, x, dy_desc_, dy, conv_desc_, algo_, work_space,
|
||||
workspace_size_, &beta, dw_desc_, dw),
|
||||
"ConvolutionBackwardFilter failed");
|
||||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
InitResource();
|
||||
if (!CheckParam(kernel_node)) {
|
||||
return false;
|
||||
|
@ -128,7 +131,8 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel {
|
|||
SetNCHW(in_shape, &n_, &c_, &old_height_, &old_width_, data_format_);
|
||||
Set4DDesc(dy_shape, filter_shape, in_shape);
|
||||
group_ = static_cast<int>(GetAttr<int64_t>(kernel_node, "group"));
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetConvolutionGroupCount(conv_desc_, group_), "cudnnSetConvGroupCount failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnSetConvolutionGroupCount(conv_desc_, group_),
|
||||
"cudnnSetConvGroupCount failed");
|
||||
|
||||
std::vector<int> pad_list;
|
||||
std::vector<int64_t> pad_list_me = GetAttr<std::vector<int64_t>>(kernel_node, "pad_list");
|
||||
|
@ -165,11 +169,12 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel {
|
|||
SetStrideA(padded_shape, strideApadded, 4, data_format_);
|
||||
}
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetTensorNdDescriptor(padded_descriptor_, cudnn_data_type_, 4, dimA, strideApadded),
|
||||
kernel_node_, cudnnSetTensorNdDescriptor(padded_descriptor_, cudnn_data_type_, 4, dimA, strideApadded),
|
||||
"cudnnSetTensor4dDescriptor failed");
|
||||
padA[0] = 0;
|
||||
padA[1] = 0;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetConvolutionNdDescriptor(conv_desc_, 2, padA, strideA, dilaA, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT),
|
||||
"cudnnSetConvolutionNdDescriptor failed");
|
||||
x_desc_real = padded_descriptor_;
|
||||
|
@ -181,12 +186,13 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel {
|
|||
padA[0] = pad_height_;
|
||||
padA[1] = pad_width_;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetConvolutionNdDescriptor(conv_desc_, 2, padA, strideA, dilaA, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT),
|
||||
"cudnnSetConvolution2dDescriptor failed");
|
||||
x_desc_real = x_desc_;
|
||||
}
|
||||
if (cudnn_data_type_ == CUDNN_DATA_HALF) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH),
|
||||
"cudnnSetConvolutionMathType failed.")
|
||||
}
|
||||
SelectAlgorithm(x_desc_real);
|
||||
|
@ -195,31 +201,42 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
void DestroyResource() noexcept override {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_),
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyConvolutionDescriptor(conv_desc_),
|
||||
"cudnnDestroyConvolutionDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(dw_desc_), "cudnnDestroyFilterDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_descriptor_), "cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyFilterDescriptor(dw_desc_),
|
||||
"cudnnDestroyFilterDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(padded_descriptor_),
|
||||
"cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dy_desc_),
|
||||
"cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(x_desc_),
|
||||
"cudnnDestroyTensorDescriptor failed");
|
||||
}
|
||||
|
||||
protected:
|
||||
void InitResource() override {
|
||||
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&x_desc_), "cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dy_desc_), "cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&padded_descriptor_), "cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateFilterDescriptor(&dw_desc_), "cudnnCreateFilterDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateConvolutionDescriptor(&conv_desc_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&x_desc_),
|
||||
"cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dy_desc_),
|
||||
"cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&padded_descriptor_),
|
||||
"cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateFilterDescriptor(&dw_desc_),
|
||||
"cudnnCreateFilterDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateConvolutionDescriptor(&conv_desc_),
|
||||
"cudnnCreateConvolutionDescriptor failed");
|
||||
}
|
||||
void InitSizeLists() override {
|
||||
if (!is_null_input_) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(dy_desc_, reinterpret_cast<size_t *>(&dy_size_)),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnGetTensorSizeInBytes(dy_desc_, reinterpret_cast<size_t *>(&dy_size_)),
|
||||
"cudnnGetTensorSizeInBytes failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(x_desc_, reinterpret_cast<size_t *>(&input_size_)),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnGetTensorSizeInBytes(x_desc_, reinterpret_cast<size_t *>(&input_size_)),
|
||||
"cudnnGetTensorSizeInBytes failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetFilterSizeInBytes(dw_desc_, reinterpret_cast<size_t *>(&output_size_)),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnGetFilterSizeInBytes(dw_desc_, reinterpret_cast<size_t *>(&output_size_)),
|
||||
"cudnnGetFilterSizeInBytes failed");
|
||||
}
|
||||
input_size_list_.push_back(dy_size_);
|
||||
|
@ -228,9 +245,10 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel {
|
|||
|
||||
if (use_pad_ && !is_null_input_) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnGetTensorSizeInBytes(padded_descriptor_, reinterpret_cast<size_t *>(&padded_size_)),
|
||||
kernel_node_, cudnnGetTensorSizeInBytes(padded_descriptor_, reinterpret_cast<size_t *>(&padded_size_)),
|
||||
"cudnnGetTensorSizeInBytes failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnn_handle_, padded_descriptor_, dy_desc_, conv_desc_,
|
||||
dw_desc_, algo_, reinterpret_cast<size_t *>(&workspace_size_)),
|
||||
"cudnnGetConvolutionBackwardFilterWorkspaceSize failed");
|
||||
|
@ -238,6 +256,7 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel {
|
|||
} else {
|
||||
if (!is_null_input_) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnn_handle_, x_desc_, dy_desc_, conv_desc_, dw_desc_, algo_,
|
||||
reinterpret_cast<size_t *>(&workspace_size_)),
|
||||
"cudnnGetConvolutionBackwardFilterWorkspaceSize failed");
|
||||
|
@ -263,6 +282,7 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel {
|
|||
void SelectAlgorithm(cudnnTensorDescriptor_t x_desc_real) {
|
||||
if (group_ > 1 || CUDNN_MAJOR < 7) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle_, x_desc_real, dy_desc_, conv_desc_, dw_desc_,
|
||||
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, 0, &algo_),
|
||||
"GetConvolutionBackwardFilterAlgorithm failed");
|
||||
|
@ -271,6 +291,7 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel {
|
|||
int returned_algo_count;
|
||||
cudnnConvolutionBwdFilterAlgoPerf_t perf_results;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnn_handle_, x_desc_real, dy_desc_, conv_desc_, dw_desc_,
|
||||
requested_algo_count, &returned_algo_count, &perf_results),
|
||||
"GetConvolutionBackwardFilterAlgorithm failed");
|
||||
|
@ -299,12 +320,14 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel {
|
|||
// filter shape relued by format_attr_. In native mode it's OHWI. In transpose mode it's OIHW.
|
||||
int filterDimA[4];
|
||||
SetDimA(filter_shape, filterDimA, 4, format_attr_);
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(dy_desc_, cudnn_data_type_, nbDims, dimAdy, strideAdy),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetTensorNdDescriptor(dy_desc_, cudnn_data_type_, nbDims, dimAdy, strideAdy),
|
||||
"cudnnSetTensorNdDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetFilterNdDescriptor(dw_desc_, cudnn_data_type_, compute_format_, nbDims, filterDimA),
|
||||
kernel_node_, cudnnSetFilterNdDescriptor(dw_desc_, cudnn_data_type_, compute_format_, nbDims, filterDimA),
|
||||
"cudnnSetFilterNdDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(x_desc_, cudnn_data_type_, nbDims, dimA, strideAin),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetTensorNdDescriptor(x_desc_, cudnn_data_type_, nbDims, dimA, strideAin),
|
||||
"cudnnSetTensorNdDescriptor failed");
|
||||
}
|
||||
void SetStrideAndDilation(const CNodePtr &kernel_node) {
|
||||
|
|
|
@ -81,6 +81,7 @@ class ConvGradInputGpuBkwKernel : public GpuKernel {
|
|||
T *padded = GetDeviceAddress<T>(workspace, 1);
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnConvolutionBackwardData(cudnn_handle_, &alpha, w_desc_, w, dy_desc_, dy, conv_desc_, algo_, work_space,
|
||||
workspace_size_, &beta_, padded_descriptor_, padded),
|
||||
"ConvolutionBackwardData failed");
|
||||
|
@ -93,6 +94,7 @@ class ConvGradInputGpuBkwKernel : public GpuKernel {
|
|||
}
|
||||
} else {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnConvolutionBackwardData(cudnn_handle_, &alpha, w_desc_, w, dy_desc_, dy, conv_desc_, algo_, work_space,
|
||||
workspace_size_, &beta_, dx_desc_, dx),
|
||||
"ConvolutionBackwardData failed");
|
||||
|
@ -100,6 +102,7 @@ class ConvGradInputGpuBkwKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
InitResource();
|
||||
if (!CheckParam(kernel_node)) {
|
||||
return false;
|
||||
|
@ -131,7 +134,8 @@ class ConvGradInputGpuBkwKernel : public GpuKernel {
|
|||
Set4DDesc(dy_shape, input_shape, filter_shape);
|
||||
|
||||
group_ = static_cast<int>(GetAttr<int64_t>(kernel_node, "group"));
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetConvolutionGroupCount(conv_desc_, group_), "cudnnSetConvGroupCount failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnSetConvolutionGroupCount(conv_desc_, group_),
|
||||
"cudnnSetConvGroupCount failed");
|
||||
|
||||
std::vector<int> pad_list;
|
||||
std::vector<int64_t> pad_list_me = GetAttr<std::vector<int64_t>>(kernel_node, "pad_list");
|
||||
|
@ -168,11 +172,12 @@ class ConvGradInputGpuBkwKernel : public GpuKernel {
|
|||
SetStrideA(padded_shape, strideApadded, 4, data_format_);
|
||||
}
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetTensorNdDescriptor(padded_descriptor_, cudnn_data_type_, 4, dimA, strideApadded),
|
||||
kernel_node_, cudnnSetTensorNdDescriptor(padded_descriptor_, cudnn_data_type_, 4, dimA, strideApadded),
|
||||
"cudnnSetTensor4dDescriptor failed");
|
||||
padA[0] = 0;
|
||||
padA[1] = 0;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetConvolutionNdDescriptor(conv_desc_, 2, padA, strideA, dilaA, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT),
|
||||
"cudnnSetConvolutionNdDescriptor failed");
|
||||
dx_desc_real = padded_descriptor_;
|
||||
|
@ -184,12 +189,13 @@ class ConvGradInputGpuBkwKernel : public GpuKernel {
|
|||
padA[0] = pad_height_;
|
||||
padA[1] = pad_width_;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetConvolutionNdDescriptor(conv_desc_, 2, padA, strideA, dilaA, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT),
|
||||
"cudnnSetConvolution2dDescriptor failed");
|
||||
dx_desc_real = dx_desc_;
|
||||
}
|
||||
if (cudnn_data_type_ == CUDNN_DATA_HALF) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH),
|
||||
"cudnnSetConvolutionMathType failed.")
|
||||
}
|
||||
SelectAlgorithm(dx_desc_real);
|
||||
|
@ -199,29 +205,39 @@ class ConvGradInputGpuBkwKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
void DestroyResource() noexcept override {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_),
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyConvolutionDescriptor(conv_desc_),
|
||||
"cudnnDestroyConvolutionDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(w_desc_), "cudnnDestroyFilterDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_descriptor_), "cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyFilterDescriptor(w_desc_),
|
||||
"cudnnDestroyFilterDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(padded_descriptor_),
|
||||
"cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dy_desc_),
|
||||
"cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dx_desc_),
|
||||
"cudnnDestroyTensorDescriptor failed");
|
||||
}
|
||||
|
||||
protected:
|
||||
void InitResource() override {
|
||||
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dx_desc_), "cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dy_desc_), "cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&padded_descriptor_), "cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateFilterDescriptor(&w_desc_), "cudnnCreateFilterDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateConvolutionDescriptor(&conv_desc_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dx_desc_),
|
||||
"cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dy_desc_),
|
||||
"cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&padded_descriptor_),
|
||||
"cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateFilterDescriptor(&w_desc_),
|
||||
"cudnnCreateFilterDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateConvolutionDescriptor(&conv_desc_),
|
||||
"cudnnCreateConvolutionDescriptor failed");
|
||||
}
|
||||
void InitSizeLists() override {
|
||||
if (!is_null_input_) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(dy_desc_, &dy_size_), "cudnnGetTensorSizeInBytes failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetFilterSizeInBytes(w_desc_, &w_size_), "cudnnGetTensorSizeInBytes failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(dx_desc_, &output_size_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(dy_desc_, &dy_size_),
|
||||
"cudnnGetTensorSizeInBytes failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetFilterSizeInBytes(w_desc_, &w_size_),
|
||||
"cudnnGetTensorSizeInBytes failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(dx_desc_, &output_size_),
|
||||
"cudnnGetTensorSizeInBytes failed");
|
||||
}
|
||||
input_size_list_.push_back(dy_size_);
|
||||
|
@ -229,17 +245,19 @@ class ConvGradInputGpuBkwKernel : public GpuKernel {
|
|||
output_size_list_.push_back(output_size_);
|
||||
|
||||
if (use_pad_ && !is_null_input_) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(padded_descriptor_, &padded_size_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(padded_descriptor_, &padded_size_),
|
||||
"cudnnGetTensorSizeInBytes failed");
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn_handle_, w_desc_, dy_desc_, conv_desc_, padded_descriptor_,
|
||||
algo_, &workspace_size_),
|
||||
"cudnnGetConvolutionBackwardDataWorkspaceSize failed");
|
||||
workspace_size_list_.push_back(padded_size_);
|
||||
} else {
|
||||
if (!is_null_input_) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetConvolutionBackwardDataWorkspaceSize(
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnGetConvolutionBackwardDataWorkspaceSize(
|
||||
cudnn_handle_, w_desc_, dy_desc_, conv_desc_, dx_desc_, algo_, &workspace_size_),
|
||||
"cudnnGetConvolutionBackwardDataWorkspaceSize failed");
|
||||
}
|
||||
|
@ -270,6 +288,7 @@ class ConvGradInputGpuBkwKernel : public GpuKernel {
|
|||
void SelectAlgorithm(cudnnTensorDescriptor_t dx_desc_real) {
|
||||
if (group_ > 1 || CUDNN_MAJOR < 7) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle_, w_desc_, dy_desc_, conv_desc_, dx_desc_real,
|
||||
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, 0, &algo_),
|
||||
"cudnnGetConvolutionBackwardDataAlgorithm failed");
|
||||
|
@ -278,6 +297,7 @@ class ConvGradInputGpuBkwKernel : public GpuKernel {
|
|||
int returned_algo_count;
|
||||
cudnnConvolutionBwdDataAlgoPerf_t perf_results;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnn_handle_, w_desc_, dy_desc_, conv_desc_, dx_desc_real,
|
||||
requested_algo_count, &returned_algo_count, &perf_results),
|
||||
"cudnnGetConvolutionBackwardDataAlgorithm_v7 failed");
|
||||
|
@ -306,12 +326,14 @@ class ConvGradInputGpuBkwKernel : public GpuKernel {
|
|||
SetStrideA(dy_shape, strideAdy, 4, data_format_);
|
||||
SetDimA(filter_shape, filterDimA, 4, data_format_);
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(dy_desc_, cudnn_data_type_, nbDims, dimAdy, strideAdy),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetTensorNdDescriptor(dy_desc_, cudnn_data_type_, nbDims, dimAdy, strideAdy),
|
||||
"cudnnSetTensorNdDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetFilterNdDescriptor(w_desc_, cudnn_data_type_, compute_format_, nbDims, filterDimA),
|
||||
kernel_node_, cudnnSetFilterNdDescriptor(w_desc_, cudnn_data_type_, compute_format_, nbDims, filterDimA),
|
||||
"cudnnSetFilterNdDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(dx_desc_, cudnn_data_type_, nbDims, dimA, strideAin),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetTensorNdDescriptor(dx_desc_, cudnn_data_type_, nbDims, dimA, strideAin),
|
||||
"cudnnSetTensorNdDescriptor failed");
|
||||
}
|
||||
void SetStrideAndDilation(const CNodePtr &kernel_node) {
|
||||
|
|
|
@ -50,6 +50,7 @@ class CtcLossGpuKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
InitResource();
|
||||
auto probs_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
if (probs_shape.size() != 3) {
|
||||
|
@ -116,9 +117,9 @@ class CtcLossGpuKernel : public GpuKernel {
|
|||
cudaStream_t stream = reinterpret_cast<cudaStream_t>(stream_ptr);
|
||||
CalculateMaxSequence(sequence_length, max_labels_length, batch, stream);
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(
|
||||
cudaMemcpyAsync(&max_sequence, max_labels_length, sizeof(int), cudaMemcpyDeviceToHost, stream),
|
||||
kernel_node_, cudaMemcpyAsync(&max_sequence, max_labels_length, sizeof(int), cudaMemcpyDeviceToHost, stream),
|
||||
"cudaMemcpyAsync failed.");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed.");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaStreamSynchronize(stream), "cudaStreamSynchronize failed.");
|
||||
if (max_time < max_sequence) {
|
||||
MS_LOG(EXCEPTION) << "max_time should be greater than sequence length.";
|
||||
}
|
||||
|
@ -128,9 +129,9 @@ class CtcLossGpuKernel : public GpuKernel {
|
|||
CalculatePreLength(label_squence_length, precum_labels_length, cum_labels_length, max_labels_length, label_indices,
|
||||
batch, label_size_ / sizeof(int), stream);
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(
|
||||
cudaMemcpyAsync(&batch_label, max_labels_length, sizeof(int), cudaMemcpyDeviceToHost, stream),
|
||||
kernel_node_, cudaMemcpyAsync(&batch_label, max_labels_length, sizeof(int), cudaMemcpyDeviceToHost, stream),
|
||||
"cudaMemcpyAsync failed.");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed.");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaStreamSynchronize(stream), "cudaStreamSynchronize failed.");
|
||||
if (batch != batch_label + 1) {
|
||||
MS_LOG(EXCEPTION) << "label batch should be equal to input batch.";
|
||||
}
|
||||
|
@ -141,9 +142,10 @@ class CtcLossGpuKernel : public GpuKernel {
|
|||
batch, stream);
|
||||
}
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudaMemcpyAsync(&max_labels_length_host, max_labels_length, sizeof(int), cudaMemcpyDeviceToHost, stream),
|
||||
"cudaMemcpyAsync failed.");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed.");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaStreamSynchronize(stream), "cudaStreamSynchronize failed.");
|
||||
}
|
||||
|
||||
void LaunchSecondHalf(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
|
@ -175,7 +177,7 @@ class CtcLossGpuKernel : public GpuKernel {
|
|||
CTCLoss(log_alpha_b, log_beta_b, softmax_probs, label_value_with_blank, batch, SOffSet, max_time, numclass,
|
||||
sequence_length, label_squence_length, cum_labels_length, costs, grads, prob_num,
|
||||
ignore_longer_outputs_than_inputs_, stream);
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed.");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaStreamSynchronize(stream), "cudaStreamSynchronize failed.");
|
||||
FreeMem(label_value_with_blank, log_alpha_b, log_beta_b);
|
||||
}
|
||||
|
||||
|
@ -197,39 +199,45 @@ class CtcLossGpuKernel : public GpuKernel {
|
|||
}
|
||||
void MemsetForWS(int *label_value_pcr, int *cum_labels_length, int *label_squence_length, T *costs, T *grads,
|
||||
cudaStream_t stream) {
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemsetAsync(label_value_pcr, static_cast<int>(0), label_size_, stream),
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaMemsetAsync(label_value_pcr, static_cast<int>(0), label_size_, stream),
|
||||
"cudaMemSet failed in CtcLossGpuKernel::Launch.");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemsetAsync(cum_labels_length, static_cast<int>(0), squence_lengths_size_, stream),
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemsetAsync(cum_labels_length, static_cast<int>(0), squence_lengths_size_, stream),
|
||||
"cudaMemSet failed in CtcLossGpuKernel::Launch.");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(
|
||||
cudaMemsetAsync(label_squence_length, static_cast<int>(0), squence_lengths_size_, stream),
|
||||
kernel_node_, cudaMemsetAsync(label_squence_length, static_cast<int>(0), squence_lengths_size_, stream),
|
||||
"cudaMemSet failed in CtcLossGpuKernel::Launch.");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemsetAsync(costs, static_cast<T>(0), probs_dims_[1] * sizeof(T), stream),
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemsetAsync(costs, static_cast<T>(0), probs_dims_[1] * sizeof(T), stream),
|
||||
"cudaMemSet failed in CtcLossGpuKernel::Launch.");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudaMemsetAsync(grads, static_cast<T>(0), probs_dims_[0] * probs_dims_[1] * probs_dims_[2] * sizeof(T), stream),
|
||||
"cudaMemSet failed in CtcLossGpuKernel::Launch.");
|
||||
}
|
||||
void MemManageForCus(T **log_alpha_b, T **log_beta_b, int **label_value_with_blank, int *cum_labels_length,
|
||||
int log_prob_size, int batch, cudaStream_t stream) {
|
||||
int total_labels_size_host = 0;
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMalloc(reinterpret_cast<void **>(log_alpha_b), sizeof(T) * log_prob_size),
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMalloc(reinterpret_cast<void **>(log_alpha_b), sizeof(T) * log_prob_size),
|
||||
"cudaMalloc failed.");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMalloc(reinterpret_cast<void **>(log_beta_b), sizeof(T) * log_prob_size),
|
||||
"cudaMalloc failed.");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(&total_labels_size_host, cum_labels_length + batch - 1, sizeof(int),
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(
|
||||
kernel_node_, cudaMalloc(reinterpret_cast<void **>(log_beta_b), sizeof(T) * log_prob_size), "cudaMalloc failed.");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(&total_labels_size_host, cum_labels_length + batch - 1, sizeof(int),
|
||||
cudaMemcpyDeviceToHost, stream),
|
||||
"cudaMemcpyAsync failed.");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed.");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaStreamSynchronize(stream), "cudaStreamSynchronize failed.");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudaMalloc(reinterpret_cast<void **>(label_value_with_blank), sizeof(int) * (2 * total_labels_size_host + batch)),
|
||||
"cudaMalloc failed.");
|
||||
}
|
||||
|
||||
void FreeMem(int *label_value_with_blank, T *log_alpha_b, T *log_beta_b) {
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaFree(label_value_with_blank), "cudaFree failed.");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaFree(log_alpha_b), "cudaFree failed.");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaFree(log_beta_b), "cudaFree failed.");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaFree(label_value_with_blank), "cudaFree failed.");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaFree(log_alpha_b), "cudaFree failed.");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaFree(log_beta_b), "cudaFree failed.");
|
||||
}
|
||||
|
||||
std::vector<size_t> input_size_list_;
|
||||
|
|
|
@ -82,6 +82,7 @@ class FusedBatchNormExGpuKernel : public GpuKernel {
|
|||
const float alpha = 1;
|
||||
const float beta = 0;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnBatchNormalizationForwardTrainingEx(handle_, mode_, bn_ops_, &alpha, &beta, x_desc_, x, z_desc_, z, y_desc_,
|
||||
y, scale_bias_mean_var_desc_, scale, bias, exp_avg_factor_, runing_mean,
|
||||
runnig_variance, epsilon_, save_mean, save_variance, activation_desc_,
|
||||
|
@ -91,6 +92,7 @@ class FusedBatchNormExGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
if (kernel_name == kFusedBatchNormEx) {
|
||||
|
@ -141,15 +143,16 @@ class FusedBatchNormExGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
void DestroyResource() noexcept override {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), "Destroy para desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_),
|
||||
"Destroy para desc failed");
|
||||
if (bn_ops_ == CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION) {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(z_desc_), "Destroy z desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(z_desc_), "Destroy z desc failed");
|
||||
}
|
||||
|
||||
if (bn_ops_ != CUDNN_BATCHNORM_OPS_BN) {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_),
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyActivationDescriptor(activation_desc_),
|
||||
"Destroy activation descriptor failed");
|
||||
}
|
||||
}
|
||||
|
@ -157,35 +160,41 @@ class FusedBatchNormExGpuKernel : public GpuKernel {
|
|||
protected:
|
||||
void InitResource() override {
|
||||
handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&x_desc_), "Create x desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&y_desc_), "Create y desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&x_desc_), "Create x desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&y_desc_), "Create y desc failed");
|
||||
if (bn_ops_ == CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&z_desc_), "Create z desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&z_desc_), "Create z desc failed");
|
||||
}
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&scale_bias_mean_var_desc_), "Create para desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&scale_bias_mean_var_desc_),
|
||||
"Create para desc failed");
|
||||
|
||||
if (bn_ops_ != CUDNN_BATCHNORM_OPS_BN) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateActivationDescriptor(&activation_desc_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateActivationDescriptor(&activation_desc_),
|
||||
"Create activation descriptor failed");
|
||||
}
|
||||
}
|
||||
|
||||
void InitSizeLists() override {
|
||||
if (!is_null_input_) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(x_desc_, &input_x_size_), "Get input x size failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(scale_bias_mean_var_desc_, ¶_size_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(x_desc_, &input_x_size_),
|
||||
"Get input x size failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(scale_bias_mean_var_desc_, ¶_size_),
|
||||
"Get para size failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(y_desc_, &output_size_), "Get output size failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(y_desc_, &output_size_),
|
||||
"Get output size failed");
|
||||
if (bn_ops_ == CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(z_desc_, &input_z_size_), "Get input z size failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(z_desc_, &input_z_size_),
|
||||
"Get input z size failed");
|
||||
}
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
|
||||
handle_, mode_, bn_ops_, x_desc_, z_desc_, y_desc_, scale_bias_mean_var_desc_,
|
||||
activation_desc_, &workspace_size_),
|
||||
"cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize failed");
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
|
||||
handle_, mode_, bn_ops_, activation_desc_, x_desc_, &reserve_size_),
|
||||
"Get reserve size failed");
|
||||
}
|
||||
|
@ -228,25 +237,28 @@ class FusedBatchNormExGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetTensor4dDescriptor(x_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width),
|
||||
kernel_node_, cudnnSetTensor4dDescriptor(x_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width),
|
||||
"Set x desc failed");
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetTensor4dDescriptor(y_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width),
|
||||
kernel_node_, cudnnSetTensor4dDescriptor(y_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width),
|
||||
"Set y desc failed");
|
||||
|
||||
if (bn_ops_ == CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(z_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width),
|
||||
"Set z desc failed");
|
||||
}
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(scale_bias_mean_var_desc_, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, channel, 1, 1),
|
||||
"Set para desc failed");
|
||||
|
||||
if (bn_ops_ != CUDNN_BATCHNORM_OPS_BN) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetActivationDescriptor(activation_desc_, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0.0),
|
||||
"cudnnSetActivationDescriptor failed");
|
||||
}
|
||||
|
|
|
@ -69,12 +69,14 @@ class FusedBatchNormGpuKernel : public GpuKernel {
|
|||
auto save_mean = GetDeviceAddress<float>(outputs, 3);
|
||||
auto save_variance = GetDeviceAddress<float>(outputs, 4);
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnBatchNormalizationForwardTraining(handle_, mode_, &alpha, &beta, x_desc_, x, y_desc_, y,
|
||||
scale_bias_mean_var_desc_, scale, bias, exp_avg_factor_, runing_mean,
|
||||
runnig_variance, epsilon_, save_mean, save_variance),
|
||||
"Kernel launch failed");
|
||||
} else {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnBatchNormalizationForwardInference(handle_, mode_, &alpha, &beta, x_desc_, x,
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnBatchNormalizationForwardInference(handle_, mode_, &alpha, &beta, x_desc_, x,
|
||||
y_desc_, y, scale_bias_mean_var_desc_, scale,
|
||||
bias, runing_mean, runnig_variance, epsilon_),
|
||||
"Kernel launch failed");
|
||||
|
@ -82,6 +84,7 @@ class FusedBatchNormGpuKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
InitResource();
|
||||
cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
|
@ -117,14 +120,17 @@ class FusedBatchNormGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(x_desc_, cudnn_format, cudnn_data_type_, batch_, channel_, height_, width_),
|
||||
"Set x desc failed");
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(y_desc_, cudnn_format, cudnn_data_type_, batch_, channel_, height_, width_),
|
||||
"Set y desc failed");
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(scale_bias_mean_var_desc_, cudnn_format, CUDNN_DATA_FLOAT, 1, channel_, 1, 1),
|
||||
"Set para desc failed");
|
||||
|
||||
|
@ -134,27 +140,31 @@ class FusedBatchNormGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
void DestroyResource() noexcept override {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), "Destroy para desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_),
|
||||
"Destroy para desc failed");
|
||||
}
|
||||
|
||||
protected:
|
||||
void InitResource() override {
|
||||
handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&x_desc_), "Create x desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&y_desc_), "Create y desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&scale_bias_mean_var_desc_), "Create para desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&x_desc_), "Create x desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&y_desc_), "Create y desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&scale_bias_mean_var_desc_),
|
||||
"Create para desc failed");
|
||||
}
|
||||
void InitSizeLists() override {
|
||||
size_t input_size = 0;
|
||||
size_t para_size = 0;
|
||||
size_t output_size = 0;
|
||||
if (!is_null_input_) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(x_desc_, &input_size), "Get input size failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(scale_bias_mean_var_desc_, ¶_size),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(x_desc_, &input_size),
|
||||
"Get input size failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(scale_bias_mean_var_desc_, ¶_size),
|
||||
"Get para size failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(y_desc_, &output_size),
|
||||
"Get para size failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(y_desc_, &output_size), "Get para size failed");
|
||||
}
|
||||
input_size_list_.push_back(input_size);
|
||||
input_size_list_.push_back(para_size); // scale
|
||||
|
|
|
@ -92,7 +92,8 @@ class FusedBatchNormGradExGpuKernel : public GpuKernel {
|
|||
const float alpha_data_diff = 1;
|
||||
const float alpha_param_diff = 1;
|
||||
const float beta_param_diff = 0;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnBatchNormalizationBackwardEx(
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnBatchNormalizationBackwardEx(
|
||||
handle_, mode_, bn_ops_, &alpha_data_diff, &beta_data_diff_, &alpha_param_diff,
|
||||
&beta_param_diff, x_desc_, x, y_desc_, y, dy_desc_, dy, dz_desc_, dz, dx_desc_, dx,
|
||||
scale_bias_diff_desc_, scale, bias, dscale, dbias, epsilon_, save_mean, save_variance,
|
||||
|
@ -102,6 +103,7 @@ class FusedBatchNormGradExGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
if (kernel_name == kFusedBatchNormGradEx) {
|
||||
|
@ -154,28 +156,30 @@ class FusedBatchNormGradExGpuKernel : public GpuKernel {
|
|||
protected:
|
||||
void InitResource() override {
|
||||
handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&x_desc_), "Create x desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&x_desc_), "Create x desc failed");
|
||||
if (bn_ops_ != CUDNN_BATCHNORM_OPS_BN) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&y_desc_), "Create y desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateActivationDescriptor(&activation_desc_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&y_desc_), "Create y desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateActivationDescriptor(&activation_desc_),
|
||||
"Create activation descriptor failed");
|
||||
}
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dy_desc_), "Create dy desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dy_desc_), "Create dy desc failed");
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dx_desc_), "Create dx desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dx_desc_), "Create dx desc failed");
|
||||
if (bn_ops_ == CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dz_desc_), "Create dz desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dz_desc_), "Create dz desc failed");
|
||||
}
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&scale_bias_diff_desc_), "Create para desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&scale_bias_diff_desc_),
|
||||
"Create para desc failed");
|
||||
}
|
||||
|
||||
void InitSizeLists() override {
|
||||
if (!is_null_input_) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(x_desc_, &x_size_), "Get x size failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(scale_bias_diff_desc_, ¶_size_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(x_desc_, &x_size_), "Get x size failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(scale_bias_diff_desc_, ¶_size_),
|
||||
"Get para size failed");
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetBatchNormalizationBackwardExWorkspaceSize(
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnGetBatchNormalizationBackwardExWorkspaceSize(
|
||||
handle_, mode_, bn_ops_, x_desc_, y_desc_, dy_desc_, dz_desc_, dx_desc_,
|
||||
scale_bias_diff_desc_, activation_desc_, &workspace_size_),
|
||||
"cudnnGetBatchNormalizationBackwardExWorkspaceSize failed");
|
||||
|
@ -202,19 +206,20 @@ class FusedBatchNormGradExGpuKernel : public GpuKernel {
|
|||
workspace_size_list_.push_back(workspace_size_);
|
||||
}
|
||||
void DestroyResource() noexcept override {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
|
||||
if (bn_ops_ != CUDNN_BATCHNORM_OPS_BN) {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_),
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyActivationDescriptor(activation_desc_),
|
||||
"Destroy activation descriptor failed");
|
||||
}
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed");
|
||||
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed");
|
||||
if (bn_ops_ == CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION) {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dz_desc_), "Destroy z desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dz_desc_), "Destroy z desc failed");
|
||||
}
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_diff_desc_), "Destroy para desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(scale_bias_diff_desc_),
|
||||
"Destroy para desc failed");
|
||||
}
|
||||
|
||||
private:
|
||||
|
@ -236,35 +241,39 @@ class FusedBatchNormGradExGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetTensor4dDescriptor(x_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width),
|
||||
kernel_node_, cudnnSetTensor4dDescriptor(x_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width),
|
||||
"Set x desc failed");
|
||||
|
||||
if (bn_ops_ != CUDNN_BATCHNORM_OPS_BN) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(y_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width),
|
||||
"Set z desc failed");
|
||||
}
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetTensor4dDescriptor(dy_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width),
|
||||
kernel_node_, cudnnSetTensor4dDescriptor(dy_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width),
|
||||
"Set dy desc failed");
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetTensor4dDescriptor(dx_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width),
|
||||
kernel_node_, cudnnSetTensor4dDescriptor(dx_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width),
|
||||
"Set dx desc failed");
|
||||
|
||||
if (bn_ops_ == CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(dz_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width),
|
||||
"Set z desc failed");
|
||||
}
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(scale_bias_diff_desc_, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, channel, 1, 1),
|
||||
"Set para desc failed");
|
||||
|
||||
if (bn_ops_ != CUDNN_BATCHNORM_OPS_BN) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetActivationDescriptor(activation_desc_, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0.0),
|
||||
"cudnnSetActivationDescriptor failed");
|
||||
}
|
||||
|
|
|
@ -67,6 +67,7 @@ class FusedBatchNormGradGpuKernel : public GpuKernel {
|
|||
const float alpha_param_diff = 1;
|
||||
const float beta_param_diff = 0;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnBatchNormalizationBackward(handle_, mode_, &alpha_data_diff, &beta_data_diff, &alpha_param_diff,
|
||||
&beta_param_diff, x_desc_, x, dy_desc_, dy, dx_desc_, dx, scale_bias_desc_, scale,
|
||||
bn_scale, bn_bias, epsilon_, save_mean, save_variance),
|
||||
|
@ -74,6 +75,7 @@ class FusedBatchNormGradGpuKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
InitResource();
|
||||
cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
|
@ -101,15 +103,19 @@ class FusedBatchNormGradGpuKernel : public GpuKernel {
|
|||
epsilon_ = GetAttr<float>(kernel_node, "epsilon");
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, batch_, channel_, height_, width_),
|
||||
"Set x desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(dy_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, batch_, channel_, height_, width_),
|
||||
"Set dy desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(dx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, batch_, channel_, height_, width_),
|
||||
"Set dx desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(scale_bias_desc_, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, channel_, 1, 1),
|
||||
"Set para desc failed");
|
||||
|
||||
|
@ -118,27 +124,31 @@ class FusedBatchNormGradGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
void DestroyResource() noexcept override {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_desc_), "Destroy para desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(scale_bias_desc_),
|
||||
"Destroy para desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
|
||||
}
|
||||
|
||||
protected:
|
||||
void InitResource() override {
|
||||
handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&x_desc_), "Create x desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dy_desc_), "Create dy desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dx_desc_), "Create dx desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&scale_bias_desc_), "Create para desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&x_desc_), "Create x desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dy_desc_), "Create dy desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dx_desc_), "Create dx desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&scale_bias_desc_),
|
||||
"Create para desc failed");
|
||||
}
|
||||
|
||||
void InitSizeLists() override {
|
||||
size_t input_size = 0;
|
||||
size_t para_size = 0;
|
||||
if (!is_null_input_) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(x_desc_, &input_size), "Get input size failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(scale_bias_desc_, ¶_size), "Get input size failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(x_desc_, &input_size),
|
||||
"Get input size failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(scale_bias_desc_, ¶_size),
|
||||
"Get input size failed");
|
||||
}
|
||||
|
||||
input_size_list_.push_back(input_size);
|
||||
|
|
|
@ -70,17 +70,18 @@ class Im2ColGpuFwdKernel : public GpuKernel {
|
|||
old_width_ + pad_width_, pad_top_, pad_left_, pad_value_, padded_addr,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnIm2Col(cudnn_handle_, padded_desc_, padded_addr, filter_desc_, conv_desc_, output_addr),
|
||||
kernel_node_, cudnnIm2Col(cudnn_handle_, padded_desc_, padded_addr, filter_desc_, conv_desc_, output_addr),
|
||||
"cudnnIm2ColForward failed");
|
||||
} else {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnIm2Col(cudnn_handle_, input_desc_, input_addr, filter_desc_, conv_desc_, output_addr),
|
||||
kernel_node_, cudnnIm2Col(cudnn_handle_, input_desc_, input_addr, filter_desc_, conv_desc_, output_addr),
|
||||
"cudnnIm2ColForward failed");
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
InitResource();
|
||||
if (!CheckParam(kernel_node)) {
|
||||
return false;
|
||||
|
@ -98,7 +99,8 @@ class Im2ColGpuFwdKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
Set4DDesc(in_shape, filter_shape, output_shape);
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetConvolutionGroupCount(conv_desc_, 1), "cudnnSetConvGroupCount failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnSetConvolutionGroupCount(conv_desc_, 1),
|
||||
"cudnnSetConvGroupCount failed");
|
||||
pad_height_ = static_cast<int>(GetAttr<int64_t>(kernel_node, "pad"));
|
||||
pad_width_ = pad_height_;
|
||||
pad_mode_ = GetAttr<std::string>(kernel_node, "pad_mode");
|
||||
|
@ -111,12 +113,13 @@ class Im2ColGpuFwdKernel : public GpuKernel {
|
|||
pad_width_ = 0;
|
||||
}
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetConvolution2dDescriptor(conv_desc_, pad_height_, pad_width_, stride_[2], stride_[3], dilation_[2],
|
||||
dilation_[3], CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT),
|
||||
"cudnnSetConvolution2dDescriptor failed");
|
||||
}
|
||||
if (cudnn_data_type_ == CUDNN_DATA_HALF) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH),
|
||||
"cudnnSetConvolutionMathType failed.")
|
||||
}
|
||||
InitSizeLists();
|
||||
|
@ -124,32 +127,43 @@ class Im2ColGpuFwdKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
void DestroyResource() noexcept override {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_),
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyConvolutionDescriptor(conv_desc_),
|
||||
"cudnnDestroyConvolutionDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(filter_desc_), "cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_desc_), "cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_desc_), "cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_desc_), "cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyFilterDescriptor(filter_desc_),
|
||||
"cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(padded_desc_),
|
||||
"cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(output_desc_),
|
||||
"cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(input_desc_),
|
||||
"cudnnDestroyTensorDescriptor failed");
|
||||
}
|
||||
|
||||
protected:
|
||||
void InitResource() override {
|
||||
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&input_desc_), "cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&output_desc_), "cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&padded_desc_), "cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateFilterDescriptor(&filter_desc_), "cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateConvolutionDescriptor(&conv_desc_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&input_desc_),
|
||||
"cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&output_desc_),
|
||||
"cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&padded_desc_),
|
||||
"cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateFilterDescriptor(&filter_desc_),
|
||||
"cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateConvolutionDescriptor(&conv_desc_),
|
||||
"cudnnCreateConvolutionDescriptor failed");
|
||||
}
|
||||
|
||||
void InitSizeLists() override {
|
||||
if (!is_null_input_) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(input_desc_, reinterpret_cast<size_t *>(&input_size_)),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnGetTensorSizeInBytes(input_desc_, reinterpret_cast<size_t *>(&input_size_)),
|
||||
"cudnnGetTensorSizeInBytes failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(output_desc_, reinterpret_cast<size_t *>(&output_size_)),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnGetTensorSizeInBytes(output_desc_, reinterpret_cast<size_t *>(&output_size_)),
|
||||
"cudnnGetTensorSizeInBytes failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(padded_desc_, reinterpret_cast<size_t *>(&padded_size_)),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnGetTensorSizeInBytes(padded_desc_, reinterpret_cast<size_t *>(&padded_size_)),
|
||||
"cudnnGetTensorSizeInBytes failed");
|
||||
}
|
||||
input_size_list_.push_back(input_size_);
|
||||
|
@ -196,10 +210,12 @@ class Im2ColGpuFwdKernel : public GpuKernel {
|
|||
use_pad_ = false;
|
||||
}
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(padded_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, n_, c_,
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(padded_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, n_, c_,
|
||||
old_height_ + pad_height_, old_width_ + pad_width_),
|
||||
"cudnnSetTensor4dDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetConvolution2dDescriptor(
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetConvolution2dDescriptor(
|
||||
conv_desc_, use_pad_ ? 0 : pad_top_, use_pad_ ? 0 : pad_left_, stride_[2], stride_[3],
|
||||
dilation_[2], dilation_[3], CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT),
|
||||
"cudnnSetConvolution2dDescriptor failed");
|
||||
|
@ -208,17 +224,20 @@ class Im2ColGpuFwdKernel : public GpuKernel {
|
|||
void Set4DDesc(const std::vector<size_t> &in_shape, const std::vector<int> &filter_shape,
|
||||
const std::vector<size_t> &output_shape) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(input_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, SizeToInt(in_shape[0]),
|
||||
SizeToInt(in_shape[1]), SizeToInt(in_shape[2]), SizeToInt(in_shape[3])),
|
||||
"cudnnSetTensor4dDescriptor failed");
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetFilter4dDescriptor(filter_desc_, cudnn_data_type_, CUDNN_TENSOR_NCHW, 1,
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetFilter4dDescriptor(filter_desc_, cudnn_data_type_, CUDNN_TENSOR_NCHW, 1,
|
||||
SizeToInt(in_shape[1]), filter_shape[0], filter_shape[1]),
|
||||
"cudnnSetFilter4dDescriptor failed");
|
||||
|
||||
auto out_H = output_shape[0] * output_shape[1] * output_shape[2];
|
||||
auto out_W = output_shape[3] * output_shape[4] * output_shape[5];
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(output_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_,
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(output_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_,
|
||||
SizeToInt(out_H), SizeToInt(out_W), 1, 1),
|
||||
"cudnnSetTensor4dDescriptor failed");
|
||||
}
|
||||
|
|
|
@ -66,11 +66,13 @@ class L2NormalizeGpuKernel : public GpuKernel {
|
|||
const float beta = 0;
|
||||
|
||||
if (all_match_) {
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(reduce_workspace_addr, input_addr, input_size_list_[0],
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(reduce_workspace_addr, input_addr, input_size_list_[0],
|
||||
cudaMemcpyDeviceToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemcpyAsync failed in L2Normalize::Launch.");
|
||||
} else {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnReduceTensor(cudnn_handle_, reduce_tensor_descriptor_, nullptr, 0, workspace_addr, workspace_size_, &alpha,
|
||||
inputA_descriptor_, input_addr, &beta, outputC_descriptor_, reduce_workspace_addr),
|
||||
"cudnnReduceTensor failed.");
|
||||
|
@ -84,6 +86,7 @@ class L2NormalizeGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
InitResource();
|
||||
data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
|
||||
if (!CheckIONumber(kernel_node)) {
|
||||
|
@ -142,25 +145,26 @@ class L2NormalizeGpuKernel : public GpuKernel {
|
|||
protected:
|
||||
void InitResource() override {
|
||||
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateReduceTensorDescriptor(&reduce_tensor_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateReduceTensorDescriptor(&reduce_tensor_descriptor_),
|
||||
"cudnnCreateReduceTensorDescriptor failed.");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&inputA_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&inputA_descriptor_),
|
||||
"cudnnCreateTensorDescriptor failed.");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&outputC_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&outputC_descriptor_),
|
||||
"cudnnCreateTensorDescriptor failed.");
|
||||
}
|
||||
void InitSizeLists() override {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(inputA_descriptor_, &input_size_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(inputA_descriptor_, &input_size_),
|
||||
"cudnnGetTensorSizeInBytes failed.");
|
||||
input_size_list_.push_back(input_size_);
|
||||
|
||||
output_size_list_.push_back(output_size_);
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(outputC_descriptor_, &workspace_size_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(outputC_descriptor_, &workspace_size_),
|
||||
"cudnnGetTensorSizeInBytes failed.");
|
||||
workspace_size_list_.push_back(workspace_size_);
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnGetReductionWorkspaceSize(cudnn_handle_, reduce_tensor_descriptor_, inputA_descriptor_, outputC_descriptor_,
|
||||
&workspace_size_),
|
||||
"cudnnGetReductionWorkspaceSize failed.");
|
||||
|
@ -184,15 +188,16 @@ class L2NormalizeGpuKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
void DestroyResource() noexcept {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyReduceTensorDescriptor(reduce_tensor_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyReduceTensorDescriptor(reduce_tensor_descriptor_),
|
||||
"cudnnDestroyReduceTensorDescriptor failed.");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(inputA_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(inputA_descriptor_),
|
||||
"cudnnDestroyTensorDescriptor failed.");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(outputC_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(outputC_descriptor_),
|
||||
"cudnnDestroyTensorDescriptor failed.");
|
||||
}
|
||||
void InferArrayReduceType(const CNodePtr &kernel_node) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetReduceTensorDescriptor(reduce_tensor_descriptor_, CUDNN_REDUCE_TENSOR_NORM2, CUDNN_DATA_FLOAT, nan_prop_,
|
||||
reduce_indices_, CUDNN_32BIT_INDICES),
|
||||
"cudnnSetReduceTensorDescriptor failed");
|
||||
|
@ -205,11 +210,12 @@ class L2NormalizeGpuKernel : public GpuKernel {
|
|||
|
||||
if (input_shape.size() <= split_dim) {
|
||||
ShapeNdTo4d(input_shape, &inputA);
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(inputA_descriptor_, CUDNN_TENSOR_NCHW, data_type_,
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(inputA_descriptor_, CUDNN_TENSOR_NCHW, data_type_,
|
||||
inputA[0], inputA[1], inputA[2], inputA[3]),
|
||||
"cudnnSetTensor4dDescriptor failed");
|
||||
} else {
|
||||
CudnnSetTensorNdDescriptor(input_shape, inputA_descriptor_, data_type_);
|
||||
CudnnSetTensorNdDescriptor(input_shape, inputA_descriptor_, data_type_, kernel_node_);
|
||||
for (auto dim : input_shape) {
|
||||
inputA.emplace_back(dim);
|
||||
}
|
||||
|
@ -219,11 +225,12 @@ class L2NormalizeGpuKernel : public GpuKernel {
|
|||
|
||||
if (outputC_shape.size() <= split_dim) {
|
||||
ShapeNdTo4d(outputC_shape, &outputC);
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(outputC_descriptor_, CUDNN_TENSOR_NCHW, data_type_,
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(outputC_descriptor_, CUDNN_TENSOR_NCHW, data_type_,
|
||||
outputC[0], outputC[1], outputC[2], outputC[3]),
|
||||
"cudnnSetTensor4dDescriptor failed");
|
||||
} else {
|
||||
CudnnSetTensorNdDescriptor(outputC_shape, outputC_descriptor_, data_type_);
|
||||
CudnnSetTensorNdDescriptor(outputC_shape, outputC_descriptor_, data_type_, kernel_node_);
|
||||
for (auto dim : outputC_shape) {
|
||||
outputC.emplace_back(dim);
|
||||
}
|
||||
|
|
|
@ -71,11 +71,13 @@ class L2NormalizeGradGpuKernel : public GpuKernel {
|
|||
const float beta = 0;
|
||||
|
||||
if (all_match_) {
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(reduce_workspace_addr, x_addr, input_size_list_[0],
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(reduce_workspace_addr, x_addr, input_size_list_[0],
|
||||
cudaMemcpyDeviceToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemcpyAsync failed in L2Normalize::Launch.");
|
||||
} else {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnReduceTensor(cudnn_handle_, reduce_tensor_descriptor_, nullptr, 0, workspace_addr, workspace_size_list_[2],
|
||||
&alpha, inputA_descriptor_, x_addr, &beta, outputC_descriptor_, reduce_workspace_addr),
|
||||
"cudnnReduceTensor failed.");
|
||||
|
@ -85,11 +87,13 @@ class L2NormalizeGradGpuKernel : public GpuKernel {
|
|||
BroadcastArith(output_shape_, output_shape_, output_shape_, BROADCAST_TYPE_MUL, y_addr, dy_addr, dx_addr,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
if (all_match_) {
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(reduce_y_dy_workspace_addr, dx_addr, output_size_list_[0],
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(reduce_y_dy_workspace_addr, dx_addr, output_size_list_[0],
|
||||
cudaMemcpyDeviceToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemcpyAsync failed in L2Normalize::Launch.");
|
||||
} else {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnReduceTensor(cudnn_handle_, reduce_sum_tensor_descriptor_, nullptr, 0, workspace_y_dy_addr,
|
||||
workspace_size_list_[3], &alpha, inputA_descriptor_, dx_addr, &beta, outputC_descriptor_,
|
||||
reduce_y_dy_workspace_addr),
|
||||
|
@ -124,6 +128,7 @@ class L2NormalizeGradGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
InitResource();
|
||||
data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
|
||||
if (!CheckIONumber(kernel_node)) {
|
||||
|
@ -187,13 +192,13 @@ class L2NormalizeGradGpuKernel : public GpuKernel {
|
|||
}
|
||||
void InitResource() override {
|
||||
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateReduceTensorDescriptor(&reduce_tensor_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateReduceTensorDescriptor(&reduce_tensor_descriptor_),
|
||||
"cudnnCreateReduceTensorDescriptor failed.");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateReduceTensorDescriptor(&reduce_sum_tensor_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateReduceTensorDescriptor(&reduce_sum_tensor_descriptor_),
|
||||
"cudnnCreateReduceTensorDescriptor failed.");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&inputA_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&inputA_descriptor_),
|
||||
"cudnnCreateTensorDescriptor failed.");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&outputC_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&outputC_descriptor_),
|
||||
"cudnnCreateTensorDescriptor failed.");
|
||||
}
|
||||
void InitSizeLists() override {
|
||||
|
@ -207,18 +212,20 @@ class L2NormalizeGradGpuKernel : public GpuKernel {
|
|||
|
||||
output_size_list_.push_back(output_size_);
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(outputC_descriptor_, &workspace_size_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(outputC_descriptor_, &workspace_size_),
|
||||
"cudnnGetTensorSizeInBytes failed.");
|
||||
workspace_size_list_.push_back(workspace_size_);
|
||||
workspace_size_list_.push_back(workspace_size_);
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnGetReductionWorkspaceSize(cudnn_handle_, reduce_tensor_descriptor_, inputA_descriptor_, outputC_descriptor_,
|
||||
&workspace_size_),
|
||||
"cudnnGetReductionWorkspaceSize failed.");
|
||||
workspace_size_list_.push_back(workspace_size_);
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnGetReductionWorkspaceSize(cudnn_handle_, reduce_sum_tensor_descriptor_, inputA_descriptor_,
|
||||
outputC_descriptor_, &workspace_size_),
|
||||
"cudnnGetReductionWorkspaceSize failed.");
|
||||
|
@ -229,21 +236,23 @@ class L2NormalizeGradGpuKernel : public GpuKernel {
|
|||
|
||||
private:
|
||||
void DestroyResource() noexcept {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyReduceTensorDescriptor(reduce_tensor_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyReduceTensorDescriptor(reduce_tensor_descriptor_),
|
||||
"cudnnDestroyReduceTensorDescriptor failed.");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyReduceTensorDescriptor(reduce_sum_tensor_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyReduceTensorDescriptor(reduce_sum_tensor_descriptor_),
|
||||
"cudnnDestroyReduceTensorDescriptor failed.");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(inputA_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(inputA_descriptor_),
|
||||
"cudnnDestroyTensorDescriptor failed.");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(outputC_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(outputC_descriptor_),
|
||||
"cudnnDestroyTensorDescriptor failed.");
|
||||
}
|
||||
void InferArrayReduceType(const CNodePtr &kernel_node) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetReduceTensorDescriptor(reduce_tensor_descriptor_, CUDNN_REDUCE_TENSOR_NORM2, CUDNN_DATA_FLOAT, nan_prop_,
|
||||
reduce_indices_, CUDNN_32BIT_INDICES),
|
||||
"cudnnSetReduceTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetReduceTensorDescriptor(reduce_sum_tensor_descriptor_, CUDNN_REDUCE_TENSOR_ADD, CUDNN_DATA_FLOAT,
|
||||
nan_prop_, reduce_indices_, CUDNN_32BIT_INDICES),
|
||||
"cudnnSetReduceTensorDescriptor failed");
|
||||
|
@ -256,11 +265,12 @@ class L2NormalizeGradGpuKernel : public GpuKernel {
|
|||
|
||||
if (input_shape.size() <= split_dim) {
|
||||
ShapeNdTo4d(input_shape, &inputA);
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(inputA_descriptor_, CUDNN_TENSOR_NCHW, data_type_,
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(inputA_descriptor_, CUDNN_TENSOR_NCHW, data_type_,
|
||||
inputA[0], inputA[1], inputA[2], inputA[3]),
|
||||
"cudnnSetTensor4dDescriptor failed");
|
||||
} else {
|
||||
CudnnSetTensorNdDescriptor(input_shape, inputA_descriptor_, data_type_);
|
||||
CudnnSetTensorNdDescriptor(input_shape, inputA_descriptor_, data_type_, kernel_node_);
|
||||
for (auto dim : input_shape) {
|
||||
inputA.emplace_back(dim);
|
||||
}
|
||||
|
@ -270,11 +280,12 @@ class L2NormalizeGradGpuKernel : public GpuKernel {
|
|||
|
||||
if (outputC_shape.size() <= split_dim) {
|
||||
ShapeNdTo4d(outputC_shape, &outputC);
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(outputC_descriptor_, CUDNN_TENSOR_NCHW, data_type_,
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(outputC_descriptor_, CUDNN_TENSOR_NCHW, data_type_,
|
||||
outputC[0], outputC[1], outputC[2], outputC[3]),
|
||||
"cudnnSetTensor4dDescriptor failed");
|
||||
} else {
|
||||
CudnnSetTensorNdDescriptor(outputC_shape, outputC_descriptor_, data_type_);
|
||||
CudnnSetTensorNdDescriptor(outputC_shape, outputC_descriptor_, data_type_, kernel_node_);
|
||||
for (auto dim : outputC_shape) {
|
||||
outputC.emplace_back(dim);
|
||||
}
|
||||
|
|
|
@ -74,12 +74,13 @@ class LstmGpuKernel : public GpuKernel {
|
|||
|
||||
if (!states_init_) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetDropoutDescriptor(dropout_desc_, handle_, dropout_, states_addr, output_size_list_[4], 0),
|
||||
kernel_node_, cudnnSetDropoutDescriptor(dropout_desc_, handle_, dropout_, states_addr, output_size_list_[4], 0),
|
||||
"set dropout_desc failed");
|
||||
states_init_ = true;
|
||||
}
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnRNNForwardTraining(handle_, rnn_desc_, seq_len_, x_desc_.get(), x_addr, hx_desc_, hx_addr, cx_desc_, cx_addr,
|
||||
w_desc_, w_addr, y_desc_.get(), y_addr, hy_desc_, hy_addr, cy_desc_, cy_addr,
|
||||
workspace_addr, workspace_size_list_[0], reserved_addr, reserved_size_),
|
||||
|
@ -88,6 +89,7 @@ class LstmGpuKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
InitResource();
|
||||
cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
|
||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
|
@ -108,33 +110,41 @@ class LstmGpuKernel : public GpuKernel {
|
|||
cudnnRNNAlgo_t algo = CUDNN_RNN_ALGO_STANDARD;
|
||||
CreateTensorDescGrp();
|
||||
int hx_dims[3]{num_layers_ * (bidirectional_ ? 2 : 1), batch_size_, hidden_size_};
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(hx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetTensorNdDescriptorEx(hx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims),
|
||||
"set hx_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(cx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetTensorNdDescriptorEx(cx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims),
|
||||
"set cx_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(hy_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetTensorNdDescriptorEx(hy_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims),
|
||||
"set hy_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(cy_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetTensorNdDescriptorEx(cy_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims),
|
||||
"set cy_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetDropoutDescriptor(dropout_desc_, handle_, dropout_, nullptr, 0, 0),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetDropoutDescriptor(dropout_desc_, handle_, dropout_, nullptr, 0, 0),
|
||||
"set dropout_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetRNNDescriptor(handle_, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetRNNDescriptor(handle_, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
|
||||
input_mode, direction, rnn_mode, algo, cudnn_data_type_),
|
||||
"set rnn_desc failed");
|
||||
cudnnRNNBiasMode_t bias_mode = has_bias_ ? CUDNN_RNN_DOUBLE_BIAS : CUDNN_RNN_NO_BIAS;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetRNNBiasMode(rnn_desc_, bias_mode), "set bias_mode failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnSetRNNBiasMode(rnn_desc_, bias_mode), "set bias_mode failed");
|
||||
auto weight_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
|
||||
size_t weight_size = weight_shape[0] * weight_shape[1] * weight_shape[2] * sizeof(T);
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetRNNParamsSize(handle_, rnn_desc_, x_desc_[0], &weight_size_, cudnn_data_type_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnGetRNNParamsSize(handle_, rnn_desc_, x_desc_[0], &weight_size_, cudnn_data_type_),
|
||||
"get weight_size_ failed");
|
||||
if (weight_size != weight_size_) {
|
||||
MS_LOG(EXCEPTION) << "weight size: " << weight_size << " error, expect: " << weight_size_ << " .";
|
||||
}
|
||||
int w_dims[3] = {SizeToInt(weight_size_ / 4), 1, 1};
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetFilterNdDescriptor(w_desc_, cudnn_data_type_, CUDNN_TENSOR_NCHW, 3, w_dims),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetFilterNdDescriptor(w_desc_, cudnn_data_type_, CUDNN_TENSOR_NCHW, 3, w_dims),
|
||||
"set w_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnGetRNNTrainingReserveSize(handle_, rnn_desc_, seq_len_, x_desc_.get(), &reserved_size_),
|
||||
kernel_node_, cudnnGetRNNTrainingReserveSize(handle_, rnn_desc_, seq_len_, x_desc_.get(), &reserved_size_),
|
||||
"get reserve size failed");
|
||||
InitSizeLists();
|
||||
return true;
|
||||
|
@ -147,47 +157,51 @@ class LstmGpuKernel : public GpuKernel {
|
|||
y_desc_ = std::make_unique<cudnnTensorDescriptor_t[]>(seq_len_);
|
||||
|
||||
for (size_t i = 0; i < IntToSize(seq_len_); ++i) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&x_desc_[i]), "create x_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&x_desc_[i]), "create x_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetTensorNdDescriptorEx(x_desc_[i], CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, x_dims), "set x_desc failed");
|
||||
kernel_node_, cudnnSetTensorNdDescriptorEx(x_desc_[i], CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, x_dims),
|
||||
"set x_desc failed");
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&y_desc_[i]), "create y_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&y_desc_[i]), "create y_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetTensorNdDescriptorEx(y_desc_[i], CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, y_dims), "set y_desc failed");
|
||||
kernel_node_, cudnnSetTensorNdDescriptorEx(y_desc_[i], CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, y_dims),
|
||||
"set y_desc failed");
|
||||
}
|
||||
}
|
||||
|
||||
void DestroyResource() noexcept override {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyDropoutDescriptor(dropout_desc_), "destroy dropout_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(cy_desc_), "destroy cy_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hy_desc_), "destroy hy_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(w_desc_), "destroy w_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(cx_desc_), "destroy cx_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyDropoutDescriptor(dropout_desc_),
|
||||
"destroy dropout_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(cy_desc_), "destroy cy_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(hy_desc_), "destroy hy_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyFilterDescriptor(w_desc_), "destroy w_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(cx_desc_), "destroy cx_desc failed");
|
||||
|
||||
for (size_t i = 0; i < IntToSize(seq_len_); ++i) {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_[i]), "destroy y_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_[i]), "destroy x_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(y_desc_[i]), "destroy y_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(x_desc_[i]), "destroy x_desc failed");
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
void InitResource() override {
|
||||
handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&hx_desc_), "create hx_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&cx_desc_), "create cx_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateFilterDescriptor(&w_desc_), "create w_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&hy_desc_), "create hy_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&cy_desc_), "create cy_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateDropoutDescriptor(&dropout_desc_), "create dropout_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateRNNDescriptor(&rnn_desc_), "create rnn_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&hx_desc_), "create hx_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&cx_desc_), "create cx_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateFilterDescriptor(&w_desc_), "create w_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&hy_desc_), "create hy_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&cy_desc_), "create cy_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateDropoutDescriptor(&dropout_desc_),
|
||||
"create dropout_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateRNNDescriptor(&rnn_desc_), "create rnn_desc failed");
|
||||
}
|
||||
void InitSizeLists() override {
|
||||
size_t x_size = IntToSize(seq_len_ * batch_size_ * input_size_) * sizeof(T);
|
||||
|
||||
size_t h_size = 0;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(hx_desc_, &h_size), "get h size failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(hx_desc_, &h_size), "get h size failed");
|
||||
|
||||
input_size_list_.push_back(x_size);
|
||||
input_size_list_.push_back(h_size);
|
||||
|
@ -200,11 +214,13 @@ class LstmGpuKernel : public GpuKernel {
|
|||
output_size_list_.push_back(h_size);
|
||||
output_size_list_.push_back(reserved_size_);
|
||||
size_t state_size = 0;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnDropoutGetStatesSize(handle_, &state_size), "get dropout states size failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnDropoutGetStatesSize(handle_, &state_size),
|
||||
"get dropout states size failed");
|
||||
output_size_list_.push_back(state_size);
|
||||
|
||||
size_t workspace_size = 0;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetRNNWorkspaceSize(handle_, rnn_desc_, seq_len_, x_desc_.get(), &workspace_size),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnGetRNNWorkspaceSize(handle_, rnn_desc_, seq_len_, x_desc_.get(), &workspace_size),
|
||||
"get workspace size failed");
|
||||
workspace_size_list_.push_back(workspace_size);
|
||||
}
|
||||
|
|
|
@ -79,19 +79,21 @@ class LstmGradDataGpuKernel : public GpuKernel {
|
|||
|
||||
if (!states_init_) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnRestoreDropoutDescriptor(dropout_desc_, handle_, dropout_, states_addr, input_size_list_[8], 0),
|
||||
"restore dropout state failed");
|
||||
states_init_ = true;
|
||||
}
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnRNNBackwardData(handle_, rnn_desc_, seq_len_, y_desc_.get(), y_addr, dy_desc_.get(), dy_addr, dhy_desc_,
|
||||
dhy_addr, dcy_desc_, dcy_addr, w_desc_, w_addr, hx_desc_, hx_addr, cx_desc_, cx_addr,
|
||||
dx_desc_.get(), dx_addr, dhx_desc_, dhx_addr, dcx_desc_, dcx_addr, workspace_addr,
|
||||
workspace_size_list_[0], reserved_addr, reserved_size_),
|
||||
"launch lstm back data kernel failed");
|
||||
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"stream synchronize failed.");
|
||||
return true;
|
||||
}
|
||||
|
@ -104,6 +106,7 @@ class LstmGradDataGpuKernel : public GpuKernel {
|
|||
dropout_ = GetAttr<float>(kernel_node, "dropout");
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
InitResource();
|
||||
cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
|
||||
auto input_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
|
||||
|
@ -117,71 +120,84 @@ class LstmGradDataGpuKernel : public GpuKernel {
|
|||
CreateTensorDescGrp();
|
||||
int hx_dims[3]{num_layers_ * (bidirectional_ ? 2 : 1), batch_size_, hidden_size_};
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetTensorNdDescriptorEx(dhy_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims), "set dhy_desc_ failed");
|
||||
kernel_node_, cudnnSetTensorNdDescriptorEx(dhy_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims),
|
||||
"set dhy_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetTensorNdDescriptorEx(dcy_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims), "set dcy_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(hx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims),
|
||||
kernel_node_, cudnnSetTensorNdDescriptorEx(dcy_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims),
|
||||
"set dcy_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetTensorNdDescriptorEx(hx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims),
|
||||
"set hx_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(cx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetTensorNdDescriptorEx(cx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims),
|
||||
"set cx_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetTensorNdDescriptorEx(dhx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims), "set dhx_desc_ failed");
|
||||
kernel_node_, cudnnSetTensorNdDescriptorEx(dhx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims),
|
||||
"set dhx_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetTensorNdDescriptorEx(dcx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims), "set dcx_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetDropoutDescriptor(dropout_desc_, handle_, dropout_, nullptr, 0, 0),
|
||||
kernel_node_, cudnnSetTensorNdDescriptorEx(dcx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims),
|
||||
"set dcx_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetDropoutDescriptor(dropout_desc_, handle_, dropout_, nullptr, 0, 0),
|
||||
"set dropout_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetRNNDescriptor(handle_, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetRNNDescriptor(handle_, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
|
||||
input_mode, direction, rnn_mode, algo, cudnn_data_type_),
|
||||
"set rnn_desc failed");
|
||||
cudnnRNNBiasMode_t bias_mode = has_bias_ ? CUDNN_RNN_DOUBLE_BIAS : CUDNN_RNN_NO_BIAS;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetRNNBiasMode(rnn_desc_, bias_mode), "set bias_mode failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnSetRNNBiasMode(rnn_desc_, bias_mode), "set bias_mode failed");
|
||||
auto weight_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 4);
|
||||
size_t weight_size = weight_shape[0] * weight_shape[1] * weight_shape[2] * sizeof(T);
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetRNNParamsSize(handle_, rnn_desc_, dx_desc_[0], &weight_size_, cudnn_data_type_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnGetRNNParamsSize(handle_, rnn_desc_, dx_desc_[0], &weight_size_, cudnn_data_type_),
|
||||
"get weight_size_ failed");
|
||||
if (weight_size != weight_size_) {
|
||||
MS_LOG(EXCEPTION) << "weight size: " << weight_size << " error, expect: " << weight_size_ << " .";
|
||||
}
|
||||
int w_dims[3] = {SizeToInt(weight_size_ / 4), 1, 1};
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetFilterNdDescriptor(w_desc_, cudnn_data_type_, CUDNN_TENSOR_NCHW, 3, w_dims),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetFilterNdDescriptor(w_desc_, cudnn_data_type_, CUDNN_TENSOR_NCHW, 3, w_dims),
|
||||
"set w_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnGetRNNTrainingReserveSize(handle_, rnn_desc_, seq_len_, dx_desc_.get(), &reserved_size_), "get size failed");
|
||||
kernel_node_, cudnnGetRNNTrainingReserveSize(handle_, rnn_desc_, seq_len_, dx_desc_.get(), &reserved_size_),
|
||||
"get size failed");
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
void DestroyResource() noexcept override {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyDropoutDescriptor(dropout_desc_), "destroy dropout_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dcx_desc_), "destroy dcx_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dhx_desc_), "destroy dhx_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(w_desc_), "destroy w_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(cx_desc_), "destroy cx_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dcy_desc_), "destroy dcy_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dhy_desc_), "destroy dhy_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyDropoutDescriptor(dropout_desc_),
|
||||
"destroy dropout_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dcx_desc_), "destroy dcx_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dhx_desc_), "destroy dhx_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyFilterDescriptor(w_desc_), "destroy w_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(cx_desc_), "destroy cx_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dcy_desc_), "destroy dcy_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dhy_desc_), "destroy dhy_desc_ failed");
|
||||
DestroyTensorDescGrp();
|
||||
}
|
||||
|
||||
protected:
|
||||
void InitResource() override {
|
||||
handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dhy_desc_), "create dhy_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dcy_desc_), "create dcy_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&hx_desc_), "create hx_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&cx_desc_), "create cx_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateFilterDescriptor(&w_desc_), "create w_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dhx_desc_), "create dhx_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dcx_desc_), "create dcx_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateDropoutDescriptor(&dropout_desc_), "create dropout_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateRNNDescriptor(&rnn_desc_), "create rnn_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dhy_desc_), "create dhy_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dcy_desc_), "create dcy_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&hx_desc_), "create hx_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&cx_desc_), "create cx_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateFilterDescriptor(&w_desc_), "create w_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dhx_desc_), "create dhx_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dcx_desc_), "create dcx_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateDropoutDescriptor(&dropout_desc_),
|
||||
"create dropout_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateRNNDescriptor(&rnn_desc_), "create rnn_desc failed");
|
||||
}
|
||||
|
||||
void InitSizeLists() override {
|
||||
size_t y_size = IntToSize(seq_len_ * batch_size_ * hidden_size_ * (bidirectional_ ? 2 : 1)) * sizeof(T);
|
||||
|
||||
size_t h_size = 0;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(hx_desc_, &h_size), "get h size failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(hx_desc_, &h_size), "get h size failed");
|
||||
|
||||
input_size_list_.push_back(y_size);
|
||||
input_size_list_.push_back(y_size);
|
||||
|
@ -192,7 +208,8 @@ class LstmGradDataGpuKernel : public GpuKernel {
|
|||
input_size_list_.push_back(h_size);
|
||||
input_size_list_.push_back(reserved_size_);
|
||||
size_t state_size = 0;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnDropoutGetStatesSize(handle_, &state_size), "get dropout states size failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnDropoutGetStatesSize(handle_, &state_size),
|
||||
"get dropout states size failed");
|
||||
input_size_list_.push_back(state_size);
|
||||
|
||||
size_t x_size = IntToSize(seq_len_ * batch_size_ * input_size_) * sizeof(T);
|
||||
|
@ -201,7 +218,8 @@ class LstmGradDataGpuKernel : public GpuKernel {
|
|||
output_size_list_.push_back(h_size);
|
||||
|
||||
size_t workspace_size = 0;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetRNNWorkspaceSize(handle_, rnn_desc_, seq_len_, dx_desc_.get(), &workspace_size),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnGetRNNWorkspaceSize(handle_, rnn_desc_, seq_len_, dx_desc_.get(), &workspace_size),
|
||||
"get workspace size failed");
|
||||
workspace_size_list_.push_back(workspace_size);
|
||||
}
|
||||
|
@ -216,27 +234,28 @@ class LstmGradDataGpuKernel : public GpuKernel {
|
|||
dy_desc_ = std::make_unique<cudnnTensorDescriptor_t[]>(seq_len_);
|
||||
|
||||
for (size_t i = 0; i < IntToSize(seq_len_); ++i) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dx_desc_[i]), "create x_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dx_desc_[i]), "create x_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetTensorNdDescriptorEx(dx_desc_[i], CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, x_dims),
|
||||
kernel_node_, cudnnSetTensorNdDescriptorEx(dx_desc_[i], CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, x_dims),
|
||||
"set dx_desc failed");
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&y_desc_[i]), "create y_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&y_desc_[i]), "create y_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetTensorNdDescriptorEx(y_desc_[i], CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, y_dims), "set y_desc failed");
|
||||
kernel_node_, cudnnSetTensorNdDescriptorEx(y_desc_[i], CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, y_dims),
|
||||
"set y_desc failed");
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dy_desc_[i]), "create dy_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dy_desc_[i]), "create dy_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetTensorNdDescriptorEx(dy_desc_[i], CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, y_dims),
|
||||
kernel_node_, cudnnSetTensorNdDescriptorEx(dy_desc_[i], CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, y_dims),
|
||||
"set dy_desc_ failed");
|
||||
}
|
||||
}
|
||||
|
||||
void DestroyTensorDescGrp() {
|
||||
for (size_t i = 0; i < IntToSize(seq_len_); ++i) {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_[i]), "destroy dy_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_[i]), "destroy y_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_[i]), "destroy x_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dy_desc_[i]), "destroy dy_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(y_desc_[i]), "destroy y_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dx_desc_[i]), "destroy x_desc failed");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -66,15 +66,18 @@ class LstmGradWeightGpuKernel : public GpuKernel {
|
|||
|
||||
if (!states_init_) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnRestoreDropoutDescriptor(dropout_desc_, handle_, dropout_, states_addr, input_size_list_[4], 0),
|
||||
"restore dropout state failed");
|
||||
states_init_ = true;
|
||||
}
|
||||
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(
|
||||
cudaMemsetAsync(dw_addr, 0, outputs[0]->size, reinterpret_cast<cudaStream_t>(stream_ptr)), "cudaMemSet Failed");
|
||||
kernel_node_, cudaMemsetAsync(dw_addr, 0, outputs[0]->size, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemSet Failed");
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnRNNBackwardWeights(handle_, rnn_desc_, seq_len_, x_desc_.get(), x_addr, hx_desc_, hx_addr, y_desc_.get(),
|
||||
y_addr, workspace_addr, workspace_size_list_[0], dw_desc_, dw_addr, reserved_addr,
|
||||
reserved_size_),
|
||||
|
@ -83,6 +86,7 @@ class LstmGradWeightGpuKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
InitResource();
|
||||
cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
|
||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
|
@ -104,29 +108,34 @@ class LstmGradWeightGpuKernel : public GpuKernel {
|
|||
CreateTensorDescGrp();
|
||||
int hx_dims[3]{num_layers_ * (bidirectional_ ? 2 : 1), batch_size_, hidden_size_};
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(hx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetTensorNdDescriptorEx(hx_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, hx_dims),
|
||||
"set hx_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetDropoutDescriptor(dropout_desc_, handle_, dropout_, nullptr, 0, 0),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetDropoutDescriptor(dropout_desc_, handle_, dropout_, nullptr, 0, 0),
|
||||
"set dropout_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetRNNDescriptor(handle_, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetRNNDescriptor(handle_, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
|
||||
input_mode, direction, rnn_mode, algo, cudnn_data_type_),
|
||||
"set rnn_desc failed");
|
||||
cudnnRNNBiasMode_t bias_mode = has_bias_ ? CUDNN_RNN_DOUBLE_BIAS : CUDNN_RNN_NO_BIAS;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetRNNBiasMode(rnn_desc_, bias_mode), "set bias_mode failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnSetRNNBiasMode(rnn_desc_, bias_mode), "set bias_mode failed");
|
||||
|
||||
auto weight_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
|
||||
size_t weight_size = weight_shape[0] * weight_shape[1] * weight_shape[2] * sizeof(T);
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetRNNParamsSize(handle_, rnn_desc_, x_desc_[0], &weight_size_, cudnn_data_type_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnGetRNNParamsSize(handle_, rnn_desc_, x_desc_[0], &weight_size_, cudnn_data_type_),
|
||||
"get weight_size_ failed");
|
||||
if (weight_size != weight_size_) {
|
||||
MS_LOG(EXCEPTION) << "weight size: " << weight_size << " error, expect: " << weight_size_ << " .";
|
||||
}
|
||||
int w_dims[3] = {SizeToInt(weight_size_ / 4), 1, 1};
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetFilterNdDescriptor(dw_desc_, cudnn_data_type_, CUDNN_TENSOR_NCHW, 3, w_dims),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetFilterNdDescriptor(dw_desc_, cudnn_data_type_, CUDNN_TENSOR_NCHW, 3, w_dims),
|
||||
"set dw_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnGetRNNTrainingReserveSize(handle_, rnn_desc_, seq_len_, x_desc_.get(), &reserved_size_),
|
||||
kernel_node_, cudnnGetRNNTrainingReserveSize(handle_, rnn_desc_, seq_len_, x_desc_.get(), &reserved_size_),
|
||||
"get reserve size failed");
|
||||
InitSizeLists();
|
||||
return true;
|
||||
|
@ -135,16 +144,17 @@ class LstmGradWeightGpuKernel : public GpuKernel {
|
|||
protected:
|
||||
void InitResource() override {
|
||||
handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&hx_desc_), "create hx_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateFilterDescriptor(&dw_desc_), "create dw_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateDropoutDescriptor(&dropout_desc_), "create dropout_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateRNNDescriptor(&rnn_desc_), "create rnn_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&hx_desc_), "create hx_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateFilterDescriptor(&dw_desc_), "create dw_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateDropoutDescriptor(&dropout_desc_),
|
||||
"create dropout_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateRNNDescriptor(&rnn_desc_), "create rnn_desc failed");
|
||||
}
|
||||
void InitSizeLists() override {
|
||||
size_t x_size = IntToSize(seq_len_ * batch_size_ * input_size_) * sizeof(T);
|
||||
|
||||
size_t h_size = 0;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(hx_desc_, &h_size), "get h size failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(hx_desc_, &h_size), "get h size failed");
|
||||
|
||||
size_t y_size = IntToSize(seq_len_ * batch_size_ * hidden_size_ * (bidirectional_ ? 2 : 1)) * sizeof(T);
|
||||
input_size_list_.push_back(x_size);
|
||||
|
@ -152,21 +162,24 @@ class LstmGradWeightGpuKernel : public GpuKernel {
|
|||
input_size_list_.push_back(y_size);
|
||||
input_size_list_.push_back(reserved_size_);
|
||||
size_t state_size = 0;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnDropoutGetStatesSize(handle_, &state_size), "get dropout states size failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnDropoutGetStatesSize(handle_, &state_size),
|
||||
"get dropout states size failed");
|
||||
input_size_list_.push_back(state_size);
|
||||
|
||||
output_size_list_.push_back(weight_size_);
|
||||
|
||||
size_t workspace_size = 0;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetRNNWorkspaceSize(handle_, rnn_desc_, seq_len_, x_desc_.get(), &workspace_size),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnGetRNNWorkspaceSize(handle_, rnn_desc_, seq_len_, x_desc_.get(), &workspace_size),
|
||||
"get workspace size failed");
|
||||
workspace_size_list_.push_back(workspace_size);
|
||||
}
|
||||
void DestroyResource() noexcept override {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyDropoutDescriptor(dropout_desc_), "destroy dropout_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(dw_desc_), "destroy dw_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyDropoutDescriptor(dropout_desc_),
|
||||
"destroy dropout_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyFilterDescriptor(dw_desc_), "destroy dw_desc_ failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc_ failed");
|
||||
DestroyTensorDescGrp();
|
||||
}
|
||||
|
||||
|
@ -179,19 +192,21 @@ class LstmGradWeightGpuKernel : public GpuKernel {
|
|||
y_desc_ = std::make_unique<cudnnTensorDescriptor_t[]>(seq_len_);
|
||||
|
||||
for (size_t i = 0; i < IntToSize(seq_len_); ++i) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&x_desc_[i]), "create x_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&x_desc_[i]), "create x_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetTensorNdDescriptorEx(x_desc_[i], CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, x_dims), "set x_desc failed");
|
||||
kernel_node_, cudnnSetTensorNdDescriptorEx(x_desc_[i], CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, x_dims),
|
||||
"set x_desc failed");
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&y_desc_[i]), "create y_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&y_desc_[i]), "create y_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetTensorNdDescriptorEx(y_desc_[i], CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, y_dims), "set y_desc failed");
|
||||
kernel_node_, cudnnSetTensorNdDescriptorEx(y_desc_[i], CUDNN_TENSOR_NCHW, cudnn_data_type_, 3, y_dims),
|
||||
"set y_desc failed");
|
||||
}
|
||||
}
|
||||
void DestroyTensorDescGrp() {
|
||||
for (size_t i = 0; i < IntToSize(seq_len_); ++i) {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_[i]), "destroy y_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_[i]), "destroy x_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(y_desc_[i]), "destroy y_desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(x_desc_[i]), "destroy x_desc failed");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -66,12 +66,14 @@ class PoolingGpuFwdKernel : public GpuKernel {
|
|||
const float alpha = 1;
|
||||
const float beta = 0;
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnPoolingForward(cudnn_handle_, pooling_descriptor_, &alpha, input_descriptor_,
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnPoolingForward(cudnn_handle_, pooling_descriptor_, &alpha, input_descriptor_,
|
||||
input_addr, &beta, output_descriptor_, output_addr),
|
||||
"cudnnPoolingForward failed");
|
||||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) {
|
||||
kernel_node_ = kernel_node;
|
||||
InitResource();
|
||||
if (!CheckParam(kernel_node)) {
|
||||
return false;
|
||||
|
@ -102,10 +104,10 @@ class PoolingGpuFwdKernel : public GpuKernel {
|
|||
SetDimA(output_shape, dimAout, 4, data_format_);
|
||||
SetStrideA(output_shape, strideAout, 4, data_format_);
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetTensorNdDescriptor(input_descriptor_, cudnn_data_type_, nbDims, dimA, strideAin),
|
||||
kernel_node_, cudnnSetTensorNdDescriptor(input_descriptor_, cudnn_data_type_, nbDims, dimA, strideAin),
|
||||
"cudnnSetTensor4dDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetTensorNdDescriptor(output_descriptor_, cudnn_data_type_, nbDims, dimAout, strideAout),
|
||||
kernel_node_, cudnnSetTensorNdDescriptor(output_descriptor_, cudnn_data_type_, nbDims, dimAout, strideAout),
|
||||
"cudnnSetTensor4dDescriptor failed");
|
||||
SetPoolingMode(kernel_node);
|
||||
SetPad(kernel_node);
|
||||
|
@ -114,27 +116,31 @@ class PoolingGpuFwdKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
void DestroyResource() noexcept override {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyPoolingDescriptor(pooling_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyPoolingDescriptor(pooling_descriptor_),
|
||||
"cudnnDestroyPoolingDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_descriptor_), "cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(output_descriptor_),
|
||||
"cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(input_descriptor_),
|
||||
"cudnnDestroyTensorDescriptor failed");
|
||||
}
|
||||
|
||||
protected:
|
||||
void InitResource() {
|
||||
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&input_descriptor_), "cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&output_descriptor_), "cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreatePoolingDescriptor(&pooling_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&input_descriptor_),
|
||||
"cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&output_descriptor_),
|
||||
"cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreatePoolingDescriptor(&pooling_descriptor_),
|
||||
"cudnnCreatePoolingDescriptor failed");
|
||||
}
|
||||
void InitSizeLists() {
|
||||
if (!is_null_input_) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnGetTensorSizeInBytes(input_descriptor_, reinterpret_cast<size_t *>(&input_size_)),
|
||||
kernel_node_, cudnnGetTensorSizeInBytes(input_descriptor_, reinterpret_cast<size_t *>(&input_size_)),
|
||||
"cudnnGetTensorSizeInBytes failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnGetTensorSizeInBytes(output_descriptor_, reinterpret_cast<size_t *>(&output_size_)),
|
||||
kernel_node_, cudnnGetTensorSizeInBytes(output_descriptor_, reinterpret_cast<size_t *>(&output_size_)),
|
||||
"cudnnGetTensorSizeInBytes failed");
|
||||
}
|
||||
input_size_list_.push_back(input_size_);
|
||||
|
@ -199,7 +205,8 @@ class PoolingGpuFwdKernel : public GpuKernel {
|
|||
pad_height_ = 0;
|
||||
pad_width_ = 0;
|
||||
}
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetPoolingNdDescriptor(pooling_descriptor_, pooling_mode_, CUDNN_NOT_PROPAGATE_NAN,
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetPoolingNdDescriptor(pooling_descriptor_, pooling_mode_, CUDNN_NOT_PROPAGATE_NAN,
|
||||
2, windowDimA, paddingA, strideA),
|
||||
"cudnnSetPoolingNdDescriptor failed");
|
||||
}
|
||||
|
|
|
@ -71,6 +71,7 @@ class PoolingGradGpuKernel : public GpuKernel {
|
|||
const float alpha = 1;
|
||||
const float beta = 0;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnPoolingBackward(cudnn_handle_, pooling_descriptor_, &alpha, y_descriptor_, y, dy_descriptor_, dy,
|
||||
x_descriptor_, x_data, &beta, dx_descriptor_, dx),
|
||||
"cudnnPoolingBackward failed");
|
||||
|
@ -108,6 +109,7 @@ class PoolingGradGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
InitResource();
|
||||
if (!CheckParam(kernel_node)) {
|
||||
return false;
|
||||
|
@ -124,14 +126,17 @@ class PoolingGradGpuKernel : public GpuKernel {
|
|||
if (!InitShape(kernel_node, dimA, strideAin, dimAy, strideAiny, dimAdy, strideAdy, dimAout, strideAout)) {
|
||||
return true;
|
||||
}
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(y_descriptor_, cudnn_data_type_, nbDims, dimAy, strideAiny),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetTensorNdDescriptor(y_descriptor_, cudnn_data_type_, nbDims, dimAy, strideAiny),
|
||||
"cudnnSetTensor4dDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(dy_descriptor_, cudnn_data_type_, nbDims, dimAdy, strideAdy),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetTensorNdDescriptor(dy_descriptor_, cudnn_data_type_, nbDims, dimAdy, strideAdy),
|
||||
"cudnnSetTensor4dDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnSetTensorNdDescriptor(dx_descriptor_, cudnn_data_type_, nbDims, dimAout, strideAout),
|
||||
kernel_node_, cudnnSetTensorNdDescriptor(dx_descriptor_, cudnn_data_type_, nbDims, dimAout, strideAout),
|
||||
"cudnnSetTensor4dDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(x_descriptor_, cudnn_data_type_, nbDims, dimA, strideAin),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetTensorNdDescriptor(x_descriptor_, cudnn_data_type_, nbDims, dimA, strideAin),
|
||||
"cudnnSetTensor4dDescriptor failed");
|
||||
SetPoolingMode(kernel_node);
|
||||
SetPad(kernel_node);
|
||||
|
@ -140,41 +145,49 @@ class PoolingGradGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
void DestroyResource() noexcept override {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyPoolingDescriptor(pooling_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyPoolingDescriptor(pooling_descriptor_),
|
||||
"cudnnDestroyPoolingDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_descriptor_), "cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_descriptor_), "cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_descriptor_), "cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_descriptor_), "cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dx_descriptor_),
|
||||
"cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(x_descriptor_),
|
||||
"cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dy_descriptor_),
|
||||
"cudnnDestroyTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(y_descriptor_),
|
||||
"cudnnDestroyTensorDescriptor failed");
|
||||
}
|
||||
|
||||
protected:
|
||||
void InitResource() override {
|
||||
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&y_descriptor_), "cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dy_descriptor_), "cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&x_descriptor_), "cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dx_descriptor_), "cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreatePoolingDescriptor(&pooling_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&y_descriptor_),
|
||||
"cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dy_descriptor_),
|
||||
"cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&x_descriptor_),
|
||||
"cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dx_descriptor_),
|
||||
"cudnnCreateTensorDescriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreatePoolingDescriptor(&pooling_descriptor_),
|
||||
"cudnnCreatePoolingDescriptor failed");
|
||||
}
|
||||
void InitSizeLists() override {
|
||||
if (!is_null_input_) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(y_descriptor_, &input_size_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(y_descriptor_, &input_size_),
|
||||
"cudnnGetTensorSizeInBytes failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(dx_descriptor_, &output_size_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(dx_descriptor_, &output_size_),
|
||||
"cudnnGetTensorSizeInBytes failed");
|
||||
}
|
||||
input_size_list_.push_back(input_size_);
|
||||
output_size_list_.push_back(output_size_);
|
||||
if (!is_null_input_) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(dy_descriptor_, &input_size_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(dy_descriptor_, &input_size_),
|
||||
"cudnnGetTensorSizeInBytes failed");
|
||||
}
|
||||
input_size_list_.push_back(input_size_);
|
||||
|
||||
if (!is_null_input_) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(x_descriptor_, &input_size_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(x_descriptor_, &input_size_),
|
||||
"cudnnGetTensorSizeInBytes failed");
|
||||
}
|
||||
input_size_list_.push_back(input_size_);
|
||||
|
@ -234,7 +247,8 @@ class PoolingGradGpuKernel : public GpuKernel {
|
|||
pad_width_ = 0;
|
||||
}
|
||||
}
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetPoolingNdDescriptor(pooling_descriptor_, pooling_mode_, CUDNN_NOT_PROPAGATE_NAN,
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetPoolingNdDescriptor(pooling_descriptor_, pooling_mode_, CUDNN_NOT_PROPAGATE_NAN,
|
||||
2, windowDimA, paddingA, strideA),
|
||||
"cudnnSetPoolingNdDescriptor failed");
|
||||
}
|
||||
|
|
|
@ -65,6 +65,7 @@ class SoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel {
|
|||
const float alpha = 1;
|
||||
const float beta = 0;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSoftmaxForward(cudnn_handle_, algo_, mode_, &alpha, logits_descriptor_, logits_addr, &beta,
|
||||
softmax_output_descriptor_, softmax_output_logits),
|
||||
"cudnnSoftmaxForward failed.");
|
||||
|
@ -74,6 +75,7 @@ class SoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
InitResource();
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 2) {
|
||||
|
@ -90,10 +92,12 @@ class SoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel {
|
|||
cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
|
||||
|
||||
InferInputOutputSize(kernel_node);
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(logits_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_,
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(logits_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_,
|
||||
batch_size_, channel_size_, height_, width_),
|
||||
"cudnnSetTensor4dDescriptor failed.");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(softmax_output_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, batch_size_,
|
||||
channel_size_, height_, width_),
|
||||
"cudnnSetTensor4dDescriptor failed.");
|
||||
|
@ -102,18 +106,18 @@ class SoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
void DestroyResource() noexcept override {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(softmax_output_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(softmax_output_descriptor_),
|
||||
"cudnnDestroyTensorDescriptor failed.");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(logits_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(logits_descriptor_),
|
||||
"cudnnDestroyTensorDescriptor failed.");
|
||||
}
|
||||
|
||||
protected:
|
||||
void InitResource() override {
|
||||
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&logits_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&logits_descriptor_),
|
||||
"cudnnCreateTensorDescriptor failed.");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&softmax_output_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&softmax_output_descriptor_),
|
||||
"cudnnCreateTensorDescriptor failed.");
|
||||
}
|
||||
void InitSizeLists() override {
|
||||
|
|
|
@ -63,7 +63,8 @@ class SoftmaxGpuKernel : public GpuKernel {
|
|||
const float beta = 0;
|
||||
|
||||
if (axis_ == 1) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSoftmaxForward(cudnn_handle_, algo_, mode_, &alpha, input_descriptor_,
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSoftmaxForward(cudnn_handle_, algo_, mode_, &alpha, input_descriptor_,
|
||||
input_addr, &beta, output_descriptor_, output_addr),
|
||||
"cudnnSoftmaxForward failed");
|
||||
} else {
|
||||
|
@ -72,19 +73,23 @@ class SoftmaxGpuKernel : public GpuKernel {
|
|||
size_t *input_shape = GetDeviceAddress<size_t>(workspace, 2);
|
||||
size_t *transpose_shape = GetDeviceAddress<size_t>(workspace, 3);
|
||||
size_t *transpose_axis = GetDeviceAddress<size_t>(workspace, 4);
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(input_shape, &input_shape_[0], workspace_size_, cudaMemcpyHostToDevice,
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(input_shape, &input_shape_[0], workspace_size_, cudaMemcpyHostToDevice,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemcpyAsync input_shape failed");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(transpose_shape, &transpose_shape_[0], workspace_size_,
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(transpose_shape, &transpose_shape_[0], workspace_size_,
|
||||
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemcpyAsync input_shape failed");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(transpose_axis, &transpose_axis_[0], workspace_size_,
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(transpose_axis, &transpose_axis_[0], workspace_size_,
|
||||
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemcpyAsync input_axis failed");
|
||||
size_t size = input_size_ / sizeof(T);
|
||||
CalTranspose(size, input_addr, input_shape, transpose_axis, shape_size_, transpose_input_addr,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSoftmaxForward(cudnn_handle_, algo_, mode_, &alpha, input_descriptor_, transpose_input_addr, &beta,
|
||||
output_descriptor_, transpose_output_addr),
|
||||
"cudnnSoftmaxForward failed");
|
||||
|
@ -95,6 +100,7 @@ class SoftmaxGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
InitResource();
|
||||
cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
|
@ -129,10 +135,12 @@ class SoftmaxGpuKernel : public GpuKernel {
|
|||
InitSizeByAxis(input_shape, axis[0]);
|
||||
}
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(input_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, SizeToInt(batch_size_),
|
||||
SizeToInt(channel_size_), SizeToInt(height_), SizeToInt(width_)),
|
||||
"set input_descriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(output_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, SizeToInt(batch_size_),
|
||||
SizeToInt(channel_size_), SizeToInt(height_), SizeToInt(width_)),
|
||||
"set output_descriptor failed");
|
||||
|
@ -141,15 +149,19 @@ class SoftmaxGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
void DestroyResource() noexcept override {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_descriptor_), "destroy output_descriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "destroy input_descriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(output_descriptor_),
|
||||
"destroy output_descriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(input_descriptor_),
|
||||
"destroy input_descriptor failed");
|
||||
}
|
||||
|
||||
protected:
|
||||
void InitResource() override {
|
||||
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&input_descriptor_), "create input_descriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&output_descriptor_), "create output_descriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&input_descriptor_),
|
||||
"create input_descriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&output_descriptor_),
|
||||
"create output_descriptor failed");
|
||||
}
|
||||
|
||||
void InitSizeLists() override {
|
||||
|
|
|
@ -70,17 +70,21 @@ class SoftmaxGradGpuKernel : public GpuKernel {
|
|||
const float beta = 0;
|
||||
|
||||
if (axis_ == 1) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSoftmaxBackward(cudnn_handle_, algo_, mode_, &alpha, y_desc_, y_addr, y_desc_,
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSoftmaxBackward(cudnn_handle_, algo_, mode_, &alpha, y_desc_, y_addr, y_desc_,
|
||||
dy_addr, &beta, y_desc_, dx_addr),
|
||||
"cudnnSoftmaxBackward failed");
|
||||
} else {
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(input_shape, &input_shape_[0], workspace_size_, cudaMemcpyHostToDevice,
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(input_shape, &input_shape_[0], workspace_size_, cudaMemcpyHostToDevice,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemcpyAsync input_shape failed");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(transpose_shape, &transpose_shape_[0], workspace_size_,
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(transpose_shape, &transpose_shape_[0], workspace_size_,
|
||||
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemcpyAsync input_shape failed");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(transpose_axis, &transpose_axis_[0], workspace_size_,
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(transpose_axis, &transpose_axis_[0], workspace_size_,
|
||||
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemcpyAsync input_axis failed");
|
||||
size_t size = input_size_ / sizeof(T);
|
||||
|
@ -88,7 +92,8 @@ class SoftmaxGradGpuKernel : public GpuKernel {
|
|||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
CalTranspose(size, dy_addr, input_shape, transpose_axis, shape_size_, transpose_dy_addr,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSoftmaxBackward(cudnn_handle_, algo_, mode_, &alpha, y_desc_, transpose_y_addr,
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSoftmaxBackward(cudnn_handle_, algo_, mode_, &alpha, y_desc_, transpose_y_addr,
|
||||
y_desc_, transpose_dy_addr, &beta, y_desc_, transpose_dx_addr),
|
||||
"cudnnSoftmaxBackward failed");
|
||||
CalTranspose(size, transpose_dx_addr, transpose_shape, transpose_axis, shape_size_, dx_addr,
|
||||
|
@ -98,6 +103,7 @@ class SoftmaxGradGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
InitResource();
|
||||
cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
|
@ -135,6 +141,7 @@ class SoftmaxGradGpuKernel : public GpuKernel {
|
|||
InitSizeByAxis(input_shape, axis[0]);
|
||||
}
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(y_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, SizeToInt(batch_size_),
|
||||
SizeToInt(channel_size_), SizeToInt(height_), SizeToInt(width_)),
|
||||
"set input_descriptor failed");
|
||||
|
@ -143,13 +150,13 @@ class SoftmaxGradGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
void DestroyResource() noexcept override {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "destroy output_descriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(y_desc_), "destroy output_descriptor failed");
|
||||
}
|
||||
|
||||
protected:
|
||||
void InitResource() override {
|
||||
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&y_desc_), "create input_descriptor failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&y_desc_), "create input_descriptor failed");
|
||||
}
|
||||
|
||||
void InitSizeLists() override {
|
||||
|
|
|
@ -64,6 +64,7 @@ class SparseSoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel {
|
|||
const float alpha = 1;
|
||||
const float beta = 0;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSoftmaxForward(cudnn_handle_, algo_, mode_, &alpha, logits_descriptor_, logits_addr, &beta,
|
||||
softmax_output_descriptor_, softmax_output_logits),
|
||||
"cudnnSoftmaxForward failed.");
|
||||
|
@ -75,6 +76,7 @@ class SparseSoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
InitResource();
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 2) {
|
||||
|
@ -92,10 +94,12 @@ class SparseSoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel {
|
|||
cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
|
||||
|
||||
InferInputOutputSize(kernel_node);
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(logits_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_,
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(logits_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_,
|
||||
batch_size_, channel_size_, height_, width_),
|
||||
"cudnnSetTensor4dDescriptor failed.");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(softmax_output_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, batch_size_,
|
||||
channel_size_, height_, width_),
|
||||
"cudnnSetTensor4dDescriptor failed.");
|
||||
|
@ -104,18 +108,18 @@ class SparseSoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
void DestroyResource() noexcept override {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(softmax_output_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(softmax_output_descriptor_),
|
||||
"cudnnDestroyTensorDescriptor failed.");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(logits_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(logits_descriptor_),
|
||||
"cudnnDestroyTensorDescriptor failed.");
|
||||
}
|
||||
|
||||
protected:
|
||||
void InitResource() override {
|
||||
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&logits_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&logits_descriptor_),
|
||||
"cudnnCreateTensorDescriptor failed.");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&softmax_output_descriptor_),
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&softmax_output_descriptor_),
|
||||
"cudnnCreateTensorDescriptor failed.");
|
||||
}
|
||||
void InitSizeLists() override {
|
||||
|
|
|
@ -38,15 +38,18 @@ class AssignGpuKernel : public GpuKernel {
|
|||
T *value = GetDeviceAddress<T>(inputs, 1);
|
||||
T *output = GetDeviceAddress<T>(outputs, 0);
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudaMemcpyAsync(var, value, input_size_, cudaMemcpyDeviceToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemxcpyAsync failed.");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudaMemcpyAsync(output, value, input_size_, cudaMemcpyDeviceToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemxcpyAsync failed.");
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
if (!CheckParam(kernel_node)) {
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -41,7 +41,8 @@ class GpuConvertToDynamicShapeGpuKernel : public GpuKernel {
|
|||
T *output_device_address = GetDeviceAddress<T>(outputs, 0);
|
||||
cuda_stream_ptr_ = stream_ptr;
|
||||
|
||||
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(output_device_address, input_device_address, input_size_ * sizeof(T),
|
||||
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
|
||||
cudaMemcpyAsync(output_device_address, input_device_address, input_size_ * sizeof(T),
|
||||
cudaMemcpyDeviceToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"Failed to copy gpu memory.");
|
||||
|
||||
|
@ -49,7 +50,7 @@ class GpuConvertToDynamicShapeGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
void PostExecute() override {
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(cuda_stream_ptr_)),
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(cuda_stream_ptr_)),
|
||||
"cudaStreamSynchronized failed");
|
||||
|
||||
std::vector<TypeId> output_types = {AnfAlgo::GetOutputInferDataType(c_node_ptr_, 0)};
|
||||
|
@ -58,6 +59,7 @@ class GpuConvertToDynamicShapeGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
size_t input_count = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_count != 1) {
|
||||
MS_LOG(ERROR) << input_count << "inputs were provided, but GpuConvertToDynamicShapeGpuKernel exepects 1.";
|
||||
|
|
|
@ -68,10 +68,12 @@ class BatchNormFold2GradGpuKernel : public GpuKernel {
|
|||
|
||||
int32_t current_step_host[1];
|
||||
size_t x_size = batch_size_ * channel_ * height_ * width_ * sizeof(T);
|
||||
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(current_step_host, global_step, sizeof(int32_t), cudaMemcpyDeviceToHost,
|
||||
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
|
||||
cudaMemcpyAsync(current_step_host, global_step, sizeof(int32_t), cudaMemcpyDeviceToHost,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"Failed to copy gpu memory.");
|
||||
CHECK_CUDA_RET_WITH_ERROR(
|
||||
kernel_node_,
|
||||
cudaMemcpyAsync(d_x, dout, x_size, cudaMemcpyDeviceToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"Failed to copy gpu memory.");
|
||||
|
||||
|
@ -90,8 +92,8 @@ class BatchNormFold2GradGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
InitResource();
|
||||
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 8) {
|
||||
MS_LOG(ERROR) << "Argument number is " << input_num << ", but BatchNormFold2GradGpuKernel needs 8.";
|
||||
|
|
|
@ -58,7 +58,8 @@ class BatchNormFoldGpuKernel : public GpuKernel {
|
|||
auto variance = GetDeviceAddress<T>(inputs, 2);
|
||||
int *current_step = GetDeviceAddress<int>(inputs, 3);
|
||||
int current_step_host[1];
|
||||
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(current_step_host, current_step, sizeof(int), cudaMemcpyDeviceToHost,
|
||||
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
|
||||
cudaMemcpyAsync(current_step_host, current_step, sizeof(int), cudaMemcpyDeviceToHost,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"Copy gpu memoy failed.");
|
||||
if (x == nullptr) {
|
||||
|
@ -83,21 +84,24 @@ class BatchNormFoldGpuKernel : public GpuKernel {
|
|||
auto running_std = GetDeviceAddress<T>(outputs, 3);
|
||||
auto y = GetDeviceAddress<T>(workspace, 0);
|
||||
|
||||
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(running_mean, mean, output_size_, cudaMemcpyDeviceToDevice,
|
||||
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
|
||||
cudaMemcpyAsync(running_mean, mean, output_size_, cudaMemcpyDeviceToDevice,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"Failed to copy gpu memory.");
|
||||
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(running_std, variance, output_size_, cudaMemcpyDeviceToDevice,
|
||||
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
|
||||
cudaMemcpyAsync(running_std, variance, output_size_, cudaMemcpyDeviceToDevice,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"Failed to copy gpu memory.");
|
||||
CalUpdateRunningStd(channel_, epsilon_, running_std, reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
if (!is_training_ || current_step_host[0] >= freeze_bn_) {
|
||||
CHECK_CUDA_RET_WITH_ERROR(cudaMemset(batch_mean, 0, output_size_), "Failed to set gpu memory.");
|
||||
CHECK_CUDA_RET_WITH_ERROR(kernel_node_, cudaMemset(batch_mean, 0, output_size_), "Failed to set gpu memory.");
|
||||
ThrustFillWith(batch_std, channel_, 1.f, reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
return true;
|
||||
}
|
||||
const T alpha = 1;
|
||||
const T beta = 0;
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnBatchNormalizationForwardTraining(
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudnnBatchNormalizationForwardTraining(
|
||||
handle_, mode_, &alpha, &beta, x_desc_, x, x_desc_, y, scale_bias_mean_var_desc_,
|
||||
mean, mean, exp_avg_factor_, mean, variance, epsilon_, batch_mean, batch_std),
|
||||
"Failed to launch kernel.")
|
||||
|
@ -106,6 +110,7 @@ class BatchNormFoldGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
InitResource();
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 4) {
|
||||
|
@ -141,10 +146,12 @@ class BatchNormFoldGpuKernel : public GpuKernel {
|
|||
|
||||
cudnnDataType_t cudnnDataType = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW, cudnnDataType, batch_, channel_, height_, width_),
|
||||
"Set x desc failed");
|
||||
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
kernel_node_,
|
||||
cudnnSetTensor4dDescriptor(scale_bias_mean_var_desc_, CUDNN_TENSOR_NCHW, cudnnDataType, 1, channel_, 1, 1),
|
||||
"Set para desc failed");
|
||||
|
||||
|
@ -153,8 +160,9 @@ class BatchNormFoldGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
void DestroyResource() noexcept override {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), "Destroy para desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
|
||||
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_),
|
||||
"Destroy para desc failed");
|
||||
}
|
||||
|
||||
protected:
|
||||
|
@ -177,8 +185,9 @@ class BatchNormFoldGpuKernel : public GpuKernel {
|
|||
|
||||
void InitResource() override {
|
||||
handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&x_desc_), "Create x desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&scale_bias_mean_var_desc_), "Create para desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&x_desc_), "Create x desc failed");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&scale_bias_mean_var_desc_),
|
||||
"Create para desc failed");
|
||||
}
|
||||
|
||||
private:
|
||||
|
|
|
@ -56,7 +56,8 @@ class BatchNormFoldGradGpuKernel : public GpuKernel {
|
|||
T *batch_std = GetDeviceAddress<T>(inputs, 4);
|
||||
int *current_step = GetDeviceAddress<int>(inputs, 5);
|
||||
int current_step_host[1];
|
||||
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(current_step_host, current_step, sizeof(int), cudaMemcpyDeviceToHost,
|
||||
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
|
||||
cudaMemcpyAsync(current_step_host, current_step, sizeof(int), cudaMemcpyDeviceToHost,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"Copy gpu memoy failed.");
|
||||
if (d_batch_mean == nullptr) {
|
||||
|
@ -95,6 +96,7 @@ class BatchNormFoldGradGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 6) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but BatchNormFoldGrad GpuKernel OP needs 6 input.";
|
||||
|
|
|
@ -42,6 +42,7 @@ const std::vector<size_t> &FakeQuantPerChannelGpuKernel::GetOutputSizeList() con
|
|||
const std::vector<size_t> &FakeQuantPerChannelGpuKernel::GetWorkspaceSizeList() const { return workspace_size_list_; }
|
||||
|
||||
bool FakeQuantPerChannelGpuKernel::Init(const CNodePtr &kernel_node) {
|
||||
kernel_node_ = kernel_node;
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 3) {
|
||||
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but FakeQuant GpuKernel OP needs 3 input.";
|
||||
|
@ -130,7 +131,8 @@ bool FakeQuantPerChannelGpuKernel::Launch(const std::vector<AddressPtr> &inputs,
|
|||
if (global_step_ >= quant_delay_) {
|
||||
CalFakeQuantize(input, output, input_min, input_max, nudge_min, nudge_max, scale, stream_ptr);
|
||||
} else {
|
||||
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(output, input, input_size_, cudaMemcpyDeviceToDevice,
|
||||
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
|
||||
cudaMemcpyAsync(output, input, input_size_, cudaMemcpyDeviceToDevice,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"Copy gpu memory failed.");
|
||||
}
|
||||
|
|
|
@ -39,6 +39,7 @@ const std::vector<size_t> &FakeQuantPerChannelGradGpuKernel::GetWorkspaceSizeLis
|
|||
}
|
||||
|
||||
bool FakeQuantPerChannelGradGpuKernel::Init(const CNodePtr &kernel_node) {
|
||||
kernel_node_ = kernel_node;
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 4) {
|
||||
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but FakeQuantGrad GpuKernel OP needs 4 output.";
|
||||
|
@ -123,7 +124,8 @@ bool FakeQuantPerChannelGradGpuKernel::Launch(const std::vector<AddressPtr> &inp
|
|||
CalFakeQuantPerChannelGrad(input, gradient, output, total_size, num_channels_, nudge_min, nudge_max,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
} else {
|
||||
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(output, gradient, input_size_, cudaMemcpyDeviceToDevice,
|
||||
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
|
||||
cudaMemcpyAsync(output, gradient, input_size_, cudaMemcpyDeviceToDevice,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"Copy gpu memory failed.");
|
||||
}
|
||||
|
|
|
@ -42,6 +42,7 @@ const std::vector<size_t> &FakeQuantPerLayerGpuKernel::GetOutputSizeList() const
|
|||
const std::vector<size_t> &FakeQuantPerLayerGpuKernel::GetWorkspaceSizeList() const { return workspace_size_list_; }
|
||||
|
||||
bool FakeQuantPerLayerGpuKernel::Init(const CNodePtr &kernel_node) {
|
||||
kernel_node_ = kernel_node;
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 3) {
|
||||
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but FakeQuant GpuKernel OP needs 3 output.";
|
||||
|
@ -122,7 +123,8 @@ bool FakeQuantPerLayerGpuKernel::Launch(const std::vector<AddressPtr> &inputs, c
|
|||
CalFakeQuantPerLayer(input, output, quant_num_, nudge_min, nudge_max, scale,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
} else {
|
||||
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(output, input, input_size_, cudaMemcpyDeviceToDevice,
|
||||
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
|
||||
cudaMemcpyAsync(output, input, input_size_, cudaMemcpyDeviceToDevice,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"Copy gpu memory failed");
|
||||
}
|
||||
|
|
|
@ -38,6 +38,7 @@ const std::vector<size_t> &FakeQuantPerLayerGradGpuKernel::GetOutputSizeList() c
|
|||
const std::vector<size_t> &FakeQuantPerLayerGradGpuKernel::GetWorkspaceSizeList() const { return workspace_size_list_; }
|
||||
|
||||
bool FakeQuantPerLayerGradGpuKernel::Init(const CNodePtr &kernel_node) {
|
||||
kernel_node_ = kernel_node;
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 4) {
|
||||
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but FakeQuantGrad GpuKernel OP needs 4 output.";
|
||||
|
@ -120,7 +121,8 @@ bool FakeQuantPerLayerGradGpuKernel::Launch(const std::vector<AddressPtr> &input
|
|||
CalFakeQuantPerLayerGrad(input, gradient, output, quant_num_, nudge_min, nudge_max,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
} else {
|
||||
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(output, gradient, input_size_, cudaMemcpyDeviceToDevice,
|
||||
CHECK_CUDA_RET_WITH_ERROR(kernel_node_,
|
||||
cudaMemcpyAsync(output, gradient, input_size_, cudaMemcpyDeviceToDevice,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"Copy gpu memory failed");
|
||||
}
|
||||
|
|
|
@ -47,7 +47,8 @@ class RandomCategoricalGpuKernel : public GpuKernel {
|
|||
host_cdf[i] = GetDeviceAddress<double>(workspaces, i);
|
||||
}
|
||||
double **dev_cdf = GetDeviceAddress<double *>(workspaces, batch_size_);
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(dev_cdf, // NOLINT
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(dev_cdf, // NOLINT
|
||||
host_cdf.get(), sizeof(double *) * batch_size_, cudaMemcpyHostToDevice,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"Random_categorica cudaMemcpyAsync dev_cdf failed");
|
||||
|
@ -68,12 +69,14 @@ class RandomCategoricalGpuKernel : public GpuKernel {
|
|||
for (int j = 0; j < num_samples_; j++) {
|
||||
host_1d_rand[j] = dist(rng);
|
||||
}
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(host_rand[i], // NOLINT
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(host_rand[i], // NOLINT
|
||||
host_1d_rand.get(), sizeof(double) * num_samples_,
|
||||
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"Random_categorica cudaMemcpyAsync host_1d_rand failed");
|
||||
}
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(dev_rand, // NOLINT
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(dev_rand, // NOLINT
|
||||
host_rand.get(), sizeof(double *) * batch_size_, cudaMemcpyHostToDevice,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"Random_categorica cudaMemcpyAsync dev_rand failed");
|
||||
|
@ -86,6 +89,7 @@ class RandomCategoricalGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 3) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but RandomCategorical needs 3 inputs.";
|
||||
|
|
|
@ -47,7 +47,8 @@ class UniformCandidateSamplerGpuKernel : public GpuKernel {
|
|||
if (remove_accidental_hits_) {
|
||||
T *input = GetDeviceAddress<T>(inputs, 0);
|
||||
array_input_ = std::vector<T>(input_size_, 0);
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(&array_input_[0], input, input_size_ * sizeof(T),
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(&array_input_[0], input, input_size_ * sizeof(T),
|
||||
cudaMemcpyDeviceToHost, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemcpyAsync sampled_candidates failed");
|
||||
for (const auto item : array_input_) {
|
||||
|
@ -58,7 +59,8 @@ class UniformCandidateSamplerGpuKernel : public GpuKernel {
|
|||
float prob = Probability();
|
||||
size_t sampled_candidates_size = num_sampled_ * sizeof(T);
|
||||
S value = ApproximateExpectedCount(prob, num_sampled_, counter);
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(sampled_candidates, &sampled_candidates_[0], sampled_candidates_size,
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||
cudaMemcpyAsync(sampled_candidates, &sampled_candidates_[0], sampled_candidates_size,
|
||||
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cudaMemcpyAsync sampled_candidates failed");
|
||||
CalUniformCandidateSampler(static_cast<int>(input_size_), num_sampled_, value, true_expected_count,
|
||||
|
@ -67,6 +69,7 @@ class UniformCandidateSamplerGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 1) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but UniformCandidateSampler needs 1 input.";
|
||||
|
|
|
@ -16,7 +16,7 @@
|
|||
|
||||
#include "runtime/device/gpu/blocking_queue.h"
|
||||
#include <chrono>
|
||||
#include "runtime/device/gpu/gpu_common.h"
|
||||
#include "runtime/device/gpu/queue_common.h"
|
||||
#include "utils/ms_utils.h"
|
||||
|
||||
namespace mindspore {
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include <algorithm>
|
||||
#include <map>
|
||||
#include "utils/log_adapter.h"
|
||||
#include "utils/trace_base.h"
|
||||
#include "include/curand.h"
|
||||
|
||||
namespace mindspore {
|
||||
|
@ -43,25 +44,34 @@ namespace gpu {
|
|||
} \
|
||||
}
|
||||
|
||||
#define CHECK_CUDA_RET_WITH_ERROR(expression, message) \
|
||||
{ \
|
||||
cudaError_t status = (expression); \
|
||||
if (status != cudaSuccess) { \
|
||||
MS_LOG(ERROR) << "CUDA Error: " << message << " | Error Number: " << status << " " \
|
||||
<< cudaGetErrorString(status); \
|
||||
} \
|
||||
#define CHECK_CUDA_RET_WITH_ERROR(node, expression, message) \
|
||||
{ \
|
||||
cudaError_t status = (expression); \
|
||||
if (status != cudaSuccess) { \
|
||||
MS_LOG(ERROR) << "CUDA Error: " << message << " | Error Number: " << status << " " << cudaGetErrorString(status) \
|
||||
<< trace::DumpSourceLines(node); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define CHECK_CUDA_RET_WITH_EXCEPT(expression, message) \
|
||||
#define CHECK_CUDA_RET_WITH_EXCEPT(node, expression, message) \
|
||||
{ \
|
||||
cudaError_t status = (expression); \
|
||||
if (status != cudaSuccess) { \
|
||||
MS_LOG(EXCEPTION) << "CUDA Error: " << message << " | Error Number: " << status << " " \
|
||||
<< cudaGetErrorString(status); \
|
||||
<< cudaGetErrorString(status) << trace::DumpSourceLines(node); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define CHECK_CUDNN_RET_WITH_EXCEPT(expression, message) \
|
||||
#define CHECK_CUDNN_RET_WITH_EXCEPT(node, expression, message) \
|
||||
{ \
|
||||
cudnnStatus_t status = (expression); \
|
||||
if (status != CUDNN_STATUS_SUCCESS) { \
|
||||
MS_LOG(EXCEPTION) << "cuDNN Error: " << message << " | Error Number: " << status << " " \
|
||||
<< cudnnGetErrorString(status) << trace::DumpSourceLines(node); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(expression, message) \
|
||||
{ \
|
||||
cudnnStatus_t status = (expression); \
|
||||
if (status != CUDNN_STATUS_SUCCESS) { \
|
||||
|
@ -70,7 +80,7 @@ namespace gpu {
|
|||
} \
|
||||
}
|
||||
|
||||
#define CHECK_CUDNN_RET_WITH_ERROR(expression, message) \
|
||||
#define CHECK_CUDNN_RET_WITH_ERROR_NOTRACE(expression, message) \
|
||||
{ \
|
||||
cudnnStatus_t status = (expression); \
|
||||
if (status != CUDNN_STATUS_SUCCESS) { \
|
||||
|
@ -79,7 +89,16 @@ namespace gpu {
|
|||
} \
|
||||
}
|
||||
|
||||
#define CHECK_CUBLAS_RET_WITH_EXCEPT(expression, message) \
|
||||
#define CHECK_CUDNN_RET_WITH_ERROR(node, expression, message) \
|
||||
{ \
|
||||
cudnnStatus_t status = (expression); \
|
||||
if (status != CUDNN_STATUS_SUCCESS) { \
|
||||
MS_LOG(ERROR) << "cuDNN Error: " << message << " | Error Number: " << status << " " \
|
||||
<< cudnnGetErrorString(status) << trace::DumpSourceLines(node); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define CHECK_CUBLAS_RET_WITH_EXCEPT_NOTRACE(expression, message) \
|
||||
{ \
|
||||
cublasStatus_t status = (expression); \
|
||||
if (status != CUBLAS_STATUS_SUCCESS) { \
|
||||
|
@ -87,6 +106,15 @@ namespace gpu {
|
|||
} \
|
||||
}
|
||||
|
||||
#define CHECK_CUBLAS_RET_WITH_EXCEPT(node, expression, message) \
|
||||
{ \
|
||||
cublasStatus_t status = (expression); \
|
||||
if (status != CUBLAS_STATUS_SUCCESS) { \
|
||||
MS_LOG(EXCEPTION) << "cuBLAS Error: " << message << " | Error Number: " << status \
|
||||
<< trace::DumpSourceLines(node); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define CHECK_CUBLAS_RET_WITH_ERROR(expression, message) \
|
||||
{ \
|
||||
cublasStatus_t status = (expression); \
|
||||
|
@ -95,7 +123,7 @@ namespace gpu {
|
|||
} \
|
||||
}
|
||||
|
||||
#define CHECK_CUSOLVER_RET_WITH_EXCEPT(expression, message) \
|
||||
#define CHECK_CUSOLVER_RET_WITH_EXCEPT_NOTRACE(expression, message) \
|
||||
{ \
|
||||
cusolverStatus_t status = (expression); \
|
||||
if (status != CUSOLVER_STATUS_SUCCESS) { \
|
||||
|
@ -103,6 +131,16 @@ namespace gpu {
|
|||
} \
|
||||
}
|
||||
|
||||
#define CHECK_CUSOLVER_RET_WITH_EXCEPT(node, expression, message) \
|
||||
{ \
|
||||
cusolverStatus_t status = (expression); \
|
||||
if (status != CUSOLVER_STATUS_SUCCESS) { \
|
||||
MS_LOG(EXCEPTION) << "cusolver Error: " << message << " | Error Number: " << status \
|
||||
<< trace::DumpSourceLines(node); \
|
||||
; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define CHECK_CUSOLVER_RET_WITH_ERROR(expression, message) \
|
||||
{ \
|
||||
cusolverStatus_t status = (expression); \
|
||||
|
@ -111,12 +149,12 @@ namespace gpu {
|
|||
} \
|
||||
}
|
||||
|
||||
#define CHECK_NCCL_RET_WITH_EXCEPT(expression, message) \
|
||||
{ \
|
||||
int result = (expression); \
|
||||
if (result != ncclSuccess) { \
|
||||
MS_LOG(EXCEPTION) << "NCCL Error: " << message << " | Error Number: " << result; \
|
||||
} \
|
||||
#define CHECK_NCCL_RET_WITH_EXCEPT(node, expression, message) \
|
||||
{ \
|
||||
int result = (expression); \
|
||||
if (result != ncclSuccess) { \
|
||||
MS_LOG(EXCEPTION) << "NCCL Error: " << message << " | Error Number: " << result << trace::DumpSourceLines(node); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define VARIABLE_NOT_USED(var) \
|
||||
|
|
|
@ -26,14 +26,16 @@ namespace gpu {
|
|||
void GPUDeviceManager::InitDevice() {
|
||||
CHECK_OP_RET_WITH_EXCEPT(CudaDriver::set_current_device(SizeToInt(cur_dev_id_)), "Failed to set current device id");
|
||||
CHECK_OP_RET_WITH_EXCEPT(CreateStream(&default_stream_), "Failed to create CUDA stream.");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreate(&cudnn_handle_), "Failed to create cuDNN handle");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetStream(cudnn_handle_, reinterpret_cast<cudaStream_t>(default_stream())),
|
||||
"Failed to set stream for cuDNN handle.");
|
||||
CHECK_CUBLAS_RET_WITH_EXCEPT(cublasCreate(&cublas_handle_), "Failed to create cuBLAS handle.");
|
||||
CHECK_CUBLAS_RET_WITH_EXCEPT(cublasSetStream(cublas_handle_, reinterpret_cast<cudaStream_t>(default_stream())),
|
||||
"Failed to set stream for cuBLAS handle.");
|
||||
CHECK_CUSOLVER_RET_WITH_EXCEPT(cusolverDnCreate(&cusolver_dn_handle_), "Failed to create cusolver dn handle.");
|
||||
CHECK_CUSOLVER_RET_WITH_EXCEPT(
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(cudnnCreate(&cudnn_handle_), "Failed to create cuDNN handle");
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(cudnnSetStream(cudnn_handle_, reinterpret_cast<cudaStream_t>(default_stream())),
|
||||
"Failed to set stream for cuDNN handle.");
|
||||
CHECK_CUBLAS_RET_WITH_EXCEPT_NOTRACE(cublasCreate(&cublas_handle_), "Failed to create cuBLAS handle.");
|
||||
CHECK_CUBLAS_RET_WITH_EXCEPT_NOTRACE(
|
||||
cublasSetStream(cublas_handle_, reinterpret_cast<cudaStream_t>(default_stream())),
|
||||
"Failed to set stream for cuBLAS handle.");
|
||||
CHECK_CUSOLVER_RET_WITH_EXCEPT_NOTRACE(cusolverDnCreate(&cusolver_dn_handle_),
|
||||
"Failed to create cusolver dn handle.");
|
||||
CHECK_CUSOLVER_RET_WITH_EXCEPT_NOTRACE(
|
||||
cusolverDnSetStream(cusolver_dn_handle_, reinterpret_cast<cudaStream_t>(default_stream())),
|
||||
"Failed to set stream for cusolver dn handle");
|
||||
CHECK_OP_RET_WITH_EXCEPT(GPUMemoryAllocator::GetInstance().Init(), "Failed to Init gpu memory allocator")
|
||||
|
@ -46,7 +48,7 @@ void GPUDeviceManager::ReleaseDevice() {
|
|||
}
|
||||
}
|
||||
if (cudnn_handle_ != nullptr) {
|
||||
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroy(cudnn_handle_), "Failed to destroy cuDNN handle");
|
||||
CHECK_CUDNN_RET_WITH_ERROR_NOTRACE(cudnnDestroy(cudnn_handle_), "Failed to destroy cuDNN handle");
|
||||
}
|
||||
if (cublas_handle_ != nullptr) {
|
||||
CHECK_CUBLAS_RET_WITH_ERROR(cublasDestroy(cublas_handle_), "Failed to destroy cuBLAS handle.");
|
||||
|
|
|
@ -174,7 +174,8 @@ bool GenSendRecvCNodesForAllReduce(const std::shared_ptr<session::KernelGraph> &
|
|||
MS_EXCEPTION_IF_NULL(*recv_node);
|
||||
|
||||
cudaEvent_t event = nullptr;
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaEventCreate(&event, cudaEventDisableTiming), "Creating cuda event failed.");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(*send_node, cudaEventCreate(&event, cudaEventDisableTiming),
|
||||
"Creating cuda event failed.");
|
||||
AnfAlgo::SetNodeAttr(kAttrRecordEvent, MakeValue(reinterpret_cast<uintptr_t>(event)), *send_node);
|
||||
AnfAlgo::SetNodeAttr(kAttrWaitEvent, MakeValue(reinterpret_cast<uintptr_t>(event)), *recv_node);
|
||||
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_QUEUE_COMMON_H_
|
||||
#define MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_QUEUE_COMMON_H_
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
#include "utils/log_adapter.h"
|
||||
#include "include/curand.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
namespace gpu {
|
||||
#define CHECK_CUDA_RET_WITH_ERROR(expression, message) \
|
||||
{ \
|
||||
cudaError_t status = (expression); \
|
||||
if (status != cudaSuccess) { \
|
||||
MS_LOG(ERROR) << "CUDA Error: " << message << " | Error Number: " << status << " " \
|
||||
<< cudaGetErrorString(status); \
|
||||
} \
|
||||
}
|
||||
} // namespace gpu
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_QUEUE_COMMON_H_
|
Loading…
Reference in New Issue