diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/concatv2_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/concatv2_gpu_kernel.h index c87082c9f2a..08b8abc2f85 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/concatv2_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/concatv2_gpu_kernel.h @@ -63,19 +63,21 @@ class ConcatV2GpuFwdKernel : public GpuKernel { if (!CheckParam(kernel_node)) { return false; } - axis_ = GetAttr(kernel_node, "axis"); if (axis_ < 0) { - auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); + auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); axis_ += SizeToInt(input_shape.size()); } + auto origin_data_format = AnfAlgo::GetOriginDataFormat(kernel_node); + auto input_format = AnfAlgo::GetInputFormat(kernel_node, 0); + axis_ = AxisTransform(origin_data_format, input_format, axis_); input_num_ = SizeToInt(AnfAlgo::GetInputTensorNum(kernel_node)); inputs_host_ = std::make_unique(input_num_); len_axis_ = std::make_unique(input_num_); for (int i = 0; i < input_num_; i++) { size_t input_size = 1; - auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i); + auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, i); for (size_t j = 0; j < input_shape.size(); j++) { input_size *= input_shape[j]; } @@ -85,7 +87,7 @@ class ConcatV2GpuFwdKernel : public GpuKernel { workspace_size_list_.push_back(sizeof(T *) * input_num_); workspace_size_list_.push_back(sizeof(int) * input_num_); - auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0); + auto output_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); output_size_ = 1; for (int i = 0; i < SizeToInt(output_shape.size()); i++) { output_size_ *= output_shape[i]; @@ -98,7 +100,6 @@ class ConcatV2GpuFwdKernel : public GpuKernel { } } output_size_list_.push_back(output_size_ * sizeof(T)); - InitSizeLists(); return true; } diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.h index 61c18680d54..fe363145d01 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.h @@ -22,6 +22,7 @@ #include #include #include +#include #include "backend/kernel_compiler/kernel.h" #include "backend/kernel_compiler/gpu/kernel_constants.h" #include "runtime/device/gpu/gpu_device_manager.h" @@ -31,6 +32,19 @@ using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm; namespace mindspore { namespace kernel { +static std::map kNCHWToNHWCAxisMap = { + {0, 0}, + {1, 3}, + {2, 1}, + {3, 2}, +}; +static std::map kNHWCToNCHWAxisMap = { + {0, 0}, + {1, 2}, + {2, 3}, + {3, 1}, +}; + class GpuKernel : public KernelMod { public: virtual ~GpuKernel() = default; @@ -74,6 +88,18 @@ class GpuKernel : public KernelMod { dst->push_back(src.size() == 0 ? 1 : SizeToInt(src[src.size() - 1])); } + int AxisTransform(const std::string &origin_data_format, const std::string &cal_format, int axis) { + if (((origin_data_format == kOpFormat_DEFAULT) || (origin_data_format == kOpFormat_NCHW)) && + (cal_format == kOpFormat_NHWC)) { + return kNCHWToNHWCAxisMap[axis]; + } else if (((cal_format == kOpFormat_DEFAULT) || (cal_format == kOpFormat_NCHW)) && + (origin_data_format == kOpFormat_NHWC)) { + return kNHWCToNCHWAxisMap[axis]; + } else { + return axis; + } + } + // transpose shape: NCHW To NHWC void ShapeNCHW2NHWC(std::vector *shape) { std::swap((*shape)[1], (*shape)[3]); diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/addn_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/addn_gpu_kernel.h index 49b6471e28f..e7471d8cb15 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/addn_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/addn_gpu_kernel.h @@ -82,7 +82,7 @@ class AddNGpuFwdKernel : public GpuKernel { MS_LOG(ERROR) << "Output number is " << output_num << ", but cudnnAddTensor needs 1 output."; return false; } - auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); + auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); is_null_input_ = CHECK_NULL_INPUT(input_shape); if (is_null_input_) { MS_LOG(WARNING) << "AddNGpuFwdKernel input is null"; @@ -96,9 +96,16 @@ class AddNGpuFwdKernel : public GpuKernel { for (size_t i = 0; i < input_shape.size(); i++) { dimA[i] = SizeToInt(input_shape[i]); } - CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(input_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, - SizeToInt(input_shape.size()), dimA), - "cudnnSetTensorNdDescriptor failed"); + auto input_format = AnfAlgo::GetInputFormat(kernel_node, 0); + if (input_format == kOpFormat_NHWC) { + CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(input_descriptor_, CUDNN_TENSOR_NHWC, cudnn_data_type_, + SizeToInt(input_shape.size()), dimA), + "cudnnSetTensorNdDescriptor failed"); + } else { + CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(input_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, + SizeToInt(input_shape.size()), dimA), + "cudnnSetTensorNdDescriptor failed"); + } InitSizeLists(); return true; } diff --git a/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.cc b/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.cc index f231f9cdc39..2649a43df22 100644 --- a/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.cc +++ b/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.cc @@ -194,6 +194,12 @@ void UpdateKernelFormatInfo(const CNodePtr &kernel_node, const std::vectorfullname_with_scope() << ", format: " << cal_format; auto inputs_format_position = iter->second.first; + // If input position is empty, then insert all the input positions, because the input numbers of this op are variable. + if (inputs_format_position.size() == 0) { + for (size_t input_index = 0; input_index < AnfAlgo::GetInputTensorNum(kernel_node); input_index++) { + inputs_format_position.push_back(input_index); + } + } for (const auto &input_format_position : inputs_format_position) { if (input_format_position >= inputs_format->size()) { MS_LOG(EXCEPTION) << "The position [" << input_format_position << "] is out of range of the input size [" diff --git a/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.h b/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.h index c78fed7b9e1..3c8d3c69564 100644 --- a/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.h +++ b/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.h @@ -30,6 +30,7 @@ namespace mindspore { namespace device { namespace gpu { // map, used for getting the insert position of format transform. +// If input position is empty, then insert all the input positions, because the input numbers of this op are variable. static std::map, std::vector>> kKernelFormatPositionMap = { {prim::kPrimConv2D->name(), {{0, 1}, {0}}}, {prim::kPrimConv2DBackpropInput->name(), {{0, 1}, {0}}}, @@ -47,6 +48,8 @@ static std::map, std::vector> {kFusedBatchNormGradEx, {{0, 1}, {0}}}, {kFusedBatchNormGradExWithActivation, {{0, 1, 7}, {0}}}, {kFusedBatchNormGradExWithAddAndActivation, {{0, 1, 7}, {0, 3}}}, + {prim::kPrimConcat->name(), {{}, {0}}}, + {prim::kPrimAddN->name(), {{}, {0}}}, }; void SetKernelInfo(const CNodePtr &apply_kernel_ptr);