forked from mindspore-Ecosystem/mindspore
gpu GoogleNet performance optimize
This commit is contained in:
parent
5613116caf
commit
ff6b64a598
|
@ -63,19 +63,21 @@ class ConcatV2GpuFwdKernel : public GpuKernel {
|
|||
if (!CheckParam(kernel_node)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
axis_ = GetAttr<int>(kernel_node, "axis");
|
||||
if (axis_ < 0) {
|
||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||
axis_ += SizeToInt(input_shape.size());
|
||||
}
|
||||
auto origin_data_format = AnfAlgo::GetOriginDataFormat(kernel_node);
|
||||
auto input_format = AnfAlgo::GetInputFormat(kernel_node, 0);
|
||||
axis_ = AxisTransform(origin_data_format, input_format, axis_);
|
||||
|
||||
input_num_ = SizeToInt(AnfAlgo::GetInputTensorNum(kernel_node));
|
||||
inputs_host_ = std::make_unique<T *[]>(input_num_);
|
||||
len_axis_ = std::make_unique<int[]>(input_num_);
|
||||
for (int i = 0; i < input_num_; i++) {
|
||||
size_t input_size = 1;
|
||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i);
|
||||
auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, i);
|
||||
for (size_t j = 0; j < input_shape.size(); j++) {
|
||||
input_size *= input_shape[j];
|
||||
}
|
||||
|
@ -85,7 +87,7 @@ class ConcatV2GpuFwdKernel : public GpuKernel {
|
|||
workspace_size_list_.push_back(sizeof(T *) * input_num_);
|
||||
workspace_size_list_.push_back(sizeof(int) * input_num_);
|
||||
|
||||
auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
|
||||
auto output_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
|
||||
output_size_ = 1;
|
||||
for (int i = 0; i < SizeToInt(output_shape.size()); i++) {
|
||||
output_size_ *= output_shape[i];
|
||||
|
@ -98,7 +100,6 @@ class ConcatV2GpuFwdKernel : public GpuKernel {
|
|||
}
|
||||
}
|
||||
output_size_list_.push_back(output_size_ * sizeof(T));
|
||||
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include <string>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
#include <map>
|
||||
#include "backend/kernel_compiler/kernel.h"
|
||||
#include "backend/kernel_compiler/gpu/kernel_constants.h"
|
||||
#include "runtime/device/gpu/gpu_device_manager.h"
|
||||
|
@ -31,6 +32,19 @@ using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm;
|
|||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
static std::map<int, int> kNCHWToNHWCAxisMap = {
|
||||
{0, 0},
|
||||
{1, 3},
|
||||
{2, 1},
|
||||
{3, 2},
|
||||
};
|
||||
static std::map<int, int> kNHWCToNCHWAxisMap = {
|
||||
{0, 0},
|
||||
{1, 2},
|
||||
{2, 3},
|
||||
{3, 1},
|
||||
};
|
||||
|
||||
class GpuKernel : public KernelMod {
|
||||
public:
|
||||
virtual ~GpuKernel() = default;
|
||||
|
@ -74,6 +88,18 @@ class GpuKernel : public KernelMod {
|
|||
dst->push_back(src.size() == 0 ? 1 : SizeToInt(src[src.size() - 1]));
|
||||
}
|
||||
|
||||
int AxisTransform(const std::string &origin_data_format, const std::string &cal_format, int axis) {
|
||||
if (((origin_data_format == kOpFormat_DEFAULT) || (origin_data_format == kOpFormat_NCHW)) &&
|
||||
(cal_format == kOpFormat_NHWC)) {
|
||||
return kNCHWToNHWCAxisMap[axis];
|
||||
} else if (((cal_format == kOpFormat_DEFAULT) || (cal_format == kOpFormat_NCHW)) &&
|
||||
(origin_data_format == kOpFormat_NHWC)) {
|
||||
return kNHWCToNCHWAxisMap[axis];
|
||||
} else {
|
||||
return axis;
|
||||
}
|
||||
}
|
||||
|
||||
// transpose shape: NCHW To NHWC
|
||||
void ShapeNCHW2NHWC(std::vector<size_t> *shape) {
|
||||
std::swap((*shape)[1], (*shape)[3]);
|
||||
|
|
|
@ -82,7 +82,7 @@ class AddNGpuFwdKernel : public GpuKernel {
|
|||
MS_LOG(ERROR) << "Output number is " << output_num << ", but cudnnAddTensor needs 1 output.";
|
||||
return false;
|
||||
}
|
||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shape);
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "AddNGpuFwdKernel input is null";
|
||||
|
@ -96,9 +96,16 @@ class AddNGpuFwdKernel : public GpuKernel {
|
|||
for (size_t i = 0; i < input_shape.size(); i++) {
|
||||
dimA[i] = SizeToInt(input_shape[i]);
|
||||
}
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(input_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_,
|
||||
SizeToInt(input_shape.size()), dimA),
|
||||
"cudnnSetTensorNdDescriptor failed");
|
||||
auto input_format = AnfAlgo::GetInputFormat(kernel_node, 0);
|
||||
if (input_format == kOpFormat_NHWC) {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(input_descriptor_, CUDNN_TENSOR_NHWC, cudnn_data_type_,
|
||||
SizeToInt(input_shape.size()), dimA),
|
||||
"cudnnSetTensorNdDescriptor failed");
|
||||
} else {
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(input_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_,
|
||||
SizeToInt(input_shape.size()), dimA),
|
||||
"cudnnSetTensorNdDescriptor failed");
|
||||
}
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -194,6 +194,12 @@ void UpdateKernelFormatInfo(const CNodePtr &kernel_node, const std::vector<TypeI
|
|||
auto cal_format = (inputs_type[0] == kNumberTypeFloat16) ? kOpFormat_NHWC : kOpFormat_NCHW;
|
||||
MS_LOG(DEBUG) << "Kernel node: " << kernel_node->fullname_with_scope() << ", format: " << cal_format;
|
||||
auto inputs_format_position = iter->second.first;
|
||||
// If input position is empty, then insert all the input positions, because the input numbers of this op are variable.
|
||||
if (inputs_format_position.size() == 0) {
|
||||
for (size_t input_index = 0; input_index < AnfAlgo::GetInputTensorNum(kernel_node); input_index++) {
|
||||
inputs_format_position.push_back(input_index);
|
||||
}
|
||||
}
|
||||
for (const auto &input_format_position : inputs_format_position) {
|
||||
if (input_format_position >= inputs_format->size()) {
|
||||
MS_LOG(EXCEPTION) << "The position [" << input_format_position << "] is out of range of the input size ["
|
||||
|
|
|
@ -30,6 +30,7 @@ namespace mindspore {
|
|||
namespace device {
|
||||
namespace gpu {
|
||||
// map<opName, (inputFormatPosition, outputFormatPosition)>, used for getting the insert position of format transform.
|
||||
// If input position is empty, then insert all the input positions, because the input numbers of this op are variable.
|
||||
static std::map<std::string, std::pair<std::vector<size_t>, std::vector<size_t>>> kKernelFormatPositionMap = {
|
||||
{prim::kPrimConv2D->name(), {{0, 1}, {0}}},
|
||||
{prim::kPrimConv2DBackpropInput->name(), {{0, 1}, {0}}},
|
||||
|
@ -47,6 +48,8 @@ static std::map<std::string, std::pair<std::vector<size_t>, std::vector<size_t>>
|
|||
{kFusedBatchNormGradEx, {{0, 1}, {0}}},
|
||||
{kFusedBatchNormGradExWithActivation, {{0, 1, 7}, {0}}},
|
||||
{kFusedBatchNormGradExWithAddAndActivation, {{0, 1, 7}, {0, 3}}},
|
||||
{prim::kPrimConcat->name(), {{}, {0}}},
|
||||
{prim::kPrimAddN->name(), {{}, {0}}},
|
||||
};
|
||||
|
||||
void SetKernelInfo(const CNodePtr &apply_kernel_ptr);
|
||||
|
|
Loading…
Reference in New Issue