!35500 AvgPoolGrad3D derived from NativeGpuKernelMod

Merge pull request !35500 from 范吉斌/fix_avgpool3d
This commit is contained in:
i-robot 2022-06-23 01:21:59 +00:00 committed by Gitee
commit f8aa543cb3
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
18 changed files with 1130 additions and 445 deletions

View File

@ -867,7 +867,7 @@ const std::set<std::string> DynamicShapeConstInputToAttrCPU = {
const std::set<std::string> DynamicShapeConstInputToAttrGPU = {
kCastOpName, kExpandDimsOpName, kReshapeOpName, kEmbeddingLookupOpName, kTransposeOpName,
kReduceSumOpName, kReduceMinOpName, kReduceMeanOpName, kReduceMaxOpName, kReduceAllOpName,
kReduceAnyOpName, kConcatOpName, kScatterNdOpName};
kReduceAnyOpName, kConcatOpName, kScatterNdOpName, kGatherV2OpName, kAvgPool3DGradOpName};
// The map between kernel's output and input ref relationship.
// Key is the output index while the value is input index which will be used as the reference of output.

View File

@ -0,0 +1,69 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/avg_pool3d_helper_impl.cuh"
#include "include/cuda_fp16.h"
template <typename T>
__global__ void RealKernelSize(const size_t size, T *kernel, const int64_t kernel_size, const int64_t shape_d,
const int64_t shape_h, const int64_t shape_w, const int64_t kernel_d,
const int64_t kernel_h, const int64_t kernel_w, const int64_t edge_kernel_d,
const int64_t edge_kernel_h, const int64_t edge_kernel_w) {
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
const int64_t d_max = shape_d - 1;
const int64_t h_max = shape_h - 1;
const int64_t w_max = shape_w - 1;
for (int64_t d = 0; d < shape_d; ++d) {
for (int64_t h = 0; h < shape_h; ++h) {
for (int64_t w = 0; w < shape_w; ++w) {
const int64_t valid_d = ((d == d_max) ? edge_kernel_d : kernel_d);
const int64_t valid_h = ((h == h_max) ? edge_kernel_h : kernel_h);
const int64_t valid_w = ((w == w_max) ? edge_kernel_w : kernel_w);
const int64_t cur_kernel_size = valid_d * valid_h * valid_w;
if (cur_kernel_size != kernel_size) {
const int64_t index = pos * shape_d * shape_h * shape_w + d * shape_h * shape_w + h * shape_w + w;
kernel[index] =
kernel[index] * static_cast<T>(static_cast<float>(cur_kernel_size) / static_cast<float>(kernel_size));
}
}
}
}
}
}
template <typename T>
void CalRealKernelSize(const std::vector<int64_t> &input_shape, const std::vector<int64_t> &kernel_size,
const std::vector<int64_t> &edge_kernel_size, T *kernel, const uint32_t &device_id,
cudaStream_t cuda_stream) {
const int64_t kernel_prod = kernel_size[2] * kernel_size[1] * kernel_size[2];
const int64_t nc_size = input_shape[0] * input_shape[1];
RealKernelSize<<<CUDA_BLOCKS(device_id, nc_size), CUDA_THREADS(device_id), 0, cuda_stream>>>(
nc_size, kernel, kernel_prod, input_shape[2], input_shape[3], input_shape[4], kernel_size[0], kernel_size[1],
kernel_size[2], edge_kernel_size[0], edge_kernel_size[1], edge_kernel_size[2]);
}
template CUDA_LIB_EXPORT void CalRealKernelSize<double>(const std::vector<int64_t> &input_shape,
const std::vector<int64_t> &kernel_size,
const std::vector<int64_t> &edge_kernel_size, double *kernel,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalRealKernelSize<float>(const std::vector<int64_t> &input_shape,
const std::vector<int64_t> &kernel_size,
const std::vector<int64_t> &edge_kernel_size, float *kernel,
const uint32_t &device_id, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void CalRealKernelSize<half>(const std::vector<int64_t> &input_shape,
const std::vector<int64_t> &kernel_size,
const std::vector<int64_t> &edge_kernel_size, half *kernel,
const uint32_t &device_id, cudaStream_t cuda_stream);

View File

@ -0,0 +1,27 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_AVG_POOL3D_HELPER_IMPL_CUH_
#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_AVG_POOL3D_HELPER_IMPL_CUH_
#include <cuda_runtime.h>
#include <vector>
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
template <typename T>
CUDA_LIB_EXPORT void CalRealKernelSize(const std::vector<int64_t> &input_shape, const std::vector<int64_t> &kernel_size,
const std::vector<int64_t> &edge_kernel_size, T *kernel,
const uint32_t &device_id, cudaStream_t cuda_stream);
#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_AVG_POOL3D_HELPER_IMPL_CUH_

View File

@ -251,6 +251,7 @@ class DeprecatedNativeGpuKernelMod : public NativeGpuKernelMod {
}
return GetValue<T>(attr);
}
template <typename T>
inline T GetAttrWithDefault(const CNodePtr &kernel_node, const std::string &key, const T &value) const {
const PrimitivePtr &prim = common::AnfAlgo::GetCNodePrimitive(kernel_node);

View File

@ -30,6 +30,8 @@ MS_REG_GPU_KERNEL_ONE(MaxPool3D, KernelAttr().AddInputAttr(kNumberTypeFloat32).A
PoolingFwdGpuKernelMod, float)
MS_REG_GPU_KERNEL_ONE(MaxPool3D, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
PoolingFwdGpuKernelMod, half)
MS_REG_GPU_KERNEL_ONE(AvgPool3D, KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
PoolingFwdGpuKernelMod, double)
MS_REG_GPU_KERNEL_ONE(AvgPool3D, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
PoolingFwdGpuKernelMod, float)
MS_REG_GPU_KERNEL_ONE(AvgPool3D, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),

View File

@ -20,13 +20,18 @@
#include <vector>
#include <string>
#include <algorithm>
#include <functional>
#include "plugin/device/gpu/kernel/gpu_kernel.h"
#include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh"
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/avg_pool3d_helper_impl.cuh"
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh"
#include "plugin/device/gpu/kernel/kernel_constants.h"
namespace mindspore {
namespace kernel {
constexpr auto kAvgPool = "AvgPool";
constexpr auto kAvgPool3D = "AvgPool3D";
template <typename T>
class PoolingFwdGpuKernelMod : public DeprecatedNativeGpuKernelMod {
public:
@ -49,8 +54,10 @@ class PoolingFwdGpuKernelMod : public DeprecatedNativeGpuKernelMod {
pad_left_(0),
n_(0),
c_(0),
divisor_override_(0),
pad_value_(0),
is_null_input_(false),
ceil_mode_(false),
kernel_name_("Pooling"),
input_size_(0),
output_size_(0),
@ -71,13 +78,38 @@ class PoolingFwdGpuKernelMod : public DeprecatedNativeGpuKernelMod {
cudnnPoolingForward(cudnn_handle_, pooling_descriptor_, &alpha, input_descriptor_,
input_addr, &beta, output_descriptor_, output_addr),
"cudnnPoolingForward failed");
if (divisor_override_ != 0) {
T *work_addr = GetDeviceAddress<T>(workspace, 0);
size_t output_num = output_size_ / sizeof(T);
int64_t size = std::accumulate(kernel_size_.begin(), kernel_size_.end(), 1, std::multiplies<int64_t>());
T divisor = static_cast<T>(LongToFloat(size) / LongToFloat(divisor_override_));
std::vector<T> divisor_value(output_num, divisor);
CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(
cudaMemcpyAsync(work_addr, divisor_value.data(), output_size_, cudaMemcpyHostToDevice,
reinterpret_cast<cudaStream_t>(stream_ptr)),
"cudaMemcpyAsync failed.");
if (ceil_mode_) {
CalRealKernelSize(output_shape_exclude_nc_, kernel_size_, edge_kernel_, work_addr, 0,
reinterpret_cast<cudaStream_t>(stream_ptr));
}
ElewiseArith(output_num, BROADCAST_TYPE_MUL, output_addr, work_addr, output_addr,
reinterpret_cast<cudaStream_t>(stream_ptr));
}
return true;
}
bool Init(const CNodePtr &kernel_node) {
kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node);
kernel_node_ = kernel_node;
InitResource();
(void)CheckParam(kernel_node);
auto prim = common::AnfAlgo::GetCNodePrimitive(kernel_node);
MS_EXCEPTION_IF_NULL(prim);
if (kernel_name_ == kAvgPool3D) {
divisor_override_ = GetAttr<int64_t>(kernel_node, "divisor_override");
ceil_mode_ = GetAttr<bool>(kernel_node, "ceil_mode");
}
cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
data_format_ = AnfAlgo::GetInputFormat(kernel_node, 0);
auto format_attr = GetAttr<std::string>(kernel_node, "format");
@ -123,6 +155,7 @@ class PoolingFwdGpuKernelMod : public DeprecatedNativeGpuKernelMod {
} else if (dim == kDim3DShapeSize) {
SetPad3D(kernel_node);
}
edge_kernel_ = GetEdgeKernelSize(kernel_node);
InitSizeLists();
return true;
}
@ -157,6 +190,7 @@ class PoolingFwdGpuKernelMod : public DeprecatedNativeGpuKernelMod {
}
input_size_list_.push_back(input_size_);
output_size_list_.push_back(output_size_);
workspace_size_list_.push_back(output_size_);
}
private:
@ -169,14 +203,22 @@ class PoolingFwdGpuKernelMod : public DeprecatedNativeGpuKernelMod {
void SetPoolingMode(const CNodePtr &kernel_node) {
mode_ = common::AnfAlgo::GetCNodeName(kernel_node);
if (mode_ == "AvgPool" || mode_ == "AvgPool3D") {
pooling_mode_ = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
auto prim = common::AnfAlgo::GetCNodePrimitive(kernel_node);
MS_EXCEPTION_IF_NULL(prim);
bool include = false;
if (prim->HasAttr("count_include_pad")) {
include = GetAttr<bool>(kernel_node, "count_include_pad");
}
if (mode_ == kAvgPool || mode_ == kAvgPool3D) {
pooling_mode_ =
include ? CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
pad_value_ = 0.0;
} else {
pooling_mode_ = CUDNN_POOLING_MAX;
pad_value_ = kSignedMinFloat;
}
}
void SetPad(const CNodePtr &kernel_node) {
auto prim = common::AnfAlgo::GetCNodePrimitive(kernel_node);
MS_EXCEPTION_IF_NULL(prim);
@ -281,12 +323,66 @@ class PoolingFwdGpuKernelMod : public DeprecatedNativeGpuKernelMod {
"cudnnSetPoolingNdDescriptor failed");
}
std::vector<int64_t> GetEdgeKernelSize(const CNodePtr &kernel_node) {
if (!ceil_mode_ && divisor_override_ == 0) {
return {};
}
const size_t k3dSizeLowerLimit = 5;
const size_t kIdxD = 2;
const size_t kIdxH = 3;
const size_t kIdxW = 4;
const size_t kScale = 2;
std::vector<int64_t> edge_kernel;
std::vector<int64_t> kernel_size = GetAttr<std::vector<int64_t>>(kernel_node, "kernel_size");
std::vector<int64_t> strides = GetAttr<std::vector<int64_t>>(kernel_node, "strides");
std::vector<int64_t> pad = GetAttr<std::vector<int64_t>>(kernel_node, "pad_list");
if (kernel_size.size() != k3dSizeLowerLimit) {
MS_LOG(EXCEPTION) << "kernel_size must be " << k3dSizeLowerLimit << "D, but got " << kernel_size.size();
}
if (strides.size() != k3dSizeLowerLimit) {
MS_LOG(EXCEPTION) << "strides must be " << k3dSizeLowerLimit << "D, but got " << strides.size();
}
auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
auto output_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
kernel_size_ = {kernel_size[kIdxD], kernel_size[kIdxH], kernel_size[kIdxW]};
std::vector<int64_t> stride = {strides[kIdxD], strides[kIdxH], strides[kIdxW]};
std::vector<int64_t> shape_exclude_nc = {SizeToLong(input_shape[kIdxD]), SizeToLong(input_shape[kIdxH]),
SizeToLong(input_shape[kIdxW])};
(void)std::transform(output_shape.begin(), output_shape.end(), std::back_inserter(output_shape_exclude_nc_),
SizeToLong);
const size_t dim = shape_exclude_nc.size();
if (pad.size() != dim * kScale) {
MS_LOG(EXCEPTION) << "pad_list must be " << (dim * kScale) << "D, but got " << pad.size() << "D!";
}
for (size_t i = 0; i < dim; ++i) {
size_t l_index = kScale * i;
size_t r_index = kScale * i + 1;
int64_t len = shape_exclude_nc[i] + pad[l_index] + pad[r_index] - kernel_size_[i];
int64_t padding_iv = FloatToLong(std::ceil(LongToFloat(len) / LongToFloat(stride[i]))) * stride[i] - len;
int64_t padding_r = pad[r_index] + padding_iv;
if (padding_r > pad[r_index] && padding_r < kernel_size_[i]) {
edge_kernel.push_back(kernel_size_[i] - padding_iv);
} else {
edge_kernel.push_back(kernel_size_[i]);
}
}
return edge_kernel;
}
cudnnHandle_t cudnn_handle_;
cudnnTensorDescriptor_t input_descriptor_;
cudnnTensorDescriptor_t output_descriptor_;
cudnnPoolingDescriptor_t pooling_descriptor_;
cudnnPoolingMode_t pooling_mode_ = CUDNN_POOLING_MAX;
std::vector<int> stride_;
std::vector<int64_t> kernel_size_;
std::vector<int64_t> shape_exclude_nc_;
std::vector<int64_t> edge_kernel_;
std::vector<int64_t> output_shape_exclude_nc_;
std::string mode_;
std::string pad_mode_;
std::string data_format_ = kOpFormat_NCHW;
@ -304,8 +400,10 @@ class PoolingFwdGpuKernelMod : public DeprecatedNativeGpuKernelMod {
int pad_left_;
int n_;
int c_;
int64_t divisor_override_;
float pad_value_;
bool is_null_input_;
bool ceil_mode_;
std::string kernel_name_;
size_t input_size_;
size_t output_size_;

View File

@ -15,54 +15,511 @@
*/
#include "plugin/device/gpu/kernel/nn/pooling_grad_gpu_kernel.h"
#include <functional>
#include <memory>
#include "mindspore/core/ops/grad/pool_grad.h"
#include "mindspore/core/ops/grad/avg_pool_3d_grad.h"
#include "mindspore/core/ops/grad/max_pool_3d_grad.h"
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh"
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/avg_pool3d_helper_impl.cuh"
namespace mindspore {
namespace kernel {
MS_REG_GPU_KERNEL_ONE(MaxPoolGrad,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
PoolingGradGpuKernelMod, float)
MS_REG_GPU_KERNEL_ONE(MaxPoolGrad,
KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16),
PoolingGradGpuKernelMod, half)
MS_REG_GPU_KERNEL_ONE(MaxPool3DGrad,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
PoolingGradGpuKernelMod, float)
MS_REG_GPU_KERNEL_ONE(MaxPool3DGrad,
KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16),
PoolingGradGpuKernelMod, half)
MS_REG_GPU_KERNEL_ONE(AvgPoolGrad,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
PoolingGradGpuKernelMod, float)
MS_REG_GPU_KERNEL_ONE(AvgPoolGrad,
KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16),
PoolingGradGpuKernelMod, half)
MS_REG_GPU_KERNEL_ONE(AvgPool3DGrad, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
PoolingGradGpuKernelMod, float)
MS_REG_GPU_KERNEL_ONE(AvgPool3DGrad, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
PoolingGradGpuKernelMod, half)
constexpr auto kMaxPoolGrad = "MaxPoolGrad";
constexpr auto kMaxPool3DGrad = "MaxPool3DGrad";
constexpr auto kAvgPoolGrad = "AvgPoolGrad";
constexpr auto kAvgPool3DGrad = "AvgPool3DGrad";
constexpr size_t kInputNum = 3;
constexpr size_t kAvgPool3DGradInputNum = 1;
bool PoolingGradGpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
const std::vector<KernelTensorPtr> &outputs) {
kernel_name_ = base_operator->name();
auto pool_grad_ptr = std::make_shared<ops::PoolGrad>(base_operator->GetPrim());
format_attr_ = pool_grad_ptr->get_format();
pad_mode_ = pool_grad_ptr->get_pad_mode();
stride_me_ = pool_grad_ptr->get_strides();
window_me_ = pool_grad_ptr->get_kernel_size();
if (kernel_name_ == kMaxPool3DGrad) {
auto kernel_ptr = std::make_shared<ops::MaxPool3DGrad>(base_operator->GetPrim());
pad_list_ = kernel_ptr->get_pad_list();
} else if (kernel_name_ == kAvgPool3DGrad) {
auto kernel_ptr = std::make_shared<ops::AvgPool3DGrad>(base_operator->GetPrim());
pad_list_ = kernel_ptr->get_pad_list();
divisor_override_ = kernel_ptr->get_divisor_override();
ceil_mode_ = kernel_ptr->get_ceil_mode();
include_ = kernel_ptr->get_count_include_pad();
}
cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(inputs.at(kIndex0)->GetDtype()));
SetPoolingMode();
auto kernel_attr = GetKernelAttrFromTensors(inputs, outputs);
auto [is_match, index] = MatchKernelAttr(kernel_attr, GetOpSupport());
if (!is_match) {
MS_LOG(ERROR) << "For '" << kernel_name_ << "' does not support this kernel type: " << kernel_attr;
return false;
}
kernel_func_ = kernel_attr_map_.at(kernel_name_)[index].second;
return true;
}
int PoolingGradGpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
const std::vector<KernelTensorPtr> &outputs,
const std::map<uint32_t, tensor::TensorPtr> &) {
int ret = KernelMod::Resize(base_operator, inputs, outputs);
if (ret != KRET_OK) {
return ret;
}
size_t input_num = inputs.size();
size_t expect_input_num = (kernel_name_ == kAvgPool3DGrad) ? kAvgPool3DGradInputNum : kInputNum;
if (input_num != expect_input_num) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs must be " << expect_input_num
<< ", but got " << input_num;
}
input_shape_ = inputs.at(kIndex0)->GetShapeVector();
output_shape_ = outputs.at(kIndex0)->GetShapeVector();
int nbDims = SizeToInt(input_shape_.size());
int dimA[kPoolingNbDims];
int strideAin[kPoolingNbDims];
int dimAy[kPoolingNbDims];
int strideAiny[kPoolingNbDims];
int dimAdy[kPoolingNbDims];
int strideAdy[kPoolingNbDims];
int dimAout[kPoolingNbDims];
int strideAout[kPoolingNbDims];
if (!InitShape(inputs, outputs, dimA, strideAin, dimAy, strideAiny, dimAdy, strideAdy, dimAout, strideAout, nbDims)) {
return ret;
}
if (nbDims == kDim2DShapeSize) {
SetPad();
} else if (nbDims == kDim3DShapeSize) {
SetPad3D();
}
std::string err_msg = "For '" + kernel_name_ + "', cudnnSetTensor4dDescriptor failed";
CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(
cudnnSetTensorNdDescriptor(y_descriptor_, cudnn_data_type_, nbDims, dimAy, strideAiny), err_msg);
CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(
cudnnSetTensorNdDescriptor(dy_descriptor_, cudnn_data_type_, nbDims, dimAdy, strideAdy), err_msg);
CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(
cudnnSetTensorNdDescriptor(dx_descriptor_, cudnn_data_type_, nbDims, dimAout, strideAout), err_msg);
CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(
cudnnSetTensorNdDescriptor(x_descriptor_, cudnn_data_type_, nbDims, dimA, strideAin), err_msg);
edge_kernel_ = GetEdgeKernelSize();
InitSizeLists();
return ret;
}
template <typename T>
bool PoolingGradGpuKernelMod::LaunchKernel(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &workspace,
const std::vector<kernel::AddressPtr> &outputs) {
T *x_data = nullptr;
T *y = nullptr;
T *dy = nullptr;
T *dx = nullptr;
if (kernel_name_ == kAvgPool3DGrad) {
dy = GetDeviceAddress<T>(inputs, kIndex0);
dx = GetDeviceAddress<T>(outputs, kIndex0);
x_data = GetDeviceAddress<T>(workspace, kIndex0);
y = GetDeviceAddress<T>(workspace, kIndex1);
} else {
x_data = GetDeviceAddress<T>(inputs, kIndex0);
y = GetDeviceAddress<T>(inputs, kIndex1);
dy = GetDeviceAddress<T>(inputs, kIndex2);
dx = GetDeviceAddress<T>(outputs, kIndex0);
}
const float alpha = 1;
const float beta = 0;
if (divisor_override_ != 0) {
T *work_addr = GetDeviceAddress<T>(workspace, kIndex2);
T *dy_work_addr = GetDeviceAddress<T>(workspace, kIndex3);
size_t output_num = input_size_ / sizeof(T);
int64_t size = std::accumulate(kernel_size_.begin(), kernel_size_.end(), 1, std::multiplies<int64_t>());
T divisor = static_cast<T>(LongToFloat(size) / LongToFloat(divisor_override_));
std::vector<T> divisor_value(output_num, divisor);
std::string err_msg = "For '" + kernel_name_ + "', cudaMemcpyAsync failed.";
CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(
cudaMemcpyAsync(work_addr, divisor_value.data(), input_size_, cudaMemcpyHostToDevice,
reinterpret_cast<cudaStream_t>(cuda_stream_)),
err_msg);
CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(cudaMemcpyAsync(dy_work_addr, dy, input_size_, cudaMemcpyDeviceToDevice,
reinterpret_cast<cudaStream_t>(cuda_stream_)),
err_msg);
if (ceil_mode_) {
CalRealKernelSize(input_shape_, kernel_size_, edge_kernel_, work_addr, device_id_,
reinterpret_cast<cudaStream_t>(cuda_stream_));
}
ElewiseArith(output_num, BROADCAST_TYPE_MUL, dy_work_addr, work_addr, dy_work_addr,
reinterpret_cast<cudaStream_t>(cuda_stream_));
CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(
cudnnPoolingBackward(cudnn_handle_, pooling_descriptor_, &alpha, y_descriptor_, y, dy_descriptor_, dy_work_addr,
x_descriptor_, x_data, &beta, dx_descriptor_, dx),
"For '" + kernel_name_ + "', cudnnPoolingBackward failed");
return true;
}
CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(
cudnnPoolingBackward(cudnn_handle_, pooling_descriptor_, &alpha, y_descriptor_, y, dy_descriptor_, dy,
x_descriptor_, x_data, &beta, dx_descriptor_, dx),
"For '" + kernel_name_ + "', cudnnPoolingBackward failed");
return true;
}
bool PoolingGradGpuKernelMod::InitShape(const std::vector<KernelTensorPtr> &inputs,
const std::vector<KernelTensorPtr> &outputs, int *dimA, int *strideAin,
int *dimAy, int *strideAiny, int *dimAdy, int *strideAdy, int *dimAout,
int *strideAout, int nbDims) {
ShapeVector dout_shape, input_mask, output_shape, input_shape;
if (kernel_name_ == kAvgPool3DGrad) {
dout_shape = inputs.at(kIndex0)->GetShapeVector();
output_shape = outputs.at(kIndex0)->GetShapeVector();
input_mask = dout_shape;
input_shape = output_shape;
} else {
input_shape = inputs.at(kIndex0)->GetShapeVector();
input_mask = inputs.at(kIndex1)->GetShapeVector();
dout_shape = inputs.at(kIndex2)->GetShapeVector();
output_shape = outputs.at(kIndex0)->GetShapeVector();
}
is_null_input_ =
CHECK_SHAPE_NULL(input_shape, kernel_name_, "input") || CHECK_SHAPE_NULL(input_mask, kernel_name_, "mask") ||
CHECK_SHAPE_NULL(dout_shape, kernel_name_, "dout") || CHECK_SHAPE_NULL(output_shape, kernel_name_, "output");
if (is_null_input_) {
InitSizeLists();
return false;
}
auto data_format = GetFormatFromEnumToStr(inputs.at(kIndex0)->GetFormat());
if (Anyone(format_attr_, Format::NHWC, Format::NDHWC)) {
data_format = GetFormatFromEnumToStr(format_attr_);
}
CheckTensorSize({input_shape, input_mask, dout_shape, output_shape});
if (nbDims == kDim2DShapeSize) {
SetNCHW(input_shape, &n_, &c_, &old_height_, &old_width_, data_format);
} else if (nbDims == kDim3DShapeSize) {
SetNCDHW(input_shape, &n_, &c_, &old_depth_, &old_height_, &old_width_, data_format);
}
SetDimA(input_shape, dimA, nbDims, data_format);
SetStrideA(input_shape, strideAin, nbDims, data_format);
SetDimA(input_mask, dimAy, nbDims, data_format);
SetStrideA(input_mask, strideAiny, nbDims, data_format);
SetDimA(dout_shape, dimAdy, nbDims, data_format);
SetStrideA(dout_shape, strideAdy, nbDims, data_format);
SetDimA(output_shape, dimAout, nbDims, data_format);
SetStrideA(output_shape, strideAout, nbDims, data_format);
return true;
}
void PoolingGradGpuKernelMod::DestroyResource() noexcept {
std::string err_msg = "For '" + kernel_name_ + "', cudnnDestroyPoolingDescriptor failed";
CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(cudnnDestroyPoolingDescriptor(pooling_descriptor_), err_msg);
CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(cudnnDestroyTensorDescriptor(dx_descriptor_), err_msg);
CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(cudnnDestroyTensorDescriptor(x_descriptor_), err_msg);
CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(cudnnDestroyTensorDescriptor(dy_descriptor_), err_msg);
CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(cudnnDestroyTensorDescriptor(y_descriptor_), err_msg);
}
void PoolingGradGpuKernelMod::InitResource() {
pooling_mode_ = CUDNN_POOLING_MAX;
cudnn_data_type_ = CUDNN_DATA_FLOAT;
compute_format_ = CUDNN_TENSOR_NCHW;
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
std::string err_msg = "For '" + kernel_name_ + "', cudnnCreateTensorDescriptor failed";
CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(cudnnCreateTensorDescriptor(&y_descriptor_), err_msg);
CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(cudnnCreateTensorDescriptor(&dy_descriptor_), err_msg);
CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(cudnnCreateTensorDescriptor(&x_descriptor_), err_msg);
CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(cudnnCreateTensorDescriptor(&dx_descriptor_), err_msg);
CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(cudnnCreatePoolingDescriptor(&pooling_descriptor_), err_msg);
}
void PoolingGradGpuKernelMod::InitSizeLists() {
input_size_list_.clear();
output_size_list_.clear();
workspace_size_list_.clear();
std::string err_msg = "For '" + kernel_name_ + "', cudnnGetTensorSizeInBytes failed";
if (!is_null_input_) {
CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(cudnnGetTensorSizeInBytes(x_descriptor_, &input_size_), err_msg);
CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(cudnnGetTensorSizeInBytes(dx_descriptor_, &output_size_), err_msg);
}
input_size_list_.push_back(input_size_);
output_size_list_.push_back(output_size_);
if (kernel_name_ == kAvgPool3DGrad) {
workspace_size_list_.push_back(output_size_);
workspace_size_list_.push_back(input_size_);
if (divisor_override_ != 0) {
workspace_size_list_.push_back(input_size_);
workspace_size_list_.push_back(input_size_);
}
}
if (!is_null_input_) {
CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(cudnnGetTensorSizeInBytes(y_descriptor_, &input_size_), err_msg);
}
input_size_list_.push_back(input_size_);
if (!is_null_input_) {
CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(cudnnGetTensorSizeInBytes(dy_descriptor_, &input_size_), err_msg);
}
input_size_list_.push_back(input_size_);
}
void PoolingGradGpuKernelMod::SetPad() {
std::vector<int> window;
std::vector<int> stride;
(void)std::transform(stride_me_.begin(), stride_me_.end(), std::back_inserter(stride),
[](const int64_t &value) { return static_cast<int>(value); });
(void)std::transform(window_me_.begin(), window_me_.end(), std::back_inserter(window),
[](const int64_t &value) { return static_cast<int>(value); });
const size_t kSizeLowerLimit = 4;
const size_t kIdxH = 2;
const size_t kIdxW = 3;
if (window.size() < kSizeLowerLimit) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the length of 'kernel_size' cannot be less than 4, but got "
<< window.size();
}
int window_height = window[kIdxH];
int window_width = window[kIdxW];
if (stride.size() < kSizeLowerLimit) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the length of 'strides' cannot be less than 4, but got "
<< stride.size();
}
int stride_h = stride[kIdxH];
int stride_w = stride[kIdxW];
if (format_attr_ == Format::NHWC) {
const size_t kNHWCIdxH = 1;
const size_t kNHWCIdxW = 2;
window_height = window[kNHWCIdxH];
window_width = window[kNHWCIdxW];
stride_h = stride[kNHWCIdxH];
stride_w = stride[kNHWCIdxW];
}
const size_t k2dDim = 2;
int windowDimA[k2dDim] = {window_height, window_width};
int paddingA[k2dDim] = {0, 0};
int strideA[k2dDim] = {stride_h, stride_w};
if (pad_mode_ == PadMode::SAME) {
pad_height_ = GetPad(old_height_, window_height, stride_h);
pad_width_ = GetPad(old_width_, window_width, stride_w);
const int kSymCoef = 2;
pad_top_ = pad_height_ / kSymCoef;
pad_left_ = pad_width_ / kSymCoef;
paddingA[kIndex0] = pad_top_;
paddingA[kIndex1] = pad_left_;
} else {
if (pad_mode_ == PadMode::VALID) {
pad_height_ = 0;
pad_width_ = 0;
}
}
std::string err_msg = "For '" + kernel_name_ + "', cudnnSetPoolingNdDescriptor failed";
CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(
cudnnSetPoolingNdDescriptor(pooling_descriptor_, pooling_mode_, CUDNN_NOT_PROPAGATE_NAN, k2dDim, windowDimA,
paddingA, strideA),
err_msg);
}
void PoolingGradGpuKernelMod::SetPad3D() {
const int kPadListSize = 6;
const int kPadScale = 2;
std::vector<int> window;
std::vector<int> stride;
(void)std::transform(stride_me_.begin(), stride_me_.end(), std::back_inserter(stride),
[](const int64_t &value) { return static_cast<int>(value); });
(void)std::transform(window_me_.begin(), window_me_.end(), std::back_inserter(window),
[](const int64_t &value) { return static_cast<int>(value); });
const size_t k3dSizeLowerLimit = 5;
const size_t kIdxD = 2;
const size_t kIdxH = 3;
const size_t kIdxW = 4;
if (window.size() < k3dSizeLowerLimit) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the length of 'kernel_size' cannot be less than 5, but got "
<< window.size();
}
int window_depth = window[kIdxD];
int window_height = window[kIdxH];
int window_width = window[kIdxW];
if (stride.size() < k3dSizeLowerLimit) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the length of 'strides' cannot be less than 5, but got "
<< stride.size();
}
int stride_d = stride[kIdxD];
int stride_h = stride[kIdxH];
int stride_w = stride[kIdxW];
if (format_attr_ == Format::NDHWC) {
const size_t kNDHWCIdxD = 1;
const size_t kNDHWCIdxH = 2;
const size_t kNDHWCIdxW = 3;
window_depth = window[kNDHWCIdxD];
window_height = window[kNDHWCIdxH];
window_width = window[kNDHWCIdxW];
stride_d = stride[kNDHWCIdxD];
stride_h = stride[kNDHWCIdxH];
stride_w = stride[kNDHWCIdxW];
}
const size_t k3dDimSize = 3;
int windowDimA[k3dDimSize] = {window_depth, window_height, window_width};
int paddingA[k3dDimSize] = {0, 0, 0};
int strideA[k3dDimSize] = {stride_d, stride_h, stride_w};
if (pad_mode_ == PadMode::SAME) {
pad_depth_ = GetPad(old_depth_, window_depth, stride_d);
pad_height_ = GetPad(old_height_, window_height, stride_h);
pad_width_ = GetPad(old_width_, window_width, stride_w);
const int kSymCoef = 2;
pad_front_ = pad_depth_ / kSymCoef;
pad_top_ = pad_height_ / kSymCoef;
pad_left_ = pad_width_ / kSymCoef;
paddingA[kIndex0] = pad_front_;
paddingA[kIndex1] = pad_top_;
paddingA[kIndex2] = pad_left_;
} else if (pad_mode_ == PadMode::VALID) {
pad_depth_ = 0;
pad_height_ = 0;
pad_width_ = 0;
} else {
if (pad_list_.size() != kPadListSize) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the length of 'pad_list' must be 6, but got "
<< pad_list_.size();
}
for (size_t idx = 0; idx < k3dDimSize; idx++) {
paddingA[idx] = pad_list_[idx * kPadScale];
}
}
std::string err_msg = "For '" + kernel_name_ + "', cudnnSetPoolingNdDescriptor failed";
CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(
cudnnSetPoolingNdDescriptor(pooling_descriptor_, pooling_mode_, CUDNN_NOT_PROPAGATE_NAN, k3dDimSize, windowDimA,
paddingA, strideA),
err_msg);
}
void PoolingGradGpuKernelMod::SetPoolingMode() {
if (kernel_name_ == kAvgPool3DGrad) {
pooling_mode_ =
include_ ? CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
pad_value_ = 0.0;
} else if (kernel_name_ == kAvgPoolGrad) {
pooling_mode_ = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
pad_value_ = 0.0;
} else {
pooling_mode_ = CUDNN_POOLING_MAX;
pad_value_ = kSignedMinFloat;
}
}
std::vector<int64_t> PoolingGradGpuKernelMod::GetEdgeKernelSize() {
if (!ceil_mode_ && divisor_override_ == 0) {
return {};
}
const size_t k3dSizeLowerLimit = 5;
const size_t kIdxD = 2;
const size_t kIdxH = 3;
const size_t kIdxW = 4;
const size_t kScale = 2;
std::vector<int64_t> edge_kernel;
if (window_me_.size() != k3dSizeLowerLimit) {
MS_LOG(EXCEPTION) << "kernel_size must be " << k3dSizeLowerLimit << "D, but got " << window_me_.size();
}
if (stride_me_.size() != k3dSizeLowerLimit) {
MS_LOG(EXCEPTION) << "strides must be " << k3dSizeLowerLimit << "D, but got " << stride_me_.size();
}
kernel_size_ = {window_me_[kIdxD], window_me_[kIdxH], window_me_[kIdxW]};
std::vector<int64_t> stride = {stride_me_[kIdxD], stride_me_[kIdxH], stride_me_[kIdxW]};
std::vector<int64_t> shape_exclude_nc = {output_shape_[kIdxD], output_shape_[kIdxH], output_shape_[kIdxW]};
const size_t kDim = shape_exclude_nc.size();
if (pad_list_.size() != kDim * kScale) {
MS_LOG(EXCEPTION) << "pad_list must be " << (kDim * kScale) << "D, but got " << pad_list_.size() << "D!";
}
for (size_t i = 0; i < kDim; ++i) {
size_t l_index = kScale * i;
size_t r_index = kScale * i + 1;
int64_t len = shape_exclude_nc[i] + pad_list_[l_index] + pad_list_[r_index] - kernel_size_[i];
int64_t padding_iv = FloatToLong(std::ceil(LongToFloat(len) / LongToFloat(stride[i]))) * stride[i] - len;
int64_t padding_r = pad_list_[r_index] + padding_iv;
if (padding_r > pad_list_[r_index] && padding_r < kernel_size_[i]) {
edge_kernel.push_back(kernel_size_[i] - padding_iv);
} else {
edge_kernel.push_back(kernel_size_[i]);
}
}
return edge_kernel;
}
std::map<std::string, std::vector<std::pair<KernelAttr, PoolingGradGpuKernelMod::PoolingGradFunc>>>
PoolingGradGpuKernelMod::kernel_attr_map_ = {
{kMaxPoolGrad,
{{KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
&PoolingGradGpuKernelMod::LaunchKernel<float>},
{KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16),
&PoolingGradGpuKernelMod::LaunchKernel<half>}}},
{kMaxPool3DGrad,
{{KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
&PoolingGradGpuKernelMod::LaunchKernel<float>},
{KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16),
&PoolingGradGpuKernelMod::LaunchKernel<half>}}},
{kAvgPoolGrad,
{{KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
&PoolingGradGpuKernelMod::LaunchKernel<float>},
{KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16),
&PoolingGradGpuKernelMod::LaunchKernel<half>}}},
{kAvgPool3DGrad,
{{KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
&PoolingGradGpuKernelMod::LaunchKernel<double>},
{KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
&PoolingGradGpuKernelMod::LaunchKernel<float>},
{KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
&PoolingGradGpuKernelMod::LaunchKernel<half>}}}};
std::vector<KernelAttr> PoolingGradGpuKernelMod::GetOpSupport() {
auto iter = kernel_attr_map_.find(kernel_name_);
if (iter == kernel_attr_map_.end()) {
MS_LOG(ERROR) << "For 'PoolingGradGpuKernelMod', the kernel name must be in " << kernel::Map2Str(kernel_attr_map_)
<< ", but got " << kernel_name_;
return std::vector<KernelAttr>{};
}
std::vector<KernelAttr> support_list;
(void)std::transform(iter->second.begin(), iter->second.end(), std::back_inserter(support_list),
[](const std::pair<KernelAttr, PoolingGradFunc> &item) { return item.first; });
return support_list;
}
MS_KERNEL_FACTORY_REG_BY_CREATOR(NativeGpuKernelMod, MaxPoolGrad,
[]() { return std::make_shared<PoolingGradGpuKernelMod>(kMaxPoolGrad); });
MS_KERNEL_FACTORY_REG_BY_CREATOR(NativeGpuKernelMod, MaxPool3DGrad,
[]() { return std::make_shared<PoolingGradGpuKernelMod>(kMaxPool3DGrad); });
MS_KERNEL_FACTORY_REG_BY_CREATOR(NativeGpuKernelMod, AvgPoolGrad,
[]() { return std::make_shared<PoolingGradGpuKernelMod>(kAvgPoolGrad); });
MS_KERNEL_FACTORY_REG_BY_CREATOR(NativeGpuKernelMod, AvgPool3DGrad,
[]() { return std::make_shared<PoolingGradGpuKernelMod>(kAvgPool3DGrad); });
} // namespace kernel
} // namespace mindspore

View File

@ -18,368 +18,62 @@
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_POOLING_GRAD_GPU_KERNEL_H_
#include <vector>
#include <map>
#include <utility>
#include <string>
#include <algorithm>
#include "plugin/device/gpu/kernel/gpu_kernel.h"
#include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/pad_impl.cuh"
#include "plugin/device/gpu/kernel/kernel_constants.h"
#include "utils/check_convert_utils.h"
namespace mindspore {
namespace kernel {
constexpr size_t kInputNum = 3;
constexpr size_t kAvgPool3DGradInputNum = 1;
template <typename T>
class PoolingGradGpuKernelMod : public DeprecatedNativeGpuKernelMod {
class PoolingGradGpuKernelMod : public NativeGpuKernelMod {
public:
PoolingGradGpuKernelMod()
: cudnn_handle_(nullptr),
pooling_descriptor_(nullptr),
y_descriptor_(nullptr),
dy_descriptor_(nullptr),
x_descriptor_(nullptr),
dx_descriptor_(nullptr),
pooling_mode_(CUDNN_POOLING_MAX),
cudnn_data_type_(CUDNN_DATA_FLOAT),
compute_format_(CUDNN_TENSOR_NCHW),
old_depth_(0),
old_height_(0),
old_width_(0),
pad_depth_(0),
pad_height_(0),
pad_width_(0),
pad_front_(0),
pad_top_(0),
pad_left_(0),
n_(0),
c_(0),
pad_value_(0),
is_null_input_(false),
kernel_name_("PoolingGrad"),
input_size_(0),
output_size_(0),
workspace_size_(0) {}
explicit PoolingGradGpuKernelMod(const std::string &kernel_name) : kernel_name_(kernel_name) { InitResource(); }
~PoolingGradGpuKernelMod() override { DestroyResource(); }
bool Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
const std::vector<KernelTensorPtr> &outputs) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
const std::vector<AddressPtr> &outputs, void *cuda_stream) override {
if (is_null_input_) {
return true;
}
T *x_data = nullptr;
T *y = nullptr;
T *dy = nullptr;
T *dx = nullptr;
if (kernel_name_ == kAvgPool3DGradOpName) {
dy = GetDeviceAddress<T>(inputs, 0);
dx = GetDeviceAddress<T>(outputs, 0);
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaMalloc(reinterpret_cast<void **>(&x_data), outputs[0]->size),
"cudaMalloc failed.");
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaMalloc(reinterpret_cast<void **>(&y), inputs[0]->size),
"cudaMalloc failed.");
} else {
x_data = GetDeviceAddress<T>(inputs, 0);
y = GetDeviceAddress<T>(inputs, 1);
dy = GetDeviceAddress<T>(inputs, 2);
dx = GetDeviceAddress<T>(outputs, 0);
}
const float alpha = 1;
const float beta = 0;
CHECK_CUDNN_RET_WITH_EXCEPT(
kernel_node_,
cudnnPoolingBackward(cudnn_handle_, pooling_descriptor_, &alpha, y_descriptor_, y, dy_descriptor_, dy,
x_descriptor_, x_data, &beta, dx_descriptor_, dx),
"cudnnPoolingBackward failed");
return true;
cuda_stream_ = cuda_stream;
return kernel_func_(this, inputs, workspace, outputs);
}
bool InitShape(const CNodePtr &kernel_node, int *dimA, int *strideAin, int *dimAy, int *strideAiny, int *dimAdy,
int *strideAdy, int *dimAout, int *strideAout, int nbDims) {
ShapeVector dout_shape, input_mask, output_shape, input_shape;
if (kernel_name_ == kAvgPool3DGradOpName) {
dout_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
output_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
input_mask = dout_shape;
input_shape = output_shape;
} else {
const size_t kDoutIdx = 2;
input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
input_mask = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
dout_shape = AnfAlgo::GetInputDeviceShape(kernel_node, kDoutIdx);
output_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
}
int Resize(
const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
const std::vector<KernelTensorPtr> &outputs,
const std::map<uint32_t, tensor::TensorPtr> &inputsOnHost = std::map<uint32_t, tensor::TensorPtr>()) override;
auto data_format = AnfAlgo::GetInputFormat(kernel_node, 0);
format_attr_ = GetAttr<std::string>(kernel_node, "format");
if (Anyone(format_attr_, kOpFormat_NHWC, kOpFormat_NDHWC)) {
data_format = format_attr_;
}
cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
is_null_input_ =
CHECK_SHAPE_NULL(input_shape, kernel_name_, "input") || CHECK_SHAPE_NULL(input_mask, kernel_name_, "mask") ||
CHECK_SHAPE_NULL(dout_shape, kernel_name_, "dout") || CHECK_SHAPE_NULL(output_shape, kernel_name_, "output");
if (is_null_input_ || AnfAlgo::IsShapesDynamic({input_shape, output_shape, input_mask, dout_shape})) {
InitSizeLists();
return true;
}
CheckTensorSize({input_shape, input_mask, dout_shape, output_shape});
if (nbDims == kDim2DShapeSize) {
SetNCHW(input_shape, &n_, &c_, &old_height_, &old_width_, data_format);
} else if (nbDims == kDim3DShapeSize) {
SetNCDHW(input_shape, &n_, &c_, &old_depth_, &old_height_, &old_width_, data_format);
}
SetDimA(input_shape, dimA, nbDims, data_format);
SetStrideA(input_shape, strideAin, nbDims, data_format);
SetDimA(input_mask, dimAy, nbDims, data_format);
SetStrideA(input_mask, strideAiny, nbDims, data_format);
SetDimA(dout_shape, dimAdy, nbDims, data_format);
SetStrideA(dout_shape, strideAdy, nbDims, data_format);
SetDimA(output_shape, dimAout, nbDims, data_format);
SetStrideA(output_shape, strideAout, nbDims, data_format);
return true;
}
bool Init(const CNodePtr &kernel_node) override {
kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node);
kernel_node_ = kernel_node;
InitResource();
(void)CheckParam(kernel_node);
auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
int nbDims = SizeToInt(input_shape.size());
int dimA[kPoolingNbDims];
int strideAin[kPoolingNbDims];
int dimAy[kPoolingNbDims];
int strideAiny[kPoolingNbDims];
int dimAdy[kPoolingNbDims];
int strideAdy[kPoolingNbDims];
int dimAout[kPoolingNbDims];
int strideAout[kPoolingNbDims];
if (!InitShape(kernel_node, dimA, strideAin, dimAy, strideAiny, dimAdy, strideAdy, dimAout, strideAout, nbDims)) {
return true;
}
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
cudnnSetTensorNdDescriptor(y_descriptor_, cudnn_data_type_, nbDims, dimAy, strideAiny),
"cudnnSetTensor4dDescriptor failed");
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
cudnnSetTensorNdDescriptor(dy_descriptor_, cudnn_data_type_, nbDims, dimAdy, strideAdy),
"cudnnSetTensor4dDescriptor failed");
CHECK_CUDNN_RET_WITH_EXCEPT(
kernel_node_, cudnnSetTensorNdDescriptor(dx_descriptor_, cudnn_data_type_, nbDims, dimAout, strideAout),
"cudnnSetTensor4dDescriptor failed");
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
cudnnSetTensorNdDescriptor(x_descriptor_, cudnn_data_type_, nbDims, dimA, strideAin),
"cudnnSetTensor4dDescriptor failed");
SetPoolingMode(kernel_node);
if (nbDims == kDim2DShapeSize) {
SetPad(kernel_node);
} else if (nbDims == kDim3DShapeSize) {
SetPad3D(kernel_node);
}
InitSizeLists();
return true;
}
void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyPoolingDescriptor(pooling_descriptor_),
"cudnnDestroyPoolingDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dx_descriptor_),
"cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(x_descriptor_),
"cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dy_descriptor_),
"cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(y_descriptor_),
"cudnnDestroyTensorDescriptor failed");
}
void DestroyResource() noexcept override;
protected:
void InitResource() override {
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&y_descriptor_),
"cudnnCreateTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dy_descriptor_),
"cudnnCreateTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&x_descriptor_),
"cudnnCreateTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dx_descriptor_),
"cudnnCreateTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreatePoolingDescriptor(&pooling_descriptor_),
"cudnnCreatePoolingDescriptor failed");
}
void InitSizeLists() override {
if (!is_null_input_) {
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(x_descriptor_, &input_size_),
"cudnnGetTensorSizeInBytes failed");
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(dx_descriptor_, &output_size_),
"cudnnGetTensorSizeInBytes failed");
}
input_size_list_.push_back(input_size_);
output_size_list_.push_back(output_size_);
if (!is_null_input_) {
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(y_descriptor_, &input_size_),
"cudnnGetTensorSizeInBytes failed");
}
input_size_list_.push_back(input_size_);
if (!is_null_input_) {
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(dy_descriptor_, &input_size_),
"cudnnGetTensorSizeInBytes failed");
}
input_size_list_.push_back(input_size_);
}
std::vector<KernelAttr> GetOpSupport() override;
void InitResource() override;
void InitSizeLists();
private:
void CheckParam(const CNodePtr &kernel_node) {
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
size_t expect_input_num = (kernel_name_ == kAvgPool3DGradOpName) ? kAvgPool3DGradInputNum : kInputNum;
if (input_num != expect_input_num) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs must be " << expect_input_num
<< ", but got " << input_num;
}
}
void SetPad(const CNodePtr &kernel_node) {
pad_mode_ = GetAttr<std::string>(kernel_node, "pad_mode");
std::vector<int64_t> stride_me = GetAttr<std::vector<int64_t>>(kernel_node, "strides");
std::vector<int> window;
std::vector<int64_t> window_me = GetAttr<std::vector<int64_t>>(kernel_node, "kernel_size");
(void)std::transform(stride_me.begin(), stride_me.end(), std::back_inserter(stride_),
[](const int64_t &value) { return static_cast<int>(value); });
(void)std::transform(window_me.begin(), window_me.end(), std::back_inserter(window),
[](const int64_t &value) { return static_cast<int>(value); });
const size_t kSizeLowerLimit = 4;
const size_t kIdxH = 2;
const size_t kIdxW = 3;
if (window.size() < kSizeLowerLimit) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the length of 'kernel_size' cannot be less than 4, but got "
<< window.size();
}
int window_height = window[kIdxH];
int window_width = window[kIdxW];
if (stride_.size() < kSizeLowerLimit) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the length of 'strides' cannot be less than 4, but got "
<< stride_.size();
}
int stride_h = stride_[kIdxH];
int stride_w = stride_[kIdxW];
if (format_attr_ == kOpFormat_NHWC) {
const size_t kNHWCIdxH = 1;
const size_t kNHWCIdxW = 2;
window_height = window[kNHWCIdxH];
window_width = window[kNHWCIdxW];
stride_h = stride_[kNHWCIdxH];
stride_w = stride_[kNHWCIdxW];
}
const size_t k2dDim = 2;
int windowDimA[k2dDim] = {window_height, window_width};
int paddingA[k2dDim] = {0, 0};
int strideA[k2dDim] = {stride_h, stride_w};
if (kSamePadModeUpperCase == pad_mode_ || kSamePadModeLowerCase == pad_mode_) {
pad_height_ = GetPad(old_height_, window_height, stride_h);
pad_width_ = GetPad(old_width_, window_width, stride_w);
const int kSymCoef = 2;
pad_top_ = pad_height_ / kSymCoef;
pad_left_ = pad_width_ / kSymCoef;
paddingA[0] = pad_top_;
paddingA[1] = pad_left_;
} else {
if (pad_mode_ == kValidPadModeUpperCase || pad_mode_ == kValidPadModeLowerCase) {
pad_height_ = 0;
pad_width_ = 0;
}
}
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
cudnnSetPoolingNdDescriptor(pooling_descriptor_, pooling_mode_, CUDNN_NOT_PROPAGATE_NAN,
2, windowDimA, paddingA, strideA),
"cudnnSetPoolingNdDescriptor failed");
}
void SetPad3D(const CNodePtr &kernel_node) {
const int kPadListSize = 6;
const int kPadScale = 2;
pad_mode_ = GetAttr<std::string>(kernel_node, "pad_mode");
std::vector<int64_t> stride_me = GetAttr<std::vector<int64_t>>(kernel_node, "strides");
std::vector<int> window;
std::vector<int64_t> window_me = GetAttr<std::vector<int64_t>>(kernel_node, "kernel_size");
(void)std::transform(stride_me.begin(), stride_me.end(), std::back_inserter(stride_),
[](const int64_t &value) { return static_cast<int>(value); });
(void)std::transform(window_me.begin(), window_me.end(), std::back_inserter(window),
[](const int64_t &value) { return static_cast<int>(value); });
const size_t k3dSizeLowerLimit = 5;
const size_t kIdxD = 2;
const size_t kIdxH = 3;
const size_t kIdxW = 4;
if (window.size() < k3dSizeLowerLimit) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the length of 'kernel_size' cannot be less than 5, but got "
<< window.size();
}
int window_depth = window[kIdxD];
int window_height = window[kIdxH];
int window_width = window[kIdxW];
if (stride_.size() < k3dSizeLowerLimit) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the length of 'strides' cannot be less than 5, but got "
<< stride_.size();
}
int stride_d = stride_[kIdxD];
int stride_h = stride_[kIdxH];
int stride_w = stride_[kIdxW];
if (format_attr_ == kOpFormat_NDHWC) {
const size_t kNDHWCIdxD = 1;
const size_t kNDHWCIdxH = 2;
const size_t kNDHWCIdxW = 3;
window_depth = window[kNDHWCIdxD];
window_height = window[kNDHWCIdxH];
window_width = window[kNDHWCIdxW];
stride_d = stride_[kNDHWCIdxD];
stride_h = stride_[kNDHWCIdxH];
stride_w = stride_[kNDHWCIdxW];
}
const size_t k3dDimSize = 3;
int windowDimA[k3dDimSize] = {window_depth, window_height, window_width};
int paddingA[k3dDimSize] = {0, 0, 0};
int strideA[k3dDimSize] = {stride_d, stride_h, stride_w};
if (kSamePadModeUpperCase == pad_mode_ || kSamePadModeLowerCase == pad_mode_) {
pad_depth_ = GetPad(old_depth_, window_depth, stride_d);
pad_height_ = GetPad(old_height_, window_height, stride_h);
pad_width_ = GetPad(old_width_, window_width, stride_w);
const int kSymCoef = 2;
pad_front_ = pad_depth_ / kSymCoef;
pad_top_ = pad_height_ / kSymCoef;
pad_left_ = pad_width_ / kSymCoef;
paddingA[0] = pad_front_;
paddingA[1] = pad_top_;
const size_t kPadLeftIdx = 2;
paddingA[kPadLeftIdx] = pad_left_;
} else if (pad_mode_ == kValidPadModeUpperCase || pad_mode_ == kValidPadModeLowerCase) {
pad_depth_ = 0;
pad_height_ = 0;
pad_width_ = 0;
} else {
const std::vector<int64_t> &pad_list = GetAttr<std::vector<int64_t>>(kernel_node, "pad_list");
if (pad_list.size() != kPadListSize) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the length of 'pad_list' must be 6, but got "
<< pad_list.size();
}
for (size_t idx = 0; idx < k3dDimSize; idx++) {
paddingA[idx] = pad_list[idx * kPadScale];
}
}
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
cudnnSetPoolingNdDescriptor(pooling_descriptor_, pooling_mode_, CUDNN_NOT_PROPAGATE_NAN,
k3dDimSize, windowDimA, paddingA, strideA),
"cudnnSetPoolingNdDescriptor failed");
}
void SetPoolingMode(const CNodePtr &kernel_node) {
if (kernel_name_ == kAvgPoolGradOpName || kernel_name_ == kAvgPool3DGradOpName) {
pooling_mode_ = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
pad_value_ = 0.0;
} else {
pooling_mode_ = CUDNN_POOLING_MAX;
pad_value_ = kSignedMinFloat;
}
}
template <typename T>
bool LaunchKernel(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &workspace,
const std::vector<kernel::AddressPtr> &outputs);
bool InitShape(const std::vector<KernelTensorPtr> &inputs, const std::vector<KernelTensorPtr> &outputs, int *dimA,
int *strideAin, int *dimAy, int *strideAiny, int *dimAdy, int *strideAdy, int *dimAout,
int *strideAout, int nbDims);
void SetPad();
void SetPad3D();
void SetPoolingMode();
std::vector<int64_t> GetEdgeKernelSize();
using PoolingGradFunc =
std::function<bool(PoolingGradGpuKernelMod *, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &, const std::vector<kernel::AddressPtr> &)>;
PoolingGradFunc kernel_func_;
static std::map<std::string, std::vector<std::pair<KernelAttr, PoolingGradGpuKernelMod::PoolingGradFunc>>>
kernel_attr_map_;
cudnnHandle_t cudnn_handle_;
cudnnPoolingDescriptor_t pooling_descriptor_;
@ -387,30 +81,42 @@ class PoolingGradGpuKernelMod : public DeprecatedNativeGpuKernelMod {
cudnnTensorDescriptor_t dy_descriptor_;
cudnnTensorDescriptor_t x_descriptor_;
cudnnTensorDescriptor_t dx_descriptor_;
cudnnPoolingMode_t pooling_mode_ = CUDNN_POOLING_MAX;
std::vector<int> stride_;
std::string pad_mode_;
std::string format_attr_ = kOpFormat_NCHW;
cudnnDataType_t cudnn_data_type_;
cudnnTensorFormat_t compute_format_;
int old_depth_;
int old_height_;
int old_width_;
int pad_depth_;
int pad_height_;
int pad_width_;
int pad_front_;
int pad_top_;
int pad_left_;
int n_;
int c_;
float pad_value_;
bool is_null_input_;
cudnnPoolingMode_t pooling_mode_;
std::vector<int> stride_;
std::vector<int64_t> stride_me_;
std::vector<int64_t> window_me_;
std::vector<int64_t> input_shape_;
std::vector<int64_t> output_shape_;
std::vector<int64_t> edge_kernel_;
std::vector<int64_t> kernel_size_;
std::vector<int64_t> pad_list_;
std::string kernel_name_;
size_t input_size_;
size_t output_size_;
size_t workspace_size_;
PadMode pad_mode_;
Format format_attr_ = Format::NCHW;
int old_depth_{0};
int old_height_{0};
int old_width_{0};
int pad_depth_{0};
int pad_height_{0};
int pad_width_{0};
int pad_front_{0};
int pad_top_{0};
int pad_left_{0};
int n_{0};
int c_{0};
float pad_value_{0.0};
bool is_null_input_{false};
bool include_{false};
bool ceil_mode_{false};
int64_t divisor_override_{0};
size_t input_size_{0};
size_t output_size_{0};
size_t workspace_size_{0};
void *cuda_stream_{nullptr};
};
} // namespace kernel
} // namespace mindspore

View File

@ -533,6 +533,7 @@ GVAR_DEF(PrimitivePtr, kPrimPSROIPoolingGrad, std::make_shared<Primitive>("PSROI
GVAR_DEF(PrimitivePtr, kPrimROIPooling, std::make_shared<Primitive>("ROIPooling"));
GVAR_DEF(PrimitivePtr, kPrimMaxPool, std::make_shared<Primitive>("MaxPool"));
GVAR_DEF(PrimitivePtr, kPrimMaxPoolGrad, std::make_shared<Primitive>("MaxPoolGrad"));
GVAR_DEF(PrimitivePtr, kPrimMaxPool3DGrad, std::make_shared<Primitive>("MaxPool3DGrad"));
GVAR_DEF(PrimitivePtr, kPrimMaxPoolV1, std::make_shared<Primitive>("MaxPoolV1"));
GVAR_DEF(PrimitivePtr, kPrimMaxPoolGradV1, std::make_shared<Primitive>("MaxPoolGradV1"));
GVAR_DEF(PrimitivePtr, kPrimMaxPoolWithArgmax, std::make_shared<Primitive>("MaxPoolWithArgmax"));

View File

@ -30,6 +30,8 @@ namespace {
abstract::ShapePtr GatherInferShape(const PrimitivePtr &primitive, const std::vector<AbstractBasePtr> &input_args) {
MS_EXCEPTION_IF_NULL(primitive);
const std::string &op_name = primitive->name();
const int64_t input_num = 2;
(void)CheckAndConvertUtils::CheckInputArgs(input_args, kGreaterEqual, input_num, op_name);
abstract::AbstractTensorPtr indices =
CheckAndConvertUtils::CheckArgs<abstract::AbstractTensor>(op_name, input_args, 1);
abstract::AbstractTensorPtr params =
@ -42,7 +44,11 @@ abstract::ShapePtr GatherInferShape(const PrimitivePtr &primitive, const std::ve
std::any_of(params->shape()->shape().begin(), params->shape()->shape().end(), [](int64_t s) { return s < 0; });
int64_t axis_val = 0;
// 3rd input is a Tensor when Gather is a dynamic shape operator
if (input_args[kInputIndex2]->isa<abstract::AbstractTensor>()) {
if (SizeToLong(input_args.size()) == input_num) {
auto axis_attr = primitive->GetAttr("axis");
MS_EXCEPTION_IF_NULL(axis_attr);
axis_val = GetValue<int64_t>(axis_attr);
} else if (input_args[kInputIndex2]->isa<abstract::AbstractTensor>()) {
auto axis = input_args[kInputIndex2]->cast<abstract::AbstractTensorPtr>();
MS_EXCEPTION_IF_NULL(axis);
auto axis_value_ptr = axis->BuildValue();
@ -94,15 +100,17 @@ abstract::ShapePtr GatherInferShape(const PrimitivePtr &primitive, const std::ve
TypePtr GatherInferType(const PrimitivePtr &primitive, const std::vector<AbstractBasePtr> &input_args) {
MS_EXCEPTION_IF_NULL(primitive);
const std::string &op_name = primitive->name();
constexpr size_t input_num = 3;
abstract::CheckArgsSize(op_name, input_args, input_num);
constexpr int64_t input_num = 2;
(void)CheckAndConvertUtils::CheckInputArgs(input_args, kGreaterEqual, input_num, op_name);
std::set<TypePtr> valid_params_types = {kTensorType};
(void)CheckAndConvertUtils::CheckSubClass("params", input_args[kInputIndex0]->BuildType(), valid_params_types,
op_name);
std::set<TypePtr> int_types = {kInt8, kInt16, kInt32, kInt64};
(void)CheckAndConvertUtils::CheckTensorTypeValid("indices", input_args[kInputIndex1]->BuildType(), int_types,
op_name);
(void)CheckAndConvertUtils::CheckTypeValid("axis", input_args[kInputIndex2]->BuildType(), int_types, op_name);
if (SizeToLong(input_args.size()) > input_num) {
(void)CheckAndConvertUtils::CheckTypeValid("axis", input_args[kInputIndex2]->BuildType(), int_types, op_name);
}
abstract::AbstractTensorPtr params =
CheckAndConvertUtils::CheckArgs<abstract::AbstractTensor>(op_name, input_args, 0);
@ -113,7 +121,7 @@ TypePtr GatherInferType(const PrimitivePtr &primitive, const std::vector<Abstrac
AbstractBasePtr GatherInfer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive,
const std::vector<AbstractBasePtr> &input_args) {
MS_EXCEPTION_IF_NULL(primitive);
const int64_t kInputsNum = 3;
const int64_t kInputsNum = 2;
CheckAndConvertUtils::CheckInputArgs(input_args, kGreaterEqual, kInputsNum, primitive->name());
auto infer_type = GatherInferType(primitive, input_args);
auto infer_shape = GatherInferShape(primitive, input_args);

View File

@ -22,28 +22,76 @@
#include "utils/check_convert_utils.h"
#include "abstract/ops/primitive_infer_map.h"
#include "mindapi/src/helper.h"
#include "utils/ms_context.h"
namespace mindspore {
namespace ops {
namespace {
void AvgPool3DGrad::Init(const std::vector<int64_t> &kernel_size, const std::vector<int64_t> &strides,
const PadMode &pad_mode, const std::vector<int64_t> &pad_list, bool ceil_mode,
bool count_include_pad, int64_t divisor_override, const Format &format) {
set_kernel_size(kernel_size);
set_strides(strides);
set_pad_mode(pad_mode);
set_pad_list(pad_list);
set_ceil_mode(ceil_mode);
set_count_include_pad(count_include_pad);
set_divisor_override(divisor_override);
set_format(format);
}
void AvgPool3DGrad::set_pad_list(const std::vector<int64_t> &pad_list) {
const int64_t pad_size = 4;
(void)CheckAndConvertUtils::CheckInteger(kPadList, SizeToLong(pad_list.size()), kEqual, pad_size, name());
(void)AddAttr(kPadList, api::MakeValue(pad_list));
}
void AvgPool3DGrad::set_ceil_mode(bool ceil_mode) { (void)AddAttr(kCeilMode, api::MakeValue(ceil_mode)); }
void AvgPool3DGrad::set_count_include_pad(bool count_include_pad) {
(void)AddAttr(kCountIncludePad, api::MakeValue(count_include_pad));
}
void AvgPool3DGrad::set_divisor_override(int64_t divisor_override) {
(void)AddAttr(kDivisorOverride, api::MakeValue(divisor_override));
}
std::vector<int64_t> AvgPool3DGrad::get_pad_list() const {
auto value_ptr = GetAttr(kPadList);
return GetValue<std::vector<int64_t>>(value_ptr);
}
bool AvgPool3DGrad::get_ceil_mode() const { return GetValue<bool>(GetAttr(kCeilMode)); }
bool AvgPool3DGrad::get_count_include_pad() const { return GetValue<bool>(GetAttr(kCountIncludePad)); }
int64_t AvgPool3DGrad::get_divisor_override() const { return GetValue<int64_t>(GetAttr(kDivisorOverride)); }
abstract::ShapePtr AvgPool3DGradInferShape(const PrimitivePtr &primitive,
const std::vector<AbstractBasePtr> &input_args) {
MS_EXCEPTION_IF_NULL(primitive);
auto op_name = primitive->name();
const int64_t input_num = 2;
(void)CheckAndConvertUtils::CheckInteger("input size", SizeToLong(input_args.size()), kEqual, input_num, op_name);
const int64_t input_num = 1;
(void)CheckAndConvertUtils::CheckInteger("input size", SizeToLong(input_args.size()), kGreaterEqual, input_num,
op_name);
for (const auto &item : input_args) {
MS_EXCEPTION_IF_NULL(item);
}
auto grad_shape = CheckAndConvertUtils::ConvertShapePtrToShapeMap(input_args[1]->GetShapeTrack())[kShape];
size_t grad_index = input_args.size() - 1;
auto grad_shape = CheckAndConvertUtils::ConvertShapePtrToShapeMap(input_args[grad_index]->GetShapeTrack())[kShape];
constexpr int64_t k5DInputDims = 5;
(void)CheckAndConvertUtils::CheckInteger("grad_rank", SizeToLong(grad_shape.size()), kEqual, k5DInputDims, op_name);
std::vector<int64_t> origin_input_size;
if (input_args[0]->isa<abstract::AbstractTuple>()) { // origin_size is tuple
origin_input_size = GetValue<std::vector<int64_t>>(input_args[0]->BuildValue());
if (SizeToLong(input_args.size()) == input_num) {
auto shape_attr = primitive->GetAttr("origin_input_shape");
MS_EXCEPTION_IF_NULL(shape_attr);
origin_input_size = GetValue<ShapeVector>(shape_attr);
} else {
MS_LOG(EXCEPTION) << "For '" << op_name << "', the first input data size must be a tuple, but got: "
<< input_args[0]->BuildShape()->ToString() << ".";
if (input_args[0]->isa<abstract::AbstractTuple>()) { // origin_size is tuple
origin_input_size = GetValue<std::vector<int64_t>>(input_args[0]->BuildValue());
} else {
MS_LOG(EXCEPTION) << "For '" << op_name << "', the first input data size must be a tuple, but got: "
<< input_args[0]->BuildShape()->ToString() << ".";
}
}
return std::make_shared<abstract::Shape>(origin_input_size);
}
@ -51,18 +99,25 @@ abstract::ShapePtr AvgPool3DGradInferShape(const PrimitivePtr &primitive,
TypePtr AvgPool3DGradInferType(const PrimitivePtr &primitive, const std::vector<AbstractBasePtr> &input_args) {
MS_EXCEPTION_IF_NULL(primitive);
auto op_name = primitive->name();
const int64_t input_num = 2;
(void)CheckAndConvertUtils::CheckInteger("input size", SizeToLong(input_args.size()), kEqual, input_num, op_name);
const int64_t input_num = 1;
(void)CheckAndConvertUtils::CheckInteger("input size", SizeToLong(input_args.size()), kGreaterEqual, input_num,
op_name);
for (const auto &item : input_args) {
MS_EXCEPTION_IF_NULL(item);
}
auto grad_dtype = input_args[1]->BuildType();
const std::set<TypePtr> valid_types = {kFloat16, kFloat32};
size_t grad_index = input_args.size() - 1;
auto grad_dtype = input_args[grad_index]->BuildType();
std::set<TypePtr> valid_types = {kFloat16, kFloat32};
auto context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context);
bool is_gpu = (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice);
if (is_gpu) {
valid_types = {kFloat16, kFloat32, kFloat64};
}
return CheckAndConvertUtils::CheckTensorTypeValid("grad", grad_dtype, valid_types, op_name);
}
} // namespace
MIND_API_OPERATOR_IMPL(AvgPool3DGrad, BaseOperator);
MIND_API_OPERATOR_IMPL(AvgPool3DGrad, PoolGrad);
AbstractBasePtr AvgPool3DGradInfer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive,
const std::vector<AbstractBasePtr> &input_args) {
auto res = std::make_shared<abstract::AbstractTensor>(AvgPool3DGradInferType(primitive, input_args),

View File

@ -21,17 +21,33 @@
#include <vector>
#include <string>
#include <memory>
#include "ops/base_operator.h"
#include "ops/grad/pool_grad.h"
#include "mindapi/base/types.h"
#include "mindapi/base/format.h"
namespace mindspore {
namespace ops {
constexpr auto kNameAvgPool3DGrad = "AvgPool3DGrad";
class MIND_API AvgPool3DGrad : public BaseOperator {
class MIND_API AvgPool3DGrad : public PoolGrad {
public:
MIND_API_BASE_MEMBER(AvgPool3DGrad);
AvgPool3DGrad() : BaseOperator(kNameAvgPool3DGrad) { InitIOName({"origin_input_size", "grad"}, {"output"}); }
AvgPool3DGrad() : PoolGrad(kNameAvgPool3DGrad) { InitIOName({"origin_input_size", "grad"}, {"output"}); }
void Init(const std::vector<int64_t> &kernel_size = {1, 1, 1, 1, 1},
const std::vector<int64_t> &strides = {1, 1, 1, 1, 1}, const PadMode &pad_mode = VALID,
const std::vector<int64_t> &pad_list = {0, 0, 0, 0, 0, 0}, bool ceil_mode = false,
bool count_include_pad = true, int64_t divisor_override = 0, const Format &format = NCHW);
void set_pad_list(const std::vector<int64_t> &pad_list);
void set_ceil_mode(bool ceil_mode);
void set_count_include_pad(bool count_include_pad);
void set_divisor_override(int64_t divisor_override);
std::vector<int64_t> get_pad_list() const;
bool get_ceil_mode() const;
bool get_count_include_pad() const;
int64_t get_divisor_override() const;
};
abstract::AbstractBasePtr AvgPool3DGradInfer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive,

View File

@ -0,0 +1,86 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "ops/grad/max_pool_3d_grad.h"
#include <memory>
#include <set>
#include <vector>
#include "ops/op_utils.h"
#include "utils/check_convert_utils.h"
#include "abstract/ops/primitive_infer_map.h"
#include "mindapi/src/helper.h"
namespace mindspore {
namespace ops {
void MaxPool3DGrad::Init(const std::vector<int64_t> &kernel_size, const std::vector<int64_t> &strides,
const PadMode &pad_mode, const std::vector<int64_t> &pad_list, const Format &format) {
set_kernel_size(kernel_size);
set_strides(strides);
set_pad_mode(pad_mode);
set_pad_list(pad_list);
set_format(format);
}
void MaxPool3DGrad::set_pad_list(const std::vector<int64_t> &pad_list) {
const int64_t pad_size = 4;
(void)CheckAndConvertUtils::CheckInteger(kPadList, SizeToLong(pad_list.size()), kEqual, pad_size, name());
(void)AddAttr(kPadList, api::MakeValue(pad_list));
}
std::vector<int64_t> MaxPool3DGrad::get_pad_list() const {
auto value_ptr = GetAttr(kPadList);
return GetValue<std::vector<int64_t>>(value_ptr);
}
abstract::ShapePtr MaxPool3DGradInferShape(const PrimitivePtr &primitive,
const std::vector<AbstractBasePtr> &input_args) {
MS_EXCEPTION_IF_NULL(primitive);
auto op_name = primitive->name();
const int64_t input_num = 3;
(void)CheckAndConvertUtils::CheckInteger("input size", SizeToLong(input_args.size()), kEqual, input_num, op_name);
for (const auto &item : input_args) {
MS_EXCEPTION_IF_NULL(item);
}
auto x_shape = CheckAndConvertUtils::ConvertShapePtrToShapeMap(input_args[0]->GetShapeTrack())[kShape];
constexpr int64_t k5DInputDims = 5;
(void)CheckAndConvertUtils::CheckInteger("input_rank", SizeToLong(x_shape.size()), kEqual, k5DInputDims, op_name);
return std::make_shared<abstract::Shape>(x_shape);
}
TypePtr MaxPool3DGradInferType(const PrimitivePtr &primitive, const std::vector<AbstractBasePtr> &input_args) {
MS_EXCEPTION_IF_NULL(primitive);
auto op_name = primitive->name();
const int64_t input_num = 3;
(void)CheckAndConvertUtils::CheckInteger("input size", SizeToLong(input_args.size()), kEqual, input_num, op_name);
for (const auto &item : input_args) {
MS_EXCEPTION_IF_NULL(item);
}
auto x_dtype = input_args[0]->BuildType();
const std::set<TypePtr> valid_types = {kFloat16, kFloat32};
return CheckAndConvertUtils::CheckTensorTypeValid("input", x_dtype, valid_types, op_name);
}
MIND_API_OPERATOR_IMPL(MaxPool3DGrad, PoolGrad);
AbstractBasePtr MaxPool3DGradInfer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive,
const std::vector<AbstractBasePtr> &input_args) {
auto res = std::make_shared<abstract::AbstractTensor>(MaxPool3DGradInferType(primitive, input_args),
MaxPool3DGradInferShape(primitive, input_args)->shape());
return res;
}
REGISTER_PRIMITIVE_EVAL_IMPL(MaxPool3DGrad, prim::kPrimMaxPool3DGrad, MaxPool3DGradInfer, nullptr, true);
} // namespace ops
} // namespace mindspore

View File

@ -0,0 +1,47 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CORE_OPS_MAX_POOL_3D_GRAD_H_
#define MINDSPORE_CORE_OPS_MAX_POOL_3D_GRAD_H_
#include <map>
#include <vector>
#include <string>
#include <memory>
#include "ops/grad/pool_grad.h"
#include "mindapi/base/types.h"
namespace mindspore {
namespace ops {
constexpr auto kNameMaxPool3DGrad = "MaxPool3DGrad";
class MIND_API MaxPool3DGrad : public PoolGrad {
public:
MIND_API_BASE_MEMBER(MaxPool3DGrad);
MaxPool3DGrad() : PoolGrad(kNameMaxPool3DGrad) { InitIOName({"x_origin", "out_origin", "grad"}, {"output"}); }
void Init(const std::vector<int64_t> &kernel_size = {1, 1, 1, 1, 1},
const std::vector<int64_t> &strides = {1, 1, 1, 1, 1}, const PadMode &pad_mode = VALID,
const std::vector<int64_t> &pad_list = {0, 0, 0, 0, 0, 0}, const Format &format = NCHW);
void set_pad_list(const std::vector<int64_t> &pad_list);
std::vector<int64_t> get_pad_list() const;
};
abstract::AbstractBasePtr MaxPool3DGradInfer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive,
const std::vector<abstract::AbstractBasePtr> &input_args);
using PrimMaxPool3DGradPtr = std::shared_ptr<MaxPool3DGrad>;
} // namespace ops
} // namespace mindspore
#endif // MINDSPORE_CORE_OPS_MAX_POOL_3D_GRAD_H_

View File

@ -15,11 +15,26 @@
*/
#include "ops/grad/pool_grad.h"
#include <cctype>
#include "ops/op_utils.h"
#include "mindapi/src/helper.h"
namespace mindspore {
namespace ops {
static std::map<std::string, int64_t> pad_map = {
{"CALCULATED", PadMode::PAD},
{"PAD", PadMode::PAD},
{"SAME", PadMode::SAME},
{"VALID", PadMode::VALID},
};
static std::map<std::string, int64_t> dataformat_map = {
{"NCHW", Format::NCHW},
{"NHWC", Format::NHWC},
{"NCDHW", Format::NCDHW},
{"NDHWC", Format::NDHWC},
};
MIND_API_OPERATOR_IMPL(PoolGrad, BaseOperator);
std::vector<int64_t> PoolGrad::_grad_check_vector(const std::string &arg_name, std::vector<int64_t> arg_val,
const std::string &op_name) {
@ -91,13 +106,31 @@ std::vector<int64_t> PoolGrad::get_strides() const {
PadMode PoolGrad::get_pad_mode() const {
auto value_ptr = GetAttr(kPadMode);
MS_EXCEPTION_IF_NULL(value_ptr);
return PadMode(GetValue<int64_t>(value_ptr));
if (!value_ptr->isa<mindspore::api::StringImm>()) {
return PadMode(GetValue<int64_t>(value_ptr));
}
auto attr_value_str = GetValue<std::string>(value_ptr);
std::transform(attr_value_str.begin(), attr_value_str.end(), attr_value_str.begin(), toupper);
auto iter = pad_map.find(attr_value_str);
if (iter == pad_map.end()) {
MS_LOG(EXCEPTION) << "Invalid pad mode " << attr_value_str << " use CALCULATED, PAD, VALID or SAME";
}
return PadMode(iter->second);
}
Format PoolGrad::get_format() const {
auto value_ptr = GetAttr(kFormat);
MS_EXCEPTION_IF_NULL(value_ptr);
return Format(GetValue<int64_t>(value_ptr));
if (!value_ptr->isa<mindspore::api::StringImm>()) {
return Format(GetValue<int64_t>(value_ptr));
}
auto attr_value_str = GetValue<std::string>(value_ptr);
std::transform(attr_value_str.begin(), attr_value_str.end(), attr_value_str.begin(), toupper);
auto iter = dataformat_map.find(attr_value_str);
if (iter == dataformat_map.end()) {
MS_LOG(EXCEPTION) << "Invalid format " << attr_value_str << " use NCHW, NHWC NCDHW or NDHWC";
}
return Format(iter->second);
}
REGISTER_PRIMITIVE_C(kNamePoolGrad, PoolGrad);

View File

@ -130,6 +130,34 @@ def get_avg_pool_grad_vmap_rule(prim, axis_size):
return vmap_rule
@vmap_rules_getters.register(G.AvgPool3DGrad)
def get_avg_pool3d_grad_vmap_rule(prim, axis_size):
"""VmapRule for `AvgPool3DGrad`."""
cdhw_reverse_index = -4
def vmap_rule(shape_bdim, dy_bdim):
is_all_none, result = vmap_general_preprocess(prim, shape_bdim, dy_bdim)
if is_all_none:
return result
shape, shape_dim = shape_bdim
dy, dy_dim = dy_bdim
if shape_dim is not None:
_raise_value_error("The source axis of 'origin_input_shape' in 'AvgPool3DGrad' must be None, "
"but got {}.".format(shape_dim))
dy = _bdim_at_front(dy, dy_dim, axis_size)
dy_shape = F.shape(dy)
dy = F.reshape(dy, (-1,) + dy_shape[cdhw_reverse_index:])
input_shape = (F.shape(dy)[0],) + shape[cdhw_reverse_index:]
out = prim(input_shape, dy)
out_shape = F.shape(out)
return_shape = dy_shape[:cdhw_reverse_index] + out_shape[cdhw_reverse_index:]
out = F.reshape(out, return_shape)
return (out, 0)
return vmap_rule
@vmap_rules_getters.register(G.CdistGrad)
def get_cdist_grad_vmap_rule(prim, axis_size):
"""VmapRule for `cdist grad` operation."""

View File

@ -1046,12 +1046,13 @@ def _get_max_pool3d_grad_pads_by_pad_mode(input_shape, kernel_size, strides, pad
return pads
class MaxPool3DGrad(PrimitiveWithInfer):
class MaxPool3DGrad(Primitive):
"""Gradients of the max pool3d operation."""
@prim_attr_register
def __init__(self, kernel_size=(1, 1, 1, 1, 1), strides=(1, 1, 1, 1, 1),
pad_mode='VALID', pad_list=0, data_format="NCDHW"):
self.init_prim_io_names(inputs=['x_origin', 'out_origin', 'grad'], outputs=['output'])
validator.check_value_type('kernel_size', kernel_size, [int, tuple], self.name)
validator.check_value_type('strides', strides, [int, tuple], self.name)
validator.check_value_type('pad_mode', pad_mode, [str], self.name)
@ -1080,15 +1081,6 @@ class MaxPool3DGrad(PrimitiveWithInfer):
validator.check_non_negative_int(item, 'pad_list item', self.name)
self.add_prim_attr("pad_list", self.pad_list)
def infer_shape(self, x_shape, y_shape, grad_shape):
validator.check_equal_int(len(x_shape), 5, "x rank", self.name)
return x_shape
def infer_dtype(self, x_dtype, y_dtype, grad_dtype):
args = {'x_dtype': x_dtype, 'y_dtype': y_dtype, 'grad_dtype': grad_dtype}
validator.check_tensors_dtypes_same_and_valid(args, [mstype.float16, mstype.float32], self.name)
return x_dtype
class MaxPool3DGradGrad(PrimitiveWithInfer):
"""Gradients of the max pool3d grad operation."""

View File

@ -341,3 +341,62 @@ def test_avgpool_grad_vmap():
nest_vmap = vmap(vmap(net, in_axes=in_axes, out_axes=0), in_axes=in_axes, out_axes=0)
out = nest_vmap(x, sens)
assert out[0].shape == (6, 3, 1, 1, 6, 6)
class DynamicShapeAvgPool3DGrad(nn.Cell):
def __init__(self, net, axis=0):
super(DynamicShapeAvgPool3DGrad, self).__init__()
self.net = net
self.unique = P.Unique()
self.gather = P.Gather()
self.axis = axis
def construct(self, x_shape, sens, indices):
unique_indices, _ = self.unique(indices)
sens = self.gather(sens, unique_indices, self.axis)
return self.net(x_shape, sens)
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_avgpool3d_grad_dynamic_shape():
"""
Feature: AvgPool3dGrad dynamic test.
Description: Run unique and gather ops before AvgPool3dGrad.
Expectation: success.
"""
x_shape = (1, 3, 2, 3, 4)
x = Tensor(np.arange(reduce(lambda x, y: x * y, x_shape))).reshape(x_shape).astype(np.float32)
avgpool = AvgPool(dim=3, kernel_size=2, strides=1, pad_mode='VALID')
expect_output = np.array([[[[[8.5, 9.5, 10.5],
[12.5, 13.5, 14.5]]],
[[[32.5, 33.5, 34.5],
[36.5, 37.5, 38.5]]],
[[[56.5, 57.5, 58.5],
[60.5, 61.5, 62.5]]]]]).astype(np.float32)
avgpool_grad = AvgPoolGrad(avgpool)
net = DynamicShapeAvgPool3DGrad(avgpool_grad)
sens = Tensor(expect_output) + 1
indices = Tensor(np.array([0]).astype(np.int32))
actual_grad = net(x, sens, indices)
expect_grad = np.array([[[[[1.1875, 2.5, 2.75, 1.4375],
[2.875, 6., 6.5, 3.375],
[1.6875, 3.5, 3.75, 1.9375]],
[[1.1875, 2.5, 2.75, 1.4375],
[2.875, 6., 6.5, 3.375],
[1.6875, 3.5, 3.75, 1.9375]]],
[[[4.1875, 8.5, 8.75, 4.4375],
[8.875, 18., 18.5, 9.375],
[4.6875, 9.5, 9.75, 4.9375]],
[[4.1875, 8.5, 8.75, 4.4375],
[8.875, 18., 18.5, 9.375],
[4.6875, 9.5, 9.75, 4.9375]]],
[[[7.1875, 14.5, 14.75, 7.4375],
[14.875, 30., 30.5, 15.375],
[7.6875, 15.5, 15.75, 7.9375]],
[[7.1875, 14.5, 14.75, 7.4375],
[14.875, 30., 30.5, 15.375],
[7.6875, 15.5, 15.75, 7.9375]]]]]).astype(np.float32)
assert np.allclose(actual_grad[0].asnumpy(), expect_grad)