!30589 [DynamicShape][GPU]add dynamic shape support some ops for new network

Merge pull request !30589 from hanhuifeng/dcn_dyn_ops_2_new
2022-03-01 06:42:59 +00:00 · 2022-03-01 06:42:59 +00:00 · 9a5268f2be
parent 9083e549d2 b571e67795
commit 9a5268f2be
14 changed files with 477 additions and 133 deletions
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/array_reduce_gpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/array_reduce_gpu_kernel.cc
@ -56,5 +56,22 @@ MS_REG_GPU_KERNEL_ONE(ReduceProd, KernelAttr().AddInputAttr(kNumberTypeFloat32).
                      ArrayReduceGpuKernelMod, float)
 MS_REG_GPU_KERNEL_ONE(ReduceProd, KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
                      ArrayReduceGpuKernelMod, double)
+
+// dynamic
+MS_REG_GPU_KERNEL_TWO(
+  ReduceSum,
+  KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat64),
+  ArrayReduceGpuKernelMod, double, int64_t)
+MS_REG_GPU_KERNEL_TWO(
+  ReduceSum,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat32),
+  ArrayReduceGpuKernelMod, float, int64_t)
+MS_REG_GPU_KERNEL_TWO(
+  ReduceSum,
+  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat16),
+  ArrayReduceGpuKernelMod, half, int64_t)
+MS_REG_GPU_KERNEL_TWO(
+  ReduceSum, KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeBool),
+  ArrayReduceGpuKernelMod, bool, int64_t)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/array_reduce_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/array_reduce_gpu_kernel.h
@ -32,7 +32,7 @@ const std::map<std::string, cudnnReduceTensorOp_t> kReduceTypeMap = {
  {"ReduceAny", CUDNN_REDUCE_TENSOR_MAX},  {"ReduceAll", CUDNN_REDUCE_TENSOR_MUL},
  {"ReduceProd", CUDNN_REDUCE_TENSOR_MUL},
 };
-template <typename T>
+template <typename T, typename S = int64_t>
 class ArrayReduceGpuKernelMod : public NativeGpuKernelMod {
 public:
  ArrayReduceGpuKernelMod() { ResetResource(); }
@ -43,6 +43,9 @@ class ArrayReduceGpuKernelMod : public NativeGpuKernelMod {
    if (is_null_input_) {
      return true;
    }
+    if (is_dynamic_axis_ && !get_dynamic_axis_value_) {
+      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', fail to get value of the axis when axis is dynamic!";
+    }
    T *input_addr = GetDeviceAddress<T>(inputs, 0);
    T *output_addr = GetDeviceAddress<T>(outputs, 0);
    T *workspace_addr = GetPossiblyNullDeviceAddress<T>(workspace, 0);
@ -87,8 +90,13 @@ class ArrayReduceGpuKernelMod : public NativeGpuKernelMod {
    }
    data_type_ = GetCudnnDataType(type_name);
    size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
-    if (input_num != 1) {
-      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs should be 1, but got " << input_num;
+    constexpr size_t kDynamicAxisInputNum = 2;
+    if (input_num != 1 && input_num != kDynamicAxisInputNum) {
+      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs should be 1 or " << kDynamicAxisInputNum
+                        << ", but got " << input_num;
+    }
+    if (input_num == kDynamicAxisInputNum) {
+      is_dynamic_axis_ = true;
    }
    size_t output_num = common::AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
@ -96,29 +104,28 @@ class ArrayReduceGpuKernelMod : public NativeGpuKernelMod {
    }
    int input_dim_length = SizeToInt(AnfAlgo::GetInputDeviceShapeAdaptively(kernel_node, 0).size());

-    auto prim = common::AnfAlgo::GetCNodePrimitive(kernel_node);
-    MS_EXCEPTION_IF_NULL(prim);
-    if (prim->GetAttr("axis")->isa<ValueTuple>() || prim->GetAttr("axis")->isa<ValueList>()) {
-      std::vector<int> attr_axis;
-      std::vector<int64_t> attr_axis_me = GetAttr<std::vector<int64_t>>(kernel_node, "axis");
-      (void)std::transform(attr_axis_me.begin(), attr_axis_me.end(), std::back_inserter(attr_axis),
-                           [](const int64_t &value) { return static_cast<int>(value); });
-      if (attr_axis.empty()) {
-        axis_.push_back(-1);
-      } else {
-        for (auto axis : attr_axis) {
-          axis < 0 ? axis_.push_back(axis + input_dim_length) : axis_.push_back(axis);
-        }
-        std::sort(axis_.begin(), axis_.end());
-        auto multiple_pos = std::unique(axis_.begin(), axis_.end());
-        axis_.erase(multiple_pos, axis_.end());
+    std::vector<int64_t> attr_axis;
+    if (is_dynamic_axis_) {
+      get_dynamic_axis_value_ = GetDynamicAttrIntValue(kernel_node, kAxisIndex_, &attr_axis);
+      if (!get_dynamic_axis_value_) {
+        InitSizeLists();
+        return true;
      }
-    } else if (prim->GetAttr("axis")->isa<Int64Imm>()) {
-      int axis = static_cast<int>(GetAttr<int64_t>(kernel_node, "axis"));
-      axis < 0 ? axis_.push_back(axis + input_dim_length) : axis_.push_back(axis);
+      dynamic_axis_size_ = attr_axis.size();
    } else {
-      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', attribute 'axis' type is invalid.";
+      attr_axis = GetAxisValue(kernel_node);
    }
+    if (attr_axis.empty()) {
+      axis_.push_back(-1);
+    } else {
+      for (auto axis : attr_axis) {
+        axis < 0 ? axis_.push_back(axis + input_dim_length) : axis_.push_back(axis);
+      }
+      std::sort(axis_.begin(), axis_.end());
+      auto multiple_pos = std::unique(axis_.begin(), axis_.end());
+      axis_.erase(multiple_pos, axis_.end());
+    }
+
    keep_dims_ = GetAttr<bool>(kernel_node, "keep_dims");

    auto inputA_shape = AnfAlgo::GetInputDeviceShapeAdaptively(kernel_node, 0);
@ -156,6 +163,9 @@ class ArrayReduceGpuKernelMod : public NativeGpuKernelMod {
    input_size_list_.clear();
    output_size_list_.clear();
    workspace_size_list_.clear();
+    dynamic_axis_size_ = 0;
+    is_dynamic_axis_ = false;
+    get_dynamic_axis_value_ = false;
  }

  void DestroyResource() noexcept override {
@ -182,6 +192,10 @@ class ArrayReduceGpuKernelMod : public NativeGpuKernelMod {
                                "cudnnGetTensorSizeInBytes failed.");
    input_size_list_.push_back(input_size_);

+    if (is_dynamic_axis_) {
+      input_size_list_.push_back(dynamic_axis_size_ * sizeof(S));
+    }
+
    CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(outputC_descriptor_, &output_size_),
                                "cudnnGetTensorSizeInBytes failed.");
    output_size_list_.push_back(output_size_);
@ -279,6 +293,21 @@ class ArrayReduceGpuKernelMod : public NativeGpuKernelMod {
    return;
  }

+  std::vector<int64_t> GetAxisValue(const CNodePtr &kernel_node) {
+    auto prim = common::AnfAlgo::GetCNodePrimitive(kernel_node);
+    MS_EXCEPTION_IF_NULL(prim);
+    std::vector<int64_t> attr_axis_me;
+    if (prim->GetAttr("axis")->isa<ValueTuple>() || prim->GetAttr("axis")->isa<ValueList>()) {
+      attr_axis_me = GetAttr<std::vector<int64_t>>(kernel_node, "axis");
+    } else if (prim->GetAttr("axis")->isa<Int64Imm>()) {
+      auto axis = GetAttr<int64_t>(kernel_node, "axis");
+      attr_axis_me.push_back(axis);
+    } else {
+      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', attribute 'axis' type is invalid.";
+    }
+    return attr_axis_me;
+  }
+
  cudnnHandle_t cudnn_handle_;
  cudnnReduceTensorOp_t reduce_tensor_op_;
  cudnnDataType_t data_type_;
@ -295,7 +324,10 @@ class ArrayReduceGpuKernelMod : public NativeGpuKernelMod {
  size_t input_size_;
  size_t output_size_;
  size_t workspace_size_;
-  std::string kernel_name_;
+  size_t dynamic_axis_size_;
+  bool is_dynamic_axis_;
+  bool get_dynamic_axis_value_;
+  static constexpr size_t kAxisIndex_{1};
 };
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_gpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_gpu_kernel.cc
@ -33,18 +33,18 @@ namespace kernel {

 REG_SLICE_GPU_DTYPES(REG_SLICE_GPU)

-#define REG_DYNAMIC_SLICE_GPU_ATTR(T0_MS_DTYPE, T0_DTYPE, T1_MS_DTYPE) \
-  MS_REG_GPU_KERNEL_ONE(Slice,                                         \
-                        KernelAttr()                                   \
-                          .AddInputAttr(T0_MS_DTYPE)                   \
-                          .AddInputAttr(T1_MS_DTYPE)                   \
-                          .AddInputAttr(T1_MS_DTYPE)                   \
-                          .AddOutputAttr(T0_MS_DTYPE),                 \
-                        SliceFwdGpuKernelMod, T0_DTYPE)
+#define REG_DYNAMIC_SLICE_GPU_ATTR(T0_MS_DTYPE, T0_DTYPE, T1_MS_DTYPE, T1_DTYPE) \
+  MS_REG_GPU_KERNEL_TWO(Slice,                                                   \
+                        KernelAttr()                                             \
+                          .AddInputAttr(T0_MS_DTYPE)                             \
+                          .AddInputAttr(T1_MS_DTYPE)                             \
+                          .AddInputAttr(T1_MS_DTYPE)                             \
+                          .AddOutputAttr(T0_MS_DTYPE),                           \
+                        SliceFwdGpuKernelMod, T0_DTYPE, T1_DTYPE)

-#define REG_DYNAMIC_SLICE_GPU(MS_DTYPE, DTYPE)                  \
-  REG_DYNAMIC_SLICE_GPU_ATTR(MS_DTYPE, DTYPE, kNumberTypeInt32) \
-  REG_DYNAMIC_SLICE_GPU_ATTR(MS_DTYPE, DTYPE, kNumberTypeInt64)
+#define REG_DYNAMIC_SLICE_GPU(MS_DTYPE, DTYPE)                           \
+  REG_DYNAMIC_SLICE_GPU_ATTR(MS_DTYPE, DTYPE, kNumberTypeInt32, int32_t) \
+  REG_DYNAMIC_SLICE_GPU_ATTR(MS_DTYPE, DTYPE, kNumberTypeInt64, int64_t)

 REG_SLICE_GPU_DTYPES(REG_DYNAMIC_SLICE_GPU)
 }  // namespace kernel
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_gpu_kernel.h
@ -40,7 +40,7 @@ constexpr auto kIdx4 = 4;
 constexpr auto kIdx5 = 5;
 constexpr auto kIdx6 = 6;

-template <typename T>
+template <typename T, typename S = int64_t>
 class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
 public:
  SliceFwdGpuKernelMod() { kernel_name_ = "Slice"; }
@ -106,7 +106,10 @@ class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
    kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node);
    kernel_node_ = kernel_node;
    (void)CheckParam(kernel_node);
-
+    if (is_dynamic_attr_ && !get_dynamic_attr_value_) {
+      InitSizeLists();
+      return true;
+    }
    auto input_shape = AnfAlgo::GetInputDeviceShapeAdaptively(kernel_node, 0);
    auto out_shape = AnfAlgo::GetOutputDeviceShapeAdaptively(kernel_node, 0);
    is_null_input_ =
@ -118,27 +121,15 @@ class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
    (void)std::transform(input_shape.begin(), input_shape.end(), std::back_inserter(input_shape_),
                         [](const int64_t &e) { return static_cast<int32_t>(e); });

-    size_t input_size = sizeof(T);
+    input_size_ = sizeof(T);
    for (size_t x : input_shape) {
-      input_size *= x;
-    }
-    input_size_list_.push_back(input_size);
-    if (is_dynamic_attr_) {
-      std::vector<size_t> dynamic_attr_indexs = {kBeginIndex_, kSizeIndex_};
-      for (size_t index : dynamic_attr_indexs) {
-        input_size = sizeof(T);
-        for (size_t x : AnfAlgo::GetInputDeviceShapeAdaptively(kernel_node, index)) {
-          input_size *= x;
-        }
-        input_size_list_.push_back(input_size);
-      }
+      input_size_ *= x;
    }

-    size_t output_size = sizeof(T);
+    output_size_ = sizeof(T);
    for (size_t x : out_shape) {
-      output_size *= x;
+      output_size_ *= x;
    }
-    output_size_list_.push_back(output_size);

    // transpose begin and size for NHWC data
    auto data_format = AnfAlgo::GetInputFormat(kernel_node, 0);
@ -166,6 +157,8 @@ class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
    begin_.clear();
    size_.clear();
    input_shape_.clear();
+    input_size_ = 0;
+    output_size_ = 0;
    is_null_input_ = false;
    kernel_name_ = "Slice";
    is_dynamic_attr_ = false;
@ -173,7 +166,29 @@ class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
  }

 protected:
-  void InitSizeLists() override {}
+  void InitSizeLists() override {
+    input_size_list_.push_back(input_size_);
+    output_size_list_.push_back(output_size_);
+    if (is_dynamic_attr_) {
+      InitDynamicAttrSizeLists(kernel_node_.lock());
+    }
+  }
+
+  inline void InitDynamicAttrSizeLists(const CNodePtr &kernel_node) {
+    if (get_dynamic_attr_value_) {
+      std::vector<size_t> dynamic_attr_indexs = {kBeginIndex_, kSizeIndex_};
+      for (size_t index : dynamic_attr_indexs) {
+        size_t input_size = sizeof(S);
+        for (size_t x : AnfAlgo::GetInputDeviceShapeAdaptively(kernel_node, index)) {
+          input_size *= x;
+        }
+        input_size_list_.push_back(input_size);
+      }
+    } else {
+      input_size_list_.push_back(0);
+      input_size_list_.push_back(0);
+    }
+  }

 private:
  void CheckParam(const CNodePtr &kernel_node) {
@ -205,13 +220,13 @@ class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
      size = GetAttr<std::vector<int64_t>>(kernel_node, "size");
      begin = GetAttr<std::vector<int64_t>>(kernel_node, "begin");
    } else {
-      // The value of dynamic attr can only be obtained after the InferShape() of dynamic kernel is executed
-      if (depend_tensor_map_.empty()) {
+      if (GetDynamicAttrIntValue(kernel_node, kBeginIndex_, &begin) &&
+          GetDynamicAttrIntValue(kernel_node, kSizeIndex_, &size)) {
+        get_dynamic_attr_value_ = true;
+      }
+      if (!get_dynamic_attr_value_) {
        return;
      }
-      begin = GetDynamicAttrIntValue(kernel_node, kBeginIndex_);
-      size = GetDynamicAttrIntValue(kernel_node, kSizeIndex_);
-      get_dynamic_attr_value_ = true;
    }

    if (size.size() != input_shape.size() || begin.size() != input_shape.size()) {
@ -220,9 +235,8 @@ class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
                        << "of size: " << size.size() << ", the dimension of begin: " << begin.size()
                        << ", the dimension of input_x: " << input_shape.size();
    }
-    const int64_t NEG_ONE = -1;
    for (size_t i = 0; i < input_shape.size(); i++) {
-      if (size[i] == NEG_ONE) {
+      if (size[i] == -1) {
        size[i] = input_shape[i] - begin[i];
      }
      if (input_shape[i] <= 0 || size[i] <= 0) {
@ -245,6 +259,8 @@ class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
  std::vector<int32_t> size_;
  std::vector<int32_t> input_shape_;

+  size_t input_size_{0};
+  size_t output_size_{0};
  bool is_null_input_{false};
  bool is_dynamic_attr_{false};
  bool get_dynamic_attr_value_{false};
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/gpu_kernel.h
@ -346,9 +346,12 @@ class NativeGpuKernelMod : public GpuKernelMod {
    return std::equal(s1.begin(), s1.end(), s2_trans.begin(), s2_trans.end());
  }

-  inline std::vector<int64_t> GetDynamicAttrIntValue(const CNodePtr &kernel_node, const size_t input_index) {
+  inline bool GetDynamicAttrIntValue(const CNodePtr &kernel_node, const size_t input_index,
+                                     std::vector<int64_t> *attr_value) {
+    // The value of dynamic attr can only be obtained after the InferShape() is executed
    if (depend_tensor_map_.empty()) {
-      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the depend_tensor_map is empty!";
+      MS_LOG(DEBUG) << "For '" << kernel_name_ << "', the depend_tensor_map is currently empty";
+      return false;
    }
    auto depend_iter = depend_tensor_map_.find(input_index);
    if (depend_iter == depend_tensor_map_.end()) {
@ -366,7 +369,8 @@ class NativeGpuKernelMod : public GpuKernelMod {
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "',  the format of the " << input_index
                        << "th input currently should be the default format and does not support " << data_format;
    }
-    return GetTensorIntValue(input_tensor, input_index);
+    *attr_value = GetTensorIntValue(input_tensor, input_index);
+    return true;
  }
 };
 }  // namespace kernel
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/batch_norm_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/batch_norm_grad_gpu_kernel.h
@ -33,30 +33,7 @@ constexpr size_t NO_CUDNN_BATCHNORM_OPS_BN_INPUT_NUM = 8;
 template <typename T>
 class BatchNormGradGpuKernelMod : public NativeGpuKernelMod {
 public:
-  BatchNormGradGpuKernelMod()
-      : batch_(0),
-        channel_(0),
-        height_(0),
-        width_(0),
-        x_size_(0),
-        para_size_(0),
-        workspace_size_(0),
-        reserve_size_(0),
-        mode_(CUDNN_BATCHNORM_SPATIAL),
-        bn_ops_(CUDNN_BATCHNORM_OPS_BN),
-        epsilon_(10e-5),
-        is_train_(false),
-        is_null_input_(false),
-        x_desc_(nullptr),
-        y_desc_(nullptr),
-        dy_desc_(nullptr),
-        dx_desc_(nullptr),
-        dz_desc_(nullptr),
-        scale_bias_diff_desc_(nullptr),
-        activation_desc_(nullptr),
-        handle_(nullptr),
-        cudnn_data_type_(CUDNN_DATA_FLOAT),
-        beta_data_diff_(0) {}
+  BatchNormGradGpuKernelMod() { ResetResource(); }
  ~BatchNormGradGpuKernelMod() override { DestroyResource(); }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
@ -171,6 +148,33 @@ class BatchNormGradGpuKernelMod : public NativeGpuKernelMod {
    return true;
  }

+  void ResetResource() noexcept override {
+    ResetSizeLists();
+    batch_ = 0;
+    channel_ = 0;
+    height_ = 0;
+    width_ = 0;
+    x_size_ = 0;
+    para_size_ = 0;
+    workspace_size_ = 0;
+    reserve_size_ = 0;
+    mode_ = CUDNN_BATCHNORM_SPATIAL;
+    bn_ops_ = CUDNN_BATCHNORM_OPS_BN;
+    epsilon_ = 10e-5;
+    is_train_ = false;
+    is_null_input_ = false;
+    x_desc_ = nullptr;
+    y_desc_ = nullptr;
+    dy_desc_ = nullptr;
+    dx_desc_ = nullptr;
+    dz_desc_ = nullptr;
+    scale_bias_diff_desc_ = nullptr;
+    activation_desc_ = nullptr;
+    handle_ = nullptr;
+    cudnn_data_type_ = CUDNN_DATA_FLOAT;
+    beta_data_diff_ = 0;
+  }
+
 protected:
  void InitResource() override {
    handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/flatten_gpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/flatten_gpu_kernel.cc
@ -18,6 +18,7 @@

 namespace mindspore {
 namespace kernel {
+namespace {
 // float64
 MS_REG_GPU_KERNEL_ONE(Flatten, KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
                      FlattenFwdGpuKernelMod, double)
@ -25,6 +26,7 @@ MS_REG_GPU_KERNEL_ONE(Reshape, KernelAttr().AddInputAttr(kNumberTypeFloat64).Add
                      FlattenFwdGpuKernelMod, double)
 MS_REG_GPU_KERNEL_ONE(ExpandDims, KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
                      FlattenFwdGpuKernelMod, double)
+
 // float32
 MS_REG_GPU_KERNEL_ONE(Flatten, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                      FlattenFwdGpuKernelMod, float)
@ -32,6 +34,7 @@ MS_REG_GPU_KERNEL_ONE(Reshape, KernelAttr().AddInputAttr(kNumberTypeFloat32).Add
                      FlattenFwdGpuKernelMod, float)
 MS_REG_GPU_KERNEL_ONE(ExpandDims, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                      FlattenFwdGpuKernelMod, float)
+
 // float16
 MS_REG_GPU_KERNEL_ONE(Flatten, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
                      FlattenFwdGpuKernelMod, half)
@ -39,6 +42,7 @@ MS_REG_GPU_KERNEL_ONE(ExpandDims, KernelAttr().AddInputAttr(kNumberTypeFloat16).
                      FlattenFwdGpuKernelMod, half)
 MS_REG_GPU_KERNEL_ONE(Reshape, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
                      FlattenFwdGpuKernelMod, half)
+
 // int64
 MS_REG_GPU_KERNEL_ONE(Flatten, KernelAttr().AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
                      FlattenFwdGpuKernelMod, int64_t)
@ -46,13 +50,15 @@ MS_REG_GPU_KERNEL_ONE(Reshape, KernelAttr().AddInputAttr(kNumberTypeInt64).AddOu
                      FlattenFwdGpuKernelMod, int64_t)
 MS_REG_GPU_KERNEL_ONE(ExpandDims, KernelAttr().AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
                      FlattenFwdGpuKernelMod, int64_t)
+
 // int32
 MS_REG_GPU_KERNEL_ONE(Flatten, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
-                      FlattenFwdGpuKernelMod, int)
+                      FlattenFwdGpuKernelMod, int32_t)
 MS_REG_GPU_KERNEL_ONE(Reshape, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
-                      FlattenFwdGpuKernelMod, int)
+                      FlattenFwdGpuKernelMod, int32_t)
 MS_REG_GPU_KERNEL_ONE(ExpandDims, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
-                      FlattenFwdGpuKernelMod, int)
+                      FlattenFwdGpuKernelMod, int32_t)
+
 // int16
 MS_REG_GPU_KERNEL_ONE(Flatten, KernelAttr().AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
                      FlattenFwdGpuKernelMod, int16_t)
@ -107,5 +113,71 @@ MS_REG_GPU_KERNEL_ONE(Reshape, KernelAttr().AddInputAttr(kNumberTypeBool).AddOut
                      FlattenFwdGpuKernelMod, bool)
 MS_REG_GPU_KERNEL_ONE(ExpandDims, KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool),
                      FlattenFwdGpuKernelMod, bool)
+}  // namespace
+
+// dynamic kernel:
+namespace {
+// float64
+MS_REG_GPU_KERNEL_TWO(
+  Reshape,
+  KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat64),
+  FlattenFwdGpuKernelMod, double, int64_t)
+
+// float32
+MS_REG_GPU_KERNEL_TWO(
+  Reshape,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat32),
+  FlattenFwdGpuKernelMod, float, int64_t)
+
+// float16
+MS_REG_GPU_KERNEL_TWO(
+  Reshape,
+  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat16),
+  FlattenFwdGpuKernelMod, half, int64_t)
+
+// int64
+MS_REG_GPU_KERNEL_TWO(
+  Reshape, KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
+  FlattenFwdGpuKernelMod, int64_t, int64_t)
+
+// int32
+MS_REG_GPU_KERNEL_TWO(
+  Reshape, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt32),
+  FlattenFwdGpuKernelMod, int32_t, int64_t)
+
+// int16
+MS_REG_GPU_KERNEL_TWO(
+  Reshape, KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt16),
+  FlattenFwdGpuKernelMod, int16_t, int64_t)
+// int8
+MS_REG_GPU_KERNEL_TWO(
+  Reshape, KernelAttr().AddInputAttr(kNumberTypeInt8).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt8),
+  FlattenFwdGpuKernelMod, char, int64_t)
+
+// uint64
+MS_REG_GPU_KERNEL_TWO(
+  Reshape, KernelAttr().AddInputAttr(kNumberTypeUInt64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeUInt64),
+  FlattenFwdGpuKernelMod, uint64_t, int64_t)
+
+// uint32
+MS_REG_GPU_KERNEL_TWO(
+  Reshape, KernelAttr().AddInputAttr(kNumberTypeUInt32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeUInt32),
+  FlattenFwdGpuKernelMod, uint, int64_t)
+
+// uint16
+MS_REG_GPU_KERNEL_TWO(
+  Reshape, KernelAttr().AddInputAttr(kNumberTypeUInt16).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeUInt16),
+  FlattenFwdGpuKernelMod, uint16_t, int64_t)
+
+// uint8
+MS_REG_GPU_KERNEL_TWO(
+  Reshape, KernelAttr().AddInputAttr(kNumberTypeUInt8).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeUInt8),
+  FlattenFwdGpuKernelMod, uchar, int64_t)
+
+// bool
+MS_REG_GPU_KERNEL_TWO(
+  Reshape, KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeBool),
+  FlattenFwdGpuKernelMod, bool, int64_t)
+}  // namespace
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/flatten_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/flatten_gpu_kernel.h
@ -25,7 +25,7 @@

 namespace mindspore {
 namespace kernel {
-template <typename T>
+template <typename T, typename S = int64_t>
 class FlattenFwdGpuKernelMod : public NativeGpuKernelMod {
 public:
  FlattenFwdGpuKernelMod() : input_size_(0), is_null_input_(false) {}
@ -48,6 +48,7 @@ class FlattenFwdGpuKernelMod : public NativeGpuKernelMod {
  }
  bool Init(const CNodePtr &kernel_node) override {
    kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node);
+    kernel_node_ = kernel_node;
    auto shape = AnfAlgo::GetInputDeviceShapeAdaptively(kernel_node, 0);
    kernel_node_ = kernel_node;
    is_null_input_ = CHECK_SHAPE_NULL(shape, kernel_name_, "input");
@ -75,6 +76,21 @@ class FlattenFwdGpuKernelMod : public NativeGpuKernelMod {
  void InitSizeLists() override {
    input_size_list_.push_back(input_size_);
    output_size_list_.push_back(input_size_);
+    InitDynamicAttrSizeLists(kernel_node_.lock());
+  }
+
+  inline void InitDynamicAttrSizeLists(const CNodePtr &kernel_node) {
+    size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
+    if (input_num == 1) {
+      return;
+    }
+    for (size_t index = 1; index < input_num; ++index) {
+      size_t input_size = sizeof(S);
+      for (size_t x : AnfAlgo::GetInputDeviceShapeAdaptively(kernel_node, index)) {
+        input_size *= x;
+      }
+      input_size_list_.push_back(input_size);
+    }
  }

 private:
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/other/dynamic_broadcast_grad_args_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/other/dynamic_broadcast_grad_args_gpu_kernel.h
@ -49,6 +49,8 @@ class DynamicBroadcastGradientArgsGpuKernelMod : public NativeGpuKernelMod {
    CHECK_CUDA_RET_WITH_EXCEPT(
      kernel_node_, cudaMemcpyAsync(&x1_value[0], s1_addr, input_size_list_[1], cudaMemcpyDeviceToHost, cuda_stream),
      "DynamicBroadcastGradientArgs copy s1 value failed");
+    CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaStreamSynchronize(cuda_stream),
+                               "DynamicBroadcastGradientArgs cudaStreamSynchronized failed");
    auto grad_reduce_idx = CalOut({x0_value, x1_value});
    r0_size_ = SetOuputValue(r0_addr, grad_reduce_idx[0], x0_value.size(), cuda_stream);
    r1_size_ = SetOuputValue(r1_addr, grad_reduce_idx[1], x1_value.size(), cuda_stream);
--- a/mindspore/core/abstract/primitive_infer_map.cc
+++ b/mindspore/core/abstract/primitive_infer_map.cc
@ -84,7 +84,8 @@ std::set<int64_t> GetDependsFormMap(const CNodePtr &cnode) {
                                                        {kReshape, ShapeSet{1}},
                                                        {kSlice, ShapeSet{1, 2}},
                                                        {kSliceGrad, ShapeSet{2, 3}},
-                                                        {kDynamicBroadcastTo, ShapeSet{1}}};
+                                                        {kDynamicBroadcastTo, ShapeSet{1}},
+                                                        {kReduceSum, ShapeSet{1}}};

  MS_EXCEPTION_IF_NULL(cnode);
  if (cnode->inputs().empty()) {
@ -98,7 +99,7 @@ std::set<int64_t> GetDependsFormMap(const CNodePtr &cnode) {
  MS_EXCEPTION_IF_NULL(ms_context);
  auto device = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
  // Special dynamic shape depends for Ascend.
-  if (device == kAscendDevice && (prim_name == kReduceSum || prim_name == kTranspose)) {
+  if (device == kAscendDevice && prim_name == kTranspose) {
    return {1};
  }

--- a/mindspore/core/ops/grad/batch_norm_grad.cc
+++ b/mindspore/core/ops/grad/batch_norm_grad.cc
@ -45,27 +45,34 @@ bool BatchNormGrad::get_is_training() const {
  return GetValue<bool>(value_ptr);
 }

+constexpr auto kInputNum = 6;
+
+abstract::TupleShapePtr BatchNormGradInferShape(const PrimitivePtr &primitive,
+                                                const std::vector<AbstractBasePtr> &input_args) {
+  MS_EXCEPTION_IF_NULL(primitive);
+  auto prim_name = primitive->name();
+  (void)CheckAndConvertUtils::CheckInputArgs(input_args, kGreaterEqual, kInputNum, prim_name);
+  auto x_shape_ptr = input_args[kInputIndex1]->BuildShape();
+  auto scale_shape_ptr = input_args[kInputIndex2]->BuildShape();
+  return std::make_shared<abstract::TupleShape>(
+    std::vector<abstract::BaseShapePtr>{x_shape_ptr, scale_shape_ptr, scale_shape_ptr});
+}
+
+TuplePtr BatchNormGradInferType(const PrimitivePtr &primitive, const std::vector<AbstractBasePtr> &input_args) {
+  MS_EXCEPTION_IF_NULL(primitive);
+  auto prim_name = primitive->name();
+  (void)CheckAndConvertUtils::CheckInputArgs(input_args, kGreaterEqual, kInputNum, prim_name);
+  auto x_type_ptr = input_args[kInputIndex1]->BuildType();
+  auto scale_type_ptr = input_args[kInputIndex2]->BuildType();
+  return std::make_shared<Tuple>(std::vector<TypePtr>{x_type_ptr, scale_type_ptr, scale_type_ptr});
+}
+
 AbstractBasePtr BatchNormGradInfer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                   const std::vector<AbstractBasePtr> &input_args) {
  MS_EXCEPTION_IF_NULL(primitive);
-  for (auto item : input_args) {
-    MS_EXCEPTION_IF_NULL(item);
-  }
-  const int64_t input_num = 5;
-  (void)CheckAndConvertUtils::CheckInteger("BatchNormGrad infer", SizeToLong(input_args.size()), kGreaterEqual,
-                                           input_num, primitive->name());
-  auto y_backprop_shape = CheckAndConvertUtils::ConvertShapePtrToShapeMap(input_args[0]->BuildShape())[kShape];
-  auto x_shape = CheckAndConvertUtils::ConvertShapePtrToShapeMap(input_args[1]->BuildShape())[kShape];
-  CheckAndConvertUtils::Check("BatchNorm y_backprop_shape", y_backprop_shape, kEqual, x_shape);
-
-  auto dx = input_args[kInputIndex1]->Broaden();
-  auto dscale = input_args[kInputIndex2]->Broaden();
-  auto reserve_1 = input_args[kInputIndex3]->Broaden();
-  auto reserve_2 = input_args[kInputIndex4]->Broaden();
-
-  AbstractBasePtrList rets = {dx, dscale, dscale, reserve_1, reserve_2};
-  return std::make_shared<abstract::AbstractTuple>(rets);
+  return abstract::MakeAbstract(BatchNormGradInferShape(primitive, input_args),
+                                BatchNormGradInferType(primitive, input_args));
 }
-REGISTER_PRIMITIVE_C(kNameBatchNormGrad, BatchNormGrad);
+REGISTER_PRIMITIVE_EVAL_IMPL(BatchNormGrad, prim::kPrimBatchNormGrad, BatchNormGradInfer, nullptr, true);
 }  // namespace ops
 }  // namespace mindspore
--- a/mindspore/python/mindspore/ops/operations/_grad_ops.py
+++ b/mindspore/python/mindspore/ops/operations/_grad_ops.py
@ -118,7 +118,7 @@ class SqrtGrad(PrimitiveWithInfer):
        return x_dtype


-class BatchNormGrad(PrimitiveWithInfer):
+class BatchNormGrad(Primitive):
    """Performs grad of BatchNorm operation."""

    @prim_attr_register
@ -127,13 +127,6 @@ class BatchNormGrad(PrimitiveWithInfer):
        self.epsilon = validator.check_float_range(epsilon, 0, 1, Rel.INC_RIGHT, 'epsilon', self.name)
        self.data_format = validator.check_string(data_format, ['NCHW', 'NHWC'], 'format', self.name)

-    def infer_shape(self, y_backprop_shape, x_shape, scale_shape, save_mean_shape, save_variance_shape, reserve):
-        validator.check("BatchNorm y_backprop_shape", y_backprop_shape, "BatchNorm x_shape", x_shape)
-        return (x_shape, scale_shape, scale_shape)
-
-    def infer_dtype(self, y_backprop_type, x_type, scale_type, save_mean_shape, save_variance_shape, reserve):
-        return (x_type, scale_type, scale_type)
-

 class SyncBatchNormGrad(PrimitiveWithInfer):
    """Performs grad of SyncBatchNorm operation."""
--- a/tests/st/ops/gpu/test_dynamic_broadcast_gradient_args.py
+++ b/tests/st/ops/gpu/test_dynamic_broadcast_gradient_args.py
@ -0,0 +1,52 @@
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+import pytest
+
+import mindspore.nn as nn
+import mindspore.context as context
+
+from mindspore.ops.operations import _inner_ops
+
+context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+
+
+class Net(nn.Cell):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.args = _inner_ops.DynamicBroadcastGradientArgs()
+
+    def construct(self, s0, s1):
+        return self.args(s0, s1)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_net():
+    """
+    Feature: DynamicBroadcastGradientArgs op.
+    Description: test cases for DynamicBroadcastGradientArgs op.
+    Expectation: the result match expected array.
+    """
+    shape0 = (4, 2, 1)
+    shape1 = (2, 7)
+    net = Net()
+    r0, r1 = net(shape0, shape1)
+    r0_expected = [2]
+    r1_expected = [0]
+
+    assert np.array_equal(r0_expected, r0.asnumpy())
+    assert np.array_equal(r1_expected, r1.asnumpy())
--- a/tests/st/ops/gpu/test_dynamic_ops.py
+++ b/tests/st/ops/gpu/test_dynamic_ops.py
@ -102,15 +102,7 @@ class ConcatNet(nn.Cell):
        return self.op((x1, x2))


-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_dynamic_concat():
-    """
-    Feature: Test Concat and its backward.
-    Description:  The shape of inputs is dynamic.
-    Expectation: Assert that results are consistent with fixed shape.
-    """
+def dynamic_concat_run(is_grad):
    axis = 1
    dtype = np.float32
    data_list = []
@ -118,15 +110,151 @@ def test_dynamic_concat():
        data = []
        data.append(np.random.rand(i, 16).astype(dtype))
        data.append(np.random.rand(i, 32).astype(dtype))
-        data.append(np.random.rand(i, 48).astype(dtype))
+        if is_grad:
+            data.append(np.random.rand(i, 48).astype(dtype))
        data_list.append(tuple(data))
    column_names = get_columns(len(data_list[0]))
    dataset = ds.GeneratorDataset(data_list, column_names, shuffle=False)
-    dataset.set_dynamic_columns(
-        columns={column_names[0]: [None, 16], column_names[1]: [None, 32], column_names[2]: [None, 48]})
+    dynamic_columns = {column_names[0]: [
+        None, 16], column_names[1]: [None, 32]}
+    if is_grad:
+        dynamic_columns[column_names[-1]] = [None, 48]
+    dataset.set_dynamic_columns(columns=dynamic_columns)
+    net = ConcatNet(axis)
+    if is_grad:
+        net = GradNetWrtX(net)
+    output = dynamic_shape_sink_process(net, dataset)
+    output_cmp = fixed_shape_process(net, dataset)
+    assert compare(output, output_cmp)

-    net = GradNetWrtX(ConcatNet(axis))

+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_dynamic_concat_forward():
+    """
+    Feature: Test Concat.
+    Description:  The shape of inputs is dynamic.
+    Expectation: Assert that results are consistent with fixed shape.
+    """
+    dynamic_concat_run(False)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_dynamic_concat_backward():
+    """
+    Feature: Test backward of Concat.
+    Description:  The shape of inputs is dynamic.
+    Expectation: Assert that results are consistent with fixed shape.
+    """
+    dynamic_concat_run(True)
+
+
+class BatchNormNet(nn.Cell):
+    def __init__(self, c):
+        super(BatchNormNet, self).__init__()
+        self.bn = nn.BatchNorm1d(c)
+
+    def construct(self, input_data):
+        x = self.bn(input_data)
+        return x
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_dynamic_bachnorm():
+    """
+    Feature: Test BatchNorm and its backward.
+    Description:  The shape of inputs is dynamic.
+    Expectation: Assert that results are consistent with fixed shape.
+    """
+    c = 256
+    dtype = np.float32
+    data_list = []
+    for i in [2, 64]:
+        data = []
+        data.append(np.random.rand(i, c).astype(dtype))
+        data.append(np.random.rand(i, c).astype(dtype))
+        data_list.append(tuple(data))
+    column_names = get_columns(len(data_list[0]))
+    dataset = ds.GeneratorDataset(data_list, column_names, shuffle=False)
+    dynamic_columns = {column_names[0]: [None, c], column_names[1]: [None, c]}
+    dataset.set_dynamic_columns(columns=dynamic_columns)
+    net = GradNetWrtX(BatchNormNet(c))
    gradients = dynamic_shape_sink_process(net, dataset)
    gradients_cmp = fixed_shape_process(net, dataset)
    assert compare(gradients, gradients_cmp)
+
+
+class ReshapeNet(nn.Cell):
+    def construct(self, x, y):
+        shape_of_y = ops.DynamicShape()(y)
+        return ops.Reshape()(x, shape_of_y)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_dynamic_reshape():
+    """
+    Feature: Test Reshape.
+    Description:  The shape of inputs is dynamic.
+    Expectation: Assert that results are consistent with fixed shape.
+    """
+    dtype = np.float32
+    data_list = []
+    for i in [2, 96]:
+        data = []
+        data.append(np.random.rand(i, 64, 1).astype(dtype))
+        data.append(np.random.rand(i, 64).astype(dtype))
+        data_list.append(tuple(data))
+    column_names = get_columns(len(data_list[0]))
+    dataset = ds.GeneratorDataset(data_list, column_names, shuffle=False)
+    dynamic_columns = {column_names[0]: [
+        None, 64, 1], column_names[1]: [None, 64]}
+    dataset.set_dynamic_columns(columns=dynamic_columns)
+    net = ReshapeNet()
+    output = dynamic_shape_sink_process(net, dataset)
+    output_cmp = fixed_shape_process(net, dataset)
+    assert compare(output, output_cmp)
+
+
+class ReduceSumNet(nn.Cell):
+    def __init__(self):
+        super(ReduceSumNet, self).__init__()
+        self.reduce = ops.ReduceSum()
+
+    def construct(self, x, y):
+        return self.reduce(x, y)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_dynamic_reduce_sum():
+    """
+    Feature: Test ReduceSum.
+    Description:  The shape of inputs is dynamic.
+    Expectation: Assert that results are consistent with result of the numpy compute
+    """
+    dtype = np.float32
+    data_list = []
+    for i in [2, 96]:
+        data = []
+        data.append(np.random.rand(i, 256).astype(dtype))
+        data.append(np.array([1], dtype=np.int64))
+        data_list.append(tuple(data))
+    column_names = get_columns(len(data_list[0]))
+    dataset = ds.GeneratorDataset(data_list, column_names, shuffle=False)
+    dynamic_columns = {column_names[0]: [None, 256]}
+    dataset.set_dynamic_columns(columns=dynamic_columns)
+    net = ReduceSumNet()
+    output = dynamic_shape_sink_process(net, dataset)
+    # Currently, the parameter axis of ReduceSum operator is dynamic(tensor) is
+    # not supported under the fixed shape, so numpy is used for comparison
+    inputs = data_list[0]
+    output_cmp = np.sum(inputs[0], inputs[1][0])
+    assert np.allclose(output.asnumpy(), output_cmp, rtol=1.0e-4, atol=1.0e-4)