From 00e78bf6c410dafbbbd62eb9682c37e8db752028 Mon Sep 17 00:00:00 2001
From: wilfChen <chenweifeng720@huawei.com>
Date: Sat, 9 May 2020 10:38:38 +0800
Subject: [PATCH] gpu support MinimumGrad & MaximumGrad kernel

---
 .../gpu/cuda_impl/broadcast_grad_impl.cu      | 116 +++++++++
 .../gpu/cuda_impl/broadcast_grad_impl.cuh     |  38 +++
 .../gpu/math/broadcast_grad_gpu_kernel.cc     |  38 +++
 .../gpu/math/broadcast_grad_gpu_kernel.h      | 149 ++++++++++++
 tests/st/ops/gpu/test_maximum_op.py           | 169 +++++++++++++-
 tests/st/ops/gpu/test_minimum_op.py           | 220 ++++++++++++++++++
 6 files changed, 729 insertions(+), 1 deletion(-)
 create mode 100644 mindspore/ccsrc/kernel/gpu/cuda_impl/broadcast_grad_impl.cu
 create mode 100644 mindspore/ccsrc/kernel/gpu/cuda_impl/broadcast_grad_impl.cuh
 create mode 100644 mindspore/ccsrc/kernel/gpu/math/broadcast_grad_gpu_kernel.cc
 create mode 100644 mindspore/ccsrc/kernel/gpu/math/broadcast_grad_gpu_kernel.h
 create mode 100644 tests/st/ops/gpu/test_minimum_op.py
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/broadcast_grad_impl.cu b/mindspore/ccsrc/kernel/gpu/cuda_impl/broadcast_grad_impl.cu
new file mode 100644
index 00000000000..ce8617283c6
--- /dev/null
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/broadcast_grad_impl.cu
@@ -0,0 +1,116 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/gpu/cuda_impl/broadcast_grad_impl.cuh"
+#include "device/gpu/cuda_common.h"
+
+template <typename T>
+struct MinimumGradFunc {
+  __device__ __forceinline__ void operator()(const T &x1, const T &x2, const T &dy, T *dx1, T *dx2) {
+    if (x1 < x2) {
+      atomicAdd(dx1, dy);
+    } else {
+      atomicAdd(dx2, dy);
+    }
+  }
+};
+
+template <typename T>
+struct MaximumGradFunc {
+  __device__ __forceinline__ void operator()(const T &x1, const T &x2, const T &dy, T *dx1, T *dx2) {
+    if (x1 > x2) {
+      atomicAdd(dx1, dy);
+    } else {
+      atomicAdd(dx2, dy);
+    }
+  }
+};
+
+__device__ __forceinline__ int Index(const int &index, const int &dim) { return dim == 1 ? 0 : index; }
+
+template <typename T, typename Func>
+__device__ __forceinline__ void BroadcastGradOperator(const int &l0, const int &l1, const int &l2, const int &l3,
+                                                      const int &r0, const int &r1, const int &r2, const int &r3,
+                                                      const int &d0, const int &d1, const int &d2, const int &d3,
+                                                      const T *x1, const T *x2, const T *dy, T *dx1, T *dx2) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < d0 * d1 * d2 * d3; pos += blockDim.x * gridDim.x) {
+    int i = pos / (d1 * d2 * d3) % d0;
+    int j = pos / (d2 * d3) % d1;
+    int k = pos / d3 % d2;
+    int l = pos % d3;
+
+    int l_index = Index(i, l0) * l1 * l2 * l3 + Index(j, l1) * l2 * l3 + Index(k, l2) * l3 + Index(l, l3);
+    int r_index = Index(i, r0) * r1 * r2 * r3 + Index(j, r1) * r2 * r3 + Index(k, r2) * r3 + Index(l, r3);
+    Func()(x1[l_index], x2[r_index], dy[pos], dx1 + l_index, dx2 + r_index);
+  }
+}
+
+template <typename T>
+__global__ void BroadcastGradKernel(const int l0, const int l1, const int l2, const int l3, const int r0, const int r1,
+                                    const int r2, const int r3, const int d0, const int d1, const int d2, const int d3,
+                                    enum BroadcastGradOpType op, const T *x1, const T *x2, const T *dy, T *dx1,
+                                    T *dx2) {
+  switch (op) {
+    case BROADCAST_GRAD_TYPE_MINIMUM:
+      return BroadcastGradOperator<T, MinimumGradFunc<T>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, x1, x2, dy,
+                                                          dx1, dx2);
+    case BROADCAST_GRAD_TYPE_MAXIMUM:
+      return BroadcastGradOperator<T, MaximumGradFunc<T>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, x1, x2, dy,
+                                                          dx1, dx2);
+  }
+}
+
+template <typename T>
+void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
+                   const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
+                   enum BroadcastGradOpType op, const T *x1, const T *x2, const T *dy, T *dx1, T *dx2,
+                   cudaStream_t stream) {
+  int size = d0 * d1 * d2 * d3;
+  BroadcastGradKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, op,
+                                                                    x1, x2, dy, dx1, dx2);
+}
+
+template <typename T, typename Func>
+__device__ __forceinline__ void NoBroadcastOperator(const int &nums, const T *x1, const T *x2, const T *dy, T *dx1,
+                                                    T *dx2) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < nums; pos += blockDim.x * gridDim.x) {
+    Func()(x1[pos], x2[pos], dy[pos], dx1 + pos, dx2 + pos);
+  }
+}
+
+template <typename T>
+__global__ void NoBroadcastGradKernel(const int nums, enum BroadcastGradOpType op, const T *x1, const T *x2,
+                                      const T *dy, T *dx1, T *dx2) {
+  switch (op) {
+    case BROADCAST_GRAD_TYPE_MINIMUM:
+      return NoBroadcastOperator<T, MinimumGradFunc<T>>(nums, x1, x2, dy, dx1, dx2);
+    case BROADCAST_GRAD_TYPE_MAXIMUM:
+      return NoBroadcastOperator<T, MaximumGradFunc<T>>(nums, x1, x2, dy, dx1, dx2);
+  }
+}
+
+template <typename T>
+void NoBroadcastGrad(const int &nums, enum BroadcastGradOpType op, const T *x1, const T *x2, const T *dy, T *dx1,
+                     T *dx2, cudaStream_t stream) {
+  NoBroadcastGradKernel<<<GET_BLOCKS(nums), GET_THREADS, 0, stream>>>(nums, op, x1, x2, dy, dx1, dx2);
+}
+
+template void NoBroadcastGrad(const int &nums, enum BroadcastGradOpType op, const float *x1, const float *x2,
+                              const float *dy, float *dx1, float *dx2, cudaStream_t stream);
+template void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
+                            const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
+                            enum BroadcastGradOpType op, const float *x1, const float *x2, const float *dy, float *dx1,
+                            float *dx2, cudaStream_t stream);
diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/broadcast_grad_impl.cuh b/mindspore/ccsrc/kernel/gpu/cuda_impl/broadcast_grad_impl.cuh
new file mode 100644
index 00000000000..d154eddd4cc
--- /dev/null
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/broadcast_grad_impl.cuh
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_GRAD_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_GRAD_H_
+
+#include "device/gpu/cuda_common.h"
+
+enum BroadcastGradOpType {
+  BROADCAST_GRAD_TYPE_MAXIMUM = 0,
+  BROADCAST_GRAD_TYPE_MINIMUM = 1,
+  BROADCAST_GRAD_TYPE_INVALID = 0xffffffff,
+};
+
+template <typename T>
+void BroadcastGrad(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
+                   const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
+                   enum BroadcastGradOpType op, const T *x1, const T *x2, const T *dy, T *dx1, T *dx2,
+                   cudaStream_t stream);
+
+template <typename T>
+void NoBroadcastGrad(const int &nums, enum BroadcastGradOpType op, const T *x1, const T *x2, const T *dy, T *dx1,
+                     T *dx2, cudaStream_t stream);
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_GRAD_H_
diff --git a/mindspore/ccsrc/kernel/gpu/math/broadcast_grad_gpu_kernel.cc b/mindspore/ccsrc/kernel/gpu/math/broadcast_grad_gpu_kernel.cc
new file mode 100644
index 00000000000..edc51d4ffd7
--- /dev/null
+++ b/mindspore/ccsrc/kernel/gpu/math/broadcast_grad_gpu_kernel.cc
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/gpu/math/broadcast_grad_gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_ONE(MinimumGrad,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32),
+                      BroadcastOpGradGpuKernel, float)
+MS_REG_GPU_KERNEL_ONE(MaximumGrad,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32),
+                      BroadcastOpGradGpuKernel, float)
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/gpu/math/broadcast_grad_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/math/broadcast_grad_gpu_kernel.h
new file mode 100644
index 00000000000..c3e974b4d6b
--- /dev/null
+++ b/mindspore/ccsrc/kernel/gpu/math/broadcast_grad_gpu_kernel.h
@@ -0,0 +1,149 @@
+
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_BROADCAST_GPU_KERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_BROADCAST_GPU_KERNEL_H_
+
+#include <cuda_runtime_api.h>
+#include <vector>
+#include <string>
+#include <map>
+#include "kernel/gpu/gpu_kernel.h"
+#include "kernel/gpu/gpu_kernel_factory.h"
+#include "kernel/gpu/cuda_impl/broadcast_grad_impl.cuh"
+#include "kernel/gpu/kernel_constants.h"
+namespace mindspore {
+namespace kernel {
+
+template <typename T>
+class BroadcastOpGradGpuKernel : public GpuKernel {
+ public:
+  BroadcastOpGradGpuKernel()
+      : op_type_(BROADCAST_GRAD_TYPE_INVALID), need_broadcast_(false), input1_num_(1), input2_num_(1), output_num_(1) {}
+  ~BroadcastOpGradGpuKernel() override = default;
+
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+              const std::vector<AddressPtr> &outputs, uintptr_t stream_ptr) override {
+    T *x1 = GetDeviceAddress<T>(inputs, 0);
+    T *x2 = GetDeviceAddress<T>(inputs, 1);
+    T *dy = GetDeviceAddress<T>(inputs, 2);
+    T *dx1 = GetDeviceAddress<T>(outputs, 0);
+    T *dx2 = GetDeviceAddress<T>(outputs, 1);
+
+    CHECK_CUDA_RET_WITH_EXCEPT(cudaMemsetAsync(dx1, 0, outputs[0]->size, reinterpret_cast<cudaStream_t>(stream_ptr)),
+                               "cudaMemSet Failed");
+    CHECK_CUDA_RET_WITH_EXCEPT(cudaMemsetAsync(dx2, 0, outputs[1]->size, reinterpret_cast<cudaStream_t>(stream_ptr)),
+                               "cudaMemSet Failed");
+    if (need_broadcast_) {
+      BroadcastGrad(x1_shape_[0], x1_shape_[1], x1_shape_[2], x1_shape_[3], x2_shape_[0], x2_shape_[1], x2_shape_[2],
+                    x2_shape_[3], dy_shape_[0], dy_shape_[1], dy_shape_[2], dy_shape_[3], op_type_, x1, x2, dy, dx1,
+                    dx2, reinterpret_cast<cudaStream_t>(stream_ptr));
+    } else {
+      NoBroadcastGrad(output_num_, op_type_, x1, x2, dy, dx1, dx2, reinterpret_cast<cudaStream_t>(stream_ptr));
+    }
+
+    return true;
+  }
+  bool Init(const CNodePtr &kernel_node) override {
+    GetOpType(kernel_node);
+    auto shape1 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    auto shape2 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
+    auto shape3 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
+    need_broadcast_ = IsBroadcast(shape1, shape2);
+    if (need_broadcast_ && shape1.size() > 4) {
+      MS_LOG(EXCEPTION) << "Broadcast operation not support dim greater than 4";
+    }
+
+    for (size_t i = 0; i < shape3.size(); i++) {
+      dy_shape_[i] = shape3[i];
+      output_num_ *= shape3[i];
+    }
+    int offset = shape3.size() - shape1.size();
+    for (size_t i = 0; i < shape1.size(); i++) {
+      x1_shape_[i + offset] = shape1[i];
+      input1_num_ *= shape1[i];
+    }
+    offset = shape3.size() - shape2.size();
+    for (size_t i = 0; i < shape2.size(); i++) {
+      x2_shape_[i + offset] = shape2[i];
+      input2_num_ *= shape2[i];
+    }
+
+    InitSizeLists();
+    return true;
+  }
+
+ protected:
+  void InitResource() override { return; }
+  void InitSizeLists() override {
+    input_size_list_.push_back(input1_num_ * sizeof(T));
+    input_size_list_.push_back(input2_num_ * sizeof(T));
+    input_size_list_.push_back(output_num_ * sizeof(T));
+    output_size_list_.push_back(input1_num_ * sizeof(T));
+    output_size_list_.push_back(input2_num_ * sizeof(T));
+  }
+
+ private:
+  void GetOpType(const CNodePtr &kernel_node) {
+    std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
+
+    static std::map<std::string, BroadcastGradOpType> kBroadcastTypeMap = {
+      {"MaximumGrad", BROADCAST_GRAD_TYPE_MAXIMUM},
+      {"MinimumGrad", BROADCAST_GRAD_TYPE_MINIMUM},
+    };
+
+    auto iter = kBroadcastTypeMap.find(kernel_name);
+    if (iter == kBroadcastTypeMap.end()) {
+      MS_LOG(EXCEPTION) << "operation " << kernel_name << " is not supported.";
+    } else {
+      op_type_ = iter->second;
+    }
+  }
+
+  bool IsBroadcast(const std::vector<size_t> &lhs, const std::vector<size_t> &rhs) {
+    if (lhs.size() != rhs.size()) {
+      return true;
+    }
+    for (size_t i = 0; i < lhs.size(); i++) {
+      if (lhs[i] != rhs[i]) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  BroadcastGradOpType op_type_;
+  bool need_broadcast_;
+  int input1_num_;
+  int input2_num_;
+  int output_num_;
+  int x1_shape_[4] = {1, 1, 1, 1};
+  int x2_shape_[4] = {1, 1, 1, 1};
+  int dy_shape_[4] = {1, 1, 1, 1};
+
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_BINARYOP_GPU_KERNEL_H_
diff --git a/tests/st/ops/gpu/test_maximum_op.py b/tests/st/ops/gpu/test_maximum_op.py
index 3193dafa61f..edfb93e926f 100644
--- a/tests/st/ops/gpu/test_maximum_op.py
+++ b/tests/st/ops/gpu/test_maximum_op.py
@@ -15,6 +15,7 @@
 
 import pytest
 from mindspore.ops import operations as P
+from mindspore.ops import composite as C
 from mindspore.nn import Cell
 from mindspore.common.tensor import Tensor
 import mindspore.context as context
@@ -29,11 +30,20 @@ class Net(Cell):
     def construct(self, x, y):
         return self.max(x, y)
 
+class Grad(Cell):
+    def __init__(self, network):
+        super(Grad, self).__init__()
+        self.grad = C.GradOperation(name="get_all", get_all=True, sens_param=True)
+        self.network = network
+
+    def construct(self, x1, x2, sens):
+        gout = self.grad(self.network)(x1, x2, sens)
+        return gout
 
 @pytest.mark.level0
 @pytest.mark.platform_x86_gpu_training
 @pytest.mark.env_onecard
-def test_max():
+def test_maximum():
     x = Tensor(np.array([[1, 2, 3]]).astype(np.float32))
     y = Tensor(np.array([[2]]).astype(np.float32))
     expect = [[2, 2, 3]]
@@ -53,3 +63,160 @@ def test_max():
     assert np.all(diff < error)
     assert np.all(-diff < error)
 
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_broadcast():
+    context.set_context(mode=context.GRAPH_MODE, save_graphs=True, device_target='GPU')
+
+    x1_np = np.array([[[[0.659578  ],
+                        [0.49113268],
+                        [0.75909054],
+                        [0.71681815],
+                        [0.30421826]]],
+                      [[[0.30322495],
+                        [0.02858258],
+                        [0.06398096],
+                        [0.09519596],
+                        [0.12498625]]],
+                      [[[0.7347768 ],
+                        [0.166469  ],
+                        [0.328553  ],
+                        [0.54908437],
+                        [0.23673844]]]]).astype(np.float32)
+    x2_np = np.array([[[[0.9154968,  0.29014662, 0.6492294,  0.39918253, 0.1648203,  0.00861965]],
+                      [[0.996885,   0.24152198, 0.3601213,  0.51664376, 0.7933056,  0.84706444]],
+                      [[0.75606346, 0.974512,   0.3939527,  0.69697475, 0.83400667, 0.6348955 ]],
+                      [[0.68492866, 0.24609096, 0.4924665,  0.22500521, 0.38474053, 0.5586104 ]]]]).astype(np.float32)
+    dy_np = np.array([[[[0.42891738, 0.03434946, 0.06192983, 0.21216309, 0.37450036, 0.6619524 ],
+                        [0.8583447,  0.5765161,  0.1468952,  0.9975385,  0.6908136,  0.4903796 ],
+                        [0.68952006, 0.39336833, 0.9049695,  0.66886294, 0.2338471,  0.913618  ],
+                        [0.0428149,  0.6243054,  0.8519898,  0.12088962, 0.9735885,  0.45661286],
+                        [0.41563734, 0.41607043, 0.4754915,  0.32207987, 0.33823156, 0.47422352]],
+
+                        [[0.64478457, 0.22430937, 0.7682554,  0.46082005, 0.8938723,  0.20490853],
+                        [0.44393885, 0.08278944, 0.4734108,  0.5543551,  0.39428464, 0.44424313],
+                        [0.12612297, 0.76566416, 0.71133816, 0.81280327, 0.20583127, 0.54058075],
+                        [0.41341263, 0.48118508, 0.00401995, 0.37259838, 0.05435474, 0.5240658 ],
+                        [0.4081956,  0.48718935, 0.9132831, 0.67969185, 0.0119757,  0.8328054 ]],
+
+                        [[0.91695577, 0.95370644, 0.263782,   0.7477626,  0.6448147,  0.8080634 ],
+                        [0.15576603, 0.9104615,  0.3778708,  0.6912833,  0.2092224, 0.67462957],
+                        [0.7087075,  0.7888326,  0.4672294,  0.98221505, 0.25210258, 0.98920417],
+                        [0.7466197,  0.22702982, 0.01991269, 0.6846591,  0.7515228,  0.5890395 ],
+                        [0.04531088, 0.21740614, 0.8406235,  0.36480767, 0.37733936, 0.02914464]],
+
+                        [[0.33069974, 0.5497569,  0.9896345,  0.4167176,  0.78057563, 0.04659131],
+                        [0.7747768,  0.21427679, 0.29893255, 0.7706969,  0.9755185,  0.42388415],
+                        [0.3910244,  0.39381978, 0.37065396, 0.15558061, 0.05012341, 0.15870963],
+                        [0.17791101, 0.47219893, 0.13899496, 0.32323205, 0.3628809,  0.02580585],
+                        [0.30274773, 0.62890774, 0.11024303, 0.6980051,  0.35346958, 0.062852  ]]],
+
+
+                      [[[0.6925081,  0.74668753, 0.80145043, 0.06598313, 0.665123,   0.15073007],
+                        [0.11784806, 0.6385372,  0.5228278,  0.5349848,  0.84671104, 0.8096436 ],
+                        [0.09516156, 0.63298017, 0.52382874, 0.36734378, 0.66497755, 0.6019127 ],
+                        [0.46438488, 0.0194377,  0.9388292,  0.7286089,  0.29178405, 0.11872514],
+                        [0.22101837, 0.6164887,  0.6139798,  0.11711904, 0.6227745,  0.09701069]],
+
+                        [[0.80480653, 0.90034056, 0.8633447,  0.97415197, 0.08309154, 0.8446033 ],
+                        [0.9473769,  0.791024,   0.26339203, 0.01155075, 0.2673186,  0.7116369 ],
+                        [0.9687511,  0.24281934, 0.37777108, 0.09802654, 0.2421312,  0.87095344],
+                        [0.6311381,  0.23368953, 0.0998995,  0.4364419,  0.9187446,  0.5043872 ],
+                        [0.35226053, 0.09357589, 0.41317305, 0.85930043, 0.16249318, 0.5478765 ]],
+
+                        [[0.14338651, 0.24859418, 0.4246941,  0.73034066, 0.47172204, 0.8717199 ],
+                        [0.05415315, 0.78556925, 0.99214983, 0.7415298,  0.673708,   0.87817156],
+                        [0.616975,   0.42843062, 0.05179814, 0.1566958, 0.04536059, 0.70166487],
+                        [0.15493333, 0.776598,   0.4361967,  0.40253627, 0.89210516, 0.8144414 ],
+                        [0.04816005, 0.29696834, 0.4586605,  0.3419852,  0.5595613,  0.74093205]],
+
+                        [[0.1388035,  0.9168704,  0.64287645, 0.83864623, 0.48026922, 0.78323376],
+                        [0.12724937, 0.83034366, 0.42557436, 0.50578654, 0.25630295, 0.15349793],
+                        [0.27256685, 0.04547984, 0.5385756,  0.39270344, 0.7661698,  0.23722854],
+                        [0.24620503, 0.25431684, 0.71564585, 0.01161419, 0.846467,   0.7043044 ],
+                        [0.63272387, 0.11857849, 0.3772076,  0.16758402, 0.46743023, 0.05919575]]],
+
+
+                      [[[0.18827082, 0.8912264,  0.6841404,  0.74436826, 0.9582085,  0.1083683 ],
+                        [0.60695344, 0.09742349, 0.25074378, 0.87940735, 0.21116392, 0.39418384],
+                        [0.744686,   0.35679692, 0.01308284, 0.45166633, 0.68166,    0.8634658 ],
+                        [0.7331758,  0.21113694, 0.3935488,  0.87934476, 0.70728546, 0.09309767],
+                        [0.12128611, 0.93696386, 0.81177396, 0.85402405, 0.5827289,  0.9776509 ]],
+
+                        [[0.54069614, 0.66651285, 0.10646132, 0.17342485, 0.88795924, 0.03551182],
+                        [0.25531697, 0.87946486, 0.74267226, 0.89230734, 0.95171434, 0.94697934],
+                        [0.3708397,  0.507355,   0.97099817, 0.4918163,  0.17212386, 0.5008048 ],
+                        [0.62530744, 0.25210327, 0.73966664, 0.71555346, 0.82484317, 0.6094874 ],
+                        [0.4589691,  0.1386695,  0.27448782, 0.20373994, 0.27805242, 0.23292768]],
+
+                        [[0.7414099,  0.2270226,  0.90431255, 0.47035843, 0.9581062,  0.5359226 ],
+                        [0.79603523, 0.45549425, 0.80858237, 0.7705133,  0.017761,   0.98001194],
+                        [0.06013146, 0.99240226, 0.33515573, 0.04110833, 0.41470334, 0.7130743 ],
+                        [0.5687417,  0.5788611,  0.00722461, 0.6603336,  0.3420471,  0.75181854],
+                        [0.4699261,  0.51390815, 0.343182,   0.81498754, 0.8942413, 0.46532857]],
+
+                        [[0.4589523,  0.5534698,  0.2825786,  0.8205943,  0.78258514, 0.43154418],
+                        [0.27020997, 0.01667354, 0.60871965, 0.90670526, 0.3208025,  0.96995634],
+                        [0.85337156, 0.9711295,  0.1381724,  0.53670496, 0.7347996,  0.73380876],
+                        [0.6137464,  0.54751194, 0.9037335,  0.23134394, 0.61411524, 0.26583543],
+                        [0.70770144, 0.01813207, 0.24718016, 0.70329237, 0.7062925,  0.14399007]]]]).astype(np.float32)
+
+    expect_dx1 = np.array([[[[ 6.6534014 ],
+                             [ 5.649811  ],
+                             [10.071739  ],
+                             [ 6.6798244 ],
+                             [ 3.0426278 ]]],
+                           [[[ 4.2183976 ],
+                             [ 0.8096436 ],
+                             [ 0.6019127 ],
+                             [ 0.11872514],
+                             [ 0.09701069]]],
+                           [[[ 9.573029  ],
+                             [ 0.60534775],
+                             [ 3.917112  ],
+                             [ 5.9021177 ],
+                             [ 2.263672  ]]]]).astype(np.float32)
+
+    expect_dx2 = np.array([[[[6.4205275, 2.941831 , 5.492452 ,   4.3212175, 2.4262471, 0.       ]],
+                            [[7.991917 , 2.3792431, 4.9190216,   5.2013817, 6.348791 , 8.351772 ]],
+                            [[5.518505 , 8.401285 , 4.691043 ,   6.463884 , 7.504318 , 7.620938 ]],
+                            [[5.2708025, 1.2835244, 4.1031275,   1.9843934, 4.9320035, 4.537787 ]]]]).astype(np.float32)
+
+    net = Grad(Net())
+    output_ms = net(Tensor(x1_np), Tensor(x2_np), Tensor(dy_np))
+    assert np.allclose(output_ms[0].asnumpy(), expect_dx1)
+    assert np.allclose(output_ms[1].asnumpy(), expect_dx2)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_broadcast_diff_dims():
+    context.set_context(mode=context.GRAPH_MODE, save_graphs=True, device_target='GPU')
+
+    x1_np = np.array([[[0.275478,   0.48933202, 0.71846116],
+                       [0.9803821,  0.57205725, 0.28511533]],
+                      [[0.61111903, 0.9671023,  0.70624334],
+                       [0.53730786, 0.90413177, 0.94349676]]]).astype(np.float32)
+
+    x2_np = np.array([[0.01045662, 0.82126397, 0.6365063 ],
+                      [0.9900942,  0.6584232,  0.98537433]]).astype(np.float32)
+
+    dy_np = np.array([[[0.3897645,  0.61152864, 0.33675498],
+                        [0.5303635,  0.84893036, 0.4959739 ]],
+                     [[0.5391046,  0.8443047,  0.4174708 ],
+                        [0.57513475, 0.9225578,  0.46760973]]]).astype(np.float32)
+
+    expect_dx1 = np.array([[[0.3897645 , 0.        , 0.33675498],
+                            [0.        , 0.        , 0.        ]],
+                           [[0.5391046 , 0.8443047 , 0.4174708 ],
+                            [0.        , 0.9225578 , 0.        ]]]).astype(np.float32)
+
+    expect_dx2 = np.array([[0.        , 0.61152864, 0.        ],
+                           [1.1054983 , 0.84893036, 0.96358365]]).astype(np.float32)
+
+    net = Grad(Net())
+    output_ms = net(Tensor(x1_np), Tensor(x2_np), Tensor(dy_np))
+    assert np.allclose(output_ms[0].asnumpy(), expect_dx1)
+    assert np.allclose(output_ms[1].asnumpy(), expect_dx2)
diff --git a/tests/st/ops/gpu/test_minimum_op.py b/tests/st/ops/gpu/test_minimum_op.py
new file mode 100644
index 00000000000..4616f80c192
--- /dev/null
+++ b/tests/st/ops/gpu/test_minimum_op.py
@@ -0,0 +1,220 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import pytest
+from mindspore.ops import operations as P
+from mindspore.ops import composite as C
+from mindspore.nn import Cell
+from mindspore.common.tensor import Tensor
+import mindspore.common.dtype as mstype
+import mindspore.context as context
+import numpy as np
+
+class MinimumNet(Cell):
+    def __init__(self):
+        super(MinimumNet, self).__init__()
+        self.min = P.Minimum()
+
+    def construct(self, x1, x2):
+        x = self.min(x1, x2)
+        return x
+
+
+class Grad(Cell):
+    def __init__(self, network):
+        super(Grad, self).__init__()
+        self.grad = C.GradOperation(name="get_all", get_all=True, sens_param=True)
+        self.network = network
+
+    def construct(self, x1, x2, sens):
+        gout = self.grad(self.network)(x1, x2, sens)
+        return gout
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_nobroadcast():
+    context.set_context(mode=context.GRAPH_MODE, save_graphs=True, device_target='GPU')
+
+    x1_np = np.random.rand(3, 4).astype(np.float32)
+    x2_np = np.random.rand(3, 4).astype(np.float32)
+    dy_np = np.random.rand(3, 4).astype(np.float32)
+
+    net = Grad(MinimumNet())
+    output_ms = net(Tensor(x1_np), Tensor(x2_np), Tensor(dy_np))
+    output0_np = np.where(x1_np < x2_np, dy_np, 0)
+    output1_np = np.where(x1_np < x2_np, 0, dy_np)
+    assert np.allclose(output_ms[0].asnumpy(), output0_np)
+    assert np.allclose(output_ms[1].asnumpy(), output1_np)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_broadcast():
+    context.set_context(mode=context.GRAPH_MODE, save_graphs=True, device_target='GPU')
+
+    x1_np = np.array([[[[0.659578  ],
+                        [0.49113268],
+                        [0.75909054],
+                        [0.71681815],
+                        [0.30421826]]],
+                      [[[0.30322495],
+                        [0.02858258],
+                        [0.06398096],
+                        [0.09519596],
+                        [0.12498625]]],
+                      [[[0.7347768 ],
+                        [0.166469  ],
+                        [0.328553  ],
+                        [0.54908437],
+                        [0.23673844]]]]).astype(np.float32)
+    x2_np = np.array([[[[0.9154968,  0.29014662, 0.6492294,  0.39918253, 0.1648203,  0.00861965]],
+                      [[0.996885,   0.24152198, 0.3601213,  0.51664376, 0.7933056,  0.84706444]],
+                      [[0.75606346, 0.974512,   0.3939527,  0.69697475, 0.83400667, 0.6348955 ]],
+                      [[0.68492866, 0.24609096, 0.4924665,  0.22500521, 0.38474053, 0.5586104 ]]]]).astype(np.float32)
+    dy_np = np.array([[[[0.42891738, 0.03434946, 0.06192983, 0.21216309, 0.37450036, 0.6619524 ],
+                        [0.8583447,  0.5765161,  0.1468952,  0.9975385,  0.6908136,  0.4903796 ],
+                        [0.68952006, 0.39336833, 0.9049695,  0.66886294, 0.2338471,  0.913618  ],
+                        [0.0428149,  0.6243054,  0.8519898,  0.12088962, 0.9735885,  0.45661286],
+                        [0.41563734, 0.41607043, 0.4754915,  0.32207987, 0.33823156, 0.47422352]],
+
+                        [[0.64478457, 0.22430937, 0.7682554,  0.46082005, 0.8938723,  0.20490853],
+                        [0.44393885, 0.08278944, 0.4734108,  0.5543551,  0.39428464, 0.44424313],
+                        [0.12612297, 0.76566416, 0.71133816, 0.81280327, 0.20583127, 0.54058075],
+                        [0.41341263, 0.48118508, 0.00401995, 0.37259838, 0.05435474, 0.5240658 ],
+                        [0.4081956,  0.48718935, 0.9132831, 0.67969185, 0.0119757,  0.8328054 ]],
+
+                        [[0.91695577, 0.95370644, 0.263782,   0.7477626,  0.6448147,  0.8080634 ],
+                        [0.15576603, 0.9104615,  0.3778708,  0.6912833,  0.2092224, 0.67462957],
+                        [0.7087075,  0.7888326,  0.4672294,  0.98221505, 0.25210258, 0.98920417],
+                        [0.7466197,  0.22702982, 0.01991269, 0.6846591,  0.7515228,  0.5890395 ],
+                        [0.04531088, 0.21740614, 0.8406235,  0.36480767, 0.37733936, 0.02914464]],
+
+                        [[0.33069974, 0.5497569,  0.9896345,  0.4167176,  0.78057563, 0.04659131],
+                        [0.7747768,  0.21427679, 0.29893255, 0.7706969,  0.9755185,  0.42388415],
+                        [0.3910244,  0.39381978, 0.37065396, 0.15558061, 0.05012341, 0.15870963],
+                        [0.17791101, 0.47219893, 0.13899496, 0.32323205, 0.3628809,  0.02580585],
+                        [0.30274773, 0.62890774, 0.11024303, 0.6980051,  0.35346958, 0.062852  ]]],
+
+
+                      [[[0.6925081,  0.74668753, 0.80145043, 0.06598313, 0.665123,   0.15073007],
+                        [0.11784806, 0.6385372,  0.5228278,  0.5349848,  0.84671104, 0.8096436 ],
+                        [0.09516156, 0.63298017, 0.52382874, 0.36734378, 0.66497755, 0.6019127 ],
+                        [0.46438488, 0.0194377,  0.9388292,  0.7286089,  0.29178405, 0.11872514],
+                        [0.22101837, 0.6164887,  0.6139798,  0.11711904, 0.6227745,  0.09701069]],
+
+                        [[0.80480653, 0.90034056, 0.8633447,  0.97415197, 0.08309154, 0.8446033 ],
+                        [0.9473769,  0.791024,   0.26339203, 0.01155075, 0.2673186,  0.7116369 ],
+                        [0.9687511,  0.24281934, 0.37777108, 0.09802654, 0.2421312,  0.87095344],
+                        [0.6311381,  0.23368953, 0.0998995,  0.4364419,  0.9187446,  0.5043872 ],
+                        [0.35226053, 0.09357589, 0.41317305, 0.85930043, 0.16249318, 0.5478765 ]],
+
+                        [[0.14338651, 0.24859418, 0.4246941,  0.73034066, 0.47172204, 0.8717199 ],
+                        [0.05415315, 0.78556925, 0.99214983, 0.7415298,  0.673708,   0.87817156],
+                        [0.616975,   0.42843062, 0.05179814, 0.1566958, 0.04536059, 0.70166487],
+                        [0.15493333, 0.776598,   0.4361967,  0.40253627, 0.89210516, 0.8144414 ],
+                        [0.04816005, 0.29696834, 0.4586605,  0.3419852,  0.5595613,  0.74093205]],
+
+                        [[0.1388035,  0.9168704,  0.64287645, 0.83864623, 0.48026922, 0.78323376],
+                        [0.12724937, 0.83034366, 0.42557436, 0.50578654, 0.25630295, 0.15349793],
+                        [0.27256685, 0.04547984, 0.5385756,  0.39270344, 0.7661698,  0.23722854],
+                        [0.24620503, 0.25431684, 0.71564585, 0.01161419, 0.846467,   0.7043044 ],
+                        [0.63272387, 0.11857849, 0.3772076,  0.16758402, 0.46743023, 0.05919575]]],
+
+
+                      [[[0.18827082, 0.8912264,  0.6841404,  0.74436826, 0.9582085,  0.1083683 ],
+                        [0.60695344, 0.09742349, 0.25074378, 0.87940735, 0.21116392, 0.39418384],
+                        [0.744686,   0.35679692, 0.01308284, 0.45166633, 0.68166,    0.8634658 ],
+                        [0.7331758,  0.21113694, 0.3935488,  0.87934476, 0.70728546, 0.09309767],
+                        [0.12128611, 0.93696386, 0.81177396, 0.85402405, 0.5827289,  0.9776509 ]],
+
+                        [[0.54069614, 0.66651285, 0.10646132, 0.17342485, 0.88795924, 0.03551182],
+                        [0.25531697, 0.87946486, 0.74267226, 0.89230734, 0.95171434, 0.94697934],
+                        [0.3708397,  0.507355,   0.97099817, 0.4918163,  0.17212386, 0.5008048 ],
+                        [0.62530744, 0.25210327, 0.73966664, 0.71555346, 0.82484317, 0.6094874 ],
+                        [0.4589691,  0.1386695,  0.27448782, 0.20373994, 0.27805242, 0.23292768]],
+
+                        [[0.7414099,  0.2270226,  0.90431255, 0.47035843, 0.9581062,  0.5359226 ],
+                        [0.79603523, 0.45549425, 0.80858237, 0.7705133,  0.017761,   0.98001194],
+                        [0.06013146, 0.99240226, 0.33515573, 0.04110833, 0.41470334, 0.7130743 ],
+                        [0.5687417,  0.5788611,  0.00722461, 0.6603336,  0.3420471,  0.75181854],
+                        [0.4699261,  0.51390815, 0.343182,   0.81498754, 0.8942413, 0.46532857]],
+
+                        [[0.4589523,  0.5534698,  0.2825786,  0.8205943,  0.78258514, 0.43154418],
+                        [0.27020997, 0.01667354, 0.60871965, 0.90670526, 0.3208025,  0.96995634],
+                        [0.85337156, 0.9711295,  0.1381724,  0.53670496, 0.7347996,  0.73380876],
+                        [0.6137464,  0.54751194, 0.9037335,  0.23134394, 0.61411524, 0.26583543],
+                        [0.70770144, 0.01813207, 0.24718016, 0.70329237, 0.7062925,  0.14399007]]]]).astype(np.float32)
+
+    expect_dx1 = np.array([[[[ 5.7664223],
+                             [ 6.981018 ],
+                             [ 2.6029902],
+                             [ 2.7598202],
+                             [ 6.763105 ]]],
+                           [[[10.06558  ],
+                             [12.077246 ],
+                             [ 9.338394 ],
+                             [11.52271  ],
+                             [ 8.889048 ]]],
+                           [[[ 3.5789769],
+                             [13.424448 ],
+                             [ 8.732746 ],
+                             [ 6.9677467],
+                             [ 9.635765 ]]]]).astype(np.float32)
+
+    expect_dx2 = np.array([[[[0.        , 4.250458  , 2.5030296 , 3.623167  , 6.4171505 , 7.2115746 ]],
+                            [[0.        , 4.367449  , 2.803152  , 2.5352    , 0.        , 0.        ]],
+                            [[0.7087075 , 0.        , 2.040332  , 2.1372325 , 0.        , 2.9222295 ]],
+                            [[1.0278877 , 5.247942  , 2.6855955 , 5.494814  , 3.5657988 , 0.66265094]]]]).astype(np.float32)
+
+    net = Grad(MinimumNet())
+    output_ms = net(Tensor(x1_np), Tensor(x2_np), Tensor(dy_np))
+    assert np.allclose(output_ms[0].asnumpy(), expect_dx1)
+    assert np.allclose(output_ms[1].asnumpy(), expect_dx2)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_broadcast_diff_dims():
+    context.set_context(mode=context.GRAPH_MODE, save_graphs=True, device_target='GPU')
+
+    x1_np = np.array([[[0.275478,   0.48933202, 0.71846116],
+                       [0.9803821,  0.57205725, 0.28511533]],
+                      [[0.61111903, 0.9671023,  0.70624334],
+                       [0.53730786, 0.90413177, 0.94349676]]]).astype(np.float32)
+
+    x2_np = np.array([[0.01045662, 0.82126397, 0.6365063 ],
+                      [0.9900942,  0.6584232,  0.98537433]]).astype(np.float32)
+
+    dy_np = np.array([[[0.3897645,  0.61152864, 0.33675498],
+                        [0.5303635,  0.84893036, 0.4959739 ]],
+                     [[0.5391046,  0.8443047,  0.4174708 ],
+                        [0.57513475, 0.9225578,  0.46760973]]]).astype(np.float32)
+
+    expect_dx1 = np.array([[[0.        , 0.61152864, 0.        ],
+                            [0.5303635 , 0.84893036, 0.4959739 ]],
+                          [[0.        , 0.        , 0.        ],
+                            [0.57513475, 0.        , 0.46760973]]]).astype(np.float32)
+
+    expect_dx2 = np.array([[0.92886907, 0.8443047 , 0.7542258 ],
+                          [0.        , 0.9225578 , 0.        ]]).astype(np.float32)
+
+    net = Grad(MinimumNet())
+    output_ms = net(Tensor(x1_np), Tensor(x2_np), Tensor(dy_np))
+    assert np.allclose(output_ms[0].asnumpy(), expect_dx1)
+    assert np.allclose(output_ms[1].asnumpy(), expect_dx2)