!4226 Add GPU div kernel for auto parallel

Merge pull request !4226 from ZPaC/master-add-gpu-div-kernel
2020-08-11 15:06:06 +08:00 · 2020-08-11 15:06:06 +08:00 · 2d1674475c
parent 63ac1f3515 1dcc34e785
commit 2d1674475c
5 changed files with 150 additions and 21 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu
@ -55,6 +55,11 @@ struct RealDivFunc {
  __device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs / rhs); }
 };

+template <typename T, typename S>
+struct DivFunc {
+  __device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs / rhs); }
+};
+
 template <typename T, typename S>
 struct MulFunc {
  __device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs * rhs); }
@ -78,7 +83,7 @@ struct FloorDivFunc {
 template <>
 struct FloorDivFunc<half, half> {
  __device__ __forceinline__ half operator()(const half &lhs, const half &rhs) {
-    return __float2half(floor(__half2float(lhs)/ __half2float(rhs)));
+    return __float2half(floor(__half2float(lhs) / __half2float(rhs)));
  }
 };

@ -96,7 +101,6 @@ struct AbsGradFunc {
  }
 };

-
 template <>
 struct PowerFunc<half, bool> {
  // invalid branch
@ -105,7 +109,6 @@ struct PowerFunc<half, bool> {

 __device__ __forceinline__ int Index(const int &index, const int &dim) { return dim == 1 ? 0 : index; }

-
 template <typename T, typename S, typename Func>
 __device__ __forceinline__ void BroadcastOperator(const int &l0, const int &l1, const int &l2, const int &l3,
                                                  const int &l4, const int &l5, const int &l6, const int &r0,
@ -181,6 +184,9 @@ __global__ void BroadcastKernel(const int l0, const int l1, const int l2, const
    case BROADCAST_TYPE_ABSGRAD:
      return BroadcastOperator<T, S, AbsGradFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1,
                                                        d2, d3, d4, d5, d6, input0, input1, output);
+    case BROADCAST_TYPE_DIV:
+      return BroadcastOperator<T, S, DivFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1, d2,
+                                                    d3, d4, d5, d6, input0, input1, output);
  }
 }

@ -192,13 +198,11 @@ void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_sh
  for (auto d : output_shape) {
    size *= d;
  }
-  BroadcastKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(lhs_shape[0], lhs_shape[1], lhs_shape[2], lhs_shape[3],
-                                                                lhs_shape[4], lhs_shape[5], lhs_shape[6], rhs_shape[0],
-                                                                rhs_shape[1], rhs_shape[2], rhs_shape[3], rhs_shape[4],
-                                                                rhs_shape[5], rhs_shape[6], output_shape[0],
-                                                                output_shape[1], output_shape[2], output_shape[3],
-                                                                output_shape[4], output_shape[5], output_shape[6],
-                                                                op, input0, input1, output);
+  BroadcastKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(
+    lhs_shape[0], lhs_shape[1], lhs_shape[2], lhs_shape[3], lhs_shape[4], lhs_shape[5], lhs_shape[6], rhs_shape[0],
+    rhs_shape[1], rhs_shape[2], rhs_shape[3], rhs_shape[4], rhs_shape[5], rhs_shape[6], output_shape[0],
+    output_shape[1], output_shape[2], output_shape[3], output_shape[4], output_shape[5], output_shape[6], op, input0,
+    input1, output);
 }

 template <typename T, typename S, typename Func>
@ -234,6 +238,8 @@ __global__ void NoBroadcastKernel(const int nums, enum BroadcastOpType op, const
      return NoBroadcastOperator<T, S, FloorDivFunc<T, S>>(nums, input0, input1, output);
    case BROADCAST_TYPE_ABSGRAD:
      return NoBroadcastOperator<T, S, AbsGradFunc<T, S>>(nums, input0, input1, output);
+    case BROADCAST_TYPE_DIV:
+      return NoBroadcastOperator<T, S, DivFunc<T, S>>(nums, input0, input1, output);
  }
 }

@ -244,8 +250,8 @@ void NoBroadcast(const int &nums, enum BroadcastOpType op, const T *input0, cons
 }

 template <typename T>
-__global__ void BroadcastToKernel(const int i0, const int i1, const int i2, const int i3, const int o0,
-                                  const int o1, const int o2, const int o3, const T *input_addr, T *output_addr) {
+__global__ void BroadcastToKernel(const int i0, const int i1, const int i2, const int i3, const int o0, const int o1,
+                                  const int o2, const int o3, const T *input_addr, T *output_addr) {
  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < o0 * o1 * o2 * o3; pos += blockDim.x * gridDim.x) {
    int i = pos / (o1 * o2 * o3) % o0;
    int j = pos / (o2 * o3) % o1;
@ -262,7 +268,7 @@ void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, con
                 const int &o2, const int &o3, const T *input_addr, T *output_addr, cudaStream_t stream) {
  int nums = o0 * o1 * o2 * o3;
  BroadcastToKernel<<<GET_BLOCKS(nums), GET_THREADS, 0, stream>>>(i0, i1, i2, i3, o0, o1, o2, o3, input_addr,
-          output_addr);
+                                                                  output_addr);
 }

 template void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
@ -291,10 +297,10 @@ template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *
                          bool *output, cudaStream_t stream);
 template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *input0, const half *input1,
                          half *output, cudaStream_t stream);
-template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1,
-                          int *output, cudaStream_t stream);
-template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1,
-                          bool *output, cudaStream_t stream);
+template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1, int *output,
+                          cudaStream_t stream);
+template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1, bool *output,
+                          cudaStream_t stream);
 template void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
                          const int &o2, const int &o3, const float *input_addr, float *output_addr,
                          cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh
@ -32,6 +32,7 @@ enum BroadcastOpType {
  BROADCAST_TYPE_ADD = 8,
  BROADCAST_TYPE_FLOORDIV = 9,
  BROADCAST_TYPE_ABSGRAD = 10,
+  BROADCAST_TYPE_DIV = 11,
  BROADCAST_TYPE_INVALID = 0xffffffff,
 };

--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.cc
@ -59,6 +59,9 @@ MS_REG_GPU_KERNEL_TWO(
  AbsGrad,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  BroadcastOpGpuKernel, float, float)
+MS_REG_GPU_KERNEL_TWO(
+  Div, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  BroadcastOpGpuKernel, float, float)

 // fp16
 MS_REG_GPU_KERNEL_TWO(
@ -101,6 +104,9 @@ MS_REG_GPU_KERNEL_TWO(
  AbsGrad,
  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
  BroadcastOpGpuKernel, half, half)
+MS_REG_GPU_KERNEL_TWO(
+  Div, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+  BroadcastOpGpuKernel, half, half)

 // int32
 MS_REG_GPU_KERNEL_TWO(
@ -118,14 +124,14 @@ MS_REG_GPU_KERNEL_TWO(
 MS_REG_GPU_KERNEL_TWO(
  Mul, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
  BroadcastOpGpuKernel, int, int)
-MS_REG_GPU_KERNEL_TWO(
-  RealDiv, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
-  BroadcastOpGpuKernel, int, int)
 MS_REG_GPU_KERNEL_TWO(
  FloorDiv, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
  BroadcastOpGpuKernel, int, int)
 MS_REG_GPU_KERNEL_TWO(
  AbsGrad, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
  BroadcastOpGpuKernel, int, int)
+MS_REG_GPU_KERNEL_TWO(
+  Div, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
+  BroadcastOpGpuKernel, int, int)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h
@ -102,7 +102,7 @@ class BroadcastOpGpuKernel : public GpuKernel {
      {"Greater", BROADCAST_TYPE_GREATER},   {"Less", BROADCAST_TYPE_LESS},       {"Maximum", BROADCAST_TYPE_MAXIMUM},
      {"Minimum", BROADCAST_TYPE_MINIMUM},   {"Pow", BROADCAST_TYPE_POWER},       {"RealDiv", BROADCAST_TYPE_REALDIV},
      {"Mul", BROADCAST_TYPE_MUL},           {"Sub", BROADCAST_TYPE_SUB},         {"TensorAdd", BROADCAST_TYPE_ADD},
-      {"FloorDiv", BROADCAST_TYPE_FLOORDIV}, {"AbsGrad", BROADCAST_TYPE_ABSGRAD},
+      {"FloorDiv", BROADCAST_TYPE_FLOORDIV}, {"AbsGrad", BROADCAST_TYPE_ABSGRAD}, {"Div", BROADCAST_TYPE_DIV},
    };

    auto iter = kBroadcastTypeMap.find(kernel_name);
--- a/tests/st/ops/gpu/test_div_op.py
+++ b/tests/st/ops/gpu/test_div_op.py
@ -0,0 +1,116 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops import operations as P
+
+class NetDiv(nn.Cell):
+    def __init__(self):
+        super(NetDiv, self).__init__()
+        self.div = P.Div()
+
+    def construct(self, x, y):
+        return self.div(x, y)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_div():
+    x0_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
+    y0_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
+    x1_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
+    y1_np = np.random.randint(1, 5, (2, 1, 4, 4)).astype(np.float32)
+    x2_np = np.random.randint(1, 5, (2, 1, 1, 4)).astype(np.float32)
+    y2_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
+    x3_np = np.random.randint(1, 5, 1).astype(np.float32)
+    y3_np = np.random.randint(1, 5, 1).astype(np.float32)
+    x4_np = np.array(768).astype(np.float32)
+    y4_np = np.array(3072.5).astype(np.float32)
+    x5_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float16)
+    y5_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float16)
+    x6_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.int32)
+    y6_np = np.random.randint(1, 5, (2, 1, 4, 4)).astype(np.int32)
+
+    x0 = Tensor(x0_np)
+    y0 = Tensor(y0_np)
+    x1 = Tensor(x1_np)
+    y1 = Tensor(y1_np)
+    x2 = Tensor(x2_np)
+    y2 = Tensor(y2_np)
+    x3 = Tensor(x3_np)
+    y3 = Tensor(y3_np)
+    x4 = Tensor(x4_np)
+    y4 = Tensor(y4_np)
+    x5 = Tensor(x5_np)
+    y5 = Tensor(y5_np)
+    x6 = Tensor(x6_np)
+    y6 = Tensor(y6_np)
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
+    div = NetDiv()
+    output0 = div(x0, y0)
+    expect0 = np.divide(x0_np, y0_np)
+    diff0 = output0.asnumpy() - expect0
+    error0 = np.ones(shape=expect0.shape) * 1.0e-5
+    assert np.all(diff0 < error0)
+    assert output0.shape == expect0.shape
+
+    output1 = div(x1, y1)
+    expect1 = np.divide(x1_np, y1_np)
+    diff1 = output1.asnumpy() - expect1
+    error1 = np.ones(shape=expect1.shape) * 1.0e-5
+    assert np.all(diff1 < error1)
+    assert output1.shape == expect1.shape
+
+    output2 = div(x2, y2)
+    expect2 = np.divide(x2_np, y2_np)
+    diff2 = output2.asnumpy() - expect2
+    error2 = np.ones(shape=expect2.shape) * 1.0e-5
+    assert np.all(diff2 < error2)
+    assert output2.shape == expect2.shape
+
+    context.set_context(mode=context.PYNATIVE_MODE, device_target='GPU')
+    output3 = div(x3, y3)
+    expect3 = np.divide(x3_np, y3_np)
+    diff3 = output3.asnumpy() - expect3
+    error3 = np.ones(shape=expect3.shape) * 1.0e-5
+    assert np.all(diff3 < error3)
+    assert output3.shape == expect3.shape
+
+    output4 = div(x4, y4)
+    expect4 = np.divide(x4_np, y4_np)
+    diff4 = output4.asnumpy() - expect4
+    error4 = np.ones(shape=expect4.shape) * 1.0e-5
+    assert np.all(diff4 < error4)
+    assert output4.shape == expect4.shape
+
+    output5 = div(x5, y5)
+    expect5 = np.divide(x5_np, y5_np)
+    diff5 = output5.asnumpy() - expect5
+    error5 = np.ones(shape=expect5.shape) * 1.0e-5
+    assert np.all(diff5 < error5)
+    assert output5.shape == expect5.shape
+
+    output6 = div(x6, y6)
+    expect6 = np.divide(x6_np, y6_np)
+    diff6 = output6.asnumpy() - expect6
+    error6 = np.ones(shape=expect6.shape) * 1.0e-5
+    assert np.all(diff6 < error6)
+    assert output6.shape == expect6.shape