!4226 Add GPU div kernel for auto parallel

Merge pull request !4226 from ZPaC/master-add-gpu-div-kernel
This commit is contained in:
mindspore-ci-bot 2020-08-11 15:06:06 +08:00 committed by Gitee
commit 2d1674475c
5 changed files with 150 additions and 21 deletions

View File

@ -55,6 +55,11 @@ struct RealDivFunc {
__device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs / rhs); }
};
template <typename T, typename S>
struct DivFunc {
__device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs / rhs); }
};
template <typename T, typename S>
struct MulFunc {
__device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs * rhs); }
@ -78,7 +83,7 @@ struct FloorDivFunc {
template <>
struct FloorDivFunc<half, half> {
__device__ __forceinline__ half operator()(const half &lhs, const half &rhs) {
return __float2half(floor(__half2float(lhs)/ __half2float(rhs)));
return __float2half(floor(__half2float(lhs) / __half2float(rhs)));
}
};
@ -96,7 +101,6 @@ struct AbsGradFunc {
}
};
template <>
struct PowerFunc<half, bool> {
// invalid branch
@ -105,7 +109,6 @@ struct PowerFunc<half, bool> {
__device__ __forceinline__ int Index(const int &index, const int &dim) { return dim == 1 ? 0 : index; }
template <typename T, typename S, typename Func>
__device__ __forceinline__ void BroadcastOperator(const int &l0, const int &l1, const int &l2, const int &l3,
const int &l4, const int &l5, const int &l6, const int &r0,
@ -181,6 +184,9 @@ __global__ void BroadcastKernel(const int l0, const int l1, const int l2, const
case BROADCAST_TYPE_ABSGRAD:
return BroadcastOperator<T, S, AbsGradFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1,
d2, d3, d4, d5, d6, input0, input1, output);
case BROADCAST_TYPE_DIV:
return BroadcastOperator<T, S, DivFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1, d2,
d3, d4, d5, d6, input0, input1, output);
}
}
@ -192,13 +198,11 @@ void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_sh
for (auto d : output_shape) {
size *= d;
}
BroadcastKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(lhs_shape[0], lhs_shape[1], lhs_shape[2], lhs_shape[3],
lhs_shape[4], lhs_shape[5], lhs_shape[6], rhs_shape[0],
rhs_shape[1], rhs_shape[2], rhs_shape[3], rhs_shape[4],
rhs_shape[5], rhs_shape[6], output_shape[0],
output_shape[1], output_shape[2], output_shape[3],
output_shape[4], output_shape[5], output_shape[6],
op, input0, input1, output);
BroadcastKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(
lhs_shape[0], lhs_shape[1], lhs_shape[2], lhs_shape[3], lhs_shape[4], lhs_shape[5], lhs_shape[6], rhs_shape[0],
rhs_shape[1], rhs_shape[2], rhs_shape[3], rhs_shape[4], rhs_shape[5], rhs_shape[6], output_shape[0],
output_shape[1], output_shape[2], output_shape[3], output_shape[4], output_shape[5], output_shape[6], op, input0,
input1, output);
}
template <typename T, typename S, typename Func>
@ -234,6 +238,8 @@ __global__ void NoBroadcastKernel(const int nums, enum BroadcastOpType op, const
return NoBroadcastOperator<T, S, FloorDivFunc<T, S>>(nums, input0, input1, output);
case BROADCAST_TYPE_ABSGRAD:
return NoBroadcastOperator<T, S, AbsGradFunc<T, S>>(nums, input0, input1, output);
case BROADCAST_TYPE_DIV:
return NoBroadcastOperator<T, S, DivFunc<T, S>>(nums, input0, input1, output);
}
}
@ -244,8 +250,8 @@ void NoBroadcast(const int &nums, enum BroadcastOpType op, const T *input0, cons
}
template <typename T>
__global__ void BroadcastToKernel(const int i0, const int i1, const int i2, const int i3, const int o0,
const int o1, const int o2, const int o3, const T *input_addr, T *output_addr) {
__global__ void BroadcastToKernel(const int i0, const int i1, const int i2, const int i3, const int o0, const int o1,
const int o2, const int o3, const T *input_addr, T *output_addr) {
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < o0 * o1 * o2 * o3; pos += blockDim.x * gridDim.x) {
int i = pos / (o1 * o2 * o3) % o0;
int j = pos / (o2 * o3) % o1;
@ -262,7 +268,7 @@ void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, con
const int &o2, const int &o3, const T *input_addr, T *output_addr, cudaStream_t stream) {
int nums = o0 * o1 * o2 * o3;
BroadcastToKernel<<<GET_BLOCKS(nums), GET_THREADS, 0, stream>>>(i0, i1, i2, i3, o0, o1, o2, o3, input_addr,
output_addr);
output_addr);
}
template void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
@ -291,10 +297,10 @@ template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *
bool *output, cudaStream_t stream);
template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *input0, const half *input1,
half *output, cudaStream_t stream);
template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1,
int *output, cudaStream_t stream);
template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1,
bool *output, cudaStream_t stream);
template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1, int *output,
cudaStream_t stream);
template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1, bool *output,
cudaStream_t stream);
template void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
const int &o2, const int &o3, const float *input_addr, float *output_addr,
cudaStream_t stream);

View File

@ -32,6 +32,7 @@ enum BroadcastOpType {
BROADCAST_TYPE_ADD = 8,
BROADCAST_TYPE_FLOORDIV = 9,
BROADCAST_TYPE_ABSGRAD = 10,
BROADCAST_TYPE_DIV = 11,
BROADCAST_TYPE_INVALID = 0xffffffff,
};

View File

@ -59,6 +59,9 @@ MS_REG_GPU_KERNEL_TWO(
AbsGrad,
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
BroadcastOpGpuKernel, float, float)
MS_REG_GPU_KERNEL_TWO(
Div, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
BroadcastOpGpuKernel, float, float)
// fp16
MS_REG_GPU_KERNEL_TWO(
@ -101,6 +104,9 @@ MS_REG_GPU_KERNEL_TWO(
AbsGrad,
KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
BroadcastOpGpuKernel, half, half)
MS_REG_GPU_KERNEL_TWO(
Div, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
BroadcastOpGpuKernel, half, half)
// int32
MS_REG_GPU_KERNEL_TWO(
@ -118,14 +124,14 @@ MS_REG_GPU_KERNEL_TWO(
MS_REG_GPU_KERNEL_TWO(
Mul, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
BroadcastOpGpuKernel, int, int)
MS_REG_GPU_KERNEL_TWO(
RealDiv, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
BroadcastOpGpuKernel, int, int)
MS_REG_GPU_KERNEL_TWO(
FloorDiv, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
BroadcastOpGpuKernel, int, int)
MS_REG_GPU_KERNEL_TWO(
AbsGrad, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
BroadcastOpGpuKernel, int, int)
MS_REG_GPU_KERNEL_TWO(
Div, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
BroadcastOpGpuKernel, int, int)
} // namespace kernel
} // namespace mindspore

View File

@ -102,7 +102,7 @@ class BroadcastOpGpuKernel : public GpuKernel {
{"Greater", BROADCAST_TYPE_GREATER}, {"Less", BROADCAST_TYPE_LESS}, {"Maximum", BROADCAST_TYPE_MAXIMUM},
{"Minimum", BROADCAST_TYPE_MINIMUM}, {"Pow", BROADCAST_TYPE_POWER}, {"RealDiv", BROADCAST_TYPE_REALDIV},
{"Mul", BROADCAST_TYPE_MUL}, {"Sub", BROADCAST_TYPE_SUB}, {"TensorAdd", BROADCAST_TYPE_ADD},
{"FloorDiv", BROADCAST_TYPE_FLOORDIV}, {"AbsGrad", BROADCAST_TYPE_ABSGRAD},
{"FloorDiv", BROADCAST_TYPE_FLOORDIV}, {"AbsGrad", BROADCAST_TYPE_ABSGRAD}, {"Div", BROADCAST_TYPE_DIV},
};
auto iter = kBroadcastTypeMap.find(kernel_name);

View File

@ -0,0 +1,116 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
import numpy as np
import pytest
import mindspore.context as context
import mindspore.nn as nn
from mindspore import Tensor
from mindspore.ops import operations as P
class NetDiv(nn.Cell):
def __init__(self):
super(NetDiv, self).__init__()
self.div = P.Div()
def construct(self, x, y):
return self.div(x, y)
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_div():
x0_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
y0_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
x1_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
y1_np = np.random.randint(1, 5, (2, 1, 4, 4)).astype(np.float32)
x2_np = np.random.randint(1, 5, (2, 1, 1, 4)).astype(np.float32)
y2_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
x3_np = np.random.randint(1, 5, 1).astype(np.float32)
y3_np = np.random.randint(1, 5, 1).astype(np.float32)
x4_np = np.array(768).astype(np.float32)
y4_np = np.array(3072.5).astype(np.float32)
x5_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float16)
y5_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float16)
x6_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.int32)
y6_np = np.random.randint(1, 5, (2, 1, 4, 4)).astype(np.int32)
x0 = Tensor(x0_np)
y0 = Tensor(y0_np)
x1 = Tensor(x1_np)
y1 = Tensor(y1_np)
x2 = Tensor(x2_np)
y2 = Tensor(y2_np)
x3 = Tensor(x3_np)
y3 = Tensor(y3_np)
x4 = Tensor(x4_np)
y4 = Tensor(y4_np)
x5 = Tensor(x5_np)
y5 = Tensor(y5_np)
x6 = Tensor(x6_np)
y6 = Tensor(y6_np)
context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
div = NetDiv()
output0 = div(x0, y0)
expect0 = np.divide(x0_np, y0_np)
diff0 = output0.asnumpy() - expect0
error0 = np.ones(shape=expect0.shape) * 1.0e-5
assert np.all(diff0 < error0)
assert output0.shape == expect0.shape
output1 = div(x1, y1)
expect1 = np.divide(x1_np, y1_np)
diff1 = output1.asnumpy() - expect1
error1 = np.ones(shape=expect1.shape) * 1.0e-5
assert np.all(diff1 < error1)
assert output1.shape == expect1.shape
output2 = div(x2, y2)
expect2 = np.divide(x2_np, y2_np)
diff2 = output2.asnumpy() - expect2
error2 = np.ones(shape=expect2.shape) * 1.0e-5
assert np.all(diff2 < error2)
assert output2.shape == expect2.shape
context.set_context(mode=context.PYNATIVE_MODE, device_target='GPU')
output3 = div(x3, y3)
expect3 = np.divide(x3_np, y3_np)
diff3 = output3.asnumpy() - expect3
error3 = np.ones(shape=expect3.shape) * 1.0e-5
assert np.all(diff3 < error3)
assert output3.shape == expect3.shape
output4 = div(x4, y4)
expect4 = np.divide(x4_np, y4_np)
diff4 = output4.asnumpy() - expect4
error4 = np.ones(shape=expect4.shape) * 1.0e-5
assert np.all(diff4 < error4)
assert output4.shape == expect4.shape
output5 = div(x5, y5)
expect5 = np.divide(x5_np, y5_np)
diff5 = output5.asnumpy() - expect5
error5 = np.ones(shape=expect5.shape) * 1.0e-5
assert np.all(diff5 < error5)
assert output5.shape == expect5.shape
output6 = div(x6, y6)
expect6 = np.divide(x6_np, y6_np)
diff6 = output6.asnumpy() - expect6
error6 = np.ones(shape=expect6.shape) * 1.0e-5
assert np.all(diff6 < error6)
assert output6.shape == expect6.shape