forked from mindspore-Ecosystem/mindspore
!4226 Add GPU div kernel for auto parallel
Merge pull request !4226 from ZPaC/master-add-gpu-div-kernel
This commit is contained in:
commit
2d1674475c
|
@ -55,6 +55,11 @@ struct RealDivFunc {
|
|||
__device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs / rhs); }
|
||||
};
|
||||
|
||||
template <typename T, typename S>
|
||||
struct DivFunc {
|
||||
__device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs / rhs); }
|
||||
};
|
||||
|
||||
template <typename T, typename S>
|
||||
struct MulFunc {
|
||||
__device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs * rhs); }
|
||||
|
@ -78,7 +83,7 @@ struct FloorDivFunc {
|
|||
template <>
|
||||
struct FloorDivFunc<half, half> {
|
||||
__device__ __forceinline__ half operator()(const half &lhs, const half &rhs) {
|
||||
return __float2half(floor(__half2float(lhs)/ __half2float(rhs)));
|
||||
return __float2half(floor(__half2float(lhs) / __half2float(rhs)));
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -96,7 +101,6 @@ struct AbsGradFunc {
|
|||
}
|
||||
};
|
||||
|
||||
|
||||
template <>
|
||||
struct PowerFunc<half, bool> {
|
||||
// invalid branch
|
||||
|
@ -105,7 +109,6 @@ struct PowerFunc<half, bool> {
|
|||
|
||||
__device__ __forceinline__ int Index(const int &index, const int &dim) { return dim == 1 ? 0 : index; }
|
||||
|
||||
|
||||
template <typename T, typename S, typename Func>
|
||||
__device__ __forceinline__ void BroadcastOperator(const int &l0, const int &l1, const int &l2, const int &l3,
|
||||
const int &l4, const int &l5, const int &l6, const int &r0,
|
||||
|
@ -181,6 +184,9 @@ __global__ void BroadcastKernel(const int l0, const int l1, const int l2, const
|
|||
case BROADCAST_TYPE_ABSGRAD:
|
||||
return BroadcastOperator<T, S, AbsGradFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1,
|
||||
d2, d3, d4, d5, d6, input0, input1, output);
|
||||
case BROADCAST_TYPE_DIV:
|
||||
return BroadcastOperator<T, S, DivFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1, d2,
|
||||
d3, d4, d5, d6, input0, input1, output);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -192,13 +198,11 @@ void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_sh
|
|||
for (auto d : output_shape) {
|
||||
size *= d;
|
||||
}
|
||||
BroadcastKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(lhs_shape[0], lhs_shape[1], lhs_shape[2], lhs_shape[3],
|
||||
lhs_shape[4], lhs_shape[5], lhs_shape[6], rhs_shape[0],
|
||||
rhs_shape[1], rhs_shape[2], rhs_shape[3], rhs_shape[4],
|
||||
rhs_shape[5], rhs_shape[6], output_shape[0],
|
||||
output_shape[1], output_shape[2], output_shape[3],
|
||||
output_shape[4], output_shape[5], output_shape[6],
|
||||
op, input0, input1, output);
|
||||
BroadcastKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(
|
||||
lhs_shape[0], lhs_shape[1], lhs_shape[2], lhs_shape[3], lhs_shape[4], lhs_shape[5], lhs_shape[6], rhs_shape[0],
|
||||
rhs_shape[1], rhs_shape[2], rhs_shape[3], rhs_shape[4], rhs_shape[5], rhs_shape[6], output_shape[0],
|
||||
output_shape[1], output_shape[2], output_shape[3], output_shape[4], output_shape[5], output_shape[6], op, input0,
|
||||
input1, output);
|
||||
}
|
||||
|
||||
template <typename T, typename S, typename Func>
|
||||
|
@ -234,6 +238,8 @@ __global__ void NoBroadcastKernel(const int nums, enum BroadcastOpType op, const
|
|||
return NoBroadcastOperator<T, S, FloorDivFunc<T, S>>(nums, input0, input1, output);
|
||||
case BROADCAST_TYPE_ABSGRAD:
|
||||
return NoBroadcastOperator<T, S, AbsGradFunc<T, S>>(nums, input0, input1, output);
|
||||
case BROADCAST_TYPE_DIV:
|
||||
return NoBroadcastOperator<T, S, DivFunc<T, S>>(nums, input0, input1, output);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -244,8 +250,8 @@ void NoBroadcast(const int &nums, enum BroadcastOpType op, const T *input0, cons
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void BroadcastToKernel(const int i0, const int i1, const int i2, const int i3, const int o0,
|
||||
const int o1, const int o2, const int o3, const T *input_addr, T *output_addr) {
|
||||
__global__ void BroadcastToKernel(const int i0, const int i1, const int i2, const int i3, const int o0, const int o1,
|
||||
const int o2, const int o3, const T *input_addr, T *output_addr) {
|
||||
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < o0 * o1 * o2 * o3; pos += blockDim.x * gridDim.x) {
|
||||
int i = pos / (o1 * o2 * o3) % o0;
|
||||
int j = pos / (o2 * o3) % o1;
|
||||
|
@ -262,7 +268,7 @@ void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, con
|
|||
const int &o2, const int &o3, const T *input_addr, T *output_addr, cudaStream_t stream) {
|
||||
int nums = o0 * o1 * o2 * o3;
|
||||
BroadcastToKernel<<<GET_BLOCKS(nums), GET_THREADS, 0, stream>>>(i0, i1, i2, i3, o0, o1, o2, o3, input_addr,
|
||||
output_addr);
|
||||
output_addr);
|
||||
}
|
||||
|
||||
template void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
|
||||
|
@ -291,10 +297,10 @@ template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *
|
|||
bool *output, cudaStream_t stream);
|
||||
template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *input0, const half *input1,
|
||||
half *output, cudaStream_t stream);
|
||||
template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1,
|
||||
int *output, cudaStream_t stream);
|
||||
template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1,
|
||||
bool *output, cudaStream_t stream);
|
||||
template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1, int *output,
|
||||
cudaStream_t stream);
|
||||
template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1, bool *output,
|
||||
cudaStream_t stream);
|
||||
template void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
|
||||
const int &o2, const int &o3, const float *input_addr, float *output_addr,
|
||||
cudaStream_t stream);
|
||||
|
|
|
@ -32,6 +32,7 @@ enum BroadcastOpType {
|
|||
BROADCAST_TYPE_ADD = 8,
|
||||
BROADCAST_TYPE_FLOORDIV = 9,
|
||||
BROADCAST_TYPE_ABSGRAD = 10,
|
||||
BROADCAST_TYPE_DIV = 11,
|
||||
BROADCAST_TYPE_INVALID = 0xffffffff,
|
||||
};
|
||||
|
||||
|
|
|
@ -59,6 +59,9 @@ MS_REG_GPU_KERNEL_TWO(
|
|||
AbsGrad,
|
||||
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
|
||||
BroadcastOpGpuKernel, float, float)
|
||||
MS_REG_GPU_KERNEL_TWO(
|
||||
Div, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
|
||||
BroadcastOpGpuKernel, float, float)
|
||||
|
||||
// fp16
|
||||
MS_REG_GPU_KERNEL_TWO(
|
||||
|
@ -101,6 +104,9 @@ MS_REG_GPU_KERNEL_TWO(
|
|||
AbsGrad,
|
||||
KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
|
||||
BroadcastOpGpuKernel, half, half)
|
||||
MS_REG_GPU_KERNEL_TWO(
|
||||
Div, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
|
||||
BroadcastOpGpuKernel, half, half)
|
||||
|
||||
// int32
|
||||
MS_REG_GPU_KERNEL_TWO(
|
||||
|
@ -118,14 +124,14 @@ MS_REG_GPU_KERNEL_TWO(
|
|||
MS_REG_GPU_KERNEL_TWO(
|
||||
Mul, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
|
||||
BroadcastOpGpuKernel, int, int)
|
||||
MS_REG_GPU_KERNEL_TWO(
|
||||
RealDiv, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
|
||||
BroadcastOpGpuKernel, int, int)
|
||||
MS_REG_GPU_KERNEL_TWO(
|
||||
FloorDiv, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
|
||||
BroadcastOpGpuKernel, int, int)
|
||||
MS_REG_GPU_KERNEL_TWO(
|
||||
AbsGrad, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
|
||||
BroadcastOpGpuKernel, int, int)
|
||||
MS_REG_GPU_KERNEL_TWO(
|
||||
Div, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
|
||||
BroadcastOpGpuKernel, int, int)
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -102,7 +102,7 @@ class BroadcastOpGpuKernel : public GpuKernel {
|
|||
{"Greater", BROADCAST_TYPE_GREATER}, {"Less", BROADCAST_TYPE_LESS}, {"Maximum", BROADCAST_TYPE_MAXIMUM},
|
||||
{"Minimum", BROADCAST_TYPE_MINIMUM}, {"Pow", BROADCAST_TYPE_POWER}, {"RealDiv", BROADCAST_TYPE_REALDIV},
|
||||
{"Mul", BROADCAST_TYPE_MUL}, {"Sub", BROADCAST_TYPE_SUB}, {"TensorAdd", BROADCAST_TYPE_ADD},
|
||||
{"FloorDiv", BROADCAST_TYPE_FLOORDIV}, {"AbsGrad", BROADCAST_TYPE_ABSGRAD},
|
||||
{"FloorDiv", BROADCAST_TYPE_FLOORDIV}, {"AbsGrad", BROADCAST_TYPE_ABSGRAD}, {"Div", BROADCAST_TYPE_DIV},
|
||||
};
|
||||
|
||||
auto iter = kBroadcastTypeMap.find(kernel_name);
|
||||
|
|
|
@ -0,0 +1,116 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import mindspore.context as context
|
||||
import mindspore.nn as nn
|
||||
from mindspore import Tensor
|
||||
from mindspore.ops import operations as P
|
||||
|
||||
class NetDiv(nn.Cell):
|
||||
def __init__(self):
|
||||
super(NetDiv, self).__init__()
|
||||
self.div = P.Div()
|
||||
|
||||
def construct(self, x, y):
|
||||
return self.div(x, y)
|
||||
|
||||
@pytest.mark.level0
|
||||
@pytest.mark.platform_x86_gpu_training
|
||||
@pytest.mark.env_onecard
|
||||
def test_div():
|
||||
x0_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
|
||||
y0_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
|
||||
x1_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
|
||||
y1_np = np.random.randint(1, 5, (2, 1, 4, 4)).astype(np.float32)
|
||||
x2_np = np.random.randint(1, 5, (2, 1, 1, 4)).astype(np.float32)
|
||||
y2_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
|
||||
x3_np = np.random.randint(1, 5, 1).astype(np.float32)
|
||||
y3_np = np.random.randint(1, 5, 1).astype(np.float32)
|
||||
x4_np = np.array(768).astype(np.float32)
|
||||
y4_np = np.array(3072.5).astype(np.float32)
|
||||
x5_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float16)
|
||||
y5_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float16)
|
||||
x6_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.int32)
|
||||
y6_np = np.random.randint(1, 5, (2, 1, 4, 4)).astype(np.int32)
|
||||
|
||||
x0 = Tensor(x0_np)
|
||||
y0 = Tensor(y0_np)
|
||||
x1 = Tensor(x1_np)
|
||||
y1 = Tensor(y1_np)
|
||||
x2 = Tensor(x2_np)
|
||||
y2 = Tensor(y2_np)
|
||||
x3 = Tensor(x3_np)
|
||||
y3 = Tensor(y3_np)
|
||||
x4 = Tensor(x4_np)
|
||||
y4 = Tensor(y4_np)
|
||||
x5 = Tensor(x5_np)
|
||||
y5 = Tensor(y5_np)
|
||||
x6 = Tensor(x6_np)
|
||||
y6 = Tensor(y6_np)
|
||||
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
|
||||
div = NetDiv()
|
||||
output0 = div(x0, y0)
|
||||
expect0 = np.divide(x0_np, y0_np)
|
||||
diff0 = output0.asnumpy() - expect0
|
||||
error0 = np.ones(shape=expect0.shape) * 1.0e-5
|
||||
assert np.all(diff0 < error0)
|
||||
assert output0.shape == expect0.shape
|
||||
|
||||
output1 = div(x1, y1)
|
||||
expect1 = np.divide(x1_np, y1_np)
|
||||
diff1 = output1.asnumpy() - expect1
|
||||
error1 = np.ones(shape=expect1.shape) * 1.0e-5
|
||||
assert np.all(diff1 < error1)
|
||||
assert output1.shape == expect1.shape
|
||||
|
||||
output2 = div(x2, y2)
|
||||
expect2 = np.divide(x2_np, y2_np)
|
||||
diff2 = output2.asnumpy() - expect2
|
||||
error2 = np.ones(shape=expect2.shape) * 1.0e-5
|
||||
assert np.all(diff2 < error2)
|
||||
assert output2.shape == expect2.shape
|
||||
|
||||
context.set_context(mode=context.PYNATIVE_MODE, device_target='GPU')
|
||||
output3 = div(x3, y3)
|
||||
expect3 = np.divide(x3_np, y3_np)
|
||||
diff3 = output3.asnumpy() - expect3
|
||||
error3 = np.ones(shape=expect3.shape) * 1.0e-5
|
||||
assert np.all(diff3 < error3)
|
||||
assert output3.shape == expect3.shape
|
||||
|
||||
output4 = div(x4, y4)
|
||||
expect4 = np.divide(x4_np, y4_np)
|
||||
diff4 = output4.asnumpy() - expect4
|
||||
error4 = np.ones(shape=expect4.shape) * 1.0e-5
|
||||
assert np.all(diff4 < error4)
|
||||
assert output4.shape == expect4.shape
|
||||
|
||||
output5 = div(x5, y5)
|
||||
expect5 = np.divide(x5_np, y5_np)
|
||||
diff5 = output5.asnumpy() - expect5
|
||||
error5 = np.ones(shape=expect5.shape) * 1.0e-5
|
||||
assert np.all(diff5 < error5)
|
||||
assert output5.shape == expect5.shape
|
||||
|
||||
output6 = div(x6, y6)
|
||||
expect6 = np.divide(x6_np, y6_np)
|
||||
diff6 = output6.asnumpy() - expect6
|
||||
error6 = np.ones(shape=expect6.shape) * 1.0e-5
|
||||
assert np.all(diff6 < error6)
|
||||
assert output6.shape == expect6.shape
|
Loading…
Reference in New Issue