forked from mindspore-Ecosystem/mindspore
broadcast kernel support unqual dims & half
This commit is contained in:
parent
bab6e0f549
commit
0a1195ddf5
|
@ -42,6 +42,19 @@ struct PowerFunc {
|
||||||
__device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return pow(lhs, rhs); }
|
__device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return pow(lhs, rhs); }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct PowerFunc<half, half> {
|
||||||
|
__device__ __forceinline__ half operator()(const half &lhs, const half &rhs) {
|
||||||
|
return __float2half(pow(__half2float(lhs), __half2float(rhs)));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct PowerFunc<half, bool> {
|
||||||
|
// invalid branch
|
||||||
|
__device__ __forceinline__ half operator()(const half &lhs, const half &rhs) { return false; }
|
||||||
|
};
|
||||||
|
|
||||||
__device__ __forceinline__ int Index(const int &index, const int &dim) { return dim == 1 ? 0 : index; }
|
__device__ __forceinline__ int Index(const int &index, const int &dim) { return dim == 1 ? 0 : index; }
|
||||||
|
|
||||||
template <typename T, typename S, typename Func>
|
template <typename T, typename S, typename Func>
|
||||||
|
@ -131,8 +144,20 @@ template void Broadcast(const int &l0, const int &l1, const int &l2, const int &
|
||||||
const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
|
const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
|
||||||
enum BroadcastOpType op, const float *input0, const float *input1, float *output,
|
enum BroadcastOpType op, const float *input0, const float *input1, float *output,
|
||||||
cudaStream_t stream);
|
cudaStream_t stream);
|
||||||
|
template void Broadcast(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
|
||||||
|
const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
|
||||||
|
enum BroadcastOpType op, const half *input0, const half *input1, bool *output,
|
||||||
|
cudaStream_t stream);
|
||||||
|
template void Broadcast(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
|
||||||
|
const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
|
||||||
|
enum BroadcastOpType op, const half *input0, const half *input1, half *output,
|
||||||
|
cudaStream_t stream);
|
||||||
|
|
||||||
template void NoBroadcast(const int &nums, enum BroadcastOpType op, const float *input0, const float *input1,
|
template void NoBroadcast(const int &nums, enum BroadcastOpType op, const float *input0, const float *input1,
|
||||||
bool *output, cudaStream_t stream);
|
bool *output, cudaStream_t stream);
|
||||||
template void NoBroadcast(const int &nums, enum BroadcastOpType op, const float *input0, const float *input1,
|
template void NoBroadcast(const int &nums, enum BroadcastOpType op, const float *input0, const float *input1,
|
||||||
float *output, cudaStream_t stream);
|
float *output, cudaStream_t stream);
|
||||||
|
template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *input0, const half *input1,
|
||||||
|
bool *output, cudaStream_t stream);
|
||||||
|
template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *input0, const half *input1,
|
||||||
|
half *output, cudaStream_t stream);
|
||||||
|
|
|
@ -18,6 +18,7 @@
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
namespace kernel {
|
namespace kernel {
|
||||||
|
// fp32
|
||||||
MS_REG_GPU_KERNEL_TWO(
|
MS_REG_GPU_KERNEL_TWO(
|
||||||
Greater,
|
Greater,
|
||||||
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeBool),
|
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeBool),
|
||||||
|
@ -36,5 +37,25 @@ MS_REG_GPU_KERNEL_TWO(
|
||||||
MS_REG_GPU_KERNEL_TWO(
|
MS_REG_GPU_KERNEL_TWO(
|
||||||
Pow, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
|
Pow, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
|
||||||
BroadcastOpGpuKernel, float, float)
|
BroadcastOpGpuKernel, float, float)
|
||||||
|
|
||||||
|
// fp16
|
||||||
|
MS_REG_GPU_KERNEL_TWO(
|
||||||
|
Greater,
|
||||||
|
KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeBool),
|
||||||
|
BroadcastOpGpuKernel, half, bool)
|
||||||
|
MS_REG_GPU_KERNEL_TWO(
|
||||||
|
Less, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeBool),
|
||||||
|
BroadcastOpGpuKernel, half, bool)
|
||||||
|
MS_REG_GPU_KERNEL_TWO(
|
||||||
|
Maximum,
|
||||||
|
KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
|
||||||
|
BroadcastOpGpuKernel, half, half)
|
||||||
|
MS_REG_GPU_KERNEL_TWO(
|
||||||
|
Minimum,
|
||||||
|
KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
|
||||||
|
BroadcastOpGpuKernel, half, half)
|
||||||
|
MS_REG_GPU_KERNEL_TWO(
|
||||||
|
Pow, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
|
||||||
|
BroadcastOpGpuKernel, half, half)
|
||||||
} // namespace kernel
|
} // namespace kernel
|
||||||
} // namespace mindspore
|
} // namespace mindspore
|
||||||
|
|
|
@ -65,15 +65,20 @@ class BroadcastOpGpuKernel : public GpuKernel {
|
||||||
MS_LOG(EXCEPTION) << "Broadcast operation not support dim greater than 4";
|
MS_LOG(EXCEPTION) << "Broadcast operation not support dim greater than 4";
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < shape1.size(); i++) {
|
for (size_t i = 0; i < shape3.size(); i++) {
|
||||||
lhs_shape_[i] = shape1[i];
|
|
||||||
rhs_shape_[i] = shape2[i];
|
|
||||||
output_shape_[i] = shape3[i];
|
output_shape_[i] = shape3[i];
|
||||||
|
|
||||||
input1_num_ *= shape1[i];
|
|
||||||
input2_num_ *= shape2[i];
|
|
||||||
output_num_ *= shape3[i];
|
output_num_ *= shape3[i];
|
||||||
}
|
}
|
||||||
|
int offset = shape3.size() - shape1.size();
|
||||||
|
for (size_t i = 0; i < shape1.size(); i++) {
|
||||||
|
lhs_shape_[i + offset] = shape1[i];
|
||||||
|
input1_num_ *= shape1[i];
|
||||||
|
}
|
||||||
|
offset = shape3.size() - shape2.size();
|
||||||
|
for (size_t i = 0; i < shape2.size(); i++) {
|
||||||
|
rhs_shape_[i + offset] = shape2[i];
|
||||||
|
input2_num_ *= shape2[i];
|
||||||
|
}
|
||||||
|
|
||||||
InitSizeLists();
|
InitSizeLists();
|
||||||
return true;
|
return true;
|
||||||
|
@ -105,6 +110,9 @@ class BroadcastOpGpuKernel : public GpuKernel {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool IsBroadcast(const std::vector<size_t> &lhs, const std::vector<size_t> &rhs) {
|
bool IsBroadcast(const std::vector<size_t> &lhs, const std::vector<size_t> &rhs) {
|
||||||
|
if (lhs.size() != rhs.size()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
for (size_t i = 0; i < lhs.size(); i++) {
|
for (size_t i = 0; i < lhs.size(); i++) {
|
||||||
if (lhs[i] != rhs[i]) {
|
if (lhs[i] != rhs[i]) {
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -79,3 +79,33 @@ def test_broadcast():
|
||||||
output_ms = P.Pow()(Tensor(x1_np), Tensor(x2_np))
|
output_ms = P.Pow()(Tensor(x1_np), Tensor(x2_np))
|
||||||
output_np = np.power(x1_np, x2_np)
|
output_np = np.power(x1_np, x2_np)
|
||||||
assert np.allclose(output_ms.asnumpy(), output_np)
|
assert np.allclose(output_ms.asnumpy(), output_np)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.level0
|
||||||
|
@pytest.mark.platform_x86_gpu_training
|
||||||
|
@pytest.mark.env_onecard
|
||||||
|
def test_broadcast_diff_dims():
|
||||||
|
context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
|
||||||
|
|
||||||
|
x1_np = np.random.rand(2).astype(np.float32)
|
||||||
|
x2_np = np.random.rand(2, 1).astype(np.float32)
|
||||||
|
|
||||||
|
output_ms = P.Minimum()(Tensor(x1_np), Tensor(x2_np))
|
||||||
|
output_np = np.minimum(x1_np, x2_np)
|
||||||
|
assert np.allclose(output_ms.asnumpy(), output_np)
|
||||||
|
|
||||||
|
output_ms = P.Maximum()(Tensor(x1_np), Tensor(x2_np))
|
||||||
|
output_np = np.maximum(x1_np, x2_np)
|
||||||
|
assert np.allclose(output_ms.asnumpy(), output_np)
|
||||||
|
|
||||||
|
output_ms = P.Greater()(Tensor(x1_np), Tensor(x2_np))
|
||||||
|
output_np = x1_np > x2_np
|
||||||
|
assert np.allclose(output_ms.asnumpy(), output_np)
|
||||||
|
|
||||||
|
output_ms = P.Less()(Tensor(x1_np), Tensor(x2_np))
|
||||||
|
output_np = x1_np < x2_np
|
||||||
|
assert np.allclose(output_ms.asnumpy(), output_np)
|
||||||
|
|
||||||
|
output_ms = P.Pow()(Tensor(x1_np), Tensor(x2_np))
|
||||||
|
output_np = np.power(x1_np, x2_np)
|
||||||
|
assert np.allclose(output_ms.asnumpy(), output_np)
|
||||||
|
|
Loading…
Reference in New Issue