update input for ScatterUpdate/Add, fix doc text and fix dynamic shape

2020-12-02 12:53:03 -05:00 · 2020-12-02 12:53:03 -05:00 · 27a602f067
parent 0856639fc5
commit 27a602f067
9 changed files with 104 additions and 88 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_add_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_add_gpu_kernel.h
@ -40,10 +40,10 @@ class ScatterAddKernel : public GpuKernel {
    int *indices = GetDeviceAddress<int>(inputs, 1);
    T *updates = GetDeviceAddress<T>(inputs, 2);
    T *output = GetDeviceAddress<T>(outputs, 0);
+    CalScatterAdd(inner_size_, indices_size_, indices, updates, input, reinterpret_cast<cudaStream_t>(stream_ptr));
    CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(&output[0], &input[0], input_size_ * sizeof(T), cudaMemcpyDeviceToDevice,
                                               reinterpret_cast<cudaStream_t>(stream_ptr)),
                               "cudaMemcpyAsync output failed");
-    CalScatterAdd(inner_size_, indices_size_, indices, updates, output, reinterpret_cast<cudaStream_t>(stream_ptr));
    return true;
  }

--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_update_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_update_gpu_kernel.h
@ -40,10 +40,10 @@ class ScatterUpdateKernel : public GpuKernel {
    int *indices = GetDeviceAddress<int>(inputs, 1);
    T *updates = GetDeviceAddress<T>(inputs, 2);
    T *output = GetDeviceAddress<T>(outputs, 0);
+    CalScatterUpdate(inner_size_, indices_size_, indices, updates, input, reinterpret_cast<cudaStream_t>(stream_ptr));
    CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(&output[0], &input[0], input_size_ * sizeof(T), cudaMemcpyDeviceToDevice,
                                               reinterpret_cast<cudaStream_t>(stream_ptr)),
                               "cudaMemcpyAsync output failed");
-    CalScatterUpdate(inner_size_, indices_size_, indices, updates, output, reinterpret_cast<cudaStream_t>(stream_ptr));
    return true;
  }

--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/scatter_add_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/scatter_add_impl.cu
@ -19,26 +19,26 @@

 template <typename T>
 __global__ void ScatterAdd(const int inner_size, const int updates_size, const int *indices, const T *updates,
-                           T *output) {
+                           T *input) {
  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < updates_size; pos += blockDim.x * gridDim.x) {
    const size_t index = pos / inner_size;
    const size_t offset = pos % inner_size;
    const size_t current_pos = indices[index] * inner_size + offset;
-    MsAtomicAdd(&output[current_pos], updates[pos]);
+    MsAtomicAdd(&input[current_pos], updates[pos]);
  }
 }

 template <typename T>
-void CalScatterAdd(const int &inner_size, const int &indices_size, const int *indices, const T *updates, T *output,
+void CalScatterAdd(const int &inner_size, const int &indices_size, const int *indices, const T *updates, T *input,
                   cudaStream_t cuda_stream) {
  const int updates_size = inner_size * indices_size;
  ScatterAdd<<<GET_BLOCKS(updates_size), GET_THREADS, 0, cuda_stream>>>(inner_size, updates_size, indices, updates,
-                                                                        output);
+                                                                        input);
 }

 template void CalScatterAdd<float>(const int &inner_size, const int &indices_size, const int *indices,
-                                   const float *updates, float *output, cudaStream_t cuda_stream);
+                                   const float *updates, float *input, cudaStream_t cuda_stream);
 template void CalScatterAdd<half>(const int &inner_size, const int &indices_size, const int *indices,
-                                  const half *updates, half *output, cudaStream_t cuda_stream);
+                                  const half *updates, half *input, cudaStream_t cuda_stream);
 template void CalScatterAdd<int>(const int &inner_size, const int &indices_size, const int *indices, const int *updates,
-                                 int *output, cudaStream_t cuda_stream);
+                                 int *input, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/scatter_add_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/scatter_add_impl.cuh
@ -20,7 +20,7 @@
 #include "runtime/device/gpu/cuda_common.h"

 template <typename T>
-void CalScatterAdd(const int &inner_size, const int &indices_size, const int *indices, const T *updates, T *output,
+void CalScatterAdd(const int &inner_size, const int &indices_size, const int *indices, const T *updates, T *input,
                   cudaStream_t cuda_stream);

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SCATTER_ADD_IMPL_CUH_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/scatter_update_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/scatter_update_impl.cu
@ -18,31 +18,31 @@

 template <typename T>
 __global__ void ScatterUpdate(const int inner_size, const int updates_size, const int *indices, const T *updates,
-                              T *output) {
+                              T *input) {
  for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < updates_size; pos += blockDim.x * gridDim.x) {
    const int index = pos / inner_size;
    const int offset = pos % inner_size;
    const int current_pos = indices[index] * inner_size + offset;
-    output[current_pos] = updates[pos];
+    input[current_pos] = updates[pos];
  }
 }

 template <typename T>
-void CalScatterUpdate(const int &inner_size, const int &indices_size, const int *indices, const T *updates, T *output,
+void CalScatterUpdate(const int &inner_size, const int &indices_size, const int *indices, const T *updates, T *input,
                      cudaStream_t cuda_stream) {
  const int updates_size = inner_size * indices_size;
  ScatterUpdate<<<GET_BLOCKS(updates_size), GET_THREADS, 0, cuda_stream>>>(inner_size, updates_size, indices, updates,
-                                                                           output);
+                                                                           input);
 }

 template void CalScatterUpdate<float>(const int &inner_size, const int &indices_size, const int *indices,
-                                      const float *updates, float *output, cudaStream_t cuda_stream);
+                                      const float *updates, float *input, cudaStream_t cuda_stream);
 template void CalScatterUpdate<half>(const int &inner_size, const int &indices_size, const int *indices,
-                                     const half *updates, half *output, cudaStream_t cuda_stream);
+                                     const half *updates, half *input, cudaStream_t cuda_stream);
 template void CalScatterUpdate<int>(const int &inner_size, const int &indices_size, const int *indices,
-                                    const int *updates, int *output, cudaStream_t cuda_stream);
+                                    const int *updates, int *input, cudaStream_t cuda_stream);
 template void CalScatterUpdate<unsigned char>(const int &inner_size, const int &indices_size, const int *indices,
-                                              const unsigned char *updates, unsigned char *output,
+                                              const unsigned char *updates, unsigned char *input,
                                              cudaStream_t cuda_stream);
 template void CalScatterUpdate<int8_t>(const int &inner_size, const int &indices_size, const int *indices,
-                                     const int8_t *updates, int8_t *output, cudaStream_t cuda_stream);
+                                     const int8_t *updates, int8_t *input, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/scatter_update_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/scatter_update_impl.cuh
@ -20,7 +20,7 @@
 #include "runtime/device/gpu/cuda_common.h"

 template <typename T>
-void CalScatterUpdate(const int &inner_size, const int &indices_size, const int *indices, const T *updates, T *output,
+void CalScatterUpdate(const int &inner_size, const int &indices_size, const int *indices, const T *updates, T *input,
                      cudaStream_t cuda_stream);

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SCATTER_UPDATE_IMPL_CUH_
--- a/mindspore/ops/operations/array_ops.py
+++ b/mindspore/ops/operations/array_ops.py
@ -73,13 +73,23 @@ class _ScatterOp_Dynamic(PrimitiveWithCheck):
    """
    Defines Scatter operators with dynamic shape
    """
+    __mindspore_signature__ = (
+        sig.make_sig('x', sig.sig_rw.RW_WRITE, dtype=sig.sig_dtype.T),
+        sig.make_sig('indices', dtype=sig.sig_dtype.T1),
+        sig.make_sig('updates', dtype=sig.sig_dtype.T)
+    )

    def _check_scatter_shape(self, x_shape, indices_shape, updates_shape, prim_name):
-        if np.all(np.array(x_shape) != -1):
-            if indices_shape != [-1] and updates_shape and updates_shape != indices_shape + x_shape[1:]:
-                raise ValueError(f"For '{prim_name}', "
-                                 f"updates_shape = indices_shape + x_shape[1:], but got x_shape: {x_shape}, "
-                                 f"indices_shape: {indices_shape}, updates_shape: {updates_shape}.")
+        # x_shape cannot be dynamic
+        if np.any(np.array(x_shape) == -1):
+            raise ValueError(f"x does not support dynamic shape")
+        # support indices and updates dynamic
+        if np.any(np.array(indices_shape) == -1) or np.any(np.array(updates_shape) == -1):
+            pass
+        elif indices_shape != [-1] and updates_shape and updates_shape != indices_shape + x_shape[1:]:
+            raise ValueError(f"For '{prim_name}', "
+                             f"updates_shape = indices_shape + x_shape[1:], but got x_shape: {x_shape}, "
+                             f"indices_shape: {indices_shape}, updates_shape: {updates_shape}.")

    @prim_attr_register
    def __init__(self, use_locking=False):
@ -3176,7 +3186,7 @@ class ScatterUpdate(_ScatterOp_Dynamic):
        Tensor, has the same shape and type as `input_x`.

    Supported Platforms:
-        ``Ascend``
+        ``Ascend`` ``GPU``

    Examples:
        >>> np_x = np.array([[-0.1, 0.3, 3.6], [0.4, 0.5, -3.2]])
--- a/tests/st/ops/gpu/test_scatter_add_op.py
+++ b/tests/st/ops/gpu/test_scatter_add_op.py
@ -56,8 +56,9 @@ class TestScatterAddDynamicNet(nn.Cell):
        self.updates = Parameter(updates, name="updates")

    def construct(self):
-        out = self.test_dynamic(self.inputx)
-        out = self.scatter_add(out, self.indices, self.updates)
+        indices = self.test_dynamic(self.indices)
+        updates = self.test_dynamic(self.updates)
+        out = self.scatter_add(self.inputx, indices, updates)
        return out

 def scatter_add_d_net(inputx, indices, updates):
@ -66,22 +67,24 @@ def scatter_add_d_net(inputx, indices, updates):
    return net()

 class TestScatterAddDynamicNet2(nn.Cell):
-    def __init__(self):
+    def __init__(self, inputx):
        super(TestScatterAddDynamicNet2, self).__init__()
        self.scatter_add = P.ScatterAdd()
        self.test_dynamic = inner.GpuConvertToDynamicShape()
+        self.inputx = Parameter(inputx, name="inputx")

-    def construct(self, inputx, indices, updates):
-        out = self.test_dynamic(inputx)
-        out = self.scatter_add(out, indices, updates)
+    def construct(self, indices, updates):
+        indices = self.test_dynamic(indices)
+        updates = self.test_dynamic(updates)
+        out = self.scatter_add(self.inputx, indices, updates)
        return out

-def scatter_add_d2_net(inputx_1, indices_1, updates_1, inputx_2,
+def scatter_add_d2_net(inputx, indices_1, updates_1,
                       indices_2, updates_2):
    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
-    net = TestScatterAddDynamicNet2()
-    out1 = net(inputx_1, indices_1, updates_1)
-    out2 = net(inputx_2, indices_2, updates_2)
+    net = TestScatterAddDynamicNet2(inputx)
+    out1 = net(indices_1, updates_1)
+    out2 = net(indices_2, updates_2)
    return (out1, out2)

@pytest.mark.level0
@ -96,6 +99,20 @@ def test_scatter_add_small_float32():
                         [12., 14., 16.]])
    np.testing.assert_array_almost_equal(output.asnumpy(), expected)

+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_scatter_add_input_updated():
+    inputx = Tensor(np.zeros((2, 3)).astype(np.float32))
+    indices = Tensor(np.array([[0, 1], [0, 1]]).astype(np.int32))
+    updates = Tensor(np.arange(12).reshape((2, 2, 3)).astype(np.float32))
+    lock = True
+    net = TestScatterAddNet(lock, inputx, indices, updates)
+    net()
+    expected = np.array([[6., 8., 10.],
+                         [12., 14., 16.]])
+    np.testing.assert_array_almost_equal(net.inputx.asnumpy(), expected)
+
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
@ -274,39 +291,16 @@ def test_scatter_add_input_less_than_1_dynamic_float32():
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
 def test_scatter_add_dynamic_two_inputs():
-    inputx_1 = Tensor(np.zeros((2, 3)).astype(np.float32))
+    inputx = Tensor(np.zeros((2, 3)).astype(np.float32))
    indices_1 = Tensor(np.array([[0, 1], [0, 1]]).astype(np.int32))
    updates_1 = Tensor(np.arange(12).reshape((2, 2, 3)).astype(np.float32))
-    inputx_2 = Tensor(np.ones((4, 2, 3, 4)).astype(np.float32))
-    indices_2 = Tensor(np.array([[0, 2], [3, 1]]).astype(np.int32))
-    updates_2 = Tensor(np.arange(96).reshape((2, 2, 2, 3, 4)).astype(np.float32))
-    output_1, output_2 = scatter_add_d2_net(inputx_1, indices_1, updates_1,
-                                            inputx_2, indices_2, updates_2)
+    indices_2 = Tensor(np.array([[0, 0], [1, 1], [1, 0]]).astype(np.int32))
+    updates_2 = Tensor(np.flip(np.arange(18).reshape((3, 2, 3)).astype(np.float32)))
+    output_1, output_2 = scatter_add_d2_net(inputx, indices_1, updates_1,
+                                            indices_2, updates_2)
    expected_1 = np.array([[6., 8., 10.],
                           [12., 14., 16.]])
-    expected_2 = np.array([[[[1., 2., 3., 4.],
-                             [5., 6., 7., 8.],
-                             [9., 10., 11., 12.]],
-                            [[13., 14., 15., 16.],
-                             [17., 18., 19., 20.],
-                             [21., 22., 23., 24.]]],
-                           [[[73., 74., 75., 76.],
-                             [77., 78., 79., 80.],
-                             [81., 82., 83., 84.]],
-                            [[85., 86., 87., 88.],
-                             [89., 90., 91., 92.],
-                             [93., 94., 95., 96.]]],
-                           [[[25., 26., 27., 28.],
-                             [29., 30., 31., 32.],
-                             [33., 34., 35., 36.]],
-                            [[37., 38., 39., 40.],
-                             [41., 42., 43., 44.],
-                             [45., 46., 47., 48.]]],
-                           [[[49., 50., 51., 52.],
-                             [53., 54., 55., 56.],
-                             [57., 58., 59., 60.]],
-                            [[61., 62., 63., 64.],
-                             [65., 66., 67., 68.],
-                             [69., 70., 71., 72.]]]])
+    expected_2 = np.array([[39., 38., 37.],
+                           [36., 35., 34.]])
    np.testing.assert_array_almost_equal(output_1.asnumpy(), expected_1)
    np.testing.assert_array_almost_equal(output_2.asnumpy(), expected_2)
--- a/tests/st/ops/gpu/test_scatter_update_op.py
+++ b/tests/st/ops/gpu/test_scatter_update_op.py
@ -50,8 +50,9 @@ class TestScatterUpdateDynamicNet(nn.Cell):
        self.updates = Parameter(updates, name="updates")

    def construct(self):
-        out = self.test_dynamic(self.inputx)
-        out = self.scatter_update(out, self.indices, self.updates)
+        indices = self.test_dynamic(self.indices)
+        updates = self.test_dynamic(self.updates)
+        out = self.scatter_update(self.inputx, indices, updates)
        return out

 def scatter_update_d_net(inputx, indices, updates):
@ -60,22 +61,24 @@ def scatter_update_d_net(inputx, indices, updates):
    return net()

 class TestScatterUpdateDynamicNet2(nn.Cell):
-    def __init__(self):
+    def __init__(self, inputx):
        super(TestScatterUpdateDynamicNet2, self).__init__()
        self.scatter_update = P.ScatterUpdate()
        self.test_dynamic = inner.GpuConvertToDynamicShape()
+        self.inputx = Parameter(inputx, name="inputx")

-    def construct(self, inputx, indices, updates):
-        out = self.test_dynamic(inputx)
-        out = self.scatter_update(out, indices, updates)
+    def construct(self, indices, updates):
+        indices = self.test_dynamic(indices)
+        updates = self.test_dynamic(updates)
+        out = self.scatter_update(self.inputx, indices, updates)
        return out

-def scatter_update_d2_net(inputx_1, indices_1, updates_1, inputx_2,
+def scatter_update_d2_net(inputx, indices_1, updates_1,
                          indices_2, updates_2):
    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
-    net = TestScatterUpdateDynamicNet2()
-    out1 = net(inputx_1, indices_1, updates_1)
-    out2 = net(inputx_2, indices_2, updates_2)
+    net = TestScatterUpdateDynamicNet2(inputx)
+    out1 = net(indices_1, updates_1)
+    out2 = net(indices_2, updates_2)
    return (out1, out2)

@pytest.mark.level0
@ -90,6 +93,19 @@ def test_scatter_update_small_float32():
                         [3., 4., 5.]])
    np.testing.assert_array_almost_equal(output.asnumpy(), expected)

+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_scatter_update_input_updated():
+    inputx = Tensor(np.zeros((2, 3)).astype(np.float32))
+    indices = Tensor(np.array([0, 1]).astype(np.int32))
+    updates = Tensor(np.arange(6).reshape((2, 3)).astype(np.float32))
+    net = TestScatterUpdateNet(inputx, indices, updates)
+    net()
+    expected = np.array([[0., 1., 2.],
+                         [3., 4., 5.]])
+    np.testing.assert_array_almost_equal(net.inputx.asnumpy(), expected)
+
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
@ -328,20 +344,16 @@ def test_scatter_update_disordered_dynamic_int32():
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
 def test_scatter_update_two_inputs():
-    inputx_1 = Tensor(np.zeros((2, 3)).astype(np.float32))
+    inputx = Tensor(np.zeros((2, 3)).astype(np.float32))
    indices_1 = Tensor(np.array([0, 1]).astype(np.int32))
    updates_1 = Tensor(np.arange(6).reshape((2, 3)).astype(np.float32))
-    inputx_2 = Tensor(np.array([[0.214141, 0.415151, 0.51516],
-                                [0.876542, 0.451611, 0.55112],
-                                [0.111244, 0.633333, 0.34444]]).astype(np.float32))
-    indices_2 = Tensor(np.array([1, 0, 2]).astype(np.int32))
-    updates_2 = Tensor(np.arange(34, 43).reshape((3, 3)).astype(np.float32))
-    output_1, output_2 = scatter_update_d2_net(inputx_1, indices_1, updates_1,
-                                               inputx_2, indices_2, updates_2)
+    indices_2 = Tensor(np.array([1]).astype(np.int32))
+    updates_2 = Tensor(np.arange(34, 37).reshape((1, 3)).astype(np.float32))
+    output_1, output_2 = scatter_update_d2_net(inputx, indices_1, updates_1,
+                                               indices_2, updates_2)
    expected_1 = np.array([[0., 1., 2.],
-                           [3., 4., 5.]])
-    expected_2 = np.array([[37., 38., 39.],
-                           [34., 35., 36.],
-                           [40., 41., 42.]], dtype=np.float32)
+                           [3., 4., 5.]], dtype=np.float32)
+    expected_2 = np.array([[0., 1., 2.],
+                           [34., 35., 36.]], dtype=np.float32)
    np.testing.assert_array_almost_equal(output_1.asnumpy(), expected_1)
    np.testing.assert_array_almost_equal(output_2.asnumpy(), expected_2)