Add scattersub op for gpu

2021-06-06 22:51:13 +08:00 · 2021-06-06 22:51:13 +08:00 · 201f85f636
parent 00dee4e3bd
commit 201f85f636
18 changed files with 1134 additions and 1065 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_add_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_add_gpu_kernel.cc
@ -1,57 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "backend/kernel_compiler/gpu/arrays/scatter_add_gpu_kernel.h"
-
-namespace mindspore {
-namespace kernel {
-MS_REG_GPU_KERNEL_ONE(ScatterAdd,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddOutputAttr(kNumberTypeFloat32),
-                      ScatterAddKernel, float)
-MS_REG_GPU_KERNEL_ONE(ScatterAdd,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddOutputAttr(kNumberTypeFloat16),
-                      ScatterAddKernel, half)
-MS_REG_GPU_KERNEL_ONE(ScatterAdd,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddOutputAttr(kNumberTypeInt32),
-                      ScatterAddKernel, int)
-MS_REG_GPU_KERNEL_ONE(ScatterAdd,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeInt8)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeInt8)
-                        .AddOutputAttr(kNumberTypeInt8),
-                      ScatterAddKernel, int8_t)
-MS_REG_GPU_KERNEL_ONE(ScatterAdd,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeUInt8)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeUInt8)
-                        .AddOutputAttr(kNumberTypeUInt8),
-                      ScatterAddKernel, uint8_t)
-}  // namespace kernel
-}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_functor_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_functor_gpu_kernel.cc
@ -0,0 +1,237 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/arrays/scatter_functor_gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+// ScatterUpdate
+MS_REG_GPU_KERNEL_TWO(ScatterUpdate,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32),
+                      ScatterFunctorKernel, float, int)
+MS_REG_GPU_KERNEL_TWO(ScatterUpdate,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32),
+                      ScatterFunctorKernel, float, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterUpdate,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddOutputAttr(kNumberTypeFloat16),
+                      ScatterFunctorKernel, half, int)
+MS_REG_GPU_KERNEL_TWO(ScatterUpdate,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddOutputAttr(kNumberTypeFloat16),
+                      ScatterFunctorKernel, half, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterUpdate,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddOutputAttr(kNumberTypeInt32),
+                      ScatterFunctorKernel, int, int)
+MS_REG_GPU_KERNEL_TWO(ScatterUpdate,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddOutputAttr(kNumberTypeInt32),
+                      ScatterFunctorKernel, int, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterUpdate,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt8)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt8)
+                        .AddOutputAttr(kNumberTypeInt8),
+                      ScatterFunctorKernel, int8_t, int)
+MS_REG_GPU_KERNEL_TWO(ScatterUpdate,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt8)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeInt8)
+                        .AddOutputAttr(kNumberTypeInt8),
+                      ScatterFunctorKernel, int8_t, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterUpdate,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeUInt8)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeUInt8)
+                        .AddOutputAttr(kNumberTypeUInt8),
+                      ScatterFunctorKernel, uint8_t, int)
+MS_REG_GPU_KERNEL_TWO(ScatterUpdate,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeUInt8)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeUInt8)
+                        .AddOutputAttr(kNumberTypeUInt8),
+                      ScatterFunctorKernel, uint8_t, int64_t)
+
+// ScatterAdd
+MS_REG_GPU_KERNEL_TWO(ScatterAdd,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32),
+                      ScatterFunctorKernel, float, int)
+MS_REG_GPU_KERNEL_TWO(ScatterAdd,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32),
+                      ScatterFunctorKernel, float, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterAdd,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddOutputAttr(kNumberTypeFloat16),
+                      ScatterFunctorKernel, half, int)
+MS_REG_GPU_KERNEL_TWO(ScatterAdd,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddOutputAttr(kNumberTypeFloat16),
+                      ScatterFunctorKernel, half, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterAdd,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddOutputAttr(kNumberTypeInt32),
+                      ScatterFunctorKernel, int, int)
+MS_REG_GPU_KERNEL_TWO(ScatterAdd,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddOutputAttr(kNumberTypeInt32),
+                      ScatterFunctorKernel, int, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterAdd,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt8)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt8)
+                        .AddOutputAttr(kNumberTypeInt8),
+                      ScatterFunctorKernel, int8_t, int)
+MS_REG_GPU_KERNEL_TWO(ScatterAdd,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt8)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeInt8)
+                        .AddOutputAttr(kNumberTypeInt8),
+                      ScatterFunctorKernel, int8_t, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterAdd,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeUInt8)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeUInt8)
+                        .AddOutputAttr(kNumberTypeUInt8),
+                      ScatterFunctorKernel, uint8_t, int)
+MS_REG_GPU_KERNEL_TWO(ScatterAdd,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeUInt8)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeUInt8)
+                        .AddOutputAttr(kNumberTypeUInt8),
+                      ScatterFunctorKernel, uint8_t, int64_t)
+
+// ScatterSub
+MS_REG_GPU_KERNEL_TWO(ScatterSub,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32),
+                      ScatterFunctorKernel, float, int)
+MS_REG_GPU_KERNEL_TWO(ScatterSub,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32),
+                      ScatterFunctorKernel, float, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterSub,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddOutputAttr(kNumberTypeFloat16),
+                      ScatterFunctorKernel, half, int)
+MS_REG_GPU_KERNEL_TWO(ScatterSub,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddOutputAttr(kNumberTypeFloat16),
+                      ScatterFunctorKernel, half, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterSub,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddOutputAttr(kNumberTypeInt32),
+                      ScatterFunctorKernel, int, int)
+MS_REG_GPU_KERNEL_TWO(ScatterSub,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddOutputAttr(kNumberTypeInt32),
+                      ScatterFunctorKernel, int, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterSub,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt8)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt8)
+                        .AddOutputAttr(kNumberTypeInt8),
+                      ScatterFunctorKernel, int8_t, int)
+MS_REG_GPU_KERNEL_TWO(ScatterSub,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt8)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeInt8)
+                        .AddOutputAttr(kNumberTypeInt8),
+                      ScatterFunctorKernel, int8_t, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterSub,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeUInt8)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeUInt8)
+                        .AddOutputAttr(kNumberTypeUInt8),
+                      ScatterFunctorKernel, uint8_t, int)
+MS_REG_GPU_KERNEL_TWO(ScatterSub,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeUInt8)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeUInt8)
+                        .AddOutputAttr(kNumberTypeUInt8),
+                      ScatterFunctorKernel, uint8_t, int64_t)
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_functor_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_functor_gpu_kernel.h
@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -14,21 +14,30 @@
 * limitations under the License.
 */

-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_ARRAYS_SCATTER_ADD_GPU_KERNEL_H_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_ARRAYS_SCATTER_ADD_GPU_KERNEL_H_
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_ARRAYS_SCATTER_FUNCTOR_GPU_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_ARRAYS_SCATTER_FUNCTOR_GPU_KERNEL_H_

 #include <vector>
+#include <string>
+#include <map>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
-#include "backend/kernel_compiler/gpu/cuda_impl/scatter_add_impl.cuh"
+#include "backend/kernel_compiler/gpu/cuda_impl/scatter_functor_impl.cuh"

 namespace mindspore {
 namespace kernel {
-template <typename T>
-class ScatterAddKernel : public GpuKernel {
+
+static const std::map<std::string, ScatterFunctorType> kScatterFunctorTypeMap = {
+  {"ScatterUpdate", SCATTER_FUNC_UPDATE},
+  {"ScatterAdd", SCATTER_FUNC_ADD},
+  {"ScatterSub", SCATTER_FUNC_SUB},
+};
+
+template <typename T, typename S>
+class ScatterFunctorKernel : public GpuKernel {
 public:
-  ScatterAddKernel() { ResetResource(); }
-  ~ScatterAddKernel() override = default;
+  ScatterFunctorKernel() { ResetResource(); }
+  ~ScatterFunctorKernel() override = default;

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
@ -37,10 +46,12 @@ class ScatterAddKernel : public GpuKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    T *input = GetDeviceAddress<T>(inputs, 0);
-    int *indices = GetDeviceAddress<int>(inputs, 1);
+    S *indices = GetDeviceAddress<S>(inputs, 1);
    T *updates = GetDeviceAddress<T>(inputs, 2);
    T *output = GetDeviceAddress<T>(outputs, 0);
-    CalScatterAdd(inner_size_, indices_size_, indices, updates, input, reinterpret_cast<cudaStream_t>(stream_ptr));
+
+    ScatterFunc(scatter_functor_type_, inner_size_, indices_size_, indices, updates, input,
+                reinterpret_cast<cudaStream_t>(stream_ptr));
    CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
                               cudaMemcpyAsync(&output[0], &input[0], input_size_ * sizeof(T), cudaMemcpyDeviceToDevice,
                                               reinterpret_cast<cudaStream_t>(stream_ptr)),
@ -49,15 +60,22 @@ class ScatterAddKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
+    std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
+    auto iter = kScatterFunctorTypeMap.find(kernel_name);
+    if (iter == kScatterFunctorTypeMap.end()) {
+      MS_LOG(EXCEPTION) << "Scatter functor " << kernel_name << " is not supported.";
+    } else {
+      scatter_functor_type_ = iter->second;
+    }
    kernel_node_ = kernel_node;
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 3) {
-      MS_LOG(ERROR) << "Input number is " << input_num << ", but ScatterAdd needs 3 inputs.";
+      MS_LOG(ERROR) << "Input number is " << input_num << ", but " << kernel_name << " needs 3 inputs.";
      return false;
    }
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
-      MS_LOG(ERROR) << "Output number is " << output_num << ", but ScatterAdd has 1 output.";
+      MS_LOG(ERROR) << "Output number is " << output_num << ", but " << kernel_name << " has 1 output.";
      return false;
    }
    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
@ -90,12 +108,13 @@ class ScatterAddKernel : public GpuKernel {
 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(input_size_ * sizeof(T));
-    input_size_list_.push_back(indices_size_ * sizeof(int));
+    input_size_list_.push_back(indices_size_ * sizeof(S));
    input_size_list_.push_back(updates_size_ * sizeof(T));
    output_size_list_.push_back(input_size_ * sizeof(T));
  }

 private:
+  ScatterFunctorType scatter_functor_type_;
  size_t input_size_;
  size_t inner_size_;
  size_t indices_size_;
@ -106,4 +125,4 @@ class ScatterAddKernel : public GpuKernel {
 };
 }  // namespace kernel
 }  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_ARRAYS_SCATTER_ADD_GPU_KERNEL_H_
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_ARRAYS_SCATTER_FUNCTOR_GPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_update_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_update_gpu_kernel.cc
@ -1,57 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "backend/kernel_compiler/gpu/arrays/scatter_update_gpu_kernel.h"
-
-namespace mindspore {
-namespace kernel {
-MS_REG_GPU_KERNEL_ONE(ScatterUpdate,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddOutputAttr(kNumberTypeFloat32),
-                      ScatterUpdateKernel, float)
-MS_REG_GPU_KERNEL_ONE(ScatterUpdate,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddOutputAttr(kNumberTypeFloat16),
-                      ScatterUpdateKernel, half)
-MS_REG_GPU_KERNEL_ONE(ScatterUpdate,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddOutputAttr(kNumberTypeInt32),
-                      ScatterUpdateKernel, int)
-MS_REG_GPU_KERNEL_ONE(ScatterUpdate,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeInt8)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeInt8)
-                        .AddOutputAttr(kNumberTypeInt8),
-                      ScatterUpdateKernel, int8_t)
-MS_REG_GPU_KERNEL_ONE(ScatterUpdate,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeUInt8)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeUInt8)
-                        .AddOutputAttr(kNumberTypeUInt8),
-                      ScatterUpdateKernel, uint8_t)
-}  // namespace kernel
-}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_update_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_update_gpu_kernel.h
@ -1,109 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_ARRAYS_SCATTER_UPDATE_GPU_KERNEL_H_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_ARRAYS_SCATTER_UPDATE_GPU_KERNEL_H_
-
-#include <vector>
-#include "backend/kernel_compiler/gpu/gpu_kernel.h"
-#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
-#include "backend/kernel_compiler/gpu/cuda_impl/scatter_update_impl.cuh"
-
-namespace mindspore {
-namespace kernel {
-template <typename T>
-class ScatterUpdateKernel : public GpuKernel {
- public:
-  ScatterUpdateKernel() { ResetResource(); }
-  ~ScatterUpdateKernel() override = default;
-
-  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
-  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
-  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
-
-  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
-              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
-    T *input = GetDeviceAddress<T>(inputs, 0);
-    int *indices = GetDeviceAddress<int>(inputs, 1);
-    T *updates = GetDeviceAddress<T>(inputs, 2);
-    T *output = GetDeviceAddress<T>(outputs, 0);
-    CalScatterUpdate(inner_size_, indices_size_, indices, updates, input, reinterpret_cast<cudaStream_t>(stream_ptr));
-    CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
-                               cudaMemcpyAsync(&output[0], &input[0], input_size_ * sizeof(T), cudaMemcpyDeviceToDevice,
-                                               reinterpret_cast<cudaStream_t>(stream_ptr)),
-                               "cudaMemcpyAsync output failed");
-    return true;
-  }
-
-  bool Init(const CNodePtr &kernel_node) override {
-    kernel_node_ = kernel_node;
-    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
-    if (input_num != 3) {
-      MS_LOG(ERROR) << "Input number is " << input_num << ", but ScatterUpdate needs 3 inputs.";
-      return false;
-    }
-    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
-    if (output_num != 1) {
-      MS_LOG(ERROR) << "Output number is " << output_num << ", but ScatterUpdate has 1 output.";
-      return false;
-    }
-    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
-    input_size_ = 1;
-    inner_size_ = 1;
-    for (size_t i = 1; i < input_shape.size(); i++) {
-      inner_size_ *= input_shape[i];
-    }
-    input_size_ = input_shape[0] * inner_size_;
-    auto indices_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
-    indices_size_ = 1;
-    for (size_t i = 0; i < indices_shape.size(); i++) {
-      indices_size_ *= indices_shape[i];
-    }
-    updates_size_ = indices_size_ * inner_size_;
-    InitSizeLists();
-    return true;
-  }
-
-  void ResetResource() noexcept override {
-    input_size_ = 0;
-    inner_size_ = 0;
-    indices_size_ = 0;
-    updates_size_ = 0;
-    input_size_list_.clear();
-    output_size_list_.clear();
-    workspace_size_list_.clear();
-  }
-
- protected:
-  void InitSizeLists() override {
-    input_size_list_.push_back(input_size_ * sizeof(T));
-    input_size_list_.push_back(indices_size_ * sizeof(int));
-    input_size_list_.push_back(updates_size_ * sizeof(T));
-    output_size_list_.push_back(input_size_ * sizeof(T));
-  }
-
- private:
-  size_t input_size_;
-  size_t inner_size_;
-  size_t indices_size_;
-  size_t updates_size_;
-  std::vector<size_t> input_size_list_;
-  std::vector<size_t> output_size_list_;
-  std::vector<size_t> workspace_size_list_;
-};
-}  // namespace kernel
-}  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_ARRAYS_SCATTER_UPDATE_GPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/scatter_add_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/scatter_add_impl.cu
@ -1,49 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "backend/kernel_compiler/gpu/cuda_impl/util.cuh"
-#include "backend/kernel_compiler/gpu/cuda_impl/scatter_add_impl.cuh"
-
-template <typename T>
-__global__ void ScatterAdd(const size_t inner_size, const size_t updates_size, const int *indices, const T *updates,
-                           T *input) {
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < updates_size; pos += blockDim.x * gridDim.x) {
-    const size_t index = pos / inner_size;
-    const size_t offset = pos % inner_size;
-    const size_t current_pos = indices[index] * inner_size + offset;
-    MsAtomicAdd(&input[current_pos], updates[pos]);
-  }
-}
-
-template <typename T>
-void CalScatterAdd(const size_t &inner_size, const size_t &indices_size, const int *indices, const T *updates, T *input,
-                   cudaStream_t cuda_stream) {
-  const size_t updates_size = inner_size * indices_size;
-  ScatterAdd<<<GET_BLOCKS(updates_size), GET_THREADS, 0, cuda_stream>>>(inner_size, updates_size, indices, updates,
-                                                                        input);
-}
-
-template void CalScatterAdd<float>(const size_t &inner_size, const size_t &indices_size, const int *indices,
-                                   const float *updates, float *input, cudaStream_t cuda_stream);
-template void CalScatterAdd<half>(const size_t &inner_size, const size_t &indices_size, const int *indices,
-                                  const half *updates, half *input, cudaStream_t cuda_stream);
-template void CalScatterAdd<int>(const size_t &inner_size, const size_t &indices_size, const int *indices,
-                                 const int *updates, int *input, cudaStream_t cuda_stream);
-template void CalScatterAdd<unsigned char>(const size_t &inner_size, const size_t &indices_size, const int *indices,
-                                           const unsigned char *updates, unsigned char *input,
-                                           cudaStream_t cuda_stream);
-template void CalScatterAdd<int8_t>(const size_t &inner_size, const size_t &indices_size, const int *indices,
-                                    const int8_t *updates, int8_t *input, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/scatter_functor_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/scatter_functor_impl.cu
@ -0,0 +1,103 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/cuda_impl/util.cuh"
+#include "backend/kernel_compiler/gpu/cuda_impl/scatter_functor_impl.cuh"
+
+template <typename T, typename S>
+__global__ void ScatterUpdateKernel(const size_t inner_size, const size_t updates_size, const S *indices,
+                                    const T *updates, T *input) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < updates_size; pos += blockDim.x * gridDim.x) {
+    const size_t index = pos / inner_size;
+    const size_t offset = pos % inner_size;
+    const size_t current_pos = indices[index] * inner_size + offset;
+    input[current_pos] = updates[pos];
+  }
+}
+
+template <typename T, typename S>
+__global__ void ScatterAddKernel(const size_t inner_size, const size_t updates_size, const S *indices, const T *updates,
+                                 T *input) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < updates_size; pos += blockDim.x * gridDim.x) {
+    const size_t index = pos / inner_size;
+    const size_t offset = pos % inner_size;
+    const size_t current_pos = indices[index] * inner_size + offset;
+    MsAtomicAdd(&input[current_pos], updates[pos]);
+  }
+}
+
+template <typename T, typename S>
+__global__ void ScatterSubKernel(const size_t inner_size, const size_t updates_size, const S *indices, const T *updates,
+                                 T *input) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < updates_size; pos += blockDim.x * gridDim.x) {
+    const size_t index = pos / inner_size;
+    const size_t offset = pos % inner_size;
+    const size_t current_pos = indices[index] * inner_size + offset;
+    MsAtomicAdd(&input[current_pos], -updates[pos]);
+  }
+}
+
+template <typename T, typename S>
+void ScatterFunc(enum ScatterFunctorType func_type, const size_t &inner_size, const size_t &indices_size,
+                 const S *indices, const T *updates, T *input, cudaStream_t cuda_stream) {
+  const size_t updates_size = inner_size * indices_size;
+  switch (func_type) {
+    case SCATTER_FUNC_UPDATE:
+      return ScatterUpdateKernel<<<GET_BLOCKS(updates_size), GET_THREADS, 0, cuda_stream>>>(inner_size, updates_size,
+                                                                                            indices, updates, input);
+    case SCATTER_FUNC_ADD:
+      return ScatterAddKernel<<<GET_BLOCKS(updates_size), GET_THREADS, 0, cuda_stream>>>(inner_size, updates_size,
+                                                                                         indices, updates, input);
+    case SCATTER_FUNC_SUB:
+      return ScatterSubKernel<<<GET_BLOCKS(updates_size), GET_THREADS, 0, cuda_stream>>>(inner_size, updates_size,
+                                                                                         indices, updates, input);
+    default:
+      break;
+  }
+}
+
+template void ScatterFunc<float, int>(enum ScatterFunctorType func_type, const size_t &inner_size,
+                                      const size_t &indices_size, const int *indices, const float *updates,
+                                      float *input, cudaStream_t cuda_stream);
+template void ScatterFunc<float, int64_t>(enum ScatterFunctorType func_type, const size_t &inner_size,
+                                          const size_t &indices_size, const int64_t *indices, const float *updates,
+                                          float *input, cudaStream_t cuda_stream);
+template void ScatterFunc<half, int>(enum ScatterFunctorType func_type, const size_t &inner_size,
+                                     const size_t &indices_size, const int *indices, const half *updates, half *input,
+                                     cudaStream_t cuda_stream);
+template void ScatterFunc<half, int64_t>(enum ScatterFunctorType func_type, const size_t &inner_size,
+                                         const size_t &indices_size, const int64_t *indices, const half *updates,
+                                         half *input, cudaStream_t cuda_stream);
+template void ScatterFunc<int, int>(enum ScatterFunctorType func_type, const size_t &inner_size,
+                                    const size_t &indices_size, const int *indices, const int *updates, int *input,
+                                    cudaStream_t cuda_stream);
+template void ScatterFunc<int, int64_t>(enum ScatterFunctorType func_type, const size_t &inner_size,
+                                        const size_t &indices_size, const int64_t *indices, const int *updates,
+                                        int *input, cudaStream_t cuda_stream);
+template void ScatterFunc<unsigned char, int>(enum ScatterFunctorType func_type, const size_t &inner_size,
+                                              const size_t &indices_size, const int *indices,
+                                              const unsigned char *updates, unsigned char *input,
+                                              cudaStream_t cuda_stream);
+template void ScatterFunc<unsigned char, int64_t>(enum ScatterFunctorType func_type, const size_t &inner_size,
+                                                  const size_t &indices_size, const int64_t *indices,
+                                                  const unsigned char *updates, unsigned char *input,
+                                                  cudaStream_t cuda_stream);
+template void ScatterFunc<int8_t, int>(enum ScatterFunctorType func_type, const size_t &inner_size,
+                                       const size_t &indices_size, const int *indices, const int8_t *updates,
+                                       int8_t *input, cudaStream_t cuda_stream);
+template void ScatterFunc<int8_t, int64_t>(enum ScatterFunctorType func_type, const size_t &inner_size,
+                                           const size_t &indices_size, const int64_t *indices, const int8_t *updates,
+                                           int8_t *input, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/scatter_functor_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/scatter_functor_impl.cuh
@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -14,13 +14,20 @@
 * limitations under the License.
 */

-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SCATTER_ADD_IMPL_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SCATTER_ADD_IMPL_CUH_
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SCATTER_FUNCTOR_IMPL_CUH_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SCATTER_FUNCTOR_IMPL_CUH_

 #include "runtime/device/gpu/cuda_common.h"

-template <typename T>
-void CalScatterAdd(const size_t &inner_size, const size_t &indices_size, const int *indices, const T *updates, T *input,
-                   cudaStream_t cuda_stream);
+enum ScatterFunctorType {
+  SCATTER_FUNC_UPDATE = 0,
+  SCATTER_FUNC_ADD,
+  SCATTER_FUNC_SUB,
+  SCATTER_FUNC_INVALID_TYPE = 255
+};

-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SCATTER_ADD_IMPL_CUH_
+template <typename T, typename S>
+void ScatterFunc(enum ScatterFunctorType func_type, const size_t &inner_size, const size_t &indices_size,
+                 const S *indices, const T *updates, T *input, cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SCATTER_FUNCTOR_IMPL_CUH_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/scatter_update_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/scatter_update_impl.cu
@ -1,48 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "backend/kernel_compiler/gpu/cuda_impl/scatter_update_impl.cuh"
-
-template <typename T>
-__global__ void ScatterUpdate(const size_t inner_size, const size_t updates_size, const int *indices, const T *updates,
-                              T *input) {
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < updates_size; pos += blockDim.x * gridDim.x) {
-    const size_t index = pos / inner_size;
-    const size_t offset = pos % inner_size;
-    const size_t current_pos = indices[index] * inner_size + offset;
-    input[current_pos] = updates[pos];
-  }
-}
-
-template <typename T>
-void CalScatterUpdate(const size_t &inner_size, const size_t &indices_size, const int *indices, const T *updates,
-                      T *input, cudaStream_t cuda_stream) {
-  const size_t updates_size = inner_size * indices_size;
-  ScatterUpdate<<<GET_BLOCKS(updates_size), GET_THREADS, 0, cuda_stream>>>(inner_size, updates_size, indices, updates,
-                                                                           input);
-}
-
-template void CalScatterUpdate<float>(const size_t &inner_size, const size_t &indices_size, const int *indices,
-                                      const float *updates, float *input, cudaStream_t cuda_stream);
-template void CalScatterUpdate<half>(const size_t &inner_size, const size_t &indices_size, const int *indices,
-                                     const half *updates, half *input, cudaStream_t cuda_stream);
-template void CalScatterUpdate<int>(const size_t &inner_size, const size_t &indices_size, const int *indices,
-                                    const int *updates, int *input, cudaStream_t cuda_stream);
-template void CalScatterUpdate<unsigned char>(const size_t &inner_size, const size_t &indices_size, const int *indices,
-                                              const unsigned char *updates, unsigned char *input,
-                                              cudaStream_t cuda_stream);
-template void CalScatterUpdate<int8_t>(const size_t &inner_size, const size_t &indices_size, const int *indices,
-                                       const int8_t *updates, int8_t *input, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/scatter_update_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/scatter_update_impl.cuh
@ -1,26 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SCATTER_UPDATE_IMPL_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SCATTER_UPDATE_IMPL_CUH_
-
-#include "runtime/device/gpu/cuda_common.h"
-
-template <typename T>
-void CalScatterUpdate(const size_t &inner_size, const size_t &indices_size, const int *indices, const T *updates,
-                      T *input, cudaStream_t cuda_stream);
-
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SCATTER_UPDATE_IMPL_CUH_
--- a/mindspore/core/abstract/infer_functions.h
+++ b/mindspore/core/abstract/infer_functions.h
@ -185,6 +185,8 @@ AbstractBasePtr InferImplDynamicStitch(const AnalysisEnginePtr &, const Primitiv
                                       const AbstractBasePtrList &args_spec_list);
 AbstractBasePtr InferImplScatterAdd(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                    const AbstractBasePtrList &args_spec_list);
+AbstractBasePtr InferImplScatterSub(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                    const AbstractBasePtrList &args_spec_list);
 AbstractBasePtr InferImplScatterUpdate(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                       const AbstractBasePtrList &args_spec_list);
 AbstractBasePtr InferImplDiv(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
--- a/mindspore/core/abstract/prim_arrays.cc
+++ b/mindspore/core/abstract/prim_arrays.cc
@ -391,6 +391,21 @@ AbstractBasePtr InferImplScatterAdd(const AnalysisEnginePtr &, const PrimitivePt
  return std::make_shared<AbstractTensor>(x->element(), std::make_shared<Shape>(shape, min_shape, max_shape));
 }

+AbstractBasePtr InferImplScatterSub(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                    const AbstractBasePtrList &args_spec_list) {
+  constexpr auto kScatterSubInputNum = 3;
+  const std::string op_name = primitive->name();
+  CheckRequiredArgsSize(op_name, args_spec_list, kScatterSubInputNum);
+  auto x = CheckArg<AbstractTensor>(op_name, args_spec_list, 0);
+  MS_EXCEPTION_IF_NULL(x);
+  MS_EXCEPTION_IF_NULL(x->shape());
+  ShapeVector shape = x->shape()->shape();
+  ShapeVector min_shape = x->shape()->min_shape();
+  ShapeVector max_shape = x->shape()->max_shape();
+  CheckMinMaxShape(shape, &min_shape, &max_shape);
+  return std::make_shared<AbstractTensor>(x->element(), std::make_shared<Shape>(shape, min_shape, max_shape));
+}
+
 AbstractBasePtr InferImplScatterUpdate(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                       const AbstractBasePtrList &args_spec_list) {
  const std::string op_name = primitive->name();
--- a/mindspore/core/abstract/primitive_infer_map.cc
+++ b/mindspore/core/abstract/primitive_infer_map.cc
@ -92,6 +92,7 @@ PrimitiveEvalImplMap &GetPrimitiveToEvalImplMap() {
    {prim::kPrimUnsortedSegmentMax, {InferImplUnsortedSegmentMax, nullptr, true}},
    {prim::kPrimUnsortedSegmentMin, {InferImplUnsortedSegmentMin, nullptr, true}},
    {prim::kPrimScatterAdd, {InferImplScatterAdd, nullptr, true}},
+    {prim::kPrimScatterSub, {InferImplScatterSub, nullptr, true}},
    {prim::kPrimSubAndFilter, {InferImplSubAndFilter, nullptr, true}},
    {prim::kPrimScatterUpdate, {InferImplScatterUpdate, nullptr, true}},
    {prim::kPrimMapCacheIdx, {InferImplMapCacheIdx, nullptr, true}},
--- a/mindspore/core/base/core_ops.h
+++ b/mindspore/core/base/core_ops.h
@ -214,6 +214,7 @@ inline const PrimitivePtr kPrimDynamicRNNGrad = std::make_shared<Primitive>("Dyn
 inline const PrimitivePtr kPrimDynamicGRUV2 = std::make_shared<Primitive>("DynamicGRUV2");
 inline const PrimitivePtr kPrimDynamicGRUV2Grad = std::make_shared<Primitive>("DynamicGRUV2Grad");
 inline const PrimitivePtr kPrimScatterAdd = std::make_shared<Primitive>("ScatterAdd");
+inline const PrimitivePtr kPrimScatterSub = std::make_shared<Primitive>("ScatterSub");
 inline const PrimitivePtr kPrimScatterUpdate = std::make_shared<Primitive>("ScatterUpdate");
 inline const PrimitivePtr kPrimTensorCopySlices = std::make_shared<Primitive>("TensorCopySlices");
 inline const PrimitivePtr kPrimMapUniform = std::make_shared<Primitive>("MapUniform");
--- a/mindspore/ops/operations/array_ops.py
+++ b/mindspore/ops/operations/array_ops.py
@ -4129,7 +4129,7 @@ class ScatterAdd(_ScatterOpDynamic):
        self.add_prim_attr('side_effect_mem', True)


-class ScatterSub(_ScatterOp):
+class ScatterSub(_ScatterOpDynamic):
    r"""
    Updates the value of the input tensor through the subtraction operation.

@ -4230,6 +4230,13 @@ class ScatterSub(_ScatterOp):
         [-12. -12. -12.]]
    """

+    @prim_attr_register
+    def __init__(self, use_locking=False):
+        """Initialize ScatterSub"""
+        validator.check_value_type('use_locking', use_locking, [bool], self.name)
+        self.init_prim_io_names(inputs=['x', 'indices', 'updates'], outputs=['y'])
+        self.add_prim_attr('side_effect_mem', True)
+

 class ScatterMul(_ScatterOp):
    r"""
--- a/tests/st/ops/gpu/test_scatter_add_op.py
+++ b/tests/st/ops/gpu/test_scatter_add_op.py
@ -1,338 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-import numpy as np
-import pytest
-import mindspore.context as context
-import mindspore.nn as nn
-from mindspore import Tensor, Parameter
-from mindspore.ops import operations as P
-from mindspore.ops.operations import _inner_ops as inner
-
-context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
-# all cases tested against dchip
-
-class TestScatterAddNet(nn.Cell):
-    def __init__(self, lock, inputx, indices, updates):
-        super(TestScatterAddNet, self).__init__()
-        self.scatter_add = P.ScatterAdd(use_locking=lock)
-        self.inputx = Parameter(inputx, name="inputx")
-        self.indices = Parameter(indices, name="indices")
-        self.updates = Parameter(updates, name="updates")
-
-    def construct(self):
-        out = self.scatter_add(self.inputx, self.indices, self.updates)
-        return out
-
-def scatter_add_net(inputx, indices, updates):
-    lock = True
-    net = TestScatterAddNet(lock, inputx, indices, updates)
-    return net()
-
-def scatter_add_use_locking_false_net(inputx, indices, updates):
-    lock = False
-    net = TestScatterAddNet(lock, inputx, indices, updates)
-    return net()
-
-class TestScatterAddDynamicNet(nn.Cell):
-    def __init__(self, inputx, indices, updates):
-        super(TestScatterAddDynamicNet, self).__init__()
-        self.scatter_add = P.ScatterAdd()
-        self.test_dynamic = inner.GpuConvertToDynamicShape()
-        self.inputx = Parameter(inputx, name="inputx")
-        self.indices = Parameter(indices, name="indices")
-        self.updates = Parameter(updates, name="updates")
-
-    def construct(self):
-        indices = self.test_dynamic(self.indices)
-        updates = self.test_dynamic(self.updates)
-        out = self.scatter_add(self.inputx, indices, updates)
-        return out
-
-def scatter_add_d_net(inputx, indices, updates):
-    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
-    net = TestScatterAddDynamicNet(inputx, indices, updates)
-    return net()
-
-class TestScatterAddDynamicNet2(nn.Cell):
-    def __init__(self, inputx):
-        super(TestScatterAddDynamicNet2, self).__init__()
-        self.scatter_add = P.ScatterAdd()
-        self.test_dynamic = inner.GpuConvertToDynamicShape()
-        self.inputx = Parameter(inputx, name="inputx")
-
-    def construct(self, indices, updates):
-        indices = self.test_dynamic(indices)
-        updates = self.test_dynamic(updates)
-        out = self.scatter_add(self.inputx, indices, updates)
-        return out
-
-def scatter_add_d2_net(inputx, indices_1, updates_1,
-                       indices_2, updates_2):
-    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
-    net = TestScatterAddDynamicNet2(inputx)
-    out1 = net(indices_1, updates_1)
-    out2 = net(indices_2, updates_2)
-    return (out1, out2)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_add_small_float32():
-    inputx = Tensor(np.zeros((2, 3)).astype(np.float32))
-    indices = Tensor(np.array([[0, 1], [0, 1]]).astype(np.int32))
-    updates = Tensor(np.arange(12).reshape((2, 2, 3)).astype(np.float32))
-    output = scatter_add_net(inputx, indices, updates)
-    expected = np.array([[6., 8., 10.],
-                         [12., 14., 16.]])
-    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_add_input_updated():
-    inputx = Tensor(np.zeros((2, 3)).astype(np.float32))
-    indices = Tensor(np.array([[0, 1], [0, 1]]).astype(np.int32))
-    updates = Tensor(np.arange(12).reshape((2, 2, 3)).astype(np.float32))
-    lock = True
-    net = TestScatterAddNet(lock, inputx, indices, updates)
-    net()
-    expected = np.array([[6., 8., 10.],
-                         [12., 14., 16.]])
-    np.testing.assert_array_almost_equal(net.inputx.asnumpy(), expected)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_add_large_shape_float32():
-    inputx = Tensor(np.ones((4, 2, 3, 4)).astype(np.float32))
-    indices = Tensor(np.array([[0, 2], [3, 1]]).astype(np.int32))
-    updates = Tensor(np.arange(96).reshape((2, 2, 2, 3, 4)).astype(np.float32))
-    output = scatter_add_net(inputx, indices, updates)
-    expected = np.array([[[[1., 2., 3., 4.],
-                           [5., 6., 7., 8.],
-                           [9., 10., 11., 12.]],
-                          [[13., 14., 15., 16.],
-                           [17., 18., 19., 20.],
-                           [21., 22., 23., 24.]]],
-                         [[[73., 74., 75., 76.],
-                           [77., 78., 79., 80.],
-                           [81., 82., 83., 84.]],
-                          [[85., 86., 87., 88.],
-                           [89., 90., 91., 92.],
-                           [93., 94., 95., 96.]]],
-                         [[[25., 26., 27., 28.],
-                           [29., 30., 31., 32.],
-                           [33., 34., 35., 36.]],
-                          [[37., 38., 39., 40.],
-                           [41., 42., 43., 44.],
-                           [45., 46., 47., 48.]]],
-                         [[[49., 50., 51., 52.],
-                           [53., 54., 55., 56.],
-                           [57., 58., 59., 60.]],
-                          [[61., 62., 63., 64.],
-                           [65., 66., 67., 68.],
-                           [69., 70., 71., 72.]]]])
-    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_add_small_float32_use_locking_false():
-    inputx = Tensor(np.zeros((2, 3)).astype(np.float32))
-    indices = Tensor(np.array([1, 0]).astype(np.int32))
-    updates = Tensor(np.arange(6).reshape((2, 3)).astype(np.float32))
-    output = scatter_add_use_locking_false_net(inputx, indices, updates)
-    expected = np.array([[3., 4., 5.],
-                         [0., 1., 2.]])
-    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_add_input_less_than_1_float32():
-    inputx = Tensor(np.array([[0.214141, 0.415151, 0.51516],
-                              [0.876542, 0.451611, 0.55112],
-                              [0.111244, 0.633333, 0.34444]]).astype(np.float32))
-    indices = Tensor(np.array([[[1, 0, 2],
-                                [2, 2, 0]],
-                               [[1, 0, 1],
-                                [2, 1, 2]]]).astype(np.int32))
-    updates = Tensor(np.arange(34, 70).reshape((2, 2, 3, 3)).astype(np.float32))
-    output = scatter_add_net(inputx, indices, updates)
-    expected = np.array([[141.21414, 144.41515, 147.51517],
-                         [208.87654, 212.45161, 216.55112],
-                         [257.11124, 262.63333, 267.34442]], dtype=np.float32)
-    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_add_float16():
-    inputx = Tensor(np.zeros((2, 3)).astype(np.float16))
-    indices = Tensor(np.array([[0, 1], [0, 1]]).astype(np.int32))
-    updates = Tensor(np.arange(12).reshape((2, 2, 3)).astype(np.float16))
-    output = scatter_add_net(inputx, indices, updates)
-    expected = np.array([[6., 8., 10.],
-                         [12., 14., 16.]])
-    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_add_large_float16():
-    inputx = Tensor(np.zeros((2, 3, 4)).astype(np.float16))
-    indices = Tensor(np.array([[0, 0], [1, 1]]).astype(np.int32))
-    updates = Tensor(np.arange(63, 111).reshape((2, 2, 3, 4)).astype(np.float16))
-    output = scatter_add_net(inputx, indices, updates)
-    expected = np.array([[[138., 140., 142., 144.],
-                          [146., 148., 150., 152.],
-                          [154., 156., 158., 160.]],
-                         [[186., 188., 190., 192.],
-                          [194., 196., 198., 200.],
-                          [202., 204., 206., 208.]]])
-    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_add_disordered_float16():
-    inputx = Tensor(np.flip(np.arange(34, 46).reshape(3, 4).astype(np.float16)))
-    indices = Tensor(np.array([[[0, 1, 2],
-                                [2, 1, 0]],
-                               [[0, 0, 0],
-                                [2, 2, 2]]]).astype(np.int32))
-    updates = Tensor(np.arange(63, 111).reshape((2, 2, 3, 4)).astype(np.float16))
-    output = scatter_add_net(inputx, indices, updates)
-    expected = np.array([[464., 468., 472., 476.],
-                         [187., 188., 189., 190.],
-                         [492., 496., 500., 504.]])
-    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_add_large_int32():
-    inputx = Tensor(np.zeros((2, 3, 4)).astype(np.int32))
-    indices = Tensor(np.array([[0, 0], [1, 1]]).astype(np.int32))
-    updates = Tensor(np.arange(63, 111).reshape((2, 2, 3, 4)).astype(np.int32))
-    output = scatter_add_net(inputx, indices, updates)
-    expected = np.array([[[138., 140., 142., 144.],
-                          [146., 148., 150., 152.],
-                          [154., 156., 158., 160.]],
-                         [[186., 188., 190., 192.],
-                          [194., 196., 198., 200.],
-                          [202., 204., 206., 208.]]]).astype(np.int32)
-    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_add_disordered_int32():
-    inputx = Tensor(np.flip(np.arange(34, 46).reshape(3, 4).astype(np.int32)))
-    indices = Tensor(np.array([[[0, 1, 2],
-                                [2, 1, 0]],
-                               [[0, 0, 0],
-                                [2, 2, 2]]]).astype(np.int32))
-    updates = Tensor(np.arange(63, 111).reshape((2, 2, 3, 4)).astype(np.int32))
-    output = scatter_add_net(inputx, indices, updates)
-    expected = np.array([[464., 468., 472., 476.],
-                         [187., 188., 189., 190.],
-                         [492., 496., 500., 504.]]).astype(np.int32)
-    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_add_disordered_dynamic_int32():
-    inputx = Tensor(np.flip(np.arange(34, 46).reshape(3, 4).astype(np.int32)))
-    indices = Tensor(np.array([[[0, 1, 2],
-                                [2, 1, 0]],
-                               [[0, 0, 0],
-                                [2, 2, 2]]]).astype(np.int32))
-    updates = Tensor(np.arange(63, 111).reshape((2, 2, 3, 4)).astype(np.int32))
-    output = scatter_add_d_net(inputx, indices, updates)
-    expected = np.array([[464., 468., 472., 476.],
-                         [187., 188., 189., 190.],
-                         [492., 496., 500., 504.]]).astype(np.int32)
-    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_add_disordered_dynamic_int8():
-    inputx = Tensor(np.flip(np.arange(34, 46).reshape(3, 4).astype(np.int8)))
-    indices = Tensor(np.array([[[0, 1, 2],
-                                [2, 1, 0]],
-                               [[0, 0, 0],
-                                [2, 2, 2]]]).astype(np.int32))
-    updates = Tensor(np.arange(63, 111).reshape((2, 2, 3, 4)).astype(np.int8))
-    output = scatter_add_d_net(inputx, indices, updates)
-    expected = np.array([[464., 468., 472., 476.],
-                         [187., 188., 189., 190.],
-                         [492., 496., 500., 504.]]).astype(np.int8)
-    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_add_disordered_dynamic_uint8():
-    inputx = Tensor(np.flip(np.arange(34, 46).reshape(3, 4).astype(np.uint8)))
-    indices = Tensor(np.array([[[0, 1, 2],
-                                [2, 1, 0]],
-                               [[0, 0, 0],
-                                [2, 2, 2]]]).astype(np.int32))
-    updates = Tensor(np.arange(63, 111).reshape((2, 2, 3, 4)).astype(np.uint8))
-    output = scatter_add_d_net(inputx, indices, updates)
-    expected = np.array([[464., 468., 472., 476.],
-                         [187., 188., 189., 190.],
-                         [492., 496., 500., 504.]]).astype(np.uint8)
-    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_add_input_less_than_1_dynamic_float32():
-    inputx = Tensor(np.array([[0.214141, 0.415151, 0.51516],
-                              [0.876542, 0.451611, 0.55112],
-                              [0.111244, 0.633333, 0.34444]]).astype(np.float32))
-    indices = Tensor(np.array([[[1, 0, 2],
-                                [2, 2, 0]],
-                               [[1, 0, 1],
-                                [2, 1, 2]]]).astype(np.int32))
-    updates = Tensor(np.arange(34, 70).reshape((2, 2, 3, 3)).astype(np.float32))
-    output = scatter_add_d_net(inputx, indices, updates)
-    expected = np.array([[141.21414, 144.41515, 147.51517],
-                         [208.87654, 212.45161, 216.55112],
-                         [257.11124, 262.63333, 267.34442]], dtype=np.float32)
-    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_add_dynamic_two_inputs():
-    inputx = Tensor(np.zeros((2, 3)).astype(np.float32))
-    indices_1 = Tensor(np.array([[0, 1], [0, 1]]).astype(np.int32))
-    updates_1 = Tensor(np.arange(12).reshape((2, 2, 3)).astype(np.float32))
-    indices_2 = Tensor(np.array([[0, 0], [1, 1], [1, 0]]).astype(np.int32))
-    updates_2 = Tensor(np.flip(np.arange(18).reshape((3, 2, 3)).astype(np.float32)))
-    output_1, output_2 = scatter_add_d2_net(inputx, indices_1, updates_1,
-                                            indices_2, updates_2)
-    expected_1 = np.array([[6., 8., 10.],
-                           [12., 14., 16.]])
-    expected_2 = np.array([[39., 38., 37.],
-                           [36., 35., 34.]])
-    np.testing.assert_array_almost_equal(output_1.asnumpy(), expected_1)
-    np.testing.assert_array_almost_equal(output_2.asnumpy(), expected_2)
--- a/tests/st/ops/gpu/test_scatter_func_op.py
+++ b/tests/st/ops/gpu/test_scatter_func_op.py
@ -0,0 +1,720 @@
+# Copyright 2020-2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor, Parameter
+from mindspore.ops import operations as P
+from mindspore.ops.operations import _inner_ops as inner
+
+context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+# all cases tested against dchip
+
+func_map = {
+    "update": P.ScatterUpdate,
+    "add": P.ScatterAdd,
+    "sub": P.ScatterSub,
+}
+
+
+class TestScatterFuncNet(nn.Cell):
+    def __init__(self, func, lock, inputx, indices, updates):
+        super(TestScatterFuncNet, self).__init__()
+
+        self.scatter_func = func_map[func](use_locking=lock)
+        self.inputx = Parameter(inputx, name="inputx")
+        self.indices = Parameter(indices, name="indices")
+        self.updates = Parameter(updates, name="updates")
+
+    def construct(self):
+        out = self.scatter_func(self.inputx, self.indices, self.updates)
+        return out
+
+
+def scatter_func_net(func, inputx, indices, updates):
+    lock = True
+    net = TestScatterFuncNet(func, lock, inputx, indices, updates)
+    return net()
+
+
+def scatter_func_use_locking_false_net(func, inputx, indices, updates):
+    lock = False
+    net = TestScatterFuncNet(func, lock, inputx, indices, updates)
+    return net()
+
+
+class TestScatterFuncDynamicNet(nn.Cell):
+    def __init__(self, func, inputx, indices, updates):
+        super(TestScatterFuncDynamicNet, self).__init__()
+        self.scatter_func = func_map[func]()
+        self.test_dynamic = inner.GpuConvertToDynamicShape()
+        self.inputx = Parameter(inputx, name="inputx")
+        self.indices = Parameter(indices, name="indices")
+        self.updates = Parameter(updates, name="updates")
+
+    def construct(self):
+        indices = self.test_dynamic(self.indices)
+        updates = self.test_dynamic(self.updates)
+        out = self.scatter_func(self.inputx, indices, updates)
+        return out
+
+
+def scatter_func_d_net(func, inputx, indices, updates):
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+    net = TestScatterFuncDynamicNet(func, inputx, indices, updates)
+    return net()
+
+
+class TestScatterFuncDynamicNet2(nn.Cell):
+    def __init__(self, func, inputx):
+        super(TestScatterFuncDynamicNet2, self).__init__()
+        self.scatter_func = func_map[func]()
+        self.test_dynamic = inner.GpuConvertToDynamicShape()
+        self.inputx = Parameter(inputx, name="inputx")
+
+    def construct(self, indices, updates):
+        indices = self.test_dynamic(indices)
+        updates = self.test_dynamic(updates)
+        out = self.scatter_func(self.inputx, indices, updates)
+        return out
+
+
+def scatter_func_d2_net(func, inputx, indices_1, updates_1, indices_2, updates_2):
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+    net = TestScatterFuncDynamicNet2(func, inputx)
+    out1 = net(indices_1, updates_1)
+    out2 = net(indices_2, updates_2)
+    return (out1, out2)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_scatter_func_small_float32():
+    inputx = Tensor(np.zeros((2, 3)).astype(np.float32))
+    indices = Tensor(np.array([[0, 1], [0, 1]]).astype(np.int32))
+    updates = Tensor(np.arange(12).reshape((2, 2, 3)).astype(np.float32))
+
+    # update
+    output = scatter_func_net("update", inputx, indices, updates)
+    expected = np.array([[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]])
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # add
+    output = scatter_func_net("add", inputx, indices, updates)
+    expected = np.array([[6.0, 8.0, 10.0], [12.0, 14.0, 16.0]])
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # sub
+    output = scatter_func_net("sub", inputx, indices, updates)
+    expected = np.array([[-6.0, -8.0, -10.0], [-12.0, -14.0, -16.0]])
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_scatter_func_input_updated():
+    inputx = Tensor(np.zeros((2, 3)).astype(np.float32))
+    indices = Tensor(np.array([[0, 1], [0, 1]]).astype(np.int32))
+    updates = Tensor(np.arange(12).reshape((2, 2, 3)).astype(np.float32))
+    lock = True
+
+    # update
+    net = TestScatterFuncNet("update", lock, inputx, indices, updates)
+    net()
+    expected = np.array([[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]])
+    np.testing.assert_array_almost_equal(net.inputx.asnumpy(), expected)
+
+    # add
+    net = TestScatterFuncNet("add", lock, inputx, indices, updates)
+    net()
+    expected = np.array([[6.0, 8.0, 10.0], [12.0, 14.0, 16.0]])
+    np.testing.assert_array_almost_equal(net.inputx.asnumpy(), expected)
+
+    # sub
+    net = TestScatterFuncNet("sub", lock, inputx, indices, updates)
+    net()
+    expected = np.array([[-6.0, -8.0, -10.0], [-12.0, -14.0, -16.0]])
+    np.testing.assert_array_almost_equal(net.inputx.asnumpy(), expected)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_scatter_func_large_shape_float32():
+    inputx = Tensor(np.ones((4, 2, 3, 4)).astype(np.float32))
+    indices = Tensor(np.array([[0, 2], [3, 1]]).astype(np.int32))
+    updates = Tensor(np.arange(96).reshape((2, 2, 2, 3, 4)).astype(np.float32))
+
+    # update
+    output = scatter_func_net("update", inputx, indices, updates)
+    expected = np.array(
+        [
+            [
+                [[0.0, 1.0, 2.0, 3.0], [4.0, 5.0, 6.0, 7.0], [8.0, 9.0, 10.0, 11.0]],
+                [[12.0, 13.0, 14.0, 15.0], [16.0, 17.0, 18.0, 19.0], [20.0, 21.0, 22.0, 23.0]],
+            ],
+            [
+                [[72.0, 73.0, 74.0, 75.0], [76.0, 77.0, 78.0, 79.0], [80.0, 81.0, 82.0, 83.0]],
+                [[84.0, 85.0, 86.0, 87.0], [88.0, 89.0, 90.0, 91.0], [92.0, 93.0, 94.0, 95.0]],
+            ],
+            [
+                [[24.0, 25.0, 26.0, 27.0], [28.0, 29.0, 30.0, 31.0], [32.0, 33.0, 34.0, 35.0]],
+                [[36.0, 37.0, 38.0, 39.0], [40.0, 41.0, 42.0, 43.0], [44.0, 45.0, 46.0, 47.0]],
+            ],
+            [
+                [[48.0, 49.0, 50.0, 51.0], [52.0, 53.0, 54.0, 55.0], [56.0, 57.0, 58.0, 59.0]],
+                [[60.0, 61.0, 62.0, 63.0], [64.0, 65.0, 66.0, 67.0], [68.0, 69.0, 70.0, 71.0]],
+            ],
+        ]
+    )
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # add
+    output = scatter_func_net("add", inputx, indices, updates)
+    expected = np.array(
+        [
+            [
+                [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], [9.0, 10.0, 11.0, 12.0]],
+                [[13.0, 14.0, 15.0, 16.0], [17.0, 18.0, 19.0, 20.0], [21.0, 22.0, 23.0, 24.0]],
+            ],
+            [
+                [[73.0, 74.0, 75.0, 76.0], [77.0, 78.0, 79.0, 80.0], [81.0, 82.0, 83.0, 84.0]],
+                [[85.0, 86.0, 87.0, 88.0], [89.0, 90.0, 91.0, 92.0], [93.0, 94.0, 95.0, 96.0]],
+            ],
+            [
+                [[25.0, 26.0, 27.0, 28.0], [29.0, 30.0, 31.0, 32.0], [33.0, 34.0, 35.0, 36.0]],
+                [[37.0, 38.0, 39.0, 40.0], [41.0, 42.0, 43.0, 44.0], [45.0, 46.0, 47.0, 48.0]],
+            ],
+            [
+                [[49.0, 50.0, 51.0, 52.0], [53.0, 54.0, 55.0, 56.0], [57.0, 58.0, 59.0, 60.0]],
+                [[61.0, 62.0, 63.0, 64.0], [65.0, 66.0, 67.0, 68.0], [69.0, 70.0, 71.0, 72.0]],
+            ],
+        ]
+    )
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # sub
+    output = scatter_func_net("sub", inputx, indices, updates)
+    expected = np.array(
+        [
+            [
+                [[1.0, 0.0, -1.0, -2.0], [-3.0, -4.0, -5.0, -6.0], [-7.0, -8.0, -9.0, -10.0]],
+                [
+                    [-11.0, -12.0, -13.0, -14.0],
+                    [-15.0, -16.0, -17.0, -18.0],
+                    [-19.0, -20.0, -21.0, -22.0],
+                ],
+            ],
+            [
+                [
+                    [-71.0, -72.0, -73.0, -74.0],
+                    [-75.0, -76.0, -77.0, -78.0],
+                    [-79.0, -80.0, -81.0, -82.0],
+                ],
+                [
+                    [-83.0, -84.0, -85.0, -86.0],
+                    [-87.0, -88.0, -89.0, -90.0],
+                    [-91.0, -92.0, -93.0, -94.0],
+                ],
+            ],
+            [
+                [
+                    [-23.0, -24.0, -25.0, -26.0],
+                    [-27.0, -28.0, -29.0, -30.0],
+                    [-31.0, -32.0, -33.0, -34.0],
+                ],
+                [
+                    [-35.0, -36.0, -37.0, -38.0],
+                    [-39.0, -40.0, -41.0, -42.0],
+                    [-43.0, -44.0, -45.0, -46.0],
+                ],
+            ],
+            [
+                [
+                    [-47.0, -48.0, -49.0, -50.0],
+                    [-51.0, -52.0, -53.0, -54.0],
+                    [-55.0, -56.0, -57.0, -58.0],
+                ],
+                [
+                    [-59.0, -60.0, -61.0, -62.0],
+                    [-63.0, -64.0, -65.0, -66.0],
+                    [-67.0, -68.0, -69.0, -70.0],
+                ],
+            ],
+        ]
+    )
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_scatter_func_small_float32_use_locking_false():
+    inputx = Tensor(np.zeros((2, 3)).astype(np.float32))
+    indices = Tensor(np.array([1, 0]).astype(np.int32))
+    updates = Tensor(np.arange(6).reshape((2, 3)).astype(np.float32))
+
+    # update
+    output = scatter_func_use_locking_false_net("update", inputx, indices, updates)
+    expected = np.array([[3.0, 4.0, 5.0], [0.0, 1.0, 2.0]])
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # add
+    output = scatter_func_use_locking_false_net("add", inputx, indices, updates)
+    expected = np.array([[3.0, 4.0, 5.0], [0.0, 1.0, 2.0]])
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # sub
+    output = scatter_func_use_locking_false_net("sub", inputx, indices, updates)
+    expected = np.array([[-3.0, -4.0, -5.0], [0.0, -1.0, -2.0]])
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_scatter_func_input_less_than_1_float32():
+    inputx = Tensor(
+        np.array(
+            [
+                [0.214141, 0.415151, 0.51516],
+                [0.876542, 0.451611, 0.55112],
+                [0.111244, 0.633333, 0.34444],
+            ]
+        ).astype(np.float32)
+    )
+    indices = Tensor(np.array([[[1, 0, 2], [2, 2, 0]], [[1, 0, 1], [2, 1, 2]]]).astype(np.int32))
+    updates = Tensor(np.arange(34, 70).reshape((2, 2, 3, 3)).astype(np.float32))
+
+    # update
+    output = scatter_func_net("update", inputx, indices, updates)
+    expected = np.array(
+        [[37.0, 38.0, 39.0], [34.0, 35.0, 66.0], [67.0, 68.0, 69.0],], dtype=np.float32,
+    )
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # add
+    output = scatter_func_net("add", inputx, indices, updates)
+    expected = np.array(
+        [
+            [141.21414, 144.41515, 147.51517],
+            [208.87654, 212.45161, 216.55112],
+            [257.11124, 262.63333, 267.34442],
+        ],
+        dtype=np.float32,
+    )
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # sub
+    output = scatter_func_net("sub", inputx, indices, updates)
+    expected = np.array(
+        [
+            [-140.78586, -143.58485, -146.48483],
+            [-207.12346, -211.54839, -215.44888],
+            [-256.88876, -261.36667, -266.65558],
+        ],
+        dtype=np.float32,
+    )
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_scatter_func_float16():
+    inputx = Tensor(np.zeros((2, 3)).astype(np.float16))
+    indices = Tensor(np.array([[0, 1], [0, 1]]).astype(np.int32))
+    updates = Tensor(np.arange(12).reshape((2, 2, 3)).astype(np.float16))
+
+    # update
+    output = scatter_func_net("update", inputx, indices, updates)
+    expected = np.array([[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]])
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # add
+    output = scatter_func_net("add", inputx, indices, updates)
+    expected = np.array([[6.0, 8.0, 10.0], [12.0, 14.0, 16.0]])
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # sub
+    output = scatter_func_net("sub", inputx, indices, updates)
+    expected = np.array([[-6.0, -8.0, -10.0], [-12.0, -14.0, -16.0]])
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_scatter_func_large_float16():
+    inputx = Tensor(np.zeros((2, 3, 4)).astype(np.float16))
+    indices = Tensor(np.array([[0, 0], [1, 1]]).astype(np.int32))
+    updates = Tensor(np.arange(63, 111).reshape((2, 2, 3, 4)).astype(np.float16))
+
+    # update
+    output = scatter_func_net("update", inputx, indices, updates)
+    expected = np.array(
+        [
+            [[63.0, 64.0, 65.0, 66.0], [67.0, 68.0, 69.0, 70.0], [71.0, 72.0, 73.0, 74.0],],
+            [[99.0, 100.0, 101.0, 102.0], [103.0, 104.0, 105.0, 106.0], [95.0, 96.0, 97.0, 98.0],],
+        ]
+    )
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # add
+    output = scatter_func_net("add", inputx, indices, updates)
+    expected = np.array(
+        [
+            [
+                [138.0, 140.0, 142.0, 144.0],
+                [146.0, 148.0, 150.0, 152.0],
+                [154.0, 156.0, 158.0, 160.0],
+            ],
+            [
+                [186.0, 188.0, 190.0, 192.0],
+                [194.0, 196.0, 198.0, 200.0],
+                [202.0, 204.0, 206.0, 208.0],
+            ],
+        ]
+    )
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # sub
+    output = scatter_func_net("sub", inputx, indices, updates)
+    expected = np.array(
+        [
+            [
+                [-138.0, -140.0, -142.0, -144.0],
+                [-146.0, -148.0, -150.0, -152.0],
+                [-154.0, -156.0, -158.0, -160.0],
+            ],
+            [
+                [-186.0, -188.0, -190.0, -192.0],
+                [-194.0, -196.0, -198.0, -200.0],
+                [-202.0, -204.0, -206.0, -208.0],
+            ],
+        ]
+    )
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_scatter_func_disordered_float16():
+    inputx = Tensor(np.flip(np.arange(34, 46).reshape(3, 4).astype(np.float16)))
+    indices = Tensor(np.array([[[0, 1, 2], [2, 1, 0]], [[0, 0, 0], [2, 2, 2]]]).astype(np.int32))
+    updates = Tensor(np.arange(63, 111).reshape((2, 2, 3, 4)).astype(np.float16))
+
+    # update
+    output = scatter_func_net("update", inputx, indices, updates)
+    expected = np.array(
+        [[95.0, 96.0, 97.0, 98.0], [67.0, 68.0, 69.0, 70.0], [99.0, 100.0, 101.0, 102.0]]
+    )
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # add
+    output = scatter_func_net("add", inputx, indices, updates)
+    expected = np.array(
+        [[464.0, 468.0, 472.0, 476.0], [187.0, 188.0, 189.0, 190.0], [492.0, 496.0, 500.0, 504.0]]
+    )
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # sub
+    output = scatter_func_net("sub", inputx, indices, updates)
+    expected = np.array(
+        [
+            [-374.0, -380.0, -386.0, -392.0],
+            [-105.0, -108.0, -111.0, -114.0],
+            [-418.0, -424.0, -430.0, -436.0],
+        ]
+    )
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_scatter_func_large_int32():
+    inputx = Tensor(np.zeros((2, 3, 4)).astype(np.int32))
+    indices = Tensor(np.array([[0, 0], [1, 1]]).astype(np.int32))
+    updates = Tensor(np.arange(63, 111).reshape((2, 2, 3, 4)).astype(np.int32))
+
+    # update
+    output = scatter_func_net("update", inputx, indices, updates)
+    expected = np.array(
+        [
+            [[63.0, 64.0, 65.0, 66.0], [67.0, 68.0, 69.0, 70.0], [71.0, 72.0, 73.0, 74.0],],
+            [[99.0, 100.0, 101.0, 102.0], [103.0, 104.0, 105.0, 106.0], [95.0, 96.0, 97.0, 98.0],],
+        ]
+    ).astype(np.int32)
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # add
+    output = scatter_func_net("add", inputx, indices, updates)
+    expected = np.array(
+        [
+            [
+                [138.0, 140.0, 142.0, 144.0],
+                [146.0, 148.0, 150.0, 152.0],
+                [154.0, 156.0, 158.0, 160.0],
+            ],
+            [
+                [186.0, 188.0, 190.0, 192.0],
+                [194.0, 196.0, 198.0, 200.0],
+                [202.0, 204.0, 206.0, 208.0],
+            ],
+        ]
+    ).astype(np.int32)
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # sub
+    output = scatter_func_net("sub", inputx, indices, updates)
+    expected = np.array(
+        [
+            [
+                [-138.0, -140.0, -142.0, -144.0],
+                [-146.0, -148.0, -150.0, -152.0],
+                [-154.0, -156.0, -158.0, -160.0],
+            ],
+            [
+                [-186.0, -188.0, -190.0, -192.0],
+                [-194.0, -196.0, -198.0, -200.0],
+                [-202.0, -204.0, -206.0, -208.0],
+            ],
+        ]
+    ).astype(np.int32)
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_scatter_func_disordered_int32():
+    inputx = Tensor(np.flip(np.arange(34, 46).reshape(3, 4).astype(np.int32)))
+    indices = Tensor(np.array([[[0, 1, 2], [2, 1, 0]], [[0, 0, 0], [2, 2, 2]]]).astype(np.int32))
+    updates = Tensor(np.arange(63, 111).reshape((2, 2, 3, 4)).astype(np.int32))
+
+    # update
+    output = scatter_func_net("update", inputx, indices, updates)
+    expected = np.array(
+        [[95.0, 96.0, 97.0, 98.0], [67.0, 68.0, 69.0, 70.0], [99.0, 100.0, 101.0, 102.0]]
+    ).astype(np.int32)
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # add
+    output = scatter_func_net("add", inputx, indices, updates)
+    expected = np.array(
+        [[464.0, 468.0, 472.0, 476.0], [187.0, 188.0, 189.0, 190.0], [492.0, 496.0, 500.0, 504.0]]
+    ).astype(np.int32)
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # sub
+    output = scatter_func_net("sub", inputx, indices, updates)
+    expected = np.array(
+        [
+            [-374.0, -380.0, -386.0, -392.0],
+            [-105.0, -108.0, -111.0, -114.0],
+            [-418.0, -424.0, -430.0, -436.0],
+        ]
+    ).astype(np.int32)
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_scatter_func_disordered_dynamic_int32():
+    inputx = Tensor(np.flip(np.arange(34, 46).reshape(3, 4).astype(np.int32)))
+    indices = Tensor(np.array([[[0, 1, 2], [2, 1, 0]], [[0, 0, 0], [2, 2, 2]]]).astype(np.int32))
+    updates = Tensor(np.arange(63, 111).reshape((2, 2, 3, 4)).astype(np.int32))
+
+    # update
+    output = scatter_func_d_net("update", inputx, indices, updates)
+    expected = np.array(
+        [[95.0, 96.0, 97.0, 98.0], [67.0, 68.0, 69.0, 70.0], [99.0, 100.0, 101.0, 102.0]]
+    ).astype(np.int32)
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # add
+    output = scatter_func_d_net("add", inputx, indices, updates)
+    expected = np.array(
+        [[464.0, 468.0, 472.0, 476.0], [187.0, 188.0, 189.0, 190.0], [492.0, 496.0, 500.0, 504.0]]
+    ).astype(np.int32)
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # sub
+    output = scatter_func_d_net("sub", inputx, indices, updates)
+    expected = np.array(
+        [
+            [-374.0, -380.0, -386.0, -392.0],
+            [-105.0, -108.0, -111.0, -114.0],
+            [-418.0, -424.0, -430.0, -436.0],
+        ]
+    ).astype(np.int32)
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_scatter_func_disordered_dynamic_int8():
+    inputx = Tensor(np.flip(np.arange(34, 46).reshape(3, 4).astype(np.int8)))
+    indices = Tensor(np.array([[[0, 1, 2], [2, 1, 0]], [[0, 0, 0], [2, 2, 2]]]).astype(np.int32))
+    updates = Tensor(np.arange(63, 111).reshape((2, 2, 3, 4)).astype(np.int8))
+
+    # update
+    output = scatter_func_d_net("update", inputx, indices, updates)
+    expected = np.array(
+        [[95.0, 96.0, 97.0, 98.0], [67.0, 68.0, 69.0, 70.0], [99.0, 100.0, 101.0, 102.0]]
+    ).astype(np.int8)
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # add
+    output = scatter_func_d_net("add", inputx, indices, updates)
+    expected = np.array(
+        [[464.0, 468.0, 472.0, 476.0], [187.0, 188.0, 189.0, 190.0], [492.0, 496.0, 500.0, 504.0]]
+    ).astype(np.int8)
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # sub
+    output = scatter_func_d_net("sub", inputx, indices, updates)
+    expected = np.array(
+        [
+            [-118.0, -124.0, 126.0, 120.0],
+            [-105.0, -108.0, -111.0, -114.0],
+            [94.0, 88.0, 82.0, 76.0],
+        ]
+    ).astype(np.int8)
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_scatter_func_disordered_dynamic_uint8():
+    inputx = Tensor(np.flip(np.arange(34, 46).reshape(3, 4).astype(np.uint8)))
+    indices = Tensor(np.array([[[0, 1, 2], [2, 1, 0]], [[0, 0, 0], [2, 2, 2]]]).astype(np.int32))
+    updates = Tensor(np.arange(63, 111).reshape((2, 2, 3, 4)).astype(np.uint8))
+
+    # update
+    output = scatter_func_d_net("update", inputx, indices, updates)
+    expected = np.array(
+        [[95.0, 96.0, 97.0, 98.0], [67.0, 68.0, 69.0, 70.0], [99.0, 100.0, 101.0, 102.0]]
+    ).astype(np.uint8)
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # add
+    output = scatter_func_d_net("add", inputx, indices, updates)
+    expected = np.array(
+        [[464.0, 468.0, 472.0, 476.0], [187.0, 188.0, 189.0, 190.0], [492.0, 496.0, 500.0, 504.0]]
+    ).astype(np.uint8)
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # sub
+    output = scatter_func_d_net("sub", inputx, indices, updates)
+    expected = np.array(
+        [[138.0, 132.0, 126.0, 120.0], [151.0, 148.0, 145.0, 142.0], [94.0, 88.0, 82.0, 76.0]]
+    ).astype(np.uint8)
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_scatter_func_input_less_than_1_dynamic_float32():
+    inputx = Tensor(
+        np.array(
+            [
+                [0.214141, 0.415151, 0.51516],
+                [0.876542, 0.451611, 0.55112],
+                [0.111244, 0.633333, 0.34444],
+            ]
+        ).astype(np.float32)
+    )
+    indices = Tensor(np.array([[[1, 0, 2], [2, 2, 0]], [[1, 0, 1], [2, 1, 2]]]).astype(np.int32))
+    updates = Tensor(np.arange(34, 70).reshape((2, 2, 3, 3)).astype(np.float32))
+
+    # update
+    output = scatter_func_d_net("update", inputx, indices, updates)
+    expected = np.array(
+        [[37.0, 38.0, 39.0], [34.0, 35.0, 66.0], [67.0, 68.0, 69.0],], dtype=np.float32,
+    )
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # add
+    output = scatter_func_d_net("add", inputx, indices, updates)
+    expected = np.array(
+        [
+            [141.21414, 144.41515, 147.51517],
+            [208.87654, 212.45161, 216.55112],
+            [257.11124, 262.63333, 267.34442],
+        ],
+        dtype=np.float32,
+    )
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # sub
+    output = scatter_func_d_net("sub", inputx, indices, updates)
+    expected = np.array(
+        [
+            [-140.78586, -143.58485, -146.48483],
+            [-207.12346, -211.54839, -215.44888],
+            [-256.88876, -261.36667, -266.65558],
+        ],
+        dtype=np.float32,
+    )
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_scatter_func_dynamic_two_inputs():
+    inputx = Tensor(np.zeros((2, 3)).astype(np.float32))
+    indices_1 = Tensor(np.array([[0, 1], [0, 1]]).astype(np.int32))
+    updates_1 = Tensor(np.arange(12).reshape((2, 2, 3)).astype(np.float32))
+    indices_2 = Tensor(np.array([[0, 0], [1, 1], [1, 0]]).astype(np.int32))
+    updates_2 = Tensor(np.flip(np.arange(18).reshape((3, 2, 3)).astype(np.float32)))
+
+    # update
+    output_1, output_2 = scatter_func_d2_net(
+        "update", inputx, indices_1, updates_1, indices_2, updates_2
+    )
+    expected_1 = np.array([[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]])
+    expected_2 = np.array([[17.0, 16.0, 15.0], [11.0, 10.0, 9.0]])
+    np.testing.assert_array_almost_equal(output_1.asnumpy(), expected_1)
+    np.testing.assert_array_almost_equal(output_2.asnumpy(), expected_2)
+
+    # add
+    output_1, output_2 = scatter_func_d2_net(
+        "add", inputx, indices_1, updates_1, indices_2, updates_2
+    )
+    expected_1 = np.array([[6.0, 8.0, 10.0], [12.0, 14.0, 16.0]])
+    expected_2 = np.array([[39.0, 38.0, 37.0], [36.0, 35.0, 34.0]])
+    np.testing.assert_array_almost_equal(output_1.asnumpy(), expected_1)
+    np.testing.assert_array_almost_equal(output_2.asnumpy(), expected_2)
+
+    # sub
+    output_1, output_2 = scatter_func_d2_net(
+        "sub", inputx, indices_1, updates_1, indices_2, updates_2
+    )
+    expected_1 = np.array([[-6.0, -8.0, -10.0], [-12.0, -14.0, -16.0]])
+    expected_2 = np.array([[-39.0, -38.0, -37.0], [-36.0, -35.0, -34.0]])
+    np.testing.assert_array_almost_equal(output_1.asnumpy(), expected_1)
+    np.testing.assert_array_almost_equal(output_2.asnumpy(), expected_2)
--- a/tests/st/ops/gpu/test_scatter_update_op.py
+++ b/tests/st/ops/gpu/test_scatter_update_op.py
@ -1,359 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-import numpy as np
-import pytest
-import mindspore.context as context
-import mindspore.nn as nn
-from mindspore import Tensor, Parameter
-from mindspore.ops import operations as P
-from mindspore.ops.operations import _inner_ops as inner
-
-context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
-# all cases tested against dchip
-
-class TestScatterUpdateNet(nn.Cell):
-    def __init__(self, inputx, indices, updates):
-        super(TestScatterUpdateNet, self).__init__()
-        self.scatter_update = P.ScatterUpdate()
-        self.inputx = Parameter(inputx, name="inputx")
-        self.indices = Parameter(indices, name="indices")
-        self.updates = Parameter(updates, name="updates")
-
-    def construct(self):
-        out = self.scatter_update(self.inputx, self.indices, self.updates)
-        return out
-
-def scatter_update_net(inputx, indices, updates):
-    net = TestScatterUpdateNet(inputx, indices, updates)
-    return net()
-
-class TestScatterUpdateDynamicNet(nn.Cell):
-    def __init__(self, inputx, indices, updates):
-        super(TestScatterUpdateDynamicNet, self).__init__()
-        self.scatter_update = P.ScatterUpdate()
-        self.test_dynamic = inner.GpuConvertToDynamicShape()
-        self.inputx = Parameter(inputx, name="inputx")
-        self.indices = Parameter(indices, name="indices")
-        self.updates = Parameter(updates, name="updates")
-
-    def construct(self):
-        indices = self.test_dynamic(self.indices)
-        updates = self.test_dynamic(self.updates)
-        out = self.scatter_update(self.inputx, indices, updates)
-        return out
-
-def scatter_update_d_net(inputx, indices, updates):
-    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
-    net = TestScatterUpdateDynamicNet(inputx, indices, updates)
-    return net()
-
-class TestScatterUpdateDynamicNet2(nn.Cell):
-    def __init__(self, inputx):
-        super(TestScatterUpdateDynamicNet2, self).__init__()
-        self.scatter_update = P.ScatterUpdate()
-        self.test_dynamic = inner.GpuConvertToDynamicShape()
-        self.inputx = Parameter(inputx, name="inputx")
-
-    def construct(self, indices, updates):
-        indices = self.test_dynamic(indices)
-        updates = self.test_dynamic(updates)
-        out = self.scatter_update(self.inputx, indices, updates)
-        return out
-
-def scatter_update_d2_net(inputx, indices_1, updates_1,
-                          indices_2, updates_2):
-    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
-    net = TestScatterUpdateDynamicNet2(inputx)
-    out1 = net(indices_1, updates_1)
-    out2 = net(indices_2, updates_2)
-    return (out1, out2)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_update_small_float32():
-    inputx = Tensor(np.zeros((2, 3)).astype(np.float32))
-    indices = Tensor(np.array([0, 1]).astype(np.int32))
-    updates = Tensor(np.arange(6).reshape((2, 3)).astype(np.float32))
-    output = scatter_update_net(inputx, indices, updates)
-    expected = np.array([[0., 1., 2.],
-                         [3., 4., 5.]])
-    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_update_input_updated():
-    inputx = Tensor(np.zeros((2, 3)).astype(np.float32))
-    indices = Tensor(np.array([0, 1]).astype(np.int32))
-    updates = Tensor(np.arange(6).reshape((2, 3)).astype(np.float32))
-    net = TestScatterUpdateNet(inputx, indices, updates)
-    net()
-    expected = np.array([[0., 1., 2.],
-                         [3., 4., 5.]])
-    np.testing.assert_array_almost_equal(net.inputx.asnumpy(), expected)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_update_input_less_than_1_float32():
-    inputx = Tensor(np.array([[0.214141, 0.415151, 0.51516],
-                              [0.876542, 0.451611, 0.55112],
-                              [0.111244, 0.633333, 0.34444]]).astype(np.float32))
-    indices = Tensor(np.array([1, 0, 2]).astype(np.int32))
-    updates = Tensor(np.arange(34, 43).reshape((3, 3)).astype(np.float32))
-    output = scatter_update_net(inputx, indices, updates)
-    expected = np.array([[37., 38., 39.],
-                         [34., 35., 36.],
-                         [40., 41., 42.]], dtype=np.float32)
-    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_update_float16():
-    inputx = Tensor(np.zeros((2, 3)).astype(np.float16))
-    indices = Tensor(np.array([0, 1]).astype(np.int32))
-    updates = Tensor(np.arange(6).reshape((2, 3)).astype(np.float16))
-    output = scatter_update_net(inputx, indices, updates)
-    expected = np.array([[0., 1., 2.],
-                         [3., 4., 5.]]).astype(np.float16)
-    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_update_int32():
-    inputx = Tensor(np.zeros((2, 3)).astype(np.int32))
-    indices = Tensor(np.array([0, 1]).astype(np.int32))
-    updates = Tensor(np.arange(6).reshape((2, 3)).astype(np.int32))
-    output = scatter_update_net(inputx, indices, updates)
-    expected = np.array([[0., 1., 2.],
-                         [3., 4., 5.]]).astype(np.int32)
-    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_update_large_float16():
-    inputx = Tensor(np.zeros((4, 3)).astype(np.float16))
-    indices = Tensor(np.array([[2, 1], [0, 3]]).astype(np.int32))
-    updates = Tensor(np.arange(63, 75).reshape((2, 2, 3)).astype(np.float16))
-    output = scatter_update_net(inputx, indices, updates)
-    expected = np.array([[69., 70., 71.],
-                         [66., 67., 68.],
-                         [63., 64., 65.],
-                         [72., 73., 74.]]).astype(np.float16)
-    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_update_disordered_float16():
-    inputx = Tensor(np.flip(np.arange(34, 46).reshape(3, 4).astype(np.float16)))
-    indices = Tensor(np.array([1, 2]).astype(np.int32))
-    updates = Tensor(np.arange(63, 71).reshape((2, 4)).astype(np.float16))
-    output = scatter_update_net(inputx, indices, updates)
-    expected = np.array([[45., 44., 43., 42.],
-                         [63., 64., 65., 66.],
-                         [67., 68., 69., 70.]]).astype(np.float16)
-    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_update_disordered_int32():
-    inputx = Tensor(np.flip(np.arange(34, 46).reshape(3, 4).astype(np.int32)))
-    indices = Tensor(np.array([1, 2]).astype(np.int32))
-    updates = Tensor(np.arange(63, 71).reshape((2, 4)).astype(np.int32))
-    output = scatter_update_net(inputx, indices, updates)
-    expected = np.array([[45., 44., 43., 42.],
-                         [63., 64., 65., 66.],
-                         [67., 68., 69., 70.]]).astype(np.int32)
-    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_update_large_shape_float16():
-    inputx = Tensor(np.arange(96).reshape((4, 2, 3, 4)).astype(np.float16))
-    indices = Tensor(np.array([1, 0]).astype(np.int32))
-    updates = Tensor(np.flip(np.arange(48).reshape((2, 2, 3, 4)).astype(np.float16)))
-    output = scatter_update_net(inputx, indices, updates)
-    expected = np.array([[[[23., 22., 21., 20.],
-                           [19., 18., 17., 16.],
-                           [15., 14., 13., 12.]],
-                          [[11., 10., 9., 8.],
-                           [7., 6., 5., 4.],
-                           [3., 2., 1., 0.]]],
-                         [[[47., 46., 45., 44.],
-                           [43., 42., 41., 40.],
-                           [39., 38., 37., 36.]],
-                          [[35., 34., 33., 32.],
-                           [31., 30., 29., 28.],
-                           [27., 26., 25., 24.]]],
-                         [[[48., 49., 50., 51.],
-                           [52., 53., 54., 55.],
-                           [56., 57., 58., 59.]],
-                          [[60., 61., 62., 63.],
-                           [64., 65., 66., 67.],
-                           [68., 69., 70., 71.]]],
-                         [[[72., 73., 74., 75.],
-                           [76., 77., 78., 79.],
-                           [80., 81., 82., 83.]],
-                          [[84., 85., 86., 87.],
-                           [88., 89., 90., 91.],
-                           [92., 93., 94., 95.]]]]).astype(np.float16)
-    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_update_disordered_int8():
-    inputx = Tensor(np.flip(np.arange(34, 46).reshape(3, 4).astype(np.int8)))
-    indices = Tensor(np.array([1, 2]).astype(np.int32))
-    updates = Tensor(np.arange(63, 71).reshape((2, 4)).astype(np.int8))
-    output = scatter_update_net(inputx, indices, updates)
-    expected = np.array([[45., 44., 43., 42.],
-                         [63., 64., 65., 66.],
-                         [67., 68., 69., 70.]]).astype(np.int8)
-    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_update_large_shape_int8():
-    inputx = Tensor(np.arange(96).reshape((4, 2, 3, 4)).astype(np.int8))
-    indices = Tensor(np.array([1, 0]).astype(np.int32))
-    updates = Tensor(np.flip(np.arange(48).reshape((2, 2, 3, 4)).astype(np.int8)))
-    output = scatter_update_net(inputx, indices, updates)
-    expected = np.array([[[[23., 22., 21., 20.],
-                           [19., 18., 17., 16.],
-                           [15., 14., 13., 12.]],
-                          [[11., 10., 9., 8.],
-                           [7., 6., 5., 4.],
-                           [3., 2., 1., 0.]]],
-                         [[[47., 46., 45., 44.],
-                           [43., 42., 41., 40.],
-                           [39., 38., 37., 36.]],
-                          [[35., 34., 33., 32.],
-                           [31., 30., 29., 28.],
-                           [27., 26., 25., 24.]]],
-                         [[[48., 49., 50., 51.],
-                           [52., 53., 54., 55.],
-                           [56., 57., 58., 59.]],
-                          [[60., 61., 62., 63.],
-                           [64., 65., 66., 67.],
-                           [68., 69., 70., 71.]]],
-                         [[[72., 73., 74., 75.],
-                           [76., 77., 78., 79.],
-                           [80., 81., 82., 83.]],
-                          [[84., 85., 86., 87.],
-                           [88., 89., 90., 91.],
-                           [92., 93., 94., 95.]]]]).astype(np.int8)
-    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_update_large_uint8():
-    inputx = Tensor(np.zeros((4, 3)).astype(np.uint8))
-    indices = Tensor(np.array([[2, 1], [0, 3]]).astype(np.int32))
-    updates = Tensor(np.arange(63, 75).reshape((2, 2, 3)).astype(np.uint8))
-    output = scatter_update_net(inputx, indices, updates)
-    expected = np.array([[69., 70., 71.],
-                         [66., 67., 68.],
-                         [63., 64., 65.],
-                         [72., 73., 74.]]).astype(np.uint8)
-    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_update_disordered_uint8():
-    inputx = Tensor(np.flip(np.arange(34, 46).reshape(3, 4).astype(np.uint8)))
-    indices = Tensor(np.array([1, 2]).astype(np.int32))
-    updates = Tensor(np.arange(63, 71).reshape((2, 4)).astype(np.uint8))
-    output = scatter_update_net(inputx, indices, updates)
-    expected = np.array([[45., 44., 43., 42.],
-                         [63., 64., 65., 66.],
-                         [67., 68., 69., 70.]]).astype(np.uint8)
-    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_update_large_shape_dynamic_int8():
-    inputx = Tensor(np.arange(96).reshape((4, 2, 3, 4)).astype(np.int8))
-    indices = Tensor(np.array([1, 0]).astype(np.int32))
-    updates = Tensor(np.flip(np.arange(48).reshape((2, 2, 3, 4)).astype(np.int8)))
-    output = scatter_update_d_net(inputx, indices, updates)
-    expected = np.array([[[[23., 22., 21., 20.],
-                           [19., 18., 17., 16.],
-                           [15., 14., 13., 12.]],
-                          [[11., 10., 9., 8.],
-                           [7., 6., 5., 4.],
-                           [3., 2., 1., 0.]]],
-                         [[[47., 46., 45., 44.],
-                           [43., 42., 41., 40.],
-                           [39., 38., 37., 36.]],
-                          [[35., 34., 33., 32.],
-                           [31., 30., 29., 28.],
-                           [27., 26., 25., 24.]]],
-                         [[[48., 49., 50., 51.],
-                           [52., 53., 54., 55.],
-                           [56., 57., 58., 59.]],
-                          [[60., 61., 62., 63.],
-                           [64., 65., 66., 67.],
-                           [68., 69., 70., 71.]]],
-                         [[[72., 73., 74., 75.],
-                           [76., 77., 78., 79.],
-                           [80., 81., 82., 83.]],
-                          [[84., 85., 86., 87.],
-                           [88., 89., 90., 91.],
-                           [92., 93., 94., 95.]]]]).astype(np.int8)
-    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_update_disordered_dynamic_int32():
-    inputx = Tensor(np.flip(np.arange(34, 46).reshape(3, 4).astype(np.int32)))
-    indices = Tensor(np.array([1, 2]).astype(np.int32))
-    updates = Tensor(np.arange(63, 71).reshape((2, 4)).astype(np.int32))
-    output = scatter_update_d_net(inputx, indices, updates)
-    expected = np.array([[45., 44., 43., 42.],
-                         [63., 64., 65., 66.],
-                         [67., 68., 69., 70.]]).astype(np.int32)
-    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_training
-@pytest.mark.env_onecard
-def test_scatter_update_two_inputs():
-    inputx = Tensor(np.zeros((2, 3)).astype(np.float32))
-    indices_1 = Tensor(np.array([0, 1]).astype(np.int32))
-    updates_1 = Tensor(np.arange(6).reshape((2, 3)).astype(np.float32))
-    indices_2 = Tensor(np.array([1]).astype(np.int32))
-    updates_2 = Tensor(np.arange(34, 37).reshape((1, 3)).astype(np.float32))
-    output_1, output_2 = scatter_update_d2_net(inputx, indices_1, updates_1,
-                                               indices_2, updates_2)
-    expected_1 = np.array([[0., 1., 2.],
-                           [3., 4., 5.]], dtype=np.float32)
-    expected_2 = np.array([[0., 1., 2.],
-                           [34., 35., 36.]], dtype=np.float32)
-    np.testing.assert_array_almost_equal(output_1.asnumpy(), expected_1)
-    np.testing.assert_array_almost_equal(output_2.asnumpy(), expected_2)