add scatter_nd_add and scatter_nd_sub op for gpu

2021-06-27 11:36:36 +08:00 · 2021-06-27 11:36:36 +08:00 · d8d906df3a
parent 00dee4e3bd
commit d8d906df3a
9 changed files with 823 additions and 401 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_nd_functor_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_nd_functor_gpu_kernel.cc
@ -0,0 +1,363 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/arrays/scatter_nd_functor_gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+// ScatterNdUpdate
+MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat64)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeFloat64)
+                        .AddOutputAttr(kNumberTypeFloat64),
+                      ScatterNdFunctorKernel, double, int)
+MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat64)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeFloat64)
+                        .AddOutputAttr(kNumberTypeFloat64),
+                      ScatterNdFunctorKernel, double, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32),
+                      ScatterNdFunctorKernel, float, int)
+MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32),
+                      ScatterNdFunctorKernel, float, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddOutputAttr(kNumberTypeFloat16),
+                      ScatterNdFunctorKernel, half, int)
+MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddOutputAttr(kNumberTypeFloat16),
+                      ScatterNdFunctorKernel, half, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddOutputAttr(kNumberTypeInt32),
+                      ScatterNdFunctorKernel, int, int)
+MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddOutputAttr(kNumberTypeInt32),
+                      ScatterNdFunctorKernel, int, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt16)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt16)
+                        .AddOutputAttr(kNumberTypeInt16),
+                      ScatterNdFunctorKernel, int16_t, int)
+MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt16)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeInt16)
+                        .AddOutputAttr(kNumberTypeInt16),
+                      ScatterNdFunctorKernel, int16_t, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeUInt8)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeUInt8)
+                        .AddOutputAttr(kNumberTypeUInt8),
+                      ScatterNdFunctorKernel, uint8_t, int)
+MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeUInt8)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeUInt8)
+                        .AddOutputAttr(kNumberTypeUInt8),
+                      ScatterNdFunctorKernel, uint8_t, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt8)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt8)
+                        .AddOutputAttr(kNumberTypeInt8),
+                      ScatterNdFunctorKernel, int8_t, int)
+MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt8)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeInt8)
+                        .AddOutputAttr(kNumberTypeInt8),
+                      ScatterNdFunctorKernel, int8_t, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeBool)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeBool)
+                        .AddOutputAttr(kNumberTypeBool),
+                      ScatterNdFunctorKernel, bool, int)
+MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeBool)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeBool)
+                        .AddOutputAttr(kNumberTypeBool),
+                      ScatterNdFunctorKernel, bool, int64_t)
+
+// ScatterNdAdd
+MS_REG_GPU_KERNEL_TWO(ScatterNdAdd,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat64)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeFloat64)
+                        .AddOutputAttr(kNumberTypeFloat64),
+                      ScatterNdFunctorKernel, double, int)
+MS_REG_GPU_KERNEL_TWO(ScatterNdAdd,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat64)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeFloat64)
+                        .AddOutputAttr(kNumberTypeFloat64),
+                      ScatterNdFunctorKernel, double, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterNdAdd,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32),
+                      ScatterNdFunctorKernel, float, int)
+MS_REG_GPU_KERNEL_TWO(ScatterNdAdd,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32),
+                      ScatterNdFunctorKernel, float, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterNdAdd,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddOutputAttr(kNumberTypeFloat16),
+                      ScatterNdFunctorKernel, half, int)
+MS_REG_GPU_KERNEL_TWO(ScatterNdAdd,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddOutputAttr(kNumberTypeFloat16),
+                      ScatterNdFunctorKernel, half, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterNdAdd,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddOutputAttr(kNumberTypeInt32),
+                      ScatterNdFunctorKernel, int, int)
+MS_REG_GPU_KERNEL_TWO(ScatterNdAdd,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddOutputAttr(kNumberTypeInt32),
+                      ScatterNdFunctorKernel, int, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterNdAdd,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt16)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt16)
+                        .AddOutputAttr(kNumberTypeInt16),
+                      ScatterNdFunctorKernel, int16_t, int)
+MS_REG_GPU_KERNEL_TWO(ScatterNdAdd,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt16)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeInt16)
+                        .AddOutputAttr(kNumberTypeInt16),
+                      ScatterNdFunctorKernel, int16_t, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterNdAdd,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeUInt8)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeUInt8)
+                        .AddOutputAttr(kNumberTypeUInt8),
+                      ScatterNdFunctorKernel, uint8_t, int)
+MS_REG_GPU_KERNEL_TWO(ScatterNdAdd,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeUInt8)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeUInt8)
+                        .AddOutputAttr(kNumberTypeUInt8),
+                      ScatterNdFunctorKernel, uint8_t, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterNdAdd,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt8)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt8)
+                        .AddOutputAttr(kNumberTypeInt8),
+                      ScatterNdFunctorKernel, int8_t, int)
+MS_REG_GPU_KERNEL_TWO(ScatterNdAdd,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt8)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeInt8)
+                        .AddOutputAttr(kNumberTypeInt8),
+                      ScatterNdFunctorKernel, int8_t, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterNdAdd,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeBool)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeBool)
+                        .AddOutputAttr(kNumberTypeBool),
+                      ScatterNdFunctorKernel, bool, int)
+MS_REG_GPU_KERNEL_TWO(ScatterNdAdd,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeBool)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeBool)
+                        .AddOutputAttr(kNumberTypeBool),
+                      ScatterNdFunctorKernel, bool, int64_t)
+
+// ScatterNdSub
+MS_REG_GPU_KERNEL_TWO(ScatterNdSub,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat64)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeFloat64)
+                        .AddOutputAttr(kNumberTypeFloat64),
+                      ScatterNdFunctorKernel, double, int)
+MS_REG_GPU_KERNEL_TWO(ScatterNdSub,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat64)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeFloat64)
+                        .AddOutputAttr(kNumberTypeFloat64),
+                      ScatterNdFunctorKernel, double, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterNdSub,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32),
+                      ScatterNdFunctorKernel, float, int)
+MS_REG_GPU_KERNEL_TWO(ScatterNdSub,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32),
+                      ScatterNdFunctorKernel, float, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterNdSub,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddOutputAttr(kNumberTypeFloat16),
+                      ScatterNdFunctorKernel, half, int)
+MS_REG_GPU_KERNEL_TWO(ScatterNdSub,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddOutputAttr(kNumberTypeFloat16),
+                      ScatterNdFunctorKernel, half, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterNdSub,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddOutputAttr(kNumberTypeInt32),
+                      ScatterNdFunctorKernel, int, int)
+MS_REG_GPU_KERNEL_TWO(ScatterNdSub,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddOutputAttr(kNumberTypeInt32),
+                      ScatterNdFunctorKernel, int, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterNdSub,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt16)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt16)
+                        .AddOutputAttr(kNumberTypeInt16),
+                      ScatterNdFunctorKernel, int16_t, int)
+MS_REG_GPU_KERNEL_TWO(ScatterNdSub,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt16)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeInt16)
+                        .AddOutputAttr(kNumberTypeInt16),
+                      ScatterNdFunctorKernel, int16_t, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterNdSub,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeUInt8)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeUInt8)
+                        .AddOutputAttr(kNumberTypeUInt8),
+                      ScatterNdFunctorKernel, uint8_t, int)
+MS_REG_GPU_KERNEL_TWO(ScatterNdSub,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeUInt8)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeUInt8)
+                        .AddOutputAttr(kNumberTypeUInt8),
+                      ScatterNdFunctorKernel, uint8_t, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterNdSub,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt8)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeInt8)
+                        .AddOutputAttr(kNumberTypeInt8),
+                      ScatterNdFunctorKernel, int8_t, int)
+MS_REG_GPU_KERNEL_TWO(ScatterNdSub,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeInt8)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeInt8)
+                        .AddOutputAttr(kNumberTypeInt8),
+                      ScatterNdFunctorKernel, int8_t, int64_t)
+MS_REG_GPU_KERNEL_TWO(ScatterNdSub,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeBool)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddInputAttr(kNumberTypeBool)
+                        .AddOutputAttr(kNumberTypeBool),
+                      ScatterNdFunctorKernel, bool, int)
+MS_REG_GPU_KERNEL_TWO(ScatterNdSub,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeBool)
+                        .AddInputAttr(kNumberTypeInt64)
+                        .AddInputAttr(kNumberTypeBool)
+                        .AddOutputAttr(kNumberTypeBool),
+                      ScatterNdFunctorKernel, bool, int64_t)
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_nd_functor_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_nd_functor_gpu_kernel.h
@ -14,21 +14,30 @@
 * limitations under the License.
 */

-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_ARRAYS_SCATTER_ND_UPDATE_GPU_KERNEL_H_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_ARRAYS_SCATTER_ND_UPDATE_GPU_KERNEL_H_
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_ARRAYS_SCATTER_ND_FUNCTOR_GPU_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_ARRAYS_SCATTER_ND_FUNCTOR_GPU_KERNEL_H_

 #include <vector>
+#include <string>
+#include <map>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
-#include "backend/kernel_compiler/gpu/cuda_impl/scatter_nd_update_impl.cuh"
+#include "backend/kernel_compiler/gpu/cuda_impl/scatter_nd_functor_impl.cuh"

 namespace mindspore {
 namespace kernel {
+
+static const std::map<std::string, ScatterNdFunctorType> kScatterNdFunctorTypeMap = {
+  {"ScatterNdUpdate", SCATTER_ND_FUNC_UPDATE},
+  {"ScatterNdAdd", SCATTER_ND_FUNC_ADD},
+  {"ScatterNdSub", SCATTER_ND_FUNC_SUB},
+};
+
 template <typename T, typename S>
-class ScatterNdUpdateKernel : public GpuKernel {
+class ScatterNdFunctorKernel : public GpuKernel {
 public:
-  ScatterNdUpdateKernel() { ResetResource(); }
-  ~ScatterNdUpdateKernel() {
+  ScatterNdFunctorKernel() { ResetResource(); }
+  ~ScatterNdFunctorKernel() {
    if (indices_stride_ != nullptr) {
      device::gpu::GPUMemoryAllocator::GetInstance().FreeTensorMem(static_cast<void *>(indices_stride_));
    }
@ -54,9 +63,9 @@ class ScatterNdUpdateKernel : public GpuKernel {
    CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
                               cudaMemcpyAsync(indices_stride_, &out_strides_[0], indices_len, cudaMemcpyHostToDevice,
                                               reinterpret_cast<cudaStream_t>(stream_ptr)),
-                               "cudaMemcpyAsync failed in ScatterNdUpdateGpuFwdKernel::Launch.");
-    CalScatterNdUpdate(unit_size_, num_units_, index_depth_, indices_stride_, indices, updates, input,
-                       reinterpret_cast<cudaStream_t>(stream_ptr));
+                               "cudaMemcpyAsync failed in ScatterNdFunctorGpuFwdKernel::Launch.");
+    CalScatterNdFunctor(scatter_nd_functor_type_, unit_size_, num_units_, index_depth_, indices_stride_, indices,
+                        updates, input, reinterpret_cast<cudaStream_t>(stream_ptr));
    CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
                               cudaMemcpyAsync(&output[0], &input[0], input_size_ * sizeof(T), cudaMemcpyDeviceToDevice,
                                               reinterpret_cast<cudaStream_t>(stream_ptr)),
@ -65,15 +74,22 @@ class ScatterNdUpdateKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
+    std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
+    auto iter = kScatterNdFunctorTypeMap.find(kernel_name);
+    if (iter == kScatterNdFunctorTypeMap.end()) {
+      MS_LOG(EXCEPTION) << "ScatterNd functor " << kernel_name << " is not supported.";
+    } else {
+      scatter_nd_functor_type_ = iter->second;
+    }
    kernel_node_ = kernel_node;
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 3) {
-      MS_LOG(ERROR) << "Input number is " << input_num << ", but ScatterNdUpdate needs 3 inputs.";
+      MS_LOG(ERROR) << "Input number is " << input_num << ", but " << kernel_name << " needs 3 inputs.";
      return false;
    }
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
-      MS_LOG(ERROR) << "Output number is " << output_num << ", but ScatterNdUpdate has 1 output.";
+      MS_LOG(ERROR) << "Output number is " << output_num << ", but " << kernel_name << " has 1 output.";
      return false;
    }

@ -151,6 +167,7 @@ class ScatterNdUpdateKernel : public GpuKernel {
  }

 private:
+  ScatterNdFunctorType scatter_nd_functor_type_;
  size_t input_size_;
  size_t indices_size_;
  size_t updates_size_;
@ -167,4 +184,4 @@ class ScatterNdUpdateKernel : public GpuKernel {
 };
 }  // namespace kernel
 }  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_ARRAYS_SCATTER_ND_UPDATE_GPU_KERNEL_H_
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_ARRAYS_SCATTER_ND_FUNCTOR_GPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_nd_update_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_nd_update_gpu_kernel.cc
@ -1,134 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "backend/kernel_compiler/gpu/arrays/scatter_nd_update_gpu_kernel.h"
-
-namespace mindspore {
-namespace kernel {
-MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeFloat64)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeFloat64)
-                        .AddOutputAttr(kNumberTypeFloat64),
-                      ScatterNdUpdateKernel, double, int)
-MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeFloat64)
-                        .AddInputAttr(kNumberTypeInt64)
-                        .AddInputAttr(kNumberTypeFloat64)
-                        .AddOutputAttr(kNumberTypeFloat64),
-                      ScatterNdUpdateKernel, double, int64_t)
-MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddOutputAttr(kNumberTypeFloat32),
-                      ScatterNdUpdateKernel, float, int)
-MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddInputAttr(kNumberTypeInt64)
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddOutputAttr(kNumberTypeFloat32),
-                      ScatterNdUpdateKernel, float, int64_t)
-MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddOutputAttr(kNumberTypeFloat16),
-                      ScatterNdUpdateKernel, half, int)
-MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeInt64)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddOutputAttr(kNumberTypeFloat16),
-                      ScatterNdUpdateKernel, half, int64_t)
-MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddOutputAttr(kNumberTypeInt32),
-                      ScatterNdUpdateKernel, int, int)
-MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeInt64)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddOutputAttr(kNumberTypeInt32),
-                      ScatterNdUpdateKernel, int, int64_t)
-MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeInt16)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeInt16)
-                        .AddOutputAttr(kNumberTypeInt16),
-                      ScatterNdUpdateKernel, int16_t, int)
-MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeInt16)
-                        .AddInputAttr(kNumberTypeInt64)
-                        .AddInputAttr(kNumberTypeInt16)
-                        .AddOutputAttr(kNumberTypeInt16),
-                      ScatterNdUpdateKernel, int16_t, int64_t)
-MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeUInt8)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeUInt8)
-                        .AddOutputAttr(kNumberTypeUInt8),
-                      ScatterNdUpdateKernel, uint8_t, int)
-MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeUInt8)
-                        .AddInputAttr(kNumberTypeInt64)
-                        .AddInputAttr(kNumberTypeUInt8)
-                        .AddOutputAttr(kNumberTypeUInt8),
-                      ScatterNdUpdateKernel, uint8_t, int64_t)
-MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeInt8)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeInt8)
-                        .AddOutputAttr(kNumberTypeInt8),
-                      ScatterNdUpdateKernel, int8_t, int)
-MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeInt8)
-                        .AddInputAttr(kNumberTypeInt64)
-                        .AddInputAttr(kNumberTypeInt8)
-                        .AddOutputAttr(kNumberTypeInt8),
-                      ScatterNdUpdateKernel, int8_t, int64_t)
-MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeBool)
-                        .AddInputAttr(kNumberTypeInt32)
-                        .AddInputAttr(kNumberTypeBool)
-                        .AddOutputAttr(kNumberTypeBool),
-                      ScatterNdUpdateKernel, bool, int)
-MS_REG_GPU_KERNEL_TWO(ScatterNdUpdate,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeBool)
-                        .AddInputAttr(kNumberTypeInt64)
-                        .AddInputAttr(kNumberTypeBool)
-                        .AddOutputAttr(kNumberTypeBool),
-                      ScatterNdUpdateKernel, bool, int64_t)
-}  // namespace kernel
-}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/scatter_nd_functor_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/scatter_nd_functor_impl.cu
@ -0,0 +1,181 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/cuda_impl/util.cuh"
+#include "backend/kernel_compiler/gpu/cuda_impl/scatter_nd_functor_impl.cuh"
+
+template <typename T, typename S>
+__global__ void ScatterNdUpdate(const size_t unit_size, const size_t index_depth, const size_t updates_size,
+                                const S *out_strides, const S *indices, const T *updates, T *input) {
+  int i, j;
+  for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < (updates_size);
+       read_index += blockDim.x * gridDim.x) {
+    size_t write_index = 0;
+    bool out_bound = false;
+
+    i = read_index / unit_size;
+    j = read_index % unit_size;
+
+    for (size_t k = 0; k < index_depth; k++) {
+      S indices_i = indices[i * index_depth + k];
+      out_bound |= indices_i < 0;
+      write_index += indices_i * out_strides[k] * unit_size;
+    }
+
+    write_index += j;
+
+    if (!out_bound) {
+      input[write_index] = updates[read_index];
+    }
+  }
+}
+
+template <typename T, typename S>
+__global__ void ScatterNdAdd(const size_t unit_size, const size_t index_depth, const size_t updates_size,
+                             const S *out_strides, const S *indices, const T *updates, T *input) {
+  int i, j;
+  for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < (updates_size);
+       read_index += blockDim.x * gridDim.x) {
+    size_t write_index = 0;
+    bool out_bound = false;
+
+    i = read_index / unit_size;
+    j = read_index % unit_size;
+
+    for (size_t k = 0; k < index_depth; k++) {
+      S indices_i = indices[i * index_depth + k];
+      out_bound |= indices_i < 0;
+      write_index += indices_i * out_strides[k] * unit_size;
+    }
+
+    write_index += j;
+
+    if (!out_bound) {
+      MsAtomicAdd(&input[write_index], updates[read_index]);
+    }
+  }
+}
+
+template <typename T, typename S>
+__global__ void ScatterNdSub(const size_t unit_size, const size_t index_depth, const size_t updates_size,
+                             const S *out_strides, const S *indices, const T *updates, T *input) {
+  int i, j;
+  for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < (updates_size);
+       read_index += blockDim.x * gridDim.x) {
+    size_t write_index = 0;
+    bool out_bound = false;
+
+    i = read_index / unit_size;
+    j = read_index % unit_size;
+
+    for (size_t k = 0; k < index_depth; k++) {
+      S indices_i = indices[i * index_depth + k];
+      out_bound |= indices_i < 0;
+      write_index += indices_i * out_strides[k] * unit_size;
+    }
+
+    write_index += j;
+
+    if (!out_bound) {
+      MsAtomicAdd(&input[write_index], -updates[read_index]);
+    }
+  }
+}
+
+template <typename T, typename S>
+void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, const size_t &unit_size, const size_t &num_units,
+                         const size_t &index_depth, const S *out_strides, const S *indices, const T *updates, T *input,
+                         cudaStream_t cuda_stream) {
+  const size_t updates_size = unit_size * num_units;
+  switch (func_type) {
+    case SCATTER_ND_FUNC_UPDATE:
+      return ScatterNdUpdate<<<GET_BLOCKS(updates_size), GET_THREADS, 0, cuda_stream>>>(
+        unit_size, index_depth, updates_size, out_strides, indices, updates, input);
+    case SCATTER_ND_FUNC_ADD:
+      return ScatterNdAdd<<<GET_BLOCKS(updates_size), GET_THREADS, 0, cuda_stream>>>(
+        unit_size, index_depth, updates_size, out_strides, indices, updates, input);
+    case SCATTER_ND_FUNC_SUB:
+      return ScatterNdSub<<<GET_BLOCKS(updates_size), GET_THREADS, 0, cuda_stream>>>(
+        unit_size, index_depth, updates_size, out_strides, indices, updates, input);
+    default:
+      break;
+  }
+}
+
+template void CalScatterNdFunctor<double, int64_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
+                                                   const size_t &num_units, const size_t &index_depth,
+                                                   const int64_t *out_strides, const int64_t *indices,
+                                                   const double *updates, double *input, cudaStream_t cuda_stream);
+template void CalScatterNdFunctor<double, int32_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
+                                                   const size_t &num_units, const size_t &index_depth,
+                                                   const int32_t *out_strides, const int32_t *indices,
+                                                   const double *updates, double *input, cudaStream_t cuda_stream);
+template void CalScatterNdFunctor<float, int64_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
+                                                  const size_t &num_units, const size_t &index_depth,
+                                                  const int64_t *out_strides, const int64_t *indices,
+                                                  const float *updates, float *input, cudaStream_t cuda_stream);
+template void CalScatterNdFunctor<float, int32_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
+                                                  const size_t &num_units, const size_t &index_depth,
+                                                  const int32_t *out_strides, const int32_t *indices,
+                                                  const float *updates, float *input, cudaStream_t cuda_stream);
+template void CalScatterNdFunctor<half, int64_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
+                                                 const size_t &num_units, const size_t &index_depth,
+                                                 const int64_t *out_strides, const int64_t *indices,
+                                                 const half *updates, half *input, cudaStream_t cuda_stream);
+template void CalScatterNdFunctor<half, int32_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
+                                                 const size_t &num_units, const size_t &index_depth,
+                                                 const int32_t *out_strides, const int32_t *indices,
+                                                 const half *updates, half *input, cudaStream_t cuda_stream);
+template void CalScatterNdFunctor<int32_t, int64_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
+                                                    const size_t &num_units, const size_t &index_depth,
+                                                    const int64_t *out_strides, const int64_t *indices,
+                                                    const int32_t *updates, int32_t *input, cudaStream_t cuda_stream);
+template void CalScatterNdFunctor<int32_t, int32_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
+                                                    const size_t &num_units, const size_t &index_depth,
+                                                    const int32_t *out_strides, const int32_t *indices,
+                                                    const int32_t *updates, int32_t *input, cudaStream_t cuda_stream);
+template void CalScatterNdFunctor<int16_t, int64_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
+                                                    const size_t &num_units, const size_t &index_depth,
+                                                    const int64_t *out_strides, const int64_t *indices,
+                                                    const int16_t *updates, int16_t *input, cudaStream_t cuda_stream);
+template void CalScatterNdFunctor<int16_t, int32_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
+                                                    const size_t &num_units, const size_t &index_depth,
+                                                    const int32_t *out_strides, const int32_t *indices,
+                                                    const int16_t *updates, int16_t *input, cudaStream_t cuda_stream);
+template void CalScatterNdFunctor<uint8_t, int64_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
+                                                    const size_t &num_units, const size_t &index_depth,
+                                                    const int64_t *out_strides, const int64_t *indices,
+                                                    const uint8_t *updates, uint8_t *input, cudaStream_t cuda_stream);
+template void CalScatterNdFunctor<uint8_t, int32_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
+                                                    const size_t &num_units, const size_t &index_depth,
+                                                    const int32_t *out_strides, const int32_t *indices,
+                                                    const uint8_t *updates, uint8_t *input, cudaStream_t cuda_stream);
+template void CalScatterNdFunctor<int8_t, int64_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
+                                                   const size_t &num_units, const size_t &index_depth,
+                                                   const int64_t *out_strides, const int64_t *indices,
+                                                   const int8_t *updates, int8_t *input, cudaStream_t cuda_stream);
+template void CalScatterNdFunctor<int8_t, int32_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
+                                                   const size_t &num_units, const size_t &index_depth,
+                                                   const int32_t *out_strides, const int32_t *indices,
+                                                   const int8_t *updates, int8_t *input, cudaStream_t cuda_stream);
+template void CalScatterNdFunctor<bool, int64_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
+                                                 const size_t &num_units, const size_t &index_depth,
+                                                 const int64_t *out_strides, const int64_t *indices,
+                                                 const bool *updates, bool *input, cudaStream_t cuda_stream);
+template void CalScatterNdFunctor<bool, int32_t>(enum ScatterNdFunctorType func_type, const size_t &unit_size,
+                                                 const size_t &num_units, const size_t &index_depth,
+                                                 const int32_t *out_strides, const int32_t *indices,
+                                                 const bool *updates, bool *input, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/scatter_nd_functor_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/scatter_nd_functor_impl.cuh
@ -14,13 +14,21 @@
 * limitations under the License.
 */

-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SCATTER_ND_UPDATE_IMPL_CUH_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SCATTER_ND_UPDATE_IMPL_CUH_
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SCATTER_ND_FUNCTOR_IMPL_CUH_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SCATTER_ND_FUNCTOR_IMPL_CUH_

 #include "runtime/device/gpu/cuda_common.h"

-template <typename T, typename S>
-void CalScatterNdUpdate(const size_t &unit_size, const size_t &num_units, const size_t &index_depth,
-                        const S *out_strides, const S *indices, const T *updates, T *input, cudaStream_t cuda_stream);
+enum ScatterNdFunctorType {
+  SCATTER_ND_FUNC_UPDATE = 0,
+  SCATTER_ND_FUNC_ADD,
+  SCATTER_ND_FUNC_SUB,
+  SCATTER_ND_FUNC_INVALID_TYPE = 255
+};

-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SCATTER_ND_UPDATE_IMPL_CUH_
+template <typename T, typename S>
+void CalScatterNdFunctor(enum ScatterNdFunctorType func_type, const size_t &unit_size, const size_t &num_units,
+                         const size_t &index_depth, const S *out_strides, const S *indices, const T *updates, T *input,
+                         cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_SCATTER_ND_FUNCTOR_IMPL_CUH_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/scatter_nd_update_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/scatter_nd_update_impl.cu
@ -1,116 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "backend/kernel_compiler/gpu/cuda_impl/scatter_nd_update_impl.cuh"
-
-template <typename T, typename S>
-__global__ void ScatterNdUpdate(const size_t unit_size, const size_t index_depth, const size_t updates_size,
-                                const S *out_strides, const S *indices, const T *updates, T *input) {
-  int i, j;
-  for (size_t read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < (updates_size);
-       read_index += blockDim.x * gridDim.x) {
-    size_t write_index = 0;
-    bool out_bound = false;
-
-    i = read_index / unit_size;
-    j = read_index % unit_size;
-
-    for (size_t k = 0; k < index_depth; k++) {
-      S indices_i = indices[i * index_depth + k];
-      out_bound |= indices_i < 0;
-      write_index += indices_i * out_strides[k] * unit_size;
-    }
-
-    write_index += j;
-
-    if (!out_bound) {
-      input[write_index] = updates[read_index];
-    }
-  }
-}
-
-template <typename T, typename S>
-void CalScatterNdUpdate(const size_t &unit_size, const size_t &num_units, const size_t &index_depth,
-                        const S *out_strides, const S *indices, const T *updates, T *input, cudaStream_t cuda_stream) {
-  const size_t updates_size = unit_size * num_units;
-  ScatterNdUpdate<<<GET_BLOCKS(updates_size), GET_THREADS, 0, cuda_stream>>>(unit_size, index_depth, updates_size,
-                                                                             out_strides, indices, updates, input);
-}
-
-template void CalScatterNdUpdate<double, int64_t>(const size_t &unit_size, const size_t &num_units,
-                                                  const size_t &index_depth, const int64_t *out_strides,
-                                                  const int64_t *indices, const double *updates, double *input,
-                                                  cudaStream_t cuda_stream);
-template void CalScatterNdUpdate<double, int32_t>(const size_t &unit_size, const size_t &num_units,
-                                                  const size_t &index_depth, const int32_t *out_strides,
-                                                  const int32_t *indices, const double *updates, double *input,
-                                                  cudaStream_t cuda_stream);
-template void CalScatterNdUpdate<float, int64_t>(const size_t &unit_size, const size_t &num_units,
-                                                 const size_t &index_depth, const int64_t *out_strides,
-                                                 const int64_t *indices, const float *updates, float *input,
-                                                 cudaStream_t cuda_stream);
-template void CalScatterNdUpdate<float, int32_t>(const size_t &unit_size, const size_t &num_units,
-                                                 const size_t &index_depth, const int32_t *out_strides,
-                                                 const int32_t *indices, const float *updates, float *input,
-                                                 cudaStream_t cuda_stream);
-template void CalScatterNdUpdate<half, int64_t>(const size_t &unit_size, const size_t &num_units,
-                                                const size_t &index_depth, const int64_t *out_strides,
-                                                const int64_t *indices, const half *updates, half *input,
-                                                cudaStream_t cuda_stream);
-template void CalScatterNdUpdate<half, int32_t>(const size_t &unit_size, const size_t &num_units,
-                                                const size_t &index_depth, const int32_t *out_strides,
-                                                const int32_t *indices, const half *updates, half *input,
-                                                cudaStream_t cuda_stream);
-template void CalScatterNdUpdate<int32_t, int64_t>(const size_t &unit_size, const size_t &num_units,
-                                                   const size_t &index_depth, const int64_t *out_strides,
-                                                   const int64_t *indices, const int32_t *updates, int32_t *input,
-                                                   cudaStream_t cuda_stream);
-template void CalScatterNdUpdate<int32_t, int32_t>(const size_t &unit_size, const size_t &num_units,
-                                                   const size_t &index_depth, const int32_t *out_strides,
-                                                   const int32_t *indices, const int32_t *updates, int32_t *input,
-                                                   cudaStream_t cuda_stream);
-template void CalScatterNdUpdate<int16_t, int64_t>(const size_t &unit_size, const size_t &num_units,
-                                                   const size_t &index_depth, const int64_t *out_strides,
-                                                   const int64_t *indices, const int16_t *updates, int16_t *input,
-                                                   cudaStream_t cuda_stream);
-template void CalScatterNdUpdate<int16_t, int32_t>(const size_t &unit_size, const size_t &num_units,
-                                                   const size_t &index_depth, const int32_t *out_strides,
-                                                   const int32_t *indices, const int16_t *updates, int16_t *input,
-                                                   cudaStream_t cuda_stream);
-template void CalScatterNdUpdate<uint8_t, int64_t>(const size_t &unit_size, const size_t &num_units,
-                                                   const size_t &index_depth, const int64_t *out_strides,
-                                                   const int64_t *indices, const uint8_t *updates, uint8_t *input,
-                                                   cudaStream_t cuda_stream);
-template void CalScatterNdUpdate<uint8_t, int32_t>(const size_t &unit_size, const size_t &num_units,
-                                                   const size_t &index_depth, const int32_t *out_strides,
-                                                   const int32_t *indices, const uint8_t *updates, uint8_t *input,
-                                                   cudaStream_t cuda_stream);
-template void CalScatterNdUpdate<int8_t, int64_t>(const size_t &unit_size, const size_t &num_units,
-                                                  const size_t &index_depth, const int64_t *out_strides,
-                                                  const int64_t *indices, const int8_t *updates, int8_t *input,
-                                                  cudaStream_t cuda_stream);
-template void CalScatterNdUpdate<int8_t, int32_t>(const size_t &unit_size, const size_t &num_units,
-                                                  const size_t &index_depth, const int32_t *out_strides,
-                                                  const int32_t *indices, const int8_t *updates, int8_t *input,
-                                                  cudaStream_t cuda_stream);
-template void CalScatterNdUpdate<bool, int64_t>(const size_t &unit_size, const size_t &num_units,
-                                                const size_t &index_depth, const int64_t *out_strides,
-                                                const int64_t *indices, const bool *updates, bool *input,
-                                                cudaStream_t cuda_stream);
-template void CalScatterNdUpdate<bool, int32_t>(const size_t &unit_size, const size_t &num_units,
-                                                const size_t &index_depth, const int32_t *out_strides,
-                                                const int32_t *indices, const bool *updates, bool *input,
-                                                cudaStream_t cuda_stream);
--- a/mindspore/ops/operations/array_ops.py
+++ b/mindspore/ops/operations/array_ops.py
@ -4479,7 +4479,7 @@ class ScatterNdAdd(_ScatterNdOp):
        ValueError: If the shape of `updates` is not equal to `indices_shape[:-1] + x_shape[indices_shape[-1]:]`.

    Supported Platforms:
-        ``Ascend``
+        ``Ascend`` ``GPU``

    Examples:
        >>> input_x = Parameter(Tensor(np.array([1, 2, 3, 4, 5, 6, 7, 8]), mindspore.float32), name="x")
@ -4556,7 +4556,7 @@ class ScatterNdSub(_ScatterNdOp):
        ValueError: If the shape of `updates` is not equal to `indices_shape[:-1] + x_shape[indices_shape[-1]:]`.

    Supported Platforms:
-        ``Ascend``
+        ``Ascend`` ``GPU``

    Examples:
        >>> input_x = Parameter(Tensor(np.array([1, 2, 3, 4, 5, 6, 7, 8]), mindspore.float32), name="x")
--- a/tests/st/ops/gpu/test_scatter_nd_func_op.py
+++ b/tests/st/ops/gpu/test_scatter_nd_func_op.py
@ -0,0 +1,234 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor, Parameter
+import mindspore.common.dtype as mstype
+import mindspore.ops as ops
+
+context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+
+func_map = {
+    "update": ops.ScatterNdUpdate,
+    "add": ops.ScatterNdAdd,
+    "sub": ops.ScatterNdSub,
+}
+
+
+class TestScatterNdFuncNet(nn.Cell):
+    def __init__(self, func, lock, inputx, indices, updates):
+        super(TestScatterNdFuncNet, self).__init__()
+
+        self.scatter_func = func_map[func](use_locking=lock)
+        self.inputx = Parameter(inputx, name="inputx")
+        self.indices = Parameter(indices, name="indices")
+        self.updates = Parameter(updates, name="updates")
+
+    def construct(self):
+        out = self.scatter_func(self.inputx, self.indices, self.updates)
+        return out
+
+
+def scatter_nd_func_net(func, inputx, indices, updates):
+    lock = True
+    net = TestScatterNdFuncNet(func, lock, inputx, indices, updates)
+    return net()
+
+
+def scatter_nd_func_use_locking_false_net(func, inputx, indices, updates):
+    lock = False
+    net = TestScatterNdFuncNet(func, lock, inputx, indices, updates)
+    return net()
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_traning
+@pytest.mark.env_onecard
+def test_scatter_nd_func_small_float32():
+    inputx = Tensor(np.array([[-0.1, 0.3, 3.6], [0.4, 0.5, -3.2]]), mstype.float32)
+    indices = Tensor(np.array([[0, 0], [1, 1]]), mstype.int32)
+    updates = Tensor(np.array([1.0, 2.2]), mstype.float32)
+
+    # update
+    output = scatter_nd_func_net("update", inputx, indices, updates)
+    expected = np.array([[1.0, 0.3, 3.6], [0.4, 2.2, -3.2]])
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # add
+    output = scatter_nd_func_net("add", inputx, indices, updates)
+    expected = np.array([[0.9, 0.3, 3.6], [0.4, 2.7, -3.2]])
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # sub
+    output = scatter_nd_func_net("sub", inputx, indices, updates)
+    expected = np.array([[-1.1, 0.3, 3.6], [0.4, -1.7, -3.2]])
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_scatter_nd_func_input_updated():
+    inputx = Tensor(np.array([[-0.1, 0.3, 3.6], [0.4, 0.5, -3.2]]), mstype.float32)
+    indices = Tensor(np.array([[0, 0], [1, 1]]), mstype.int32)
+    updates = Tensor(np.array([1.0, 2.2]), mstype.float32)
+    lock = True
+
+    # update
+    net = TestScatterNdFuncNet("update", lock, inputx, indices, updates)
+    net()
+    expected = np.array([[1.0, 0.3, 3.6], [0.4, 2.2, -3.2]])
+    np.testing.assert_array_almost_equal(net.inputx.asnumpy(), expected)
+
+    # add
+    net = TestScatterNdFuncNet("add", lock, inputx, indices, updates)
+    net()
+    expected = np.array([[0.9, 0.3, 3.6], [0.4, 2.7, -3.2]])
+    np.testing.assert_array_almost_equal(net.inputx.asnumpy(), expected)
+
+    # sub
+    net = TestScatterNdFuncNet("sub", lock, inputx, indices, updates)
+    net()
+    expected = np.array([[-1.1, 0.3, 3.6], [0.4, -1.7, -3.2]])
+    np.testing.assert_array_almost_equal(net.inputx.asnumpy(), expected)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_traning
+@pytest.mark.env_onecard
+def test_scatter_nd_func_small_float32_using_locking_false():
+    inputx = Tensor(np.array([[-0.1, 0.3, 3.6], [0.4, 0.5, -3.2]]), mstype.float32)
+    indices = Tensor(np.array([[0, 0], [1, 1]]), mstype.int32)
+    updates = Tensor(np.array([1.0, 2.2]), mstype.float32)
+
+    # update
+    output = scatter_nd_func_use_locking_false_net("update", inputx, indices, updates)
+    expected = np.array([[1.0, 0.3, 3.6], [0.4, 2.2, -3.2]])
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # add
+    output = scatter_nd_func_use_locking_false_net("add", inputx, indices, updates)
+    expected = np.array([[0.9, 0.3, 3.6], [0.4, 2.7, -3.2]])
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # sub
+    output = scatter_nd_func_use_locking_false_net("sub", inputx, indices, updates)
+    expected = np.array([[-1.1, 0.3, 3.6], [0.4, -1.7, -3.2]])
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_traning
+@pytest.mark.env_onecard
+def test_scatter_nd_func_small_int32():
+    inputx = Tensor(np.array([1, 2, 3, 4, 5, 6, 7, 8]), mstype.float32)
+    indices = Tensor(np.array([[4], [3], [1], [7]]), mstype.int32)
+    updates = Tensor(np.array([9, 10, 11, 12]), mstype.float32)
+
+    # update
+    output = scatter_nd_func_net("update", inputx, indices, updates)
+    expected = np.array([1, 11, 3, 10, 9, 6, 7, 12])
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # add
+    output = scatter_nd_func_net("add", inputx, indices, updates)
+    expected = np.array([1, 13, 3, 14, 14, 6, 7, 20])
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # sub
+    output = scatter_nd_func_net("sub", inputx, indices, updates)
+    expected = np.array([1, -9, 3, -6, -4, 6, 7, -4])
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_traning
+@pytest.mark.env_onecard
+def test_scatter_nd_func_multi_dims():
+    inputx = Tensor(np.zeros((4, 4, 4)), mstype.float32)
+    indices = Tensor(np.array([[0], [2]]), mstype.int32)
+    updates = Tensor(
+        np.array(
+            [
+                [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+                [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+            ]
+        ),
+        mstype.float32,
+    )
+
+    # update
+    output = scatter_nd_func_net("update", inputx, indices, updates)
+    expected = np.array(
+        [
+            [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+            [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
+            [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+            [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
+        ]
+    )
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # add
+    output = scatter_nd_func_net("add", inputx, indices, updates)
+    expected = np.array(
+        [
+            [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+            [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
+            [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+            [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
+        ]
+    )
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # sub
+    output = scatter_nd_func_net("sub", inputx, indices, updates)
+    expected = np.array(
+        [
+            [[-5, -5, -5, -5], [-6, -6, -6, -6], [-7, -7, -7, -7], [-8, -8, -8, -8]],
+            [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
+            [[-5, -5, -5, -5], [-6, -6, -6, -6], [-7, -7, -7, -7], [-8, -8, -8, -8]],
+            [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
+        ]
+    )
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_traning
+@pytest.mark.env_onecard
+def test_scatter_nd_func_one_value():
+    inputx = Tensor(np.array([[-0.1, 0.3, 3.6], [0.4, 0.5, -3.2]]), mstype.float32)
+    indices = Tensor(np.array([[0, 1]]), mstype.int32)
+    updates = Tensor(np.array([1.0]), mstype.float32)
+
+    # update
+    output = scatter_nd_func_net("update", inputx, indices, updates)
+    expected = np.array([[-0.1, 1.0, 3.6], [0.4, 0.5, -3.2]])
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # add
+    output = scatter_nd_func_net("add", inputx, indices, updates)
+    expected = np.array([[-0.1, 1.3, 3.6], [0.4, 0.5, -3.2]])
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
+
+    # sub
+    output = scatter_nd_func_net("sub", inputx, indices, updates)
+    expected = np.array([[-0.1, -0.7, 3.6], [0.4, 0.5, -3.2]])
+    np.testing.assert_array_almost_equal(output.asnumpy(), expected)
--- a/tests/st/ops/gpu/test_scatter_nd_update_op.py
+++ b/tests/st/ops/gpu/test_scatter_nd_update_op.py
@ -1,131 +0,0 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-import numpy as np
-import pytest
-
-import mindspore.context as context
-import mindspore.nn as nn
-from mindspore import Tensor, Parameter
-import mindspore.common.dtype as mstype
-import mindspore.ops as ops
-
-context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
-
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_traning
-@pytest.mark.env_onecard
-def test_op1():
-    class ScatterNdUpdate(nn.Cell):
-        def __init__(self):
-            super(ScatterNdUpdate, self).__init__()
-            self.scatter_nd_update = ops.ScatterNdUpdate()
-            self.x = Parameter(
-                Tensor(np.array([[-0.1, 0.3, 3.6], [0.4, 0.5, -3.2]]), mstype.float32), name="x"
-            )
-
-        def construct(self, indices, update):
-            return self.scatter_nd_update(self.x, indices, update)
-
-    indices = Tensor(np.array([[0, 0], [1, 1]]), mstype.int32)
-    update = Tensor(np.array([1.0, 2.2]), mstype.float32)
-
-    scatter_nd_update = ScatterNdUpdate()
-    scatter_nd_update(indices, update)
-    expect = [[1.0, 0.3, 3.6], [0.4, 2.2, -3.2]]
-    assert np.allclose(scatter_nd_update.x.data.asnumpy(), np.array(expect, np.float))
-
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_traning
-@pytest.mark.env_onecard
-def test_op2():
-    class ScatterNdUpdate(nn.Cell):
-        def __init__(self):
-            super(ScatterNdUpdate, self).__init__()
-            self.scatter_nd_update = ops.ScatterNdUpdate()
-            self.x = Parameter(Tensor(np.array([1, 2, 3, 4, 5, 6, 7, 8]), mstype.float32), name="x")
-
-        def construct(self, indices, update):
-            return self.scatter_nd_update(self.x, indices, update)
-
-    indices = Tensor(np.array([[4], [3], [1], [7]]), mstype.int32)
-    update = Tensor(np.array([9, 10, 11, 12]), mstype.float32)
-
-    scatter_nd_update = ScatterNdUpdate()
-    scatter_nd_update(indices, update)
-    expect = [1, 11, 3, 10, 9, 6, 7, 12]
-    assert np.allclose(scatter_nd_update.x.data.asnumpy(), np.array(expect, dtype=float))
-
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_traning
-@pytest.mark.env_onecard
-def test_op3():
-    class ScatterNdUpdate(nn.Cell):
-        def __init__(self):
-            super(ScatterNdUpdate, self).__init__()
-            self.scatter_nd_update = ops.ScatterNdUpdate()
-            self.x = Parameter(Tensor(np.zeros((4, 4, 4)), mstype.float32), name="x")
-
-        def construct(self, indices, update):
-            return self.scatter_nd_update(self.x, indices, update)
-
-    indices = Tensor(np.array([[0], [2]]), mstype.int32)
-    update = Tensor(
-        np.array(
-            [
-                [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-                [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-            ]
-        ),
-        mstype.float32,
-    )
-
-    scatter_nd_update = ScatterNdUpdate()
-    scatter_nd_update(indices, update)
-    expect = [
-        [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-        [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
-        [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-        [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
-    ]
-    assert np.allclose(scatter_nd_update.x.data.asnumpy(), np.array(expect, dtype=float))
-
-
-@pytest.mark.level0
-@pytest.mark.platform_x86_gpu_traning
-@pytest.mark.env_onecard
-def test_op4():
-    class ScatterNdUpdate(nn.Cell):
-        def __init__(self):
-            super(ScatterNdUpdate, self).__init__()
-            self.scatter_nd_update = ops.ScatterNdUpdate()
-            self.x = Parameter(
-                Tensor(np.array([[-0.1, 0.3, 3.6], [0.4, 0.5, -3.2]]), mstype.float32), name="x"
-            )
-
-        def construct(self, indices, update):
-            return self.scatter_nd_update(self.x, indices, update)
-
-    indices = Tensor(np.array([[0, 1]]), mstype.int32)
-    update = Tensor(np.array([1.0]), mstype.float32)
-
-    scatter_nd_update = ScatterNdUpdate()
-    out = scatter_nd_update(indices, update)
-    assert np.allclose(out.asnumpy(), scatter_nd_update.x.data.asnumpy())
-    expect = [[-0.1, 1.0, 3.6], [0.4, 0.5, -3.2]]
-    assert np.allclose(out.asnumpy(), np.array(expect, np.float))