!31271 Reconstruct the GPU invoking mode

Merge pull request !31271 from zong_shuai/reconstruct_gpu_kernel
2022-03-21 15:05:28 +00:00 · 2022-03-21 15:05:28 +00:00 · 27e417de6a
parent 75d85e6183 d42daf0dcb
commit 27e417de6a
10 changed files with 667 additions and 260 deletions
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/batchtospace_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/batchtospace_gpu_kernel.h
@ -19,15 +19,13 @@

 #include <vector>
 #include <string>
+#include <memory>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchtospace_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_class/batchtospace_helper.h"

 namespace mindspore {
 namespace kernel {
-constexpr size_t SHAPE_SIZE = 4;
-constexpr size_t CROPS_SHAPE_0 = 2;
-constexpr size_t CROPS_SHAPE_1 = 2;
 template <typename T>
 class BatchToSpaceGpuKernelMod : public NativeGpuKernelMod {
 public:
@ -36,139 +34,55 @@ class BatchToSpaceGpuKernelMod : public NativeGpuKernelMod {

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
-    T *input = GetDeviceAddress<T>(inputs, 0);
-    T *output = GetDeviceAddress<T>(outputs, 0);
-
-    size_t size = output_size_ / sizeof(T);
-
-    CalBatchToSpace<T>(size, input, in_, ih_, iw_, ic_, on_, oh_, ow_, oc_, crops_[0][0], crops_[0][1], crops_[1][0],
-                       crops_[1][1], block_size_, output, reinterpret_cast<cudaStream_t>(stream_ptr));
+    std::vector<void *> input_addrs = ConvertPtrs(inputs);
+    std::vector<void *> work_addrs = ConvertPtrs(workspace);
+    std::vector<void *> output_addrs = ConvertPtrs(outputs);
+    int flag = helper_ptr_->Process(input_addrs, output_addrs, work_addrs, stream_ptr);
+    if (flag != 0) {
+      return false;
+    }
    return true;
  }

  bool Init(const CNodePtr &kernel_node) override {
-    kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node);
    kernel_node_ = kernel_node;
-    (void)CheckParam(kernel_node);
-    input_size_ = sizeof(T);
-    for (size_t idx = 0; idx < input_shape_.size(); ++idx) {
-      input_size_ *= input_shape_[idx];
+    kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node);
+
+    helper_ptr_ = std::make_unique<cukernel::BatchToSpaceHelperGpuKernel<T>>(kernel_name_);
+    helper_ptr_->ResetResource();
+
+    std::vector<std::vector<size_t>> input_shapes;
+    std::vector<std::vector<size_t>> output_shapes;
+    auto input_shape = AnfAlgo::GetInputDeviceShapeAdaptively(kernel_node, 0);
+    auto output_shape = AnfAlgo::GetOutputDeviceShapeAdaptively(kernel_node, 0);
+    input_shapes.emplace_back(input_shape);
+    output_shapes.emplace_back(output_shape);
+    attr_.block_size = GetAttr<int64_t>(kernel_node, "block_size");
+    attr_.crops = GetAttr<std::vector<std::vector<int64_t>>>(kernel_node, "crops");
+    attr_.input_shape = input_shape;
+    int flag = helper_ptr_->CheckKernelParam(&attr_);
+    if (flag != 0) {
+      return false;
    }

-    in_ = input_shape_[0];
-    ic_ = input_shape_[1];
-    ih_ = input_shape_[2];
-    iw_ = input_shape_[3];
-
-    on_ = in_ / (block_size_ * block_size_);
-    oc_ = ic_;
-    oh_ = ih_ * block_size_ - crops_[0][0] - crops_[0][1];
-    ow_ = iw_ * block_size_ - crops_[1][0] - crops_[1][1];
-    output_size_ = on_ * oc_ * oh_ * ow_ * sizeof(T);
+    flag = helper_ptr_->CalMemSize(input_shapes, output_shapes);
+    if (flag != 0) {
+      return false;
+    }
    InitSizeLists();
    return true;
  }
-  void ResetResource() noexcept override {
-    in_ = 0;
-    ic_ = 0;
-    ih_ = 0;
-    iw_ = 0;
-    on_ = 0;
-    oc_ = 0;
-    oh_ = 0;
-    ow_ = 0;
-    kernel_name_ = "BatchToSpace";
-    input_size_list_.clear();
-    output_size_list_.clear();
-    crops_.clear();
-    input_shape_.clear();
-  }

 protected:
  void InitSizeLists() override {
-    input_size_list_.push_back(input_size_);
-    output_size_list_.push_back(output_size_);
-  }
-
-  void CheckParam(const CNodePtr &kernel_node) {
-    block_size_ = GetAttr<int64_t>(kernel_node, "block_size");
-    if (block_size_ < 1) {
-      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the 'block_size' cannot be less than 1, but got "
-                        << block_size_;
-    }
-    size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
-    if (input_num != 1) {
-      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs should be 1, but got " << input_num;
-    }
-    size_t output_num = common::AnfAlgo::GetOutputTensorNum(kernel_node);
-    if (output_num != 1) {
-      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of outputs should be 1, but got " << output_num;
-    }
-
-    // check input_shape
-    auto input_shape = AnfAlgo::GetInputDeviceShapeAdaptively(kernel_node, 0);
-    if (input_shape.size() != SHAPE_SIZE) {
-      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input should be 4, but got "
-                        << input_shape.size();
-    }
-    if ((input_shape[0] % (block_size_ * block_size_)) != 0) {
-      MS_LOG(EXCEPTION) << "For '" << kernel_name_
-                        << "', input_shape[0] should be divisible by product of block_shape, but got input_shape[0]: "
-                        << input_shape[0] << ", block_shape: " << block_size_;
-    }
-    for (size_t idx = 0; idx < SHAPE_SIZE; ++idx) {
-      if (input_shape[idx] < 1) {
-        MS_LOG(EXCEPTION) << "For '" << kernel_name_
-                          << "', the element of shape of input cannot be less than 1, but got "
-                          << CONVERT_VECTOR_TO_STRING(input_shape);
-      }
-    }
-    input_shape_.assign(input_shape.begin(), input_shape.end());
-
-    // check crops
-    crops_ = (GetAttr<std::vector<std::vector<int64_t>>>(kernel_node, "crops"));
-
-    if (crops_.size() != CROPS_SHAPE_0) {
-      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the size of 'crops' should be " << CROPS_SHAPE_0
-                        << ", but got " << crops_.size();
-    }
-    if (crops_[0].size() != CROPS_SHAPE_1 || crops_[1].size() != CROPS_SHAPE_1) {
-      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the size of element of 'crops' should be " << CROPS_SHAPE_1
-                        << ", but got the size of crops[0]: " << crops_[0].size()
-                        << ", the size of crops[1]: " << crops_[1].size();
-    } else {
-      for (size_t idx_i = 0; idx_i < CROPS_SHAPE_0; ++idx_i) {
-        for (size_t idx_j = 0; idx_j < CROPS_SHAPE_1; ++idx_j) {
-          if (crops_[idx_i][idx_j] < 0) {
-            MS_LOG(EXCEPTION) << "For '" << kernel_name_
-                              << "', the element of 'crops' should be greater than or equal to 0, but got crops["
-                              << idx_i << "][" << idx_j << "]: " << crops_[idx_i][idx_j];
-          }
-        }
-        auto tmp_shape = input_shape[idx_i + CROPS_SHAPE_1] * block_size_ - crops_[idx_i][0] - crops_[idx_i][1];
-        if (tmp_shape <= 0) {
-          MS_LOG(EXCEPTION) << "For '" << kernel_name_
-                            << "', the element of shape of output should be greater than 0, but got " << tmp_shape;
-        }
-      }
-    }
+    input_size_list_ = helper_ptr_->GetInputSizeList();
+    output_size_list_ = helper_ptr_->GetOutputSizeList();
  }

 private:
-  std::vector<std::vector<int64_t>> crops_;
-  std::vector<size_t> input_shape_;
-  size_t block_size_;
-  size_t input_size_;
-  size_t output_size_;
-  size_t in_;
-  size_t ic_;
-  size_t ih_;
-  size_t iw_;
-  size_t on_;
-  size_t oc_;
-  size_t oh_;
-  size_t ow_;
  std::string kernel_name_;
+  std::unique_ptr<cukernel::BatchToSpaceHelperGpuKernel<T>> helper_ptr_ = nullptr;
+  cukernel::BatchToSpaceAttr attr_;
 };
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unique_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/unique_gpu_kernel.h
@ -18,9 +18,10 @@
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_ARRAYS_UNIQUE_GPU_KERNEL_H_

 #include <vector>
+#include <memory>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unique_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_class/unique_helper.h"
 namespace mindspore {
 namespace kernel {
 template <typename T, typename S>
@ -34,32 +35,31 @@ class UniqueGpuKernelMod : public NativeGpuKernelMod {
    if (is_null_input_) {
      return true;
    }
-    T *input = GetDeviceAddress<T>(inputs, 0);
-    S *input_index = GetDeviceAddress<S>(workspace, 0);
-    S *sorted_index = GetDeviceAddress<S>(workspace, 1);
-    T *output = GetDeviceAddress<T>(outputs, 0);
-    S *index = GetDeviceAddress<S>(outputs, 1);
    stream_ptr_ = stream_ptr;
-    post_output_size_ = CalUnique(input, num_elements_, input_index, sorted_index, output, index,
-                                  reinterpret_cast<cudaStream_t>(stream_ptr));
+    std::vector<void *> input_ptrs = ConvertPtrs(inputs);
+    std::vector<void *> work_ptrs = ConvertPtrs(workspace);
+    std::vector<void *> output_ptrs = ConvertPtrs(outputs);
+    if (helper_ptr_->Process(input_ptrs, output_ptrs, work_ptrs, stream_ptr) != 0) {
+      return false;
+    }
    return true;
  }

  bool Init(const CNodePtr &kernel_node) override {
-    auto kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
    kernel_node_ = kernel_node;
+    auto kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
+    helper_ptr_ = std::make_unique<cukernel::UniqueHelperGpuKernel<T, S>>(kernel_name);
+    helper_ptr_->ResetResource();
+    std::vector<std::vector<size_t>> input_shapes;
+    std::vector<std::vector<size_t>> output_shapes;
    std::vector<size_t> shape = AnfAlgo::GetInputDeviceShapeAdaptively(kernel_node, 0);
    is_null_input_ = CHECK_SHAPE_NULL(shape, kernel_name, "input");
    if (is_null_input_) {
      InitSizeLists();
      return true;
    }
-    for (auto x : shape) {
-      num_elements_ *= x;
-    }
-    input_size_ = num_elements_ * sizeof(T);
-    output_size_ = input_size_;
-    workspace_size_ = num_elements_ * sizeof(S);
+    input_shapes.emplace_back(shape);
+    helper_ptr_->CalMemSize(input_shapes, output_shapes);
    InitSizeLists();
    return true;
  }
@ -73,7 +73,7 @@ class UniqueGpuKernelMod : public NativeGpuKernelMod {
    for (size_t i = 0; i < output_num; ++i) {
      std::vector<size_t> shape = common::AnfAlgo::GetOutputInferShape(kernel_node_.lock(), i);
      if (i == 0) {
-        shape[0] = post_output_size_;
+        shape[0] = helper_ptr_->GetOutSize();
      }
      TypeId type_id = common::AnfAlgo::GetOutputInferDataType(kernel_node_.lock(), i);
      type_ids.emplace_back(type_id);
@ -83,11 +83,6 @@ class UniqueGpuKernelMod : public NativeGpuKernelMod {
  }

  void ResetResource() noexcept override {
-    input_size_ = 0;
-    output_size_ = 0;
-    workspace_size_ = 0;
-    num_elements_ = 1;
-    post_output_size_ = 0;
    is_null_input_ = false;
    stream_ptr_ = nullptr;
    input_size_list_.clear();
@ -97,21 +92,15 @@ class UniqueGpuKernelMod : public NativeGpuKernelMod {

 protected:
  void InitSizeLists() override {
-    input_size_list_.push_back(input_size_);
-    output_size_list_.push_back(output_size_);
-    output_size_list_.push_back(num_elements_ * sizeof(S));
-    workspace_size_list_.push_back(workspace_size_);
-    workspace_size_list_.push_back(workspace_size_);
+    input_size_list_ = helper_ptr_->GetInputSizeList();
+    output_size_list_ = helper_ptr_->GetOutputSizeList();
+    workspace_size_list_ = helper_ptr_->GetWorkSizeList();
  }

 private:
  void *stream_ptr_;
-  size_t input_size_;
-  size_t output_size_;
-  size_t workspace_size_;
-  int num_elements_;
-  int post_output_size_;
  bool is_null_input_;
+  std::unique_ptr<cukernel::UniqueHelperGpuKernel<T, S>> helper_ptr_ = nullptr;
 };
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_class/batchtospace_helper.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_class/batchtospace_helper.h
@ -0,0 +1,161 @@
+/**
+ * Copyright 2019-2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_CLASS_BATCHTOSPACE_HELPER_H_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_CLASS_BATCHTOSPACE_HELPER_H_
+#include <string>
+#include <vector>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_class/helper_base.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchtospace_impl.cuh"
+
+namespace mindspore {
+namespace cukernel {
+constexpr size_t INPUT_NUM = 1;
+constexpr size_t OUTPUT_NUM = 1;
+constexpr size_t SHAPE_SIZE = 4;
+constexpr size_t CROPS_SHAPE_0 = 2;
+constexpr size_t CROPS_SHAPE_1 = 2;
+
+struct BatchToSpaceAttr : public GpuKernelAttrBase {
+  std::vector<std::vector<int64_t>> crops;
+  std::vector<size_t> input_shape;
+  size_t block_size;
+};
+
+template <typename T>
+class BatchToSpaceHelperGpuKernel : public GpuKernelHelperBase {
+ public:
+  explicit BatchToSpaceHelperGpuKernel(std::string &kernel_name) : GpuKernelHelperBase(kernel_name) {}
+  virtual ~BatchToSpaceHelperGpuKernel() = default;
+  int CalMemSize(const std::vector<std::vector<size_t>> &input_shapes,
+                 const std::vector<std::vector<size_t>> &output_shapes) override {
+    int flag = CalShapesSizeInBytes<T>(input_shapes, INPUT_NUM, kernel_name_, "input_shapes", &input_size_list_);
+    if (flag != 0) {
+      return flag;
+    }
+    flag = CalShapesSizeInBytes<T>(output_shapes, OUTPUT_NUM, kernel_name_, "output_shapes", &output_size_list_);
+    if (flag != 0) {
+      return flag;
+    }
+    kernel_size_ = output_size_list_[0] / sizeof(T);
+    return 0;
+  }
+
+  int Process(const std::vector<void *> &input_ptrs, const std::vector<void *> &output_ptrs,
+              const std::vector<void *> &work_ptrs, void *cuda_stream) override {
+    size_t in = attr_ptr_->input_shape[0];
+    size_t ic = attr_ptr_->input_shape[1];
+    size_t ih = attr_ptr_->input_shape[2];
+    size_t iw = attr_ptr_->input_shape[3];
+
+    size_t on = in / (attr_ptr_->block_size * attr_ptr_->block_size);
+    size_t oc = ic;
+    size_t oh = ih * attr_ptr_->block_size - attr_ptr_->crops[0][0] - attr_ptr_->crops[0][1];
+    size_t ow = iw * attr_ptr_->block_size - attr_ptr_->crops[1][0] - attr_ptr_->crops[1][1];
+
+    T *input_ptr = nullptr;
+    T *output_ptr = nullptr;
+    int flag = GetDeviceAddress<T>(input_ptrs, 0, kernel_name_, &input_ptr);
+    if (flag != 0) {
+      return flag;
+    }
+
+    flag = GetDeviceAddress<T>(output_ptrs, 0, kernel_name_, &output_ptr);
+    if (flag != 0) {
+      return flag;
+    }
+
+    CalBatchToSpace<T>(kernel_size_, input_ptr, in, ih, iw, ic, on, oh, ow, oc, attr_ptr_->crops[0][0],
+                       attr_ptr_->crops[0][1], attr_ptr_->crops[1][0], attr_ptr_->crops[1][1], attr_ptr_->block_size,
+                       output_ptr, reinterpret_cast<cudaStream_t>(cuda_stream));
+
+    return 0;
+  }
+
+  void ResetResource() override {
+    kernel_size_ = 0;
+    input_size_list_.clear();
+    output_size_list_.clear();
+    work_size_list_.clear();
+  }
+  int CheckKernelParam(GpuKernelAttrBase *kernel_attr) override {
+    attr_ptr_ = dynamic_cast<BatchToSpaceAttr *>(kernel_attr);
+    if (attr_ptr_->block_size < 1) {
+      MS_LOG(ERROR) << "For '" << kernel_name_ << "', the 'block_size' cannot be less than 1, but got "
+                    << attr_ptr_->block_size;
+      return -1;
+    }
+
+    // check input_shape
+    if (attr_ptr_->input_shape.size() != SHAPE_SIZE) {
+      MS_LOG(ERROR) << "For '" << kernel_name_ << "', the dimension of input should be 4, but got "
+                    << attr_ptr_->input_shape.size();
+      return -1;
+    }
+    if ((attr_ptr_->input_shape[0] % (attr_ptr_->block_size * attr_ptr_->block_size)) != 0) {
+      MS_LOG(ERROR) << "For '" << kernel_name_
+                    << "', input_shape[0] should be divisible by product of block_shape, but got input_shape[0]: "
+                    << attr_ptr_->input_shape[0] << ", block_shape: " << attr_ptr_->block_size;
+      return -1;
+    }
+    for (size_t idx = 0; idx < SHAPE_SIZE; ++idx) {
+      if (attr_ptr_->input_shape[idx] < 1) {
+        MS_LOG(ERROR) << "For '" << kernel_name_ << "', the element of shape of input cannot be less than 1, but got "
+                      << ConvertVectorToString(attr_ptr_->input_shape);
+        return -1;
+      }
+    }
+
+    // check crops
+    if (attr_ptr_->crops.size() != CROPS_SHAPE_0) {
+      MS_LOG(ERROR) << "For '" << kernel_name_ << "', the size of 'crops' should be " << CROPS_SHAPE_0 << ", but got "
+                    << attr_ptr_->crops.size();
+      return -1;
+    }
+    if (attr_ptr_->crops[0].size() != CROPS_SHAPE_1 || attr_ptr_->crops[1].size() != CROPS_SHAPE_1) {
+      MS_LOG(ERROR) << "For '" << kernel_name_ << "', the size of element of 'crops' should be " << CROPS_SHAPE_1
+                    << ", but got the size of crops[0]: " << attr_ptr_->crops[0].size()
+                    << ", the size of crops[1]: " << attr_ptr_->crops[1].size();
+      return -1;
+    } else {
+      for (size_t idx_i = 0; idx_i < CROPS_SHAPE_0; ++idx_i) {
+        for (size_t idx_j = 0; idx_j < CROPS_SHAPE_1; ++idx_j) {
+          if (attr_ptr_->crops[idx_i][idx_j] < 0) {
+            MS_LOG(ERROR) << "For '" << kernel_name_
+                          << "', the element of 'crops' should be greater than or equal to 0, but got crops[" << idx_i
+                          << "][" << idx_j << "]: " << attr_ptr_->crops[idx_i][idx_j];
+            return -1;
+          }
+        }
+        auto tmp_shape = attr_ptr_->input_shape[idx_i + CROPS_SHAPE_1] * attr_ptr_->block_size -
+                         attr_ptr_->crops[idx_i][0] - attr_ptr_->crops[idx_i][1];
+        if (tmp_shape <= 0) {
+          MS_LOG(ERROR) << "For '" << kernel_name_
+                        << "', the element of shape of output should be greater than 0, but got " << tmp_shape;
+          return -1;
+        }
+      }
+    }
+    return 0;
+  }
+
+ private:
+  BatchToSpaceAttr *attr_ptr_;
+  size_t kernel_size_;
+};
+}  // namespace cukernel
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_CLASS_BATCHTOSPACE_HELPER_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_class/cuda_class_common.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_class/cuda_class_common.h
@ -0,0 +1,89 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_CLASS_COMMON_H_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_CLASS_COMMON_H_
+
+#include <string>
+#include <vector>
+#include "mindspore/core/utils/log_adapter.h"
+namespace mindspore {
+namespace cukernel {
+// 1. 错误码细化
+
+inline std::string ConvertVectorToString(const std::vector<size_t> &value) {
+  std::stringstream ss;
+  ss << "(";
+  for (auto it = value.begin(); it != value.end(); it++) {
+    if (it == value.begin()) {
+      ss << *it;
+    } else {
+      ss << ", " << *it;
+    }
+  }
+  ss << ")";
+  return ss.str();
+}
+
+template <typename T>
+int CalShapesSizeInBytes(const std::vector<std::vector<size_t>> &shapes, const size_t shape_num,
+                         const std::string kernel_name, const std::string param_name,
+                         std::vector<size_t> *shapes_size) {
+  if (shape_num != shapes.size()) {
+    MS_LOG(ERROR) << "For '" << kernel_name << "', the number of " << param_name << "should be equal to " << shape_num
+                  << ", but got " << shapes.size();
+    return -1;
+  }
+  size_t return_flag = 0;
+  for (size_t idx = 0; idx < shape_num; ++idx) {
+    size_t cur_size = sizeof(T);
+    if (shapes[idx].size() == 0) {
+      // 常数
+      MS_LOG(WARNING) << "For '" << kernel_name << "', the shapes[" << idx << "] is ( )";
+      shapes_size->emplace_back(cur_size);
+      continue;
+    }
+    for (const auto &val : shapes[idx]) {
+      cur_size *= val;
+    }
+    if (cur_size == 0) {
+      MS_LOG(WARNING) << "For '" << kernel_name << "', the shape cannot contain zero, but got shapes[" << idx << "] is "
+                      << ConvertVectorToString(shapes[idx]);
+      return_flag = -1;
+    }
+    shapes_size->emplace_back(cur_size);
+  }
+  return return_flag;
+}
+
+template <typename T>
+inline int GetDeviceAddress(const std::vector<void *> &addr_list, const size_t index, const std::string kernel_name,
+                            T **out_ptr) {
+  if (index >= addr_list.size()) {
+    MS_LOG(ERROR) << "Address index(" << index << ") out of range(" << addr_list.size() << ")";
+    return -1;
+  }
+
+  if (addr_list[index] == nullptr) {
+    MS_LOG(ERROR) << "The device address is empty, address index: " << index << ", op name is: " << kernel_name;
+    return -1;
+  }
+  *out_ptr = reinterpret_cast<T *>(addr_list[index]);
+  return 0;
+}
+}  // namespace cukernel
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_CLASS_COMMON_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_class/helper_base.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_class/helper_base.h
@ -0,0 +1,63 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_CLASS_HELPER_BASE_H_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_CLASS_HELPER_BASE_H_
+
+#include <string>
+#include <vector>
+#include "mindspore/core/utils/log_adapter.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_class/cuda_class_common.h"
+namespace mindspore {
+namespace cukernel {
+struct GpuKernelAttrBase {
+  virtual ~GpuKernelAttrBase() = default;
+};
+
+class GpuKernelHelperBase {
+ public:
+  explicit GpuKernelHelperBase(std::string &kernel_name) : kernel_name_(kernel_name) {}
+  virtual ~GpuKernelHelperBase() {
+    input_size_list_.clear();
+    output_size_list_.clear();
+    work_size_list_.clear();
+  }
+
+  virtual int CalMemSize(const std::vector<std::vector<size_t>> &input_shapes,
+                         const std::vector<std::vector<size_t>> &output_shapes) = 0;
+
+  virtual int Process(const std::vector<void *> &input_ptrs, const std::vector<void *> &output_ptrs,
+                      const std::vector<void *> &work_ptrs, void *cuda_stream) = 0;
+
+  virtual void ResetResource() {
+    MS_LOG(ERROR) << "kernel must override the `ResetResource()` method when dynamic shape";
+  }
+
+  std::vector<size_t> GetInputSizeList() { return input_size_list_; }
+  std::vector<size_t> GetOutputSizeList() { return output_size_list_; }
+  std::vector<size_t> GetWorkSizeList() { return work_size_list_; }
+
+  virtual int CheckKernelParam(GpuKernelAttrBase *kernel_attr) { return 0; }
+
+ protected:
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> work_size_list_;
+  std::string kernel_name_;
+};
+}  // namespace cukernel
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_CLASS_HELPER_BASE_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_class/unary_helper.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_class/unary_helper.h
@ -0,0 +1,147 @@
+/**
+ * Copyright 2019-2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_CLASS_UNARY_HELPER_H_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_CLASS_UNARY_HELPER_H_
+#include <string>
+#include <vector>
+#include <map>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_class/helper_base.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_impl.cuh"
+
+namespace mindspore {
+namespace cukernel {
+enum UnaryOptype {
+  UNARY_OP_EXP = 0,
+  UNARY_OP_EXPM1,
+  UNARY_OP_LOG,
+  UNARY_OP_LOG1P,
+  UNARY_OP_ERF,
+  UNARY_OP_ERFC,
+  UNARY_OP_NEG,
+  UNARY_OP_RECIPROCAL,
+  UNARY_OP_SQUARE,
+  UNARY_OP_SQRT,
+  UNARY_OP_RSQRT,
+  UNARY_OP_SIN,
+  UNARY_OP_COS,
+  UNARY_OP_ASIN,
+  UNARY_OP_ACOS,
+  UNARY_OP_ATAN,
+  UNARY_OP_ASINH,
+  UNARY_OP_ACOSH,
+  UNARY_OP_ABS,
+  UNARY_OP_FLOOR,
+  UNARY_OP_RINT,
+  UNARY_OP_ROUND,
+  UNARY_OP_SIGN,
+  UNARY_OP_REAL,
+  UNARY_OP_IMAG,
+  UNARY_OP_CONJ,
+  UNARY_OP_INVALID_TYPE = 255
+};
+
+static const std::map<std::string, UnaryOptype> kUnaryOpTypeMap = {
+  {"Exp", UNARY_OP_EXP},       {"Expm1", UNARY_OP_EXPM1},
+  {"Log", UNARY_OP_LOG},       {"Log1p", UNARY_OP_LOG1P},
+  {"Erf", UNARY_OP_ERF},       {"Erfc", UNARY_OP_ERFC},
+  {"Neg", UNARY_OP_NEG},       {"Reciprocal", UNARY_OP_RECIPROCAL},
+  {"Square", UNARY_OP_SQUARE}, {"Sqrt", UNARY_OP_SQRT},
+  {"Rsqrt", UNARY_OP_RSQRT},   {"Sin", UNARY_OP_SIN},
+  {"Cos", UNARY_OP_COS},       {"Asin", UNARY_OP_ASIN},
+  {"ACos", UNARY_OP_ACOS},     {"Atan", UNARY_OP_ATAN},
+  {"Asinh", UNARY_OP_ASINH},   {"Acosh", UNARY_OP_ACOSH},
+  {"Abs", UNARY_OP_ABS},       {"Floor", UNARY_OP_FLOOR},
+  {"Rint", UNARY_OP_RINT},     {"Round", UNARY_OP_ROUND},
+  {"Real", UNARY_OP_REAL},     {"Imag", UNARY_OP_IMAG},
+  {"Sign", UNARY_OP_SIGN},     {"Conj", UNARY_OP_CONJ}};
+
+template <typename T>
+class UnaryHelperGpuKernel : public GpuKernelHelperBase {
+ public:
+  explicit UnaryHelperGpuKernel(std::string &kernel_name) : GpuKernelHelperBase(kernel_name) {}
+  virtual ~UnaryHelperGpuKernel() = default;
+  int CalMemSize(const std::vector<std::vector<size_t>> &input_shapes,
+                 const std::vector<std::vector<size_t>> &output_shapes) override {
+    auto iter = kUnaryOpTypeMap.find(kernel_name_);
+    if (iter == kUnaryOpTypeMap.end()) {
+      MS_LOG(ERROR) << "For '" << kernel_name_ << ", only support these types: Exp, Expm1, Log, Log1p, Erf, Erfc,"
+                    << " Neg, Reciprocal, Square, Sqrt, Rsqrt, Sin, Cos, Asin, ACos, Atan, Asinh, Acosh, Abs, "
+                    << "Floor, Rint, Round, Real, Imag, Sign, Conj currently, but got " << kernel_name_;
+      return -1;
+    }
+
+    unary_op_type_ = iter->second;
+    int flag = CalShapesSizeInBytes<T>(input_shapes, 1, kernel_name_, "input_shapes", &input_size_list_);
+    output_size_list_ = input_size_list_;
+    if (flag != 0) {
+      return flag;
+    }
+    return 0;
+  }
+
+  int Process(const std::vector<void *> &input_ptrs, const std::vector<void *> &output_ptrs,
+              const std::vector<void *> &work_ptrs, void *cuda_stream) override {
+    static std::map<UnaryOptype, std::function<void(const T *, T *, const size_t, cudaStream_t)>> func_map = {
+      {UNARY_OP_EXP, Exponential<T>}, {UNARY_OP_EXPM1, Expm1<T>},
+      {UNARY_OP_LOG, Logarithm<T>},   {UNARY_OP_LOG1P, Log1p<T>},
+      {UNARY_OP_ERF, Erf<T>},         {UNARY_OP_ERFC, Erfc<T>},
+      {UNARY_OP_NEG, Negative<T>},    {UNARY_OP_RECIPROCAL, Reciprocal<T>},
+      {UNARY_OP_SQUARE, Square<T>},   {UNARY_OP_SQRT, Sqrt<T>},
+      {UNARY_OP_RSQRT, Rsqrt<T>},     {UNARY_OP_SIN, Sin<T>},
+      {UNARY_OP_COS, Cos<T>},         {UNARY_OP_ASIN, Asin<T>},
+      {UNARY_OP_ACOS, ACos<T>},       {UNARY_OP_ATAN, Atan<T>},
+      {UNARY_OP_ASINH, Asinh<T>},     {UNARY_OP_ACOSH, Acosh<T>},
+      {UNARY_OP_ABS, Abs<T>},         {UNARY_OP_FLOOR, Floor<T>},
+      {UNARY_OP_RINT, Rint<T>},       {UNARY_OP_ROUND, Round<T>},
+      {UNARY_OP_SIGN, Sign<T>}};
+
+    auto iter = func_map.find(unary_op_type_);
+    if (iter != func_map.end()) {
+      T *input_addr;
+      T *output_addr;
+      int flag = GetDeviceAddress<T>(input_ptrs, 0, kernel_name_, &input_addr);
+      if (flag != 0) {
+        return flag;
+      }
+      flag = GetDeviceAddress<T>(output_ptrs, 0, kernel_name_, &output_addr);
+      if (flag != 0) {
+        return flag;
+      }
+      iter->second(input_addr, output_addr, input_size_list_[0] / sizeof(T),
+                   reinterpret_cast<cudaStream_t>(cuda_stream));
+    } else {
+      MS_LOG(ERROR) << "For '" << kernel_name_ << ", only support these types: Exp, Expm1, Log, Log1p, Erf, Erfc,"
+                    << " Neg, Reciprocal, Square, Sqrt, Rsqrt, Sin, Cos, Asin, ACos, Atan, Asinh, Acosh, Abs, "
+                    << "Floor, Rint, Round, Real, Imag, Sign, Conj currently, but got " << unary_op_type_;
+      return -1;
+    }
+
+    return 0;
+  }
+
+  void ResetResource() override {
+    unary_op_type_ = UNARY_OP_INVALID_TYPE;
+    input_size_list_.clear();
+    output_size_list_.clear();
+  }
+
+ private:
+  UnaryOptype unary_op_type_;
+};
+}  // namespace cukernel
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_CLASS_UNARY_HELPER_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_class/unique_helper.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_class/unique_helper.h
@ -0,0 +1,105 @@
+/**
+ * Copyright 2019-2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_CLASS_UNIQUE_HELPER_H_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_CLASS_UNIQUE_HELPER_H_
+#include <string>
+#include <vector>
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_class/helper_base.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unique_impl.cuh"
+
+namespace mindspore {
+namespace cukernel {
+constexpr size_t INPUT_NUM = 1;
+constexpr size_t OUTPUT_NUM = 1;
+constexpr size_t WORK_NUM = 0;
+constexpr size_t SHAPE_SIZE = 4;
+constexpr size_t CROPS_SHAPE_0 = 2;
+constexpr size_t CROPS_SHAPE_1 = 2;
+
+template <typename T, typename S>
+class UniqueHelperGpuKernel : public GpuKernelHelperBase {
+ public:
+  explicit UniqueHelperGpuKernel(std::string &kernel_name) : GpuKernelHelperBase(kernel_name) {}
+  virtual ~UniqueHelperGpuKernel() = default;
+  int CalMemSize(const std::vector<std::vector<size_t>> &input_shapes,
+                 const std::vector<std::vector<size_t>> &output_shapes) override {
+    int flag = CalShapesSizeInBytes<T>(input_shapes, INPUT_NUM, kernel_name_, "input_shapes", &input_size_list_);
+    if (flag != 0) {
+      return flag;
+    }
+    num_elements_ = input_size_list_[0] / sizeof(T);
+    size_t workspace_size = num_elements_ * sizeof(S);
+    work_size_list_.emplace_back(workspace_size);
+    work_size_list_.emplace_back(workspace_size);
+    output_size_list_.emplace_back(input_size_list_[0]);
+    output_size_list_.emplace_back(num_elements_ * sizeof(S));
+    return 0;
+  }
+
+  int Process(const std::vector<void *> &input_ptrs, const std::vector<void *> &output_ptrs,
+              const std::vector<void *> &work_ptrs, void *cuda_stream) override {
+    T *t_input_ptr = nullptr;
+    S *s_input_index = nullptr;
+    S *s_sorted_index = nullptr;
+    T *t_output_ptr = nullptr;
+    S *s_output_index = nullptr;
+    int flag = GetDeviceAddress<T>(input_ptrs, 0, kernel_name_, &t_input_ptr);
+    if (flag != 0) {
+      return flag;
+    }
+
+    flag = GetDeviceAddress<S>(work_ptrs, 0, kernel_name_, &s_input_index);
+    if (flag != 0) {
+      return flag;
+    }
+
+    flag = GetDeviceAddress<S>(work_ptrs, 1, kernel_name_, &s_sorted_index);
+    if (flag != 0) {
+      return flag;
+    }
+    flag = GetDeviceAddress<T>(output_ptrs, 0, kernel_name_, &t_output_ptr);
+    if (flag != 0) {
+      return flag;
+    }
+
+    flag = GetDeviceAddress<S>(output_ptrs, 1, kernel_name_, &s_output_index);
+    if (flag != 0) {
+      return flag;
+    }
+
+    post_output_size_ = CalUnique(t_input_ptr, num_elements_, s_input_index, s_sorted_index, t_output_ptr,
+                                  s_output_index, reinterpret_cast<cudaStream_t>(cuda_stream));
+    return 0;
+  }
+
+  void ResetResource() override {
+    num_elements_ = 1;
+    post_output_size_ = 0;
+    input_size_list_.clear();
+    output_size_list_.clear();
+    work_size_list_.clear();
+  }
+
+  int GetOutSize() { return post_output_size_; }
+
+ private:
+  int num_elements_;
+  int post_output_size_;
+};
+}  // namespace cukernel
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_CLASS_UNIQUE_HELPER_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/gpu_kernel.h
@ -113,6 +113,14 @@ class NativeGpuKernelMod : public GpuKernelMod {
    return reinterpret_cast<T *>(addr_list[index]->addr);
  }

+  std::vector<void *> ConvertPtrs(const std::vector<AddressPtr> &input_ptrs) {
+    std::vector<void *> out_ptrs;
+    for (auto &cur_addr : input_ptrs) {
+      out_ptrs.emplace_back(cur_addr->addr);
+    }
+    return out_ptrs;
+  }
+
  template <typename T>
  inline T *GetPossiblyNullDeviceAddress(const std::vector<AddressPtr> &addr_list, size_t index) {
    if (index >= addr_list.size()) {
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/unary_op_complex_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/unary_op_complex_gpu_kernel.h
@ -39,21 +39,21 @@ class UnaryOpComplexGpuKernelMod : public NativeGpuKernelMod {

    S *output_addr = GetDeviceAddress<S>(outputs, 0);
    switch (unary_op_type_) {
-      case UNARY_OP_REAL: {
+      case cukernel::UNARY_OP_REAL: {
        if constexpr (!std::is_same<S, utils::Complex<float>>::value &&
                      !std::is_same<S, utils::Complex<double>>::value) {
          Real(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr));
        }
        break;
      }
-      case UNARY_OP_IMAG: {
+      case cukernel::UNARY_OP_IMAG: {
        if constexpr (!std::is_same<S, utils::Complex<float>>::value &&
                      !std::is_same<S, utils::Complex<double>>::value) {
          Imag(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr));
        }
        break;
      }
-      case UNARY_OP_CONJ: {
+      case cukernel::UNARY_OP_CONJ: {
        if constexpr (std::is_same<T, S>::value && !std::is_same<T, bool>::value) {
          Conj(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr));
        }
@ -112,8 +112,8 @@ class UnaryOpComplexGpuKernelMod : public NativeGpuKernelMod {
 private:
  void GetOpType(const CNodePtr &kernel_node) {
    std::string kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
-    static std::map<std::string, UnaryOptype> kComplexSupportedTypeMap = {
-      {"Real", UNARY_OP_REAL}, {"Imag", UNARY_OP_IMAG}, {"Conj", UNARY_OP_CONJ}};
+    static std::map<std::string, cukernel::UnaryOptype> kComplexSupportedTypeMap = {
+      {"Real", cukernel::UNARY_OP_REAL}, {"Imag", cukernel::UNARY_OP_IMAG}, {"Conj", cukernel::UNARY_OP_CONJ}};
    auto iter = kComplexSupportedTypeMap.find(kernel_name);
    if (iter != kComplexSupportedTypeMap.end()) {
      unary_op_type_ = iter->second;
@ -128,7 +128,7 @@ class UnaryOpComplexGpuKernelMod : public NativeGpuKernelMod {
  size_t output_size_;
  size_t workspace_size_;
  bool is_null_input_;
-  UnaryOptype unary_op_type_;
+  cukernel::UnaryOptype unary_op_type_;
 };
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/math/unary_op_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/math/unary_op_gpu_kernel.h
@ -21,58 +21,13 @@
 #include <functional>
 #include <vector>
 #include <string>
-#include <map>
+#include <memory>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
-#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_impl.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_class/unary_helper.h"

 namespace mindspore {
 namespace kernel {
-enum UnaryOptype {
-  UNARY_OP_EXP = 0,
-  UNARY_OP_EXPM1,
-  UNARY_OP_LOG,
-  UNARY_OP_LOG1P,
-  UNARY_OP_ERF,
-  UNARY_OP_ERFC,
-  UNARY_OP_NEG,
-  UNARY_OP_RECIPROCAL,
-  UNARY_OP_SQUARE,
-  UNARY_OP_SQRT,
-  UNARY_OP_RSQRT,
-  UNARY_OP_SIN,
-  UNARY_OP_COS,
-  UNARY_OP_ASIN,
-  UNARY_OP_ACOS,
-  UNARY_OP_ATAN,
-  UNARY_OP_ASINH,
-  UNARY_OP_ACOSH,
-  UNARY_OP_ABS,
-  UNARY_OP_FLOOR,
-  UNARY_OP_RINT,
-  UNARY_OP_ROUND,
-  UNARY_OP_SIGN,
-  UNARY_OP_REAL,
-  UNARY_OP_IMAG,
-  UNARY_OP_CONJ,
-  UNARY_OP_INVALID_TYPE = 255
-};
-
-static const std::map<std::string, UnaryOptype> kUnaryOpTypeMap = {
-  {"Exp", UNARY_OP_EXP},       {"Expm1", UNARY_OP_EXPM1},
-  {"Log", UNARY_OP_LOG},       {"Log1p", UNARY_OP_LOG1P},
-  {"Erf", UNARY_OP_ERF},       {"Erfc", UNARY_OP_ERFC},
-  {"Neg", UNARY_OP_NEG},       {"Reciprocal", UNARY_OP_RECIPROCAL},
-  {"Square", UNARY_OP_SQUARE}, {"Sqrt", UNARY_OP_SQRT},
-  {"Rsqrt", UNARY_OP_RSQRT},   {"Sin", UNARY_OP_SIN},
-  {"Cos", UNARY_OP_COS},       {"Asin", UNARY_OP_ASIN},
-  {"ACos", UNARY_OP_ACOS},     {"Atan", UNARY_OP_ATAN},
-  {"Asinh", UNARY_OP_ASINH},   {"Acosh", UNARY_OP_ACOSH},
-  {"Abs", UNARY_OP_ABS},       {"Floor", UNARY_OP_FLOOR},
-  {"Rint", UNARY_OP_RINT},     {"Round", UNARY_OP_ROUND},
-  {"Real", UNARY_OP_REAL},     {"Imag", UNARY_OP_IMAG},
-  {"Sign", UNARY_OP_SIGN},     {"Conj", UNARY_OP_CONJ}};
-
 template <typename T>
 class UnaryOpGpuKernelMod : public NativeGpuKernelMod {
 public:
@ -84,72 +39,50 @@ class UnaryOpGpuKernelMod : public NativeGpuKernelMod {
    if (is_null_input_) {
      return true;
    }
-
-    static std::map<UnaryOptype, std::function<void(const T *, T *, const size_t, cudaStream_t)>> func_map = {
-      {UNARY_OP_EXP, Exponential<T>}, {UNARY_OP_EXPM1, Expm1<T>},
-      {UNARY_OP_LOG, Logarithm<T>},   {UNARY_OP_LOG1P, Log1p<T>},
-      {UNARY_OP_ERF, Erf<T>},         {UNARY_OP_ERFC, Erfc<T>},
-      {UNARY_OP_NEG, Negative<T>},    {UNARY_OP_RECIPROCAL, Reciprocal<T>},
-      {UNARY_OP_SQUARE, Square<T>},   {UNARY_OP_SQRT, Sqrt<T>},
-      {UNARY_OP_RSQRT, Rsqrt<T>},     {UNARY_OP_SIN, Sin<T>},
-      {UNARY_OP_COS, Cos<T>},         {UNARY_OP_ASIN, Asin<T>},
-      {UNARY_OP_ACOS, ACos<T>},       {UNARY_OP_ATAN, Atan<T>},
-      {UNARY_OP_ASINH, Asinh<T>},     {UNARY_OP_ACOSH, Acosh<T>},
-      {UNARY_OP_ABS, Abs<T>},         {UNARY_OP_FLOOR, Floor<T>},
-      {UNARY_OP_RINT, Rint<T>},       {UNARY_OP_ROUND, Round<T>},
-      {UNARY_OP_SIGN, Sign<T>}};
-
-    auto iter = func_map.find(unary_op_type_);
-    if (iter != func_map.end()) {
-      T *input_addr = GetDeviceAddress<T>(inputs, 0);
-      T *output_addr = GetDeviceAddress<T>(outputs, 0);
-      iter->second(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr));
-    } else {
-      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", only support these types: Exp, Expm1, Log, Log1p, Erf, Erfc,"
-                        << " Neg, Reciprocal, Square, Sqrt, Rsqrt, Sin, Cos, Asin, ACos, Atan, Asinh, Acosh, Abs, "
-                        << "Floor, Rint, Round, Real, Imag, Sign, Conj currently, but got " << unary_op_type_;
+    std::vector<void *> input_addrs;
+    std::vector<void *> output_addrs;
+    std::vector<void *> work_addrs;
+    for (size_t idx = 0; idx < inputs.size(); ++idx) {
+      void *cur_ptr = reinterpret_cast<void *>(GetDeviceAddress<T>(inputs, idx));
+      input_addrs.emplace_back(cur_ptr);
+    }
+    for (size_t idx = 0; idx < outputs.size(); ++idx) {
+      void *cur_ptr = reinterpret_cast<void *>(GetDeviceAddress<T>(outputs, idx));
+      output_addrs.emplace_back(cur_ptr);
+    }
+    int flag = helper_ptr_->Process(input_addrs, output_addrs, work_addrs, stream_ptr);
+    if (flag != 0) {
+      return false;
    }
-
    return true;
  }

  bool Init(const CNodePtr &kernel_node) override {
-    std::string kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
    kernel_node_ = kernel_node;
-    auto iter = kUnaryOpTypeMap.find(kernel_name);
-    if (iter == kUnaryOpTypeMap.end()) {
-      MS_LOG(EXCEPTION) << "For '" << kernel_name << ", only support these types: Exp, Expm1, Log, Log1p, Erf, Erfc,"
-                        << " Neg, Reciprocal, Square, Sqrt, Rsqrt, Sin, Cos, Asin, ACos, Atan, Asinh, Acosh, Abs, "
-                        << "Floor, Rint, Round, Real, Imag, Sign, Conj currently, but got " << kernel_name;
-    }
-    unary_op_type_ = iter->second;
-    size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
-    if (input_num != 1) {
-      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 1, but got " << input_num;
-    }
-    size_t output_num = common::AnfAlgo::GetOutputTensorNum(kernel_node);
-    if (output_num != 1) {
-      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
-    }
+    std::string kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
+    helper_ptr_ = std::make_unique<cukernel::UnaryHelperGpuKernel<T>>(kernel_name);
+    helper_ptr_->ResetResource();
+    std::vector<std::vector<size_t>> input_shapes;
+    std::vector<std::vector<size_t>> output_shapes;
    auto input_shape = AnfAlgo::GetInputDeviceShapeAdaptively(kernel_node, 0);
+    auto output_shape = AnfAlgo::GetOutputDeviceShapeAdaptively(kernel_node, 0);
    is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
    if (is_null_input_) {
-      InitSizeLists();
+      input_size_list_.emplace_back(0);
+      output_size_list_.emplace_back(0);
      return true;
    }
-    for (size_t i = 0; i < input_shape.size(); i++) {
-      input_size_ *= input_shape[i];
+    input_shapes.emplace_back(input_shape);
+    output_shapes.emplace_back(output_shape);
+    int flag = helper_ptr_->CalMemSize(input_shapes, output_shapes);
+    if (flag != 0) {
+      return false;
    }
-    output_size_ = input_size_;
    InitSizeLists();
    return true;
  }
+
  void ResetResource() noexcept override {
-    unary_op_type_ = UNARY_OP_INVALID_TYPE;
-    input_size_ = sizeof(T);
-    output_size_ = sizeof(T);
-    workspace_size_ = 0;
-    is_null_input_ = false;
    input_size_list_.clear();
    output_size_list_.clear();
    workspace_size_list_.clear();
@ -157,15 +90,13 @@ class UnaryOpGpuKernelMod : public NativeGpuKernelMod {

 protected:
  void InitSizeLists() override {
-    input_size_list_.push_back(input_size_);
-    output_size_list_.push_back(output_size_);
+    input_size_list_ = helper_ptr_->GetInputSizeList();
+    output_size_list_ = helper_ptr_->GetOutputSizeList();
+    workspace_size_list_ = helper_ptr_->GetWorkSizeList();
  }

 private:
-  UnaryOptype unary_op_type_;
-  size_t input_size_;
-  size_t output_size_;
-  size_t workspace_size_;
+  std::unique_ptr<cukernel::UnaryHelperGpuKernel<T>> helper_ptr_ = nullptr;
  bool is_null_input_;
 };
 }  // namespace kernel