!3495 Gpu StridedSlice performance optimize

Merge pull request !3495 from chenweifeng/strided_slice
2020-07-27 11:04:50 +08:00 · 2020-07-27 11:04:50 +08:00 · c22792aab1
parent 295038d346 d3167da8af
commit c22792aab1
12 changed files with 867 additions and 190 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/slice_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/slice_gpu_kernel.cc
@ -24,11 +24,5 @@ MS_REG_GPU_KERNEL_ONE(Slice, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutp
                      SliceGpuFwdKernel, int)
 MS_REG_GPU_KERNEL_ONE(Slice, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
                      SliceGpuFwdKernel, half)
-MS_REG_GPU_KERNEL_ONE(StridedSlice, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
-                      SliceGpuFwdKernel, float)
-MS_REG_GPU_KERNEL_ONE(StridedSlice, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
-                      SliceGpuFwdKernel, half)
-MS_REG_GPU_KERNEL_ONE(StridedSlice, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
-                      SliceGpuFwdKernel, int)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/slice_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/slice_gpu_kernel.h
@ -41,14 +41,9 @@ class SliceGpuFwdKernel : public GpuKernel {
    }
    T *input = GetDeviceAddress<T>(inputs, 0);
    T *output = GetDeviceAddress<T>(outputs, 0);
-    if (is_strided_slice_) {
-      CalStridedSlice(output_size_ / sizeof(T), input, input_shape_, begin_, size_, strides_, output,
-                      reinterpret_cast<cudaStream_t>(stream_ptr));
-    } else {
-      Slice4DKernel(begin_[0], begin_[1], begin_[2], begin_[3], size_[0], size_[1], size_[2], size_[3], input_shape_[0],
-                    input_shape_[1], input_shape_[2], input_shape_[3], input, output,
-                    reinterpret_cast<cudaStream_t>(stream_ptr));
-    }
+    Slice4DKernel(begin_[0], begin_[1], begin_[2], begin_[3], size_[0], size_[1], size_[2], size_[3], input_shape_[0],
+                  input_shape_[1], input_shape_[2], input_shape_[3], input, output,
+                  reinterpret_cast<cudaStream_t>(stream_ptr));
    return true;
  }
  bool Init(const CNodePtr &kernel_node) override {
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/slice_grad_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/slice_grad_gpu_kernel.cc
@ -29,11 +29,5 @@ MS_REG_GPU_KERNEL_ONE(
  SliceGrad,
  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
  SliceGradGpuKernel, half)
-MS_REG_GPU_KERNEL_ONE(StridedSliceGrad, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
-                      SliceGradGpuKernel, float)
-MS_REG_GPU_KERNEL_ONE(StridedSliceGrad, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
-                      SliceGradGpuKernel, int)
-MS_REG_GPU_KERNEL_ONE(StridedSliceGrad, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
-                      SliceGradGpuKernel, half)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/slice_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/slice_grad_gpu_kernel.h
@ -38,13 +38,8 @@ class SliceGradGpuKernel : public GpuKernel {
    T *dy = GetDeviceAddress<T>(inputs, 0);
    T *dx = GetDeviceAddress<T>(outputs, 0);
    FillDeviceArray(outputs[0]->size / sizeof(T), dx, 0.f, reinterpret_cast<cudaStream_t>(stream_ptr));
-    if (is_strided_slice_) {
-      CalStridedSliceGrad(output_size_ / sizeof(T), dy, input_shape_, begin_, size_, strides_, dx,
-                          reinterpret_cast<cudaStream_t>(stream_ptr));
-    } else {
-      CalSliceGrad(output_size_ / sizeof(T), dy, input_shape_, begin_, size_, dx,
-                   reinterpret_cast<cudaStream_t>(stream_ptr));
-    }
+    CalSliceGrad(output_size_ / sizeof(T), dy, input_shape_, begin_, size_, dx,
+                 reinterpret_cast<cudaStream_t>(stream_ptr));
    return true;
  }

@ -140,7 +135,7 @@ class SliceGradGpuKernel : public GpuKernel {
  size_t input_size_;
  size_t output_size_;
  size_t workspace_size_;
-};
+};  // namespace kernel
 }  // namespace kernel
 }  // namespace mindspore

--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/strided_slice_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/strided_slice_gpu_kernel.cc
@ -0,0 +1,28 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/arrays/strided_slice_gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_ONE(StridedSlice, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                      StridedSliceGpuKernel, float)
+MS_REG_GPU_KERNEL_ONE(StridedSlice, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+                      StridedSliceGpuKernel, half)
+MS_REG_GPU_KERNEL_ONE(StridedSlice, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
+                      StridedSliceGpuKernel, int)
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/strided_slice_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/strided_slice_gpu_kernel.h
@ -0,0 +1,190 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_STRIDED_SLICE_GPU_KERNEL_H
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_STRIDED_SLICE_GPU_KERNEL_H
+
+#include <vector>
+#include <bitset>
+#include <algorithm>
+#include "backend/kernel_compiler/gpu/gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
+#include "backend/kernel_compiler/gpu/cuda_impl/slice_impl.cuh"
+
+namespace mindspore {
+namespace kernel {
+constexpr int MAX_DIMS = 4;
+template <typename T>
+class StridedSliceGpuKernel : public GpuKernel {
+ public:
+  StridedSliceGpuKernel() : null_output_(false) {}
+  ~StridedSliceGpuKernel() override = default;
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+    if (null_output_) {
+      return true;
+    }
+
+    T *input = GetDeviceAddress<T>(inputs, 0);
+    T *output = GetDeviceAddress<T>(outputs, 0);
+    StridedSlice(input_shape_, begin_, strides_, output_shape_, input, output,
+                 reinterpret_cast<cudaStream_t>(stream_ptr));
+    return true;
+  }
+  bool Init(const CNodePtr &kernel_node) override {
+    input_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    if (input_shape_.size() > MAX_DIMS) {
+      MS_LOG(ERROR) << "StridedSlice support support dims less than " << input_shape_.size();
+      return false;
+    }
+
+    FillEmptyDims(kernel_node);
+    ParseMasks(kernel_node);
+    FillOutputDim();
+    null_output_ = IsNullOutput();
+    InitSizeLists();
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {
+    input_size_list_.push_back(input_shape_[0] * input_shape_[1] * input_shape_[2] * input_shape_[3] * sizeof(T));
+    output_size_list_.push_back(output_shape_[0] * output_shape_[1] * output_shape_[2] * output_shape_[3] * sizeof(T));
+  }
+
+ private:
+  void FillEmptyDims(const CNodePtr &kernel_node) {
+    begin_ = GetAttr<std::vector<int>>(kernel_node, "begin");
+    end_ = GetAttr<std::vector<int>>(kernel_node, "end");
+    strides_ = GetAttr<std::vector<int>>(kernel_node, "strides");
+
+    for (size_t i = 0; i < MAX_DIMS; i++) {
+      if (i < begin_.size()) {
+        begin_[i] =
+          std::min(begin_[i] < 0 ? SizeToInt(begin_[i] + input_shape_[i]) : begin_[i], SizeToInt(input_shape_[i] - 1));
+      } else {
+        begin_.push_back(0);
+      }
+
+      if (i < end_.size()) {
+        end_[i] = std::max(end_[i] < 0 ? end_[i] + SizeToInt(input_shape_[i]) : end_[i], -1);
+      } else {
+        end_.push_back(i < input_shape_.size() ? input_shape_[i] : 1);
+      }
+
+      if (i >= strides_.size()) {
+        strides_.push_back(1);
+      }
+
+      if (i >= input_shape_.size()) {
+        input_shape_.push_back(1);
+      }
+    }
+  }
+
+  void ParseMasks(const CNodePtr &kernel_node) {
+    auto begin_mask_int = GetAttr<int>(kernel_node, "begin_mask");
+    auto begin_mask = Dec2Bin(begin_mask_int);
+    for (size_t i = 0; i < begin_mask.size(); i++) {
+      if (begin_mask[i]) {
+        begin_[i] = 0;
+      }
+    }
+
+    auto end_mask_int = GetAttr<int>(kernel_node, "end_mask");
+    auto end_mask = Dec2Bin(end_mask_int);
+    for (size_t j = 0; j < end_mask.size(); j++) {
+      if (end_mask[j]) {
+        end_[j] = input_shape_[j];
+      }
+    }
+
+    auto ellipsis_mask_int = GetAttr<int>(kernel_node, "ellipsis_mask");
+    auto ellipsis_mask = Dec2Bin(ellipsis_mask_int);
+    for (size_t k = 0; k < ellipsis_mask.size(); k++) {
+      if (ellipsis_mask[k]) {
+        begin_[k] = 0;
+        end_[k] = input_shape_[k];
+        strides_[k] = 1;
+      }
+    }
+
+    auto shrink_axis_mask_str = GetAttr<int>(kernel_node, "shrink_axis_mask");
+    auto shrink_axis_mask = Dec2Bin(shrink_axis_mask_str);
+    for (size_t l = 0; l < shrink_axis_mask.size(); l++) {
+      if (shrink_axis_mask[l]) {
+        end_[l] = end_[l] > begin_[l] ? begin_[l] + 1 : begin_[l] - 1;
+        strides_[l] = end_[l] > begin_[l] ? 1 : -1;
+      }
+    }
+  }
+
+  std::vector<bool> Dec2Bin(const int &mask) {
+    auto mask_str = std::bitset<MAX_DIMS>(mask).to_string();
+    int dim_idx = 0;
+    std::vector<bool> result = {false, false, false, false};
+    for (int i = mask_str.size() - 1; i >= 0; i--) {
+      if (mask_str[i] == '1') {
+        result[dim_idx] = true;
+      }
+      dim_idx++;
+    }
+    return result;
+  }
+
+  void FillOutputDim() {
+    for (int i = 0; i < MAX_DIMS; i++) {
+      if (begin_[i] <= end_[i] && strides_[i] > 0) {
+        output_shape_.push_back((end_[i] - 1 - begin_[i]) / strides_[i] + 1);
+      } else if (begin_[i] > end_[i] && strides_[i] < 0) {
+        output_shape_.push_back((end_[i] - begin_[i] + 1) / strides_[i] + 1);
+      } else {
+        output_shape_.push_back(0);
+      }
+    }
+  }
+
+  bool IsNullOutput() {
+    for (int i = 0; i < MAX_DIMS; i++) {
+      if (begin_[i] >= end_[i] && strides_[i] > 0) {
+        return true;
+      }
+      if (begin_[i] < end_[i] && strides_[i] < 0) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  std::vector<int> begin_;
+  std::vector<int> end_;
+  std::vector<int> strides_;
+  std::vector<size_t> input_shape_;
+  std::vector<int> output_shape_;
+  int null_output_;
+
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_STRIDED_SLICE_GPU_KERNEL_H
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/strided_slice_grad_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/strided_slice_grad_gpu_kernel.cc
@ -0,0 +1,28 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/arrays/strided_slice_grad_gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_ONE(StridedSliceGrad, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                      StridedSliceGradGpuKernel, float)
+MS_REG_GPU_KERNEL_ONE(StridedSliceGrad, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+                      StridedSliceGradGpuKernel, half)
+MS_REG_GPU_KERNEL_ONE(StridedSliceGrad, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
+                      StridedSliceGradGpuKernel, int)
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/strided_slice_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/strided_slice_grad_gpu_kernel.h
@ -0,0 +1,191 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_STRIDED_SLICE_GRAD_GPU_KERNEL_H
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_STRIDED_SLICE_GRAD_GPU_KERNEL_H
+
+#include <vector>
+#include <bitset>
+#include <algorithm>
+#include "backend/kernel_compiler/gpu/gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
+#include "backend/kernel_compiler/gpu/cuda_impl/slice_impl.cuh"
+
+namespace mindspore {
+namespace kernel {
+constexpr int MAX_DIMS = 4;
+template <typename T>
+class StridedSliceGradGpuKernel : public GpuKernel {
+ public:
+  StridedSliceGradGpuKernel() : null_output_(false) {}
+  ~StridedSliceGradGpuKernel() override = default;
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+    T *dy = GetDeviceAddress<T>(inputs, 0);
+    T *dx = GetDeviceAddress<T>(outputs, 0);
+
+    FillDeviceArray(outputs[0]->size / sizeof(T), dx, 0.f, reinterpret_cast<cudaStream_t>(stream_ptr));
+    if (null_output_) {
+      return true;
+    }
+
+    StridedSliceGrad(output_shape_, begin_, strides_, input_shape_, dy, dx, reinterpret_cast<cudaStream_t>(stream_ptr));
+    return true;
+  }
+  bool Init(const CNodePtr &kernel_node) override {
+    input_shape_ = GetAttr<std::vector<int>>(kernel_node, "shapex");
+    if (input_shape_.size() > MAX_DIMS) {
+      MS_LOG(ERROR) << "StridedSliceGrad support support dims less than " << input_shape_.size();
+      return false;
+    }
+
+    FillEmptyDims(kernel_node);
+    ParseMasks(kernel_node);
+    FillOutputDim();
+    null_output_ = IsNullOutput();
+    InitSizeLists();
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {
+    input_size_list_.push_back(output_shape_[0] * output_shape_[1] * output_shape_[2] * output_shape_[3] * sizeof(T));
+    output_size_list_.push_back(input_shape_[0] * input_shape_[1] * input_shape_[2] * input_shape_[3] * sizeof(T));
+  }
+
+ private:
+  void FillEmptyDims(const CNodePtr &kernel_node) {
+    begin_ = GetAttr<std::vector<int>>(kernel_node, "begin");
+    end_ = GetAttr<std::vector<int>>(kernel_node, "end");
+    strides_ = GetAttr<std::vector<int>>(kernel_node, "strides");
+
+    for (size_t i = 0; i < MAX_DIMS; i++) {
+      if (i < begin_.size()) {
+        begin_[i] =
+          std::min(begin_[i] < 0 ? SizeToInt(begin_[i] + input_shape_[i]) : begin_[i], SizeToInt(input_shape_[i] - 1));
+      } else {
+        begin_.push_back(0);
+      }
+
+      if (i < end_.size()) {
+        end_[i] = std::max(end_[i] < 0 ? end_[i] + SizeToInt(input_shape_[i]) : end_[i], -1);
+      } else {
+        end_.push_back(i < input_shape_.size() ? input_shape_[i] : 1);
+      }
+
+      if (i >= strides_.size()) {
+        strides_.push_back(1);
+      }
+
+      if (i >= input_shape_.size()) {
+        input_shape_.push_back(1);
+      }
+    }
+  }
+
+  void ParseMasks(const CNodePtr &kernel_node) {
+    auto begin_mask_int = GetAttr<int>(kernel_node, "begin_mask");
+    auto begin_mask = Dec2Bin(begin_mask_int);
+    for (size_t i = 0; i < begin_mask.size(); i++) {
+      if (begin_mask[i]) {
+        begin_[i] = 0;
+      }
+    }
+
+    auto end_mask_int = GetAttr<int>(kernel_node, "end_mask");
+    auto end_mask = Dec2Bin(end_mask_int);
+    for (size_t j = 0; j < end_mask.size(); j++) {
+      if (end_mask[j]) {
+        end_[j] = input_shape_[j];
+      }
+    }
+
+    auto ellipsis_mask_int = GetAttr<int>(kernel_node, "ellipsis_mask");
+    auto ellipsis_mask = Dec2Bin(ellipsis_mask_int);
+    for (size_t k = 0; k < ellipsis_mask.size(); k++) {
+      if (ellipsis_mask[k]) {
+        begin_[k] = 0;
+        end_[k] = input_shape_[k];
+        strides_[k] = 1;
+      }
+    }
+
+    auto shrink_axis_mask_str = GetAttr<int>(kernel_node, "shrink_axis_mask");
+    auto shrink_axis_mask = Dec2Bin(shrink_axis_mask_str);
+    for (size_t l = 0; l < shrink_axis_mask.size(); l++) {
+      if (shrink_axis_mask[l]) {
+        end_[l] = end_[l] > begin_[l] ? begin_[l] + 1 : begin_[l] - 1;
+        strides_[l] = end_[l] > begin_[l] ? 1 : -1;
+      }
+    }
+  }
+
+  std::vector<bool> Dec2Bin(const int &mask) {
+    auto mask_str = std::bitset<MAX_DIMS>(mask).to_string();
+    int dim_idx = 0;
+    std::vector<bool> result = {false, false, false, false};
+    for (int i = mask_str.size() - 1; i >= 0; i--) {
+      if (mask_str[i] == '1') {
+        result[dim_idx] = true;
+      }
+      dim_idx++;
+    }
+    return result;
+  }
+
+  void FillOutputDim() {
+    for (int i = 0; i < MAX_DIMS; i++) {
+      if (begin_[i] <= end_[i] && strides_[i] > 0) {
+        output_shape_.push_back((end_[i] - 1 - begin_[i]) / strides_[i] + 1);
+      } else if (begin_[i] > end_[i] && strides_[i] < 0) {
+        output_shape_.push_back((end_[i] - begin_[i] + 1) / strides_[i] + 1);
+      } else {
+        output_shape_.push_back(0);
+      }
+    }
+  }
+
+  bool IsNullOutput() {
+    for (int i = 0; i < MAX_DIMS; i++) {
+      if (begin_[i] >= end_[i] && strides_[i] > 0) {
+        return true;
+      }
+      if (begin_[i] < end_[i] && strides_[i] < 0) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  std::vector<int> begin_;
+  std::vector<int> end_;
+  std::vector<int> strides_;
+  std::vector<int> input_shape_;
+  std::vector<int> output_shape_;
+  int null_output_;
+
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_STRIDED_SLICE_GRAD_GPU_KERNEL_H
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/slice_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/slice_impl.cu
@ -21,48 +21,29 @@
 #include "backend/kernel_compiler/gpu/cuda_impl/slice_impl.cuh"

 template <typename T>
-__global__ void Slice4D(const int s1, const int s2, const int s3, const int s4,
-                       const int l1, const int l2, const int l3, const int l4,
-                       const int d1, const int d2, const int d3, const int d4,
-                       const T *input, T *output) {
+__global__ void Slice4D(const int s1, const int s2, const int s3, const int s4, const int l1, const int l2,
+                        const int l3, const int l4, const int d1, const int d2, const int d3, const int d4,
+                        const T *input, T *output) {
  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (l1 * l2 * l3 * l4); pos += blockDim.x * gridDim.x) {
    int i = pos / (l2 * l3 * l4) % l1;
    int j = pos / (l3 * l4) % l2;
    int k = pos / l4 % l3;
    int o = pos % l4;

-    int offset = (i + s1) * (d2 * d3 * d4) +
-                 (j + s2) * (d3 * d4) +
-                 (k + s3) * d4 +
-                 (o + s4);
+    int offset = (i + s1) * (d2 * d3 * d4) + (j + s2) * (d3 * d4) + (k + s3) * d4 + (o + s4);
    output[pos] = input[offset];
  }
 }
 template <typename T>
-__global__ void SliceGrad(const T* dy, int p, int start, int length, T* output) {
+__global__ void SliceGrad(const T *dy, int p, int start, int length, T *output) {
  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (length); pos += blockDim.x * gridDim.x) {
    output[start + pos] = dy[p + pos];
  }
  return;
 }
+
 template <typename T>
-__global__ void StridedSlice(const T* input, int p, int start, int begin, int stride, int ended, T* output) {
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < std::ceil(static_cast<float>(ended - begin) / stride);
-       pos += blockDim.x * gridDim.x) {
-    output[p + pos] = input[start + pos * stride];
-  }
-  return;
-}
-template <typename T>
-__global__ void StridedSliceGrad(const T* dy, int p, int start, int begin, int stride, int ended, T* dx) {
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < std::ceil(static_cast<float>(ended - begin) / stride);
-       pos += blockDim.x * gridDim.x) {
-    dx[start + pos * stride] = dy[p + pos];
-  }
-  return;
-}
-template <typename T>
-__global__ void FillArray(T* addr, const size_t len, const float value) {
+__global__ void FillArray(T *addr, const size_t len, const float value) {
  T value_ = static_cast<T>(value);
  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < len; pos += blockDim.x * gridDim.x) {
    addr[pos] = value_;
@ -70,21 +51,20 @@ __global__ void FillArray(T* addr, const size_t len, const float value) {
  return;
 }
 template <typename T>
-void FillDeviceArray(const size_t input_size, T* addr, const float value, cudaStream_t cuda_stream) {
+void FillDeviceArray(const size_t input_size, T *addr, const float value, cudaStream_t cuda_stream) {
  FillArray<<<GET_BLOCKS(input_size), GET_THREADS, 0, cuda_stream>>>(addr, input_size, value);
  return;
 }
 template <typename T>
-void Slice4DKernel(const int s1, const int s2, const int s3, const int s4,
-                  const int l1, const int l2, const int l3, const int l4,
-                  const int d1, const int d2, const int d3, const int d4,
-                  const T *input, T *output, cudaStream_t stream) {
-  Slice4D<<<GET_BLOCKS(l1 * l2 * l3 * l4), GET_THREADS, 0, stream>>>(s1, s2, s3, s4, l1, l2, l3, l4,
-                                                                     d1, d2, d3, d4, input, output);
+void Slice4DKernel(const int s1, const int s2, const int s3, const int s4, const int l1, const int l2, const int l3,
+                   const int l4, const int d1, const int d2, const int d3, const int d4, const T *input, T *output,
+                   cudaStream_t stream) {
+  Slice4D<<<GET_BLOCKS(l1 * l2 * l3 * l4), GET_THREADS, 0, stream>>>(s1, s2, s3, s4, l1, l2, l3, l4, d1, d2, d3, d4,
+                                                                     input, output);
 }
 template <typename T>
-void CalSliceGrad(const size_t input_size, const T* dy, const std::vector<int> in_shape, const std::vector<int> begin,
-                  const std::vector<int> size, T* output, cudaStream_t cuda_stream) {
+void CalSliceGrad(const size_t input_size, const T *dy, const std::vector<int> in_shape, const std::vector<int> begin,
+                  const std::vector<int> size, T *output, cudaStream_t cuda_stream) {
  int block = in_shape[1] * in_shape[2] * in_shape[3];
  int map = in_shape[2] * in_shape[3];
  int w = in_shape[3];
@ -100,92 +80,100 @@ void CalSliceGrad(const size_t input_size, const T* dy, const std::vector<int> i
    }
  }
 }
+
 template <typename T>
-void CalStridedSlice(const size_t input_size, const T* input, const std::vector<int> in_shape,
-                     const std::vector<int> begin, const std::vector<int> end, const std::vector<int> strides,
-                     T* output, cudaStream_t cuda_stream) {
-  int block = in_shape[1] * in_shape[2] * in_shape[3];
-  int map = in_shape[2] * in_shape[3];
-  int w = in_shape[3];
-  int ended = end[3];
-  int p = 0;
-  int start = 0;
-  for (int i = begin[0]; i < ((end[0] > begin[0]) ? end[0] : (2 * begin[0] - end[0])); i += std::abs(strides[0])) {
-    for (int j = begin[1]; j < ((end[1] > begin[1]) ? end[1] : (2 * begin[1] - end[1])); j += std::abs(strides[1])) {
-      for (int k = begin[2]; k < ((end[2] > begin[2]) ? end[2] : (2 * begin[2] - end[2])); k += std::abs(strides[2])) {
-        start = (strides[0] > 0 ? i : 2 * begin[0] - i) * block + (strides[1] > 0 ? j : 2 * begin[1] - j) * map +
-                (strides[2] > 0 ? k : 2 * begin[2] - k) * w + begin[3];
-        StridedSlice<<<GET_BLOCKS(input_size), GET_THREADS, 0, cuda_stream>>>(input, p, start, begin[3], strides[3],
-                                                                              ended, output);
-        p = p + std::ceil(static_cast<float>(end[3] - begin[3]) / strides[3]);
-      }
-    }
-  }
-}
-template <typename T>
-void CalStridedSliceGrad(const size_t input_size, const T* dy, const std::vector<int> in_shape,
-                         const std::vector<int> begin, const std::vector<int> end, const std::vector<int> strides,
-                         T* dx, cudaStream_t cuda_stream) {
-  int block = in_shape[1] * in_shape[2] * in_shape[3];
-  int map = in_shape[2] * in_shape[3];
-  int w = in_shape[3];
-  int ended = end[3];
-  int p = 0;
-  int start = 0;
-  for (int i = begin[0]; i < ((end[0] > begin[0]) ? end[0] : (2 * begin[0] - end[0] + 1)); i += std::abs(strides[0])) {
-    for (int j = begin[1]; j < ((end[1] > begin[1]) ? end[1] : (2 * begin[1] - end[1] + 1));
-         j += std::abs(strides[1])) {
-      for (int k = begin[2]; k < ((end[2] > begin[2]) ? end[2] : (2 * begin[2] - end[2] + 1));
-           k += std::abs(strides[2])) {
-        start = (strides[0] > 0 ? i : 2 * begin[0] - i) * block + (strides[1] > 0 ? j : 2 * begin[1] - j) * map +
-                (strides[2] > 0 ? k : 2 * begin[2] - k) * w + begin[3];
-        StridedSliceGrad<<<GET_BLOCKS(input_size), GET_THREADS, 0, cuda_stream>>>(dy, p, start, begin[3], strides[3],
-                                                                                  ended, dx);
-        p = p + std::ceil(static_cast<float>(end[3] - begin[3]) / strides[3]);
-      }
-    }
+__global__ void StridedSliceKernel(const int b0, const int b1, const int b2, const int b3, const int s0, const int s1,
+                                   const int s2, const int s3, const int i0, const int i1, const int i2, const int i3,
+                                   const int o0, const int o1, const int o2, const int o3, const T *input_addr,
+                                   T *output_addr) {
+  int output_num = o0 * o1 * o2 * o3;
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < output_num; pos += blockDim.x * gridDim.x) {
+    int i = pos / (o1 * o2 * o3) % o0;
+    int j = pos / (o2 * o3) % o1;
+    int k = pos / o3 % o2;
+    int l = pos % o3;
+
+    int input_idx = (i * s0 + b0) * i1 * i2 * i3 + (j * s1 + b1) * i2 * i3 + (k * s2 + b2) * i3 + (l * s3 + b3);
+    output_addr[pos] = input_addr[input_idx];
  }
 }

-template void FillDeviceArray<float>(const size_t input_size, float* addr, const float value, cudaStream_t cuda_stream);
-template void Slice4DKernel(const int s1, const int s2, const int s3, const int s4,
-                            const int l1, const int l2, const int l3, const int l4,
-                            const int d1, const int d2, const int d3, const int d4,
+template <typename T>
+void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int> &begin,
+                  const std::vector<int> &strides, const std::vector<int> &output_shape, const T *input, T *output,
+                  cudaStream_t cuda_stream) {
+  int size = output_shape[0] * output_shape[1] * output_shape[2] * output_shape[3];
+  StridedSliceKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(
+    begin[0], begin[1], begin[2], begin[3], strides[0], strides[1], strides[2], strides[3], input_shape[0],
+    input_shape[1], input_shape[2], input_shape[3], output_shape[0], output_shape[1], output_shape[2], output_shape[3],
+    input, output);
+}
+
+template <typename T>
+__global__ void StridedSliceGradKernel(const int b0, const int b1, const int b2, const int b3, const int s0,
+                                       const int s1, const int s2, const int s3, const int i0, const int i1,
+                                       const int i2, const int i3, const int o0, const int o1, const int o2,
+                                       const int o3, const T *dy, T *dx) {
+  int output_num = o0 * o1 * o2 * o3;
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < output_num; pos += blockDim.x * gridDim.x) {
+    int i = pos / (o1 * o2 * o3) % o0;
+    int j = pos / (o2 * o3) % o1;
+    int k = pos / o3 % o2;
+    int l = pos % o3;
+
+    int input_idx = (i * s0 + b0) * i1 * i2 * i3 + (j * s1 + b1) * i2 * i3 + (k * s2 + b2) * i3 + (l * s3 + b3);
+    dx[input_idx] = dy[pos];
+  }
+  return;
+}
+
+template <typename T>
+void StridedSliceGrad(const std::vector<int> &dy_shape, const std::vector<int> &begin, const std::vector<int> &strides,
+                      const std::vector<int> &dx_shape, const T *dy, T *dx, cudaStream_t cuda_stream) {
+  int size = dy_shape[0] * dy_shape[1] * dy_shape[2] * dy_shape[3];
+  StridedSliceGradKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(
+    begin[0], begin[1], begin[2], begin[3], strides[0], strides[1], strides[2], strides[3], dx_shape[0], dx_shape[1],
+    dx_shape[2], dx_shape[3], dy_shape[0], dy_shape[1], dy_shape[2], dy_shape[3], dy, dx);
+}
+
+template void FillDeviceArray<float>(const size_t input_size, float *addr, const float value, cudaStream_t cuda_stream);
+template void Slice4DKernel(const int s1, const int s2, const int s3, const int s4, const int l1, const int l2,
+                            const int l3, const int l4, const int d1, const int d2, const int d3, const int d4,
                            const float *input, float *output, cudaStream_t stream);
-template void CalSliceGrad<float>(const size_t input_size, const float* dy, const std::vector<int> in_shape,
-                                  const std::vector<int> begin, const std::vector<int> size, float* output,
+template void CalSliceGrad<float>(const size_t input_size, const float *dy, const std::vector<int> in_shape,
+                                  const std::vector<int> begin, const std::vector<int> size, float *output,
                                  cudaStream_t cuda_stream);
-template void CalStridedSlice<float>(const size_t input_size, const float* input, const std::vector<int> in_shape,
-                                     const std::vector<int> begin, const std::vector<int> end,
-                                     const std::vector<int> strides, float* output, cudaStream_t cuda_stream);
-template void CalStridedSliceGrad<float>(const size_t input_size, const float* dy, const std::vector<int> in_shape,
-                                         const std::vector<int> begin, const std::vector<int> end,
-                                         const std::vector<int> strides, float* dx, cudaStream_t cuda_stream);
-template void FillDeviceArray<half>(const size_t input_size, half* addr, const float value, cudaStream_t cuda_stream);
-template void Slice4DKernel(const int s1, const int s2, const int s3, const int s4,
-                            const int l1, const int l2, const int l3, const int l4,
-                            const int d1, const int d2, const int d3, const int d4,
+template void FillDeviceArray<half>(const size_t input_size, half *addr, const float value, cudaStream_t cuda_stream);
+template void Slice4DKernel(const int s1, const int s2, const int s3, const int s4, const int l1, const int l2,
+                            const int l3, const int l4, const int d1, const int d2, const int d3, const int d4,
                            const half *input, half *output, cudaStream_t stream);
-template void CalSliceGrad<half>(const size_t input_size, const half* dy, const std::vector<int> in_shape,
-                                 const std::vector<int> begin, const std::vector<int> size, half* output,
+template void CalSliceGrad<half>(const size_t input_size, const half *dy, const std::vector<int> in_shape,
+                                 const std::vector<int> begin, const std::vector<int> size, half *output,
                                 cudaStream_t cuda_stream);
-template void CalStridedSlice<half>(const size_t input_size, const half* input, const std::vector<int> in_shape,
-                                    const std::vector<int> begin, const std::vector<int> end,
-                                    const std::vector<int> strides, half* output, cudaStream_t cuda_stream);
-template void CalStridedSliceGrad<half>(const size_t input_size, const half* dy, const std::vector<int> in_shape,
-                                        const std::vector<int> begin, const std::vector<int> end,
-                                        const std::vector<int> strides, half* dx, cudaStream_t cuda_stream);
-template void FillDeviceArray<int>(const size_t input_size, int* addr, const float value, cudaStream_t cuda_stream);
-template void Slice4DKernel(const int s1, const int s2, const int s3, const int s4,
-                            const int l1, const int l2, const int l3, const int l4,
-                            const int d1, const int d2, const int d3, const int d4,
+template void FillDeviceArray<int>(const size_t input_size, int *addr, const float value, cudaStream_t cuda_stream);
+template void Slice4DKernel(const int s1, const int s2, const int s3, const int s4, const int l1, const int l2,
+                            const int l3, const int l4, const int d1, const int d2, const int d3, const int d4,
                            const int *input, int *output, cudaStream_t stream);
-template void CalSliceGrad<int>(const size_t input_size, const int* dy, const std::vector<int> in_shape,
-                                const std::vector<int> begin, const std::vector<int> size, int* output,
+template void CalSliceGrad<int>(const size_t input_size, const int *dy, const std::vector<int> in_shape,
+                                const std::vector<int> begin, const std::vector<int> size, int *output,
                                cudaStream_t cuda_stream);
-template void CalStridedSlice<int>(const size_t input_size, const int* input, const std::vector<int> in_shape,
-                                   const std::vector<int> begin, const std::vector<int> end,
-                                   const std::vector<int> strides, int* output, cudaStream_t cuda_stream);
-template void CalStridedSliceGrad<int>(const size_t input_size, const int* dy, const std::vector<int> in_shape,
-                                       const std::vector<int> begin, const std::vector<int> end,
-                                       const std::vector<int> strides, int* dx, cudaStream_t cuda_stream);
+
+template void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int> &begin,
+                           const std::vector<int> &strides, const std::vector<int> &output_shape, const float *input,
+                           float *output, cudaStream_t cuda_stream);
+template void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int> &begin,
+                           const std::vector<int> &strides, const std::vector<int> &output_shape, const half *input,
+                           half *output, cudaStream_t cuda_stream);
+template void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int> &begin,
+                           const std::vector<int> &strides, const std::vector<int> &output_shape, const int *input,
+                           int *output, cudaStream_t cuda_stream);
+
+template void StridedSliceGrad(const std::vector<int> &dy_shape, const std::vector<int> &begin,
+                               const std::vector<int> &strides, const std::vector<int> &dx_shape, const float *dy,
+                               float *dx, cudaStream_t cuda_stream);
+template void StridedSliceGrad(const std::vector<int> &dy_shape, const std::vector<int> &begin,
+                               const std::vector<int> &strides, const std::vector<int> &dx_shape, const half *dy,
+                               half *dx, cudaStream_t cuda_stream);
+template void StridedSliceGrad(const std::vector<int> &dy_shape, const std::vector<int> &begin,
+                               const std::vector<int> &strides, const std::vector<int> &dx_shape, const int *dy,
+                               int *dx, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/slice_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/slice_impl.cuh
@ -21,23 +21,20 @@
 #include <vector>
 #include "runtime/device/gpu/cuda_common.h"

-
 template <typename T>
-void Slice4DKernel(const int s1, const int s2, const int s3, const int s4,
-                   const int l1, const int l2, const int l3, const int l4,
-                   const int d1, const int d2, const int d3, const int d4,
-                   const T *input, T *output, cudaStream_t stream);
+void Slice4DKernel(const int s1, const int s2, const int s3, const int s4, const int l1, const int l2, const int l3,
+                   const int l4, const int d1, const int d2, const int d3, const int d4, const T *input, T *output,
+                   cudaStream_t stream);
 template <typename T>
-void CalSliceGrad(const size_t input_size, const T* input, const std::vector<int> in_shape,
-                  const std::vector<int> begin, const std::vector<int> size, T* output, cudaStream_t cuda_stream);
+void CalSliceGrad(const size_t input_size, const T *input, const std::vector<int> in_shape,
+                  const std::vector<int> begin, const std::vector<int> size, T *output, cudaStream_t cuda_stream);
 template <typename T>
-void CalStridedSlice(const size_t input_size, const T* input, const std::vector<int> in_shape,
-                     const std::vector<int> begin, const std::vector<int> end, const std::vector<int> strides,
-                     T* output, cudaStream_t cuda_stream);
+void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int> &begin,
+                  const std::vector<int> &strides, const std::vector<int> &output_shape, const T *input, T *output,
+                  cudaStream_t cuda_stream);
 template <typename T>
-void CalStridedSliceGrad(const size_t input_size, const T* dy, const std::vector<int> in_shape,
-                         const std::vector<int> begin, const std::vector<int> end, const std::vector<int> strides,
-                         T* dx, cudaStream_t cuda_stream);
+void StridedSliceGrad(const std::vector<int> &dy_shape, const std::vector<int> &begin, const std::vector<int> &strides,
+                      const std::vector<int> &dx_shape, const T *dy, T *dx, cudaStream_t cuda_stream);
 template <typename T>
-void FillDeviceArray(const size_t input_size, T* addr, const float value, cudaStream_t cuda_stream);
+void FillDeviceArray(const size_t input_size, T *addr, const float value, cudaStream_t cuda_stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SLICEIMPL_H_
--- a/tests/st/ops/gpu/test_stridedslice_grad_op.py
+++ b/tests/st/ops/gpu/test_stridedslice_grad_op.py
@ -19,31 +19,258 @@ import pytest
 import mindspore.context as context
 import mindspore.nn as nn
 from mindspore import Tensor
-from mindspore.common.api import ms_function
 from mindspore.ops import operations as P
-from mindspore.ops.operations import _grad_ops as G
+from mindspore.ops import composite as C

 context.set_context(mode=context.GRAPH_MODE, device_target='GPU')


-class StridedSliceGrad(nn.Cell):
-    def __init__(self):
-        super(StridedSliceGrad, self).__init__()
-        self.ssg = G.StridedSliceGrad()
-        self.shape = P.Shape()
+class StridedSliceNet(nn.Cell):
+    def __init__(self, begin, end, stride, begin_mask=0, end_mask=0, ellipsis_mask=0):
+        super(StridedSliceNet, self).__init__()
+        self.begin = begin
+        self.end = end
+        self.strides = stride
+        self.slice = P.StridedSlice(begin_mask, end_mask, ellipsis_mask)

-    @ms_function
-    def construct(self, dy, x):
-        return self.ssg(dy, self.shape(x), (2, 0, 0), (3, 2, 3), (1, 1, 1))
+    def construct(self, x):
+        return self.slice(x, self.begin, self.end, self.strides)

+class GradData(nn.Cell):
+    def __init__(self, network):
+        super(GradData, self).__init__()
+        self.grad = C.GradOperation(name="get_all", get_all=True, sens_param=False)
+        self.network = network
+
+    def construct(self, x):
+        return self.grad(self.network)(x)

@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
-def test_slice():
-    x = Tensor(np.array([[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]], [[5, 5, 5], [6, 7, 8]]]).astype(np.float32))
-    dy = Tensor(np.array([[[5., 1., 5.], [6., 1., 8.]]]).astype(np.float32))
-    ssg = StridedSliceGrad()
-    output = ssg(dy, x)
-    expect = [[[0, 0, 0], [0, 0, 0]], [[0, 0, 0], [0, 0, 0]], [[5, 1, 5], [6, 1, 8]]]
-    assert (output.asnumpy() == expect).all()
+def test_strided_slice_grad():
+    x = Tensor(np.arange(0, 2*3*4*5).reshape(2, 3, 4, 5).astype(np.float32))
+    net = StridedSliceNet((1, 0, 0, 2), (2, 2, 2, 4), (1, 1, 1, 1))
+    dx = GradData(net)(x)
+    expect = np.array([[[[0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.]],
+
+                        [[0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.]],
+
+                        [[0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.]]],
+
+
+                       [[[0., 0., 1., 1., 0.],
+                         [0., 0., 1., 1., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.]],
+
+                        [[0., 0., 1., 1., 0.],
+                         [0., 0., 1., 1., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.]],
+
+                        [[0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.]]]])
+    assert np.allclose(dx[0].asnumpy(), expect)
+
+    net = StridedSliceNet((1, 0, 0, 5), (2, 2, 2, 1), (1, 1, 1, -2))
+    dx = GradData(net)(x)
+    expect = np.array([[[[0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.]],
+
+                        [[0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.]],
+
+                        [[0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.]]],
+
+
+                       [[[0., 0., 1., 0., 1.],
+                         [0., 0., 1., 0., 1.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.]],
+
+                        [[0., 0., 1., 0., 1.],
+                         [0., 0., 1., 0., 1.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.]],
+
+                        [[0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.]]]])
+    assert np.allclose(dx[0].asnumpy(), expect)
+
+
+    net = StridedSliceNet((1, 0, 0, -1), (2, 2, 2, 1), (1, 1, 1, -1))
+    dx = GradData(net)(x)
+    expect = np.array([[[[0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.]],
+
+                        [[0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.]],
+
+                        [[0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.]]],
+
+
+                       [[[0., 0., 1., 1., 1.],
+                         [0., 0., 1., 1., 1.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.]],
+
+                        [[0., 0., 1., 1., 1.],
+                         [0., 0., 1., 1., 1.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.]],
+
+                        [[0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.]]]])
+    assert np.allclose(dx[0].asnumpy(), expect)
+
+    # ME infer fault
+    # y = GradData()(x, (1, 0, -1, -2), (2, 2, 0, -5), (1, 1, -1, -2))
+    # expect = np.array([[[[0., 0., 0., 0., 0.],
+    #                      [0., 0., 0., 0., 0.],
+    #                      [0., 0., 0., 0., 0.],
+    #                      [0., 0., 0., 0., 0.]],
+
+    #                     [[0., 0., 0., 0., 0.],
+    #                      [0., 0., 0., 0., 0.],
+    #                      [0., 0., 0., 0., 0.],
+    #                      [0., 0., 0., 0., 0.]],
+
+    #                     [[0., 0., 0., 0., 0.],
+    #                      [0., 0., 0., 0., 0.],
+    #                      [0., 0., 0., 0., 0.],
+    #                      [0., 0., 0., 0., 0.]]],
+
+
+    #                    [[[0., 0., 0., 0., 0.],
+    #                      [0., 1., 0., 1., 0.],
+    #                      [0., 1., 0., 1., 0.],
+    #                      [0., 1., 0., 1., 0.]],
+
+    #                     [[0., 0., 0., 0., 0.],
+    #                      [0., 1., 0., 1., 0.],
+    #                      [0., 1., 0., 1., 0.],
+    #                      [0., 1., 0., 1., 0.]],begin_mask=0b1000, end_mask=0b0010, ellipsis_mask=0b0100
+
+    #                     [[0., 0., 0., 0., 0.],
+    #                      [0., 0., 0., 0., 0.],
+    #                      [0., 0., 0., 0., 0.],
+    #                      [0., 0., 0., 0., 0.]]]])
+    # assert np.allclose(y.asnumpy(), expect)
+
+    # y = Grad(begin_mask=0b1000, end_mask=0b0010)(x, (1, 0, 0, 2), (2, 2, 2, 4), (1, 1, 1, 1))
+    # expect = np.array([[[[0., 0., 0., 0., 0.],
+    #                      [0., 0., 0., 0., 0.],
+    #                      [0., 0., 0., 0., 0.],
+    #                      [0., 0., 0., 0., 0.]],
+
+    #                     [[0., 0., 0., 0., 0.],
+    #                      [0., 0., 0., 0., 0.],
+    #                      [0., 0., 0., 0., 0.],
+    #                      [0., 0., 0., 0., 0.]],
+
+    #                     [[0., 0., 0., 0., 0.],
+    #                      [0., 0., 0., 0., 0.],
+    #                      [0., 0., 0., 0., 0.],
+    #                      [0., 0., 0., 0., 0.]]],
+
+
+    #                    [[[0., 0., 1., 1., 0.],
+    #                      [0., 0., 1., 1., 0.],
+    #                      [0., 0., 0., 0., 0.],
+    #                      [0., 0., 0., 0., 0.]],
+
+    #                     [[0., 0., 1., 1., 0.],
+    #                      [0., 0., 1., 1., 0.],
+    #                      [0., 0., 0., 0., 0.],
+    #                      [0., 0., 0., 0., 0.]],
+
+    #                     [[0., 0., 0., 0., 0.],
+    #                      [0., 0., 0., 0., 0.],
+    #                      [0., 0., 0., 0., 0.],
+    #                      [0., 0., 0., 0., 0.]]]])
+    # assert np.allclose(y.asnumpy(), expect)
+
+
+    net = StridedSliceNet((1, 0, 0, 2), (2, 2, 2, 4), (1, 1, 1, 1),
+                          begin_mask=0b1000, end_mask=0b0010, ellipsis_mask=0b0100)
+    dx = GradData(net)(x)
+    expect = np.array([[[[0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.]],
+
+                        [[0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.]],
+
+                        [[0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.],
+                         [0., 0., 0., 0., 0.]]],
+
+
+                       [[[1., 1., 1., 1., 0.],
+                         [1., 1., 1., 1., 0.],
+                         [1., 1., 1., 1., 0.],
+                         [1., 1., 1., 1., 0.]],
+
+                        [[1., 1., 1., 1., 0.],
+                         [1., 1., 1., 1., 0.],
+                         [1., 1., 1., 1., 0.],
+                         [1., 1., 1., 1., 0.]],
+
+                        [[1., 1., 1., 1., 0.],
+                         [1., 1., 1., 1., 0.],
+                         [1., 1., 1., 1., 0.],
+                         [1., 1., 1., 1., 0.]]]])
+    assert np.allclose(dx[0].asnumpy(), expect)
+
+    x = Tensor(np.arange(0, 3*4*5).reshape(3, 4, 5).astype(np.float32))
+    net = StridedSliceNet((1, 0, 0), (2, -3, 3), (1, 1, 3))
+    dx = GradData(net)(x)
+    expect = np.array([[[0., 0., 0., 0., 0.],
+                        [0., 0., 0., 0., 0.],
+                        [0., 0., 0., 0., 0.],
+                        [0., 0., 0., 0., 0.]],
+
+                       [[1., 0., 0., 0., 0.],
+                        [0., 0., 0., 0., 0.],
+                        [0., 0., 0., 0., 0.],
+                        [0., 0., 0., 0., 0.]],
+
+                       [[0., 0., 0., 0., 0.],
+                        [0., 0., 0., 0., 0.],
+                        [0., 0., 0., 0., 0.],
+                        [0., 0., 0., 0., 0.]]])
+    assert np.allclose(dx[0].asnumpy(), expect)
--- a/tests/st/ops/gpu/test_stridedslice_op.py
+++ b/tests/st/ops/gpu/test_stridedslice_op.py
@ -17,29 +17,79 @@ import numpy as np
 import pytest

 import mindspore.context as context
-import mindspore.nn as nn
 from mindspore import Tensor
 from mindspore.ops import operations as P

 context.set_context(mode=context.GRAPH_MODE, device_target='GPU')


-class StridedSlice(nn.Cell):
-    def __init__(self):
-        super(StridedSlice, self).__init__()
-        self.stridedslice = P.StridedSlice()
-
-    def construct(self, x):
-        return self.stridedslice(x, (2, 0, 0), (3, 2, 3), (1, 1, 1))
-
-
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
-def test_slice():
-    x = Tensor(np.array([[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]], [[5, 5, 5], [6, 7, 8]]]).astype(np.int32))
-    stridedslice = StridedSlice()
-    output = stridedslice(x)
-    expect = [[[5., 5., 5.],
-               [6., 7., 8.]]]
-    assert (output.asnumpy() == expect).all()
+def test_stridedslice():
+    x = Tensor(np.arange(0, 2*3*4*5).reshape(2, 3, 4, 5).astype(np.float32))
+    y = P.StridedSlice()(x, (1, 0, 0, 2), (2, 2, 2, 4), (1, 1, 1, 1))
+    expect = np.array([[[[62, 63],
+                         [67, 68]],
+                        [[82, 83],
+                         [87, 88]]]])
+    assert np.allclose(y.asnumpy(), expect)
+
+    y = P.StridedSlice()(x, (1, 0, 0, 5), (2, 2, 2, 1), (1, 1, 1, -2))
+    expect = np.array([[[[64, 62],
+                         [69, 67]],
+                        [[84, 82],
+                         [89, 87]]]])
+    assert np.allclose(y.asnumpy(), expect)
+
+    y = P.StridedSlice()(x, (1, 0, 0, -1), (2, 2, 2, 1), (1, 1, 1, -1))
+    expect = np.array([[[[64, 63, 62],
+                         [69, 68, 67]],
+                        [[84, 83, 82],
+                         [89, 88, 87]]]])
+    assert np.allclose(y.asnumpy(), expect)
+
+    # ME infer fault
+    # y = P.StridedSlice()(x, (1, 0, -1, -2), (2, 2, 0, -5), (1, 1, -1, -2))
+    # expect = np.array([[[[78, 76],
+    #                      [73, 71],
+    #                      [68, 66]],
+    #                     [[98, 96],
+    #                      [93, 91],
+    #                      [88, 86]]]])
+    # assert np.allclose(y.asnumpy(), expect)
+
+    # y = P.StridedSlice(begin_mask=0b1000, end_mask=0b0010)(x, (1, 0, 0, 2), (2, 2, 2, 4), (1, 1, 1, 1))
+    # expect = np.array([[[[ 62,  63],
+    #                      [ 67,  68]],
+    #                     [[ 82,  83],
+    #                      [ 87,  88]],
+    #                     [[102, 103],
+    #                      [107, 108]]]])
+    # assert np.allclose(y.asnumpy(), expect)
+
+    op = P.StridedSlice(begin_mask=0b1000, end_mask=0b0010, ellipsis_mask=0b0100)
+    y = op(x, (1, 0, 0, 2), (2, 2, 2, 4), (1, 1, 1, 1))
+    expect = np.array([[[[60, 61, 62, 63],
+                         [65, 66, 67, 68],
+                         [70, 71, 72, 73],
+                         [75, 76, 77, 78]],
+                        [[80, 81, 82, 83],
+                         [85, 86, 87, 88],
+                         [90, 91, 92, 93],
+                         [95, 96, 97, 98]],
+                        [[100, 101, 102, 103],
+                         [105, 106, 107, 108],
+                         [110, 111, 112, 113],
+                         [115, 116, 117, 118]]]])
+    assert np.allclose(y.asnumpy(), expect)
+
+    x = Tensor(np.arange(0, 3*4*5).reshape(3, 4, 5).astype(np.float32))
+    y = P.StridedSlice()(x, (1, 0, 0), (2, -3, 3), (1, 1, 3))
+    expect = np.array([[[20]]])
+    assert np.allclose(y.asnumpy(), expect)
+
+    x_np = np.arange(0, 4*5).reshape(4, 5).astype(np.float32)
+    y = Tensor(x_np)[:, ::-1]
+    expect = x_np[:, ::-1]
+    assert np.allclose(y.asnumpy(), expect)