From 23e366e712850be41407ebbf3419255da4b4b357 Mon Sep 17 00:00:00 2001
From: nomindcarry <caikairui@huawei.com>
Date: Mon, 19 Feb 2024 22:24:49 -0800
Subject: [PATCH] identity support uncontiguous

---
 .../kernel/pyboost/customize/identity.cc      | 100 ++++++++++++++++
 .../ccsrc/kernel/pyboost/customize/identity.h |  34 ++++++
 .../op_function/template/pyboost_function.tpl |   1 +
 .../ccsrc/pipeline/pynative/pynative_utils.cc |  12 ++
 .../ccsrc/pipeline/pynative/pynative_utils.h  |   1 +
 .../kernel/pyboost/customize/identity.cc      | 111 ++++++++++++++----
 .../cpu/kernel/pyboost/customize/identity.cc  |  33 ++++++
 .../cpu/kernel/pyboost/customize/identity.h   |  33 ++++++
 .../gpu/kernel/pyboost/customize/identity.cc  |  39 ++++++
 .../gpu/kernel/pyboost/customize/identity.h   |  33 ++++++
 .../runtime/device/device_address_utils.cc    |  18 +++
 .../runtime/device/device_address_utils.h     |   2 +
 mindspore/core/ops/ops_def/identity_op.yaml   |   6 +-
 tests/st/numpy_native/test_array_creations.py |   2 +-
 14 files changed, 396 insertions(+), 29 deletions(-)
 create mode 100644 mindspore/ccsrc/kernel/pyboost/customize/identity.cc
 create mode 100644 mindspore/ccsrc/kernel/pyboost/customize/identity.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/pyboost/customize/identity.cc
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/pyboost/customize/identity.h
 create mode 100644 mindspore/ccsrc/plugin/device/gpu/kernel/pyboost/customize/identity.cc
 create mode 100644 mindspore/ccsrc/plugin/device/gpu/kernel/pyboost/customize/identity.h
diff --git a/mindspore/ccsrc/kernel/pyboost/customize/identity.cc b/mindspore/ccsrc/kernel/pyboost/customize/identity.cc
new file mode 100644
index 00000000000..a2e0793fda2
--- /dev/null
+++ b/mindspore/ccsrc/kernel/pyboost/customize/identity.cc
@@ -0,0 +1,100 @@
+/**
+ * Copyright 2024 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mindspore/ccsrc/kernel/pyboost/customize/identity.h"
+#include <memory>
+#include <utility>
+
+namespace mindspore {
+namespace kernel {
+namespace pyboost {
+
+void IdentityCustomizeCallWithoutContigous(const std::shared_ptr<OpRunner> &op, const TensorPtr &x_tensor,
+                                           void *stream) {
+  // Async
+  PyBoostUtils::DispatchRun(std::make_shared<runtime::PyBoostDeviceTask>([op, x_tensor, stream]() {
+    MS_LOG(DEBUG) << "Run device task Identity start";
+    auto device_context = op->device_context();
+    const auto &outputs = op->outputs();
+    auto input_x_address = std::dynamic_pointer_cast<device::DeviceAddress>(x_tensor->device_address());
+
+    // Malloc for input tensors
+    PyBoostUtils::MallocOpInputs(device_context, x_tensor);
+
+    // Malloc for output tensors
+    auto launch_device_address = runtime::DeviceAddressUtils::CreateDeviceAddress(
+      op->device_context(), outputs[0], x_tensor->storage_info()->ori_shape, op->stream_id());
+    if (!device_context->device_res_manager_->AllocateMemory(launch_device_address.get())) {
+      MS_LOG(EXCEPTION) << "Allocate memory failed";
+    }
+
+    // Get inputs kernel tensors, the not-tensor value will malloc here
+    const auto &input_address_info = PyBoostUtils::GetAddressInfo(device_context, op->input_abs(), x_tensor);
+
+    // Get outputs kernel tensors
+    std::vector<kernel::KernelTensor *> output_kernel_tensor_list{launch_device_address->kernel_tensor().get()};
+    device::DeviceAddressPtrList output_device_address_list{launch_device_address};
+    const auto &output_address_info = std::make_pair(output_kernel_tensor_list, output_device_address_list);
+
+    PyBoostUtils::LaunchKernel(op->primitive(), op->device_context(), input_address_info, output_address_info, stream);
+    auto output_address = std::dynamic_pointer_cast<device::DeviceAddress>(outputs[0]->device_address());
+    output_address->SetStorageInfo(input_x_address->GetStorageInfo());
+    output_address->set_ptr(launch_device_address->GetMutablePtr());
+    MS_LOG(DEBUG) << "Run device task Identity end";
+  }));
+}
+
+void IdentityCustomizeCall(const std::shared_ptr<OpRunner> &op, const TensorPtr &x_tensor, void *stream) {
+  // Async
+  PyBoostUtils::DispatchRun(std::make_shared<runtime::PyBoostDeviceTask>([op, x_tensor, stream]() {
+    MS_LOG(DEBUG) << "Run device task Identity start";
+    auto device_context = op->device_context();
+    const auto &outputs = op->outputs();
+
+    // Malloc for input tensors
+    PyBoostUtils::MallocOpInputs(device_context, x_tensor);
+    // Malloc for output tensors
+    PyBoostUtils::MallocOpOutputs(device_context, outputs);
+
+    // Get inputs kernel tensors, the not-tensor value will malloc here
+    const auto &input_address_info = PyBoostUtils::GetAddressInfo(device_context, op->input_abs(), x_tensor);
+
+    // Get outputs kernel tensors
+    const auto &output_address_info = PyBoostUtils::GetAddressInfo(device_context, {op->output_abs()}, outputs);
+
+    PyBoostUtils::LaunchKernel(op->primitive(), op->device_context(), input_address_info, output_address_info, stream);
+    MS_LOG(DEBUG) << "Run device task Identity end";
+  }));
+}
+
+tensor::TensorPtr IdentityCustomize(const std::shared_ptr<OpRunner> &op, const TensorPtr &x_tensor, void *stream) {
+  OpRunner::InferOpOutput(op, x_tensor);
+
+  PyBoostUtils::PrepareOpInputs(op->device_context(), x_tensor);
+  PyBoostUtils::PrepareOpOutputs(op->device_context(), op->outputs());
+
+  if (x_tensor->is_contiguous()) {
+    MS_LOG(DEBUG) << "Run Identity input contiguous";
+    IdentityCustomizeCall(op, x_tensor, stream);
+  } else {
+    MS_LOG(DEBUG) << "Run Identity input without contiguous";
+    IdentityCustomizeCallWithoutContigous(op, x_tensor, stream);
+  }
+  return op->output(0);
+}
+}  // namespace pyboost
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/kernel/pyboost/customize/identity.h b/mindspore/ccsrc/kernel/pyboost/customize/identity.h
new file mode 100644
index 00000000000..b9705d35a20
--- /dev/null
+++ b/mindspore/ccsrc/kernel/pyboost/customize/identity.h
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2024 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_MINDSPORE_CCSRC_KERNEL_PYBOOST_CUSTOMIZE_IDENTITY_H_
+#define MINDSPORE_MINDSPORE_CCSRC_KERNEL_PYBOOST_CUSTOMIZE_IDENTITY_H_
+#include <vector>
+#include <memory>
+#include "ir/tensor.h"
+#include "ir/value.h"
+#include "runtime/hardware/device_context_manager.h"
+#include "kernel/pyboost/op_runner.h"
+
+namespace mindspore {
+namespace kernel {
+namespace pyboost {
+tensor::TensorPtr BACKEND_EXPORT IdentityCustomize(const std::shared_ptr<OpRunner> &op, const TensorPtr &x_tensor,
+                                                   void *stream = nullptr);
+}  // namespace pyboost
+}  // namespace kernel
+}  // namespace mindspore
+#endif  // MINDSPORE_MINDSPORE_CCSRC_KERNEL_PYBOOST_CUSTOMIZE_IDENTITY_H_
diff --git a/mindspore/ccsrc/pipeline/pynative/op_function/template/pyboost_function.tpl b/mindspore/ccsrc/pipeline/pynative/op_function/template/pyboost_function.tpl
index ac5a03ccf00..810373f0f62 100644
--- a/mindspore/ccsrc/pipeline/pynative/op_function/template/pyboost_function.tpl
+++ b/mindspore/ccsrc/pipeline/pynative/op_function/template/pyboost_function.tpl
@@ -34,6 +34,7 @@ py::object ${func_name}_Base(const PrimitivePtr &prim, const py::list &args) {
           // Run op
           (void)op->Call(${cast_args});
           ${optional_to_value}
+          PyNativeAlgo::PyBoost::DataSyncForGraph(op);
           // Update op and op_run_info by op outputs
           PyNativeAlgo::PyBoost::UpdateOpRunInfo(op, {${grad_args}}, op_run_info);
 
diff --git a/mindspore/ccsrc/pipeline/pynative/pynative_utils.cc b/mindspore/ccsrc/pipeline/pynative/pynative_utils.cc
index 73cc32fddcb..5ec4fe4fb1c 100644
--- a/mindspore/ccsrc/pipeline/pynative/pynative_utils.cc
+++ b/mindspore/ccsrc/pipeline/pynative/pynative_utils.cc
@@ -1386,6 +1386,18 @@ void PyBoost::UpdateOpRunInfo(const kernel::pyboost::OpPtr &op, const vector<Val
   }
 }
 
+void PyBoost::DataSyncForGraph(const kernel::pyboost::OpPtr &op) {
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) != kPynativeMode) {
+    // If execution mode is Graph Mode in MsContext, the tensor will be the input of graph which will execute in Graph
+    // Mode, if the graph contain no CNode after optimization, the tensor need sync to host.
+    for (const auto &output : op->outputs()) {
+      output->data_sync(true);
+    }
+  }
+}
+
 PrimitivePtr PyBoost::ConvertPrimitive(const py::object &obj) {
   const auto &adapter = obj.cast<PrimitivePyAdapterPtr>();
   MS_EXCEPTION_IF_NULL(adapter);
diff --git a/mindspore/ccsrc/pipeline/pynative/pynative_utils.h b/mindspore/ccsrc/pipeline/pynative/pynative_utils.h
index 6f5b449a999..50d3d0a4add 100644
--- a/mindspore/ccsrc/pipeline/pynative/pynative_utils.h
+++ b/mindspore/ccsrc/pipeline/pynative/pynative_utils.h
@@ -211,6 +211,7 @@ struct PyBoost {
     }
     return ret;
   }
+  static void DataSyncForGraph(const kernel::pyboost::OpPtr &op);
 };
 
 // Some common functions used in both jit and PackFunc grad
diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/pyboost/customize/identity.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/pyboost/customize/identity.cc
index 49c2567c686..e63b129e42d 100644
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/pyboost/customize/identity.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/pyboost/customize/identity.cc
@@ -26,17 +26,76 @@
 namespace mindspore {
 namespace kernel {
 namespace pyboost {
-tensor::TensorPtr IdentityAscendCustomize(const std::shared_ptr<OpRunner> &op, const TensorPtr &x_tensor) {
-  OpRunner::InferOpOutput(op, x_tensor);
-
-  PyBoostUtils::PrepareOpInputs(op->device_context(), x_tensor);
-  PyBoostUtils::PrepareOpOutputs(op->device_context(), op->outputs());
 
+void IdentityCustomizeCallWithoutContigous(const std::shared_ptr<OpRunner> &op, const TensorPtr &x_tensor) {
   // Async
   PyBoostUtils::DispatchRun(std::make_shared<runtime::PyBoostDeviceTask>([op, x_tensor]() {
     MS_LOG(DEBUG) << "Run device task Identity start";
     auto device_context = op->device_context();
     const auto &outputs = op->outputs();
+    auto input_shape = x_tensor->storage_info()->ori_shape;
+    const auto &output_shape = x_tensor->storage_info()->ori_shape;
+    // Malloc for input tensors
+    PyBoostUtils::MallocOpInputs(device_context, x_tensor);
+    // Malloc for output tensors
+    auto launch_device_address = runtime::DeviceAddressUtils::CreateDeviceAddress(
+      op->device_context(), outputs[0], x_tensor->storage_info()->ori_shape, op->stream_id());
+    if (!device_context->device_res_manager_->AllocateMemory(launch_device_address.get())) {
+      MS_LOG(EXCEPTION) << "Allocate memory failed";
+    }
+
+    auto identity_kernel = std::make_shared<kernel::AclKernelMod>();
+    auto input_x_address = std::dynamic_pointer_cast<device::DeviceAddress>(x_tensor->device_address());
+
+    if (!input_x_address->kernel_tensor()->host_info_exist()) {
+      input_x_address->kernel_tensor()->SetHostInfo(std::make_shared<abstract::TensorShape>(x_tensor->shape()),
+                                                    std::make_shared<TensorType>(x_tensor->Dtype()), nullptr);
+    }
+    if (!launch_device_address->kernel_tensor()->host_info_exist()) {
+      launch_device_address->kernel_tensor()->SetHostInfo(std::make_shared<abstract::TensorShape>(output_shape),
+                                                          std::make_shared<TensorType>(outputs[0]->Dtype()), nullptr);
+    }
+    auto input_kernel_tensors = {input_x_address->kernel_tensor().get()};
+    auto output_kernel_tensors = {launch_device_address->kernel_tensor().get()};
+
+    if (!std::static_pointer_cast<KernelMod>(identity_kernel)
+           ->Init(prim::kPrimIdentity, input_kernel_tensors, output_kernel_tensors)) {
+      MS_LOG(EXCEPTION) << "#dmsg#Kernel build failed:#dmsg#Initialize acl kernel op[Identity] failed.";
+    }
+    identity_kernel->CreateAclConverter();
+    identity_kernel->SetDeviceInfo({input_x_address->format()}, {launch_device_address->format()},
+                                   {input_x_address->type_id()}, {launch_device_address->type_id()});
+
+    identity_kernel->PackageInput(kIndex0, input_x_address->format(), &input_shape);
+    identity_kernel->PackageOutput(kIndex0, output_shape);
+    identity_kernel->SetNeedConvertHostTensor(true);
+
+    if (identity_kernel->Resize(input_kernel_tensors, output_kernel_tensors) != KRET_OK) {
+      MS_LOG(EXCEPTION) << "Kernel identity resize failed";
+    }
+    auto stream_ptr = device_context->device_res_manager_->GetStream(op->stream_id());
+
+    auto workspace_address = PyBoostUtils::CreateWorkSpaceDeviceAddress(identity_kernel, device_context, "Identity");
+    auto workspaces = PyBoostUtils::GetKernelTensorFromAddress(workspace_address);
+
+    if (!identity_kernel->Launch(input_kernel_tensors, workspaces, output_kernel_tensors, stream_ptr)) {
+      MS_LOG(EXCEPTION) << "Launch kernel identity failed";
+    }
+    auto output_address = std::dynamic_pointer_cast<device::DeviceAddress>(outputs[0]->device_address());
+    output_address->SetStorageInfo(input_x_address->GetStorageInfo());
+    output_address->set_ptr(launch_device_address->GetMutablePtr());
+    MS_LOG(DEBUG) << "Run device task Identity end";
+  }));
+}
+
+void IdentityCustomizeCall(const std::shared_ptr<OpRunner> &op, const TensorPtr &x_tensor) {
+  // Async
+  PyBoostUtils::DispatchRun(std::make_shared<runtime::PyBoostDeviceTask>([op, x_tensor]() {
+    MS_LOG(DEBUG) << "Run device task Identity start";
+    auto device_context = op->device_context();
+    const auto &outputs = op->outputs();
+    auto input_shape = x_tensor->shape();
+    auto output_shape = outputs[0]->shape();
     // Malloc for input tensors
     PyBoostUtils::MallocOpInputs(device_context, x_tensor);
     // Malloc for output tensors
@@ -45,12 +104,13 @@ tensor::TensorPtr IdentityAscendCustomize(const std::shared_ptr<OpRunner> &op, c
     auto identity_kernel = std::make_shared<kernel::AclKernelMod>();
     auto input_x_address = std::dynamic_pointer_cast<device::DeviceAddress>(x_tensor->device_address());
     auto output_address = std::dynamic_pointer_cast<device::DeviceAddress>(outputs[0]->device_address());
+
     if (!input_x_address->kernel_tensor()->host_info_exist()) {
       input_x_address->kernel_tensor()->SetHostInfo(std::make_shared<abstract::TensorShape>(x_tensor->shape()),
                                                     std::make_shared<TensorType>(x_tensor->Dtype()), nullptr);
     }
     if (!output_address->kernel_tensor()->host_info_exist()) {
-      output_address->kernel_tensor()->SetHostInfo(std::make_shared<abstract::TensorShape>(outputs[0]->shape()),
+      output_address->kernel_tensor()->SetHostInfo(std::make_shared<abstract::TensorShape>(output_shape),
                                                    std::make_shared<TensorType>(outputs[0]->Dtype()), nullptr);
     }
     auto input_kernel_tensors = {input_x_address->kernel_tensor().get()};
@@ -63,40 +123,39 @@ tensor::TensorPtr IdentityAscendCustomize(const std::shared_ptr<OpRunner> &op, c
     identity_kernel->CreateAclConverter();
     identity_kernel->SetDeviceInfo({input_x_address->format()}, {output_address->format()},
                                    {input_x_address->type_id()}, {output_address->type_id()});
-    auto input_shape = x_tensor->shape();
 
     identity_kernel->PackageInput(kIndex0, input_x_address->format(), &input_shape);
-    identity_kernel->PackageOutput(kIndex0, outputs[0]->shape());
+    identity_kernel->PackageOutput(kIndex0, output_shape);
     identity_kernel->SetNeedConvertHostTensor(true);
 
     if (identity_kernel->Resize(input_kernel_tensors, output_kernel_tensors) != KRET_OK) {
       MS_LOG(EXCEPTION) << "Kernel identity resize failed";
     }
-    auto stream_ptr = device_context->device_res_manager_->GetStream(kDefaultStreamIndex);
+    auto stream_ptr = device_context->device_res_manager_->GetStream(op->stream_id());
 
-    auto workspace_sizes = identity_kernel->GetWorkspaceSizeList();
-    std::vector<kernel::KernelTensor *> workspaces;
-    workspaces.reserve(workspace_sizes.size());
-    for (size_t i = 0; i < workspace_sizes.size(); ++i) {
-      auto kernel_tensor = std::make_shared<KernelTensor>(
-        nullptr, workspace_sizes[i], Format::DEFAULT_FORMAT, kTypeUnknown, ShapeVector(),
-        device_context->device_context_key().device_name_, device_context->device_context_key().device_id_);
-      auto device_address = device_context->device_res_manager_->CreateDeviceAddress(kernel_tensor);
-      MS_EXCEPTION_IF_NULL(device_address);
-      if (device_address->GetPtr() == nullptr &&
-          !device_context->device_res_manager_->AllocateMemory(device_address.get())) {
-        MS_LOG(EXCEPTION) << "Allocate dynamic workspace memory failed";
-      }
-      (void)workspaces.emplace_back(device_address->kernel_tensor().get());
-      MS_LOG(DEBUG) << "workspace[" << i << "]:" << workspaces.back()->device_ptr()
-                    << " size:" << workspaces.back()->size();
-    }
+    auto workspace_address = PyBoostUtils::CreateWorkSpaceDeviceAddress(identity_kernel, device_context, "Identity");
+    auto workspaces = PyBoostUtils::GetKernelTensorFromAddress(workspace_address);
 
     if (!identity_kernel->Launch(input_kernel_tensors, workspaces, output_kernel_tensors, stream_ptr)) {
       MS_LOG(EXCEPTION) << "Launch kernel identity failed";
     }
     MS_LOG(DEBUG) << "Run device task Identity end";
   }));
+}
+
+tensor::TensorPtr IdentityAscendCustomize(const std::shared_ptr<OpRunner> &op, const TensorPtr &x_tensor) {
+  OpRunner::InferOpOutput(op, x_tensor);
+
+  PyBoostUtils::PrepareOpInputs(op->device_context(), x_tensor);
+  PyBoostUtils::PrepareOpOutputs(op->device_context(), op->outputs());
+
+  if (x_tensor->is_contiguous()) {
+    MS_LOG(DEBUG) << "Run Identity input contiguous";
+    IdentityCustomizeCall(op, x_tensor);
+  } else {
+    MS_LOG(DEBUG) << "Run Identity input without contiguous";
+    IdentityCustomizeCallWithoutContigous(op, x_tensor);
+  }
   return op->output(0);
 }
 }  // namespace pyboost
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/pyboost/customize/identity.cc b/mindspore/ccsrc/plugin/device/cpu/kernel/pyboost/customize/identity.cc
new file mode 100644
index 00000000000..476b3b8602c
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/pyboost/customize/identity.cc
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2024 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "plugin/device/cpu/kernel/pyboost/customize/identity.h"
+#include <memory>
+#include <utility>
+#include "mindspore/ccsrc/kernel/pyboost/customize/identity.h"
+
+namespace mindspore {
+namespace kernel {
+namespace pyboost {
+tensor::TensorPtr IdentityCPUCustomize(const std::shared_ptr<OpRunner> &op, const TensorPtr &x_tensor) {
+  MS_LOG(DEBUG) << "Identity call start";
+  IdentityCustomize(op, x_tensor);
+  MS_LOG(DEBUG) << "Identity call end";
+  return op->output(0);
+}
+}  // namespace pyboost
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/pyboost/customize/identity.h b/mindspore/ccsrc/plugin/device/cpu/kernel/pyboost/customize/identity.h
new file mode 100644
index 00000000000..24dec4a8ee0
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/pyboost/customize/identity.h
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2024 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_PYBOOST_CUSTOMIZE_IDENTITY_H_
+#define MINDSPORE_MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_PYBOOST_CUSTOMIZE_IDENTITY_H_
+#include <vector>
+#include <memory>
+#include "ir/tensor.h"
+#include "ir/value.h"
+#include "runtime/hardware/device_context_manager.h"
+#include "kernel/pyboost/op_runner.h"
+
+namespace mindspore {
+namespace kernel {
+namespace pyboost {
+tensor::TensorPtr IdentityCPUCustomize(const std::shared_ptr<OpRunner> &op, const TensorPtr &x_tensor);
+}  // namespace pyboost
+}  // namespace kernel
+}  // namespace mindspore
+#endif  // MINDSPORE_MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_PYBOOST_CUSTOMIZE_IDENTITY_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/pyboost/customize/identity.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/pyboost/customize/identity.cc
new file mode 100644
index 00000000000..431d248c0c1
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/pyboost/customize/identity.cc
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2024 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "plugin/device/gpu/kernel/pyboost/customize/identity.h"
+#include <memory>
+#include <utility>
+#include "plugin/device/gpu/hal/device/gpu_device_manager.h"
+#include "mindspore/ccsrc/kernel/pyboost/customize/identity.h"
+
+namespace mindspore {
+namespace kernel {
+namespace pyboost {
+tensor::TensorPtr IdentityGPUCustomize(const std::shared_ptr<OpRunner> &op, const TensorPtr &x_tensor) {
+  MS_LOG(DEBUG) << "Identity call start";
+  auto stream = device::gpu::GPUDeviceManager::GetInstance().GetStream(op->stream_id());
+  IdentityCustomize(op, x_tensor, stream);
+  static auto sync = MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_SYNCHRONIZE);
+  if (sync && !op->device_context()->device_res_manager_->SyncAllStreams()) {
+    MS_LOG(EXCEPTION) << "SyncStream failed for op Identity.";
+  }
+  MS_LOG(DEBUG) << "Identity call end";
+  return op->output(0);
+}
+}  // namespace pyboost
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/pyboost/customize/identity.h b/mindspore/ccsrc/plugin/device/gpu/kernel/pyboost/customize/identity.h
new file mode 100644
index 00000000000..5eb8527a3e3
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/pyboost/customize/identity.h
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2024 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_PYBOOST_CUSTOMIZE_IDENTITY_H_
+#define MINDSPORE_MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_PYBOOST_CUSTOMIZE_IDENTITY_H_
+#include <vector>
+#include <memory>
+#include "ir/tensor.h"
+#include "ir/value.h"
+#include "runtime/hardware/device_context_manager.h"
+#include "kernel/pyboost/op_runner.h"
+
+namespace mindspore {
+namespace kernel {
+namespace pyboost {
+tensor::TensorPtr IdentityGPUCustomize(const std::shared_ptr<OpRunner> &op, const TensorPtr &x_tensor);
+}  // namespace pyboost
+}  // namespace kernel
+}  // namespace mindspore
+#endif  // MINDSPORE_MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_PYBOOST_CUSTOMIZE_IDENTITY_H_
diff --git a/mindspore/ccsrc/runtime/device/device_address_utils.cc b/mindspore/ccsrc/runtime/device/device_address_utils.cc
index 8c7855cd81b..71496cd794f 100644
--- a/mindspore/ccsrc/runtime/device/device_address_utils.cc
+++ b/mindspore/ccsrc/runtime/device/device_address_utils.cc
@@ -1077,6 +1077,24 @@ void DeviceAddressUtils::CreateOutputTensorAddress(DeviceContext *device_context
   }
 }
 
+device::DeviceAddressPtr DeviceAddressUtils::CreateDeviceAddress(DeviceContext *device_context,
+                                                                 const tensor::TensorPtr &tensor,
+                                                                 const ShapeVector &real_shape,
+                                                                 const size_t &stream_id) {
+  MS_EXCEPTION_IF_NULL(device_context);
+  MS_EXCEPTION_IF_NULL(tensor);
+  auto tensor_size = GetTypeByte(TypeIdToType(tensor->data_type())) * SizeOf(real_shape);
+  const auto &device_format = GetFormatByTensorShape(device_context, tensor->shape());
+  auto kernel_tensor = std::make_shared<kernel::KernelTensor>(
+    nullptr, tensor_size, device_format, tensor->data_type(), real_shape,
+    device_context->device_context_key().device_name_, device_context->device_context_key().device_id_);
+  kernel_tensor->set_stream_id(stream_id);
+  device::DeviceAddressPtr device_address = device_context->device_res_manager_->CreateDeviceAddress(kernel_tensor);
+  MS_LOG(DEBUG) << "Create tensor device address " << device_address << "Shape: " << tensor->shape()
+                << ", Type: " << tensor->data_type();
+  return device_address;
+}
+
 void DeviceAddressUtils::MallocForOutputs(DeviceContext *device_context,
                                           const std::vector<tensor::TensorPtr> &outputs) {
   for (const auto &output : outputs) {
diff --git a/mindspore/ccsrc/runtime/device/device_address_utils.h b/mindspore/ccsrc/runtime/device/device_address_utils.h
index c046eb7531d..9bdff1c2f0d 100644
--- a/mindspore/ccsrc/runtime/device/device_address_utils.h
+++ b/mindspore/ccsrc/runtime/device/device_address_utils.h
@@ -117,6 +117,8 @@ class BACKEND_EXPORT DeviceAddressUtils {
 
   static void UpdateDeviceAddressHostInfoByNode(const device::DeviceAddressPtr &addr, const AnfNodePtr &node,
                                                 size_t output_idx);
+  static device::DeviceAddressPtr CreateDeviceAddress(DeviceContext *device_context, const tensor::TensorPtr &tensor,
+                                                      const ShapeVector &real_shape, const size_t &stream_id);
 };
 }  // namespace runtime
 }  // namespace mindspore
diff --git a/mindspore/core/ops/ops_def/identity_op.yaml b/mindspore/core/ops/ops_def/identity_op.yaml
index 1fe28bd7d29..3694eb82af8 100644
--- a/mindspore/core/ops/ops_def/identity_op.yaml
+++ b/mindspore/core/ops/ops_def/identity_op.yaml
@@ -9,5 +9,7 @@ identity:
   function:
     name: deepcopy
   dispatch:
-    enable: False
-    Ascend: IdentityAscend
\ No newline at end of file
+    enable: True
+    Ascend: IdentityAscend
+    CPU: IdentityCPU
+    GPU: IdentityGPU
\ No newline at end of file
diff --git a/tests/st/numpy_native/test_array_creations.py b/tests/st/numpy_native/test_array_creations.py
index c001c27a174..1363b7d6cfb 100644
--- a/tests/st/numpy_native/test_array_creations.py
+++ b/tests/st/numpy_native/test_array_creations.py
@@ -1111,7 +1111,7 @@ def test_empty_like_exception():
         _pynative_executor.sync()
 
 
-@pytest.mark.level1
+@pytest.mark.level0
 @pytest.mark.platform_arm_ascend_training
 @pytest.mark.platform_x86_ascend_training
 @pytest.mark.platform_x86_gpu_training