identity support uncontiguous

2024-02-19 22:24:49 -08:00 · 2024-02-19 22:24:49 -08:00 · 23e366e712
parent 3f818e10e8
commit 23e366e712
14 changed files with 396 additions and 29 deletions
--- a/mindspore/ccsrc/kernel/pyboost/customize/identity.cc
+++ b/mindspore/ccsrc/kernel/pyboost/customize/identity.cc
@ -0,0 +1,100 @@
+/**
+ * Copyright 2024 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mindspore/ccsrc/kernel/pyboost/customize/identity.h"
+#include <memory>
+#include <utility>
+
+namespace mindspore {
+namespace kernel {
+namespace pyboost {
+
+void IdentityCustomizeCallWithoutContigous(const std::shared_ptr<OpRunner> &op, const TensorPtr &x_tensor,
+                                           void *stream) {
+  // Async
+  PyBoostUtils::DispatchRun(std::make_shared<runtime::PyBoostDeviceTask>([op, x_tensor, stream]() {
+    MS_LOG(DEBUG) << "Run device task Identity start";
+    auto device_context = op->device_context();
+    const auto &outputs = op->outputs();
+    auto input_x_address = std::dynamic_pointer_cast<device::DeviceAddress>(x_tensor->device_address());
+
+    // Malloc for input tensors
+    PyBoostUtils::MallocOpInputs(device_context, x_tensor);
+
+    // Malloc for output tensors
+    auto launch_device_address = runtime::DeviceAddressUtils::CreateDeviceAddress(
+      op->device_context(), outputs[0], x_tensor->storage_info()->ori_shape, op->stream_id());
+    if (!device_context->device_res_manager_->AllocateMemory(launch_device_address.get())) {
+      MS_LOG(EXCEPTION) << "Allocate memory failed";
+    }
+
+    // Get inputs kernel tensors, the not-tensor value will malloc here
+    const auto &input_address_info = PyBoostUtils::GetAddressInfo(device_context, op->input_abs(), x_tensor);
+
+    // Get outputs kernel tensors
+    std::vector<kernel::KernelTensor *> output_kernel_tensor_list{launch_device_address->kernel_tensor().get()};
+    device::DeviceAddressPtrList output_device_address_list{launch_device_address};
+    const auto &output_address_info = std::make_pair(output_kernel_tensor_list, output_device_address_list);
+
+    PyBoostUtils::LaunchKernel(op->primitive(), op->device_context(), input_address_info, output_address_info, stream);
+    auto output_address = std::dynamic_pointer_cast<device::DeviceAddress>(outputs[0]->device_address());
+    output_address->SetStorageInfo(input_x_address->GetStorageInfo());
+    output_address->set_ptr(launch_device_address->GetMutablePtr());
+    MS_LOG(DEBUG) << "Run device task Identity end";
+  }));
+}
+
+void IdentityCustomizeCall(const std::shared_ptr<OpRunner> &op, const TensorPtr &x_tensor, void *stream) {
+  // Async
+  PyBoostUtils::DispatchRun(std::make_shared<runtime::PyBoostDeviceTask>([op, x_tensor, stream]() {
+    MS_LOG(DEBUG) << "Run device task Identity start";
+    auto device_context = op->device_context();
+    const auto &outputs = op->outputs();
+
+    // Malloc for input tensors
+    PyBoostUtils::MallocOpInputs(device_context, x_tensor);
+    // Malloc for output tensors
+    PyBoostUtils::MallocOpOutputs(device_context, outputs);
+
+    // Get inputs kernel tensors, the not-tensor value will malloc here
+    const auto &input_address_info = PyBoostUtils::GetAddressInfo(device_context, op->input_abs(), x_tensor);
+
+    // Get outputs kernel tensors
+    const auto &output_address_info = PyBoostUtils::GetAddressInfo(device_context, {op->output_abs()}, outputs);
+
+    PyBoostUtils::LaunchKernel(op->primitive(), op->device_context(), input_address_info, output_address_info, stream);
+    MS_LOG(DEBUG) << "Run device task Identity end";
+  }));
+}
+
+tensor::TensorPtr IdentityCustomize(const std::shared_ptr<OpRunner> &op, const TensorPtr &x_tensor, void *stream) {
+  OpRunner::InferOpOutput(op, x_tensor);
+
+  PyBoostUtils::PrepareOpInputs(op->device_context(), x_tensor);
+  PyBoostUtils::PrepareOpOutputs(op->device_context(), op->outputs());
+
+  if (x_tensor->is_contiguous()) {
+    MS_LOG(DEBUG) << "Run Identity input contiguous";
+    IdentityCustomizeCall(op, x_tensor, stream);
+  } else {
+    MS_LOG(DEBUG) << "Run Identity input without contiguous";
+    IdentityCustomizeCallWithoutContigous(op, x_tensor, stream);
+  }
+  return op->output(0);
+}
+}  // namespace pyboost
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/kernel/pyboost/customize/identity.h
+++ b/mindspore/ccsrc/kernel/pyboost/customize/identity.h
@ -0,0 +1,34 @@
+/**
+ * Copyright 2024 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_MINDSPORE_CCSRC_KERNEL_PYBOOST_CUSTOMIZE_IDENTITY_H_
+#define MINDSPORE_MINDSPORE_CCSRC_KERNEL_PYBOOST_CUSTOMIZE_IDENTITY_H_
+#include <vector>
+#include <memory>
+#include "ir/tensor.h"
+#include "ir/value.h"
+#include "runtime/hardware/device_context_manager.h"
+#include "kernel/pyboost/op_runner.h"
+
+namespace mindspore {
+namespace kernel {
+namespace pyboost {
+tensor::TensorPtr BACKEND_EXPORT IdentityCustomize(const std::shared_ptr<OpRunner> &op, const TensorPtr &x_tensor,
+                                                   void *stream = nullptr);
+}  // namespace pyboost
+}  // namespace kernel
+}  // namespace mindspore
+#endif  // MINDSPORE_MINDSPORE_CCSRC_KERNEL_PYBOOST_CUSTOMIZE_IDENTITY_H_
--- a/mindspore/ccsrc/pipeline/pynative/op_function/template/pyboost_function.tpl
+++ b/mindspore/ccsrc/pipeline/pynative/op_function/template/pyboost_function.tpl
@ -34,6 +34,7 @@ py::object ${func_name}_Base(const PrimitivePtr &prim, const py::list &args) {
          // Run op
          (void)op->Call(${cast_args});
          ${optional_to_value}
+          PyNativeAlgo::PyBoost::DataSyncForGraph(op);
          // Update op and op_run_info by op outputs
          PyNativeAlgo::PyBoost::UpdateOpRunInfo(op, {${grad_args}}, op_run_info);

--- a/mindspore/ccsrc/pipeline/pynative/pynative_utils.cc
+++ b/mindspore/ccsrc/pipeline/pynative/pynative_utils.cc
@ -1386,6 +1386,18 @@ void PyBoost::UpdateOpRunInfo(const kernel::pyboost::OpPtr &op, const vector<Val
  }
 }

+void PyBoost::DataSyncForGraph(const kernel::pyboost::OpPtr &op) {
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) != kPynativeMode) {
+    // If execution mode is Graph Mode in MsContext, the tensor will be the input of graph which will execute in Graph
+    // Mode, if the graph contain no CNode after optimization, the tensor need sync to host.
+    for (const auto &output : op->outputs()) {
+      output->data_sync(true);
+    }
+  }
+}
+
 PrimitivePtr PyBoost::ConvertPrimitive(const py::object &obj) {
  const auto &adapter = obj.cast<PrimitivePyAdapterPtr>();
  MS_EXCEPTION_IF_NULL(adapter);
--- a/mindspore/ccsrc/pipeline/pynative/pynative_utils.h
+++ b/mindspore/ccsrc/pipeline/pynative/pynative_utils.h
@ -211,6 +211,7 @@ struct PyBoost {
    }
    return ret;
  }
+  static void DataSyncForGraph(const kernel::pyboost::OpPtr &op);
 };

 // Some common functions used in both jit and PackFunc grad
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/pyboost/customize/identity.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/pyboost/customize/identity.cc
@ -26,17 +26,76 @@
 namespace mindspore {
 namespace kernel {
 namespace pyboost {
-tensor::TensorPtr IdentityAscendCustomize(const std::shared_ptr<OpRunner> &op, const TensorPtr &x_tensor) {
-  OpRunner::InferOpOutput(op, x_tensor);
-
-  PyBoostUtils::PrepareOpInputs(op->device_context(), x_tensor);
-  PyBoostUtils::PrepareOpOutputs(op->device_context(), op->outputs());

+void IdentityCustomizeCallWithoutContigous(const std::shared_ptr<OpRunner> &op, const TensorPtr &x_tensor) {
  // Async
  PyBoostUtils::DispatchRun(std::make_shared<runtime::PyBoostDeviceTask>([op, x_tensor]() {
    MS_LOG(DEBUG) << "Run device task Identity start";
    auto device_context = op->device_context();
    const auto &outputs = op->outputs();
+    auto input_shape = x_tensor->storage_info()->ori_shape;
+    const auto &output_shape = x_tensor->storage_info()->ori_shape;
+    // Malloc for input tensors
+    PyBoostUtils::MallocOpInputs(device_context, x_tensor);
+    // Malloc for output tensors
+    auto launch_device_address = runtime::DeviceAddressUtils::CreateDeviceAddress(
+      op->device_context(), outputs[0], x_tensor->storage_info()->ori_shape, op->stream_id());
+    if (!device_context->device_res_manager_->AllocateMemory(launch_device_address.get())) {
+      MS_LOG(EXCEPTION) << "Allocate memory failed";
+    }
+
+    auto identity_kernel = std::make_shared<kernel::AclKernelMod>();
+    auto input_x_address = std::dynamic_pointer_cast<device::DeviceAddress>(x_tensor->device_address());
+
+    if (!input_x_address->kernel_tensor()->host_info_exist()) {
+      input_x_address->kernel_tensor()->SetHostInfo(std::make_shared<abstract::TensorShape>(x_tensor->shape()),
+                                                    std::make_shared<TensorType>(x_tensor->Dtype()), nullptr);
+    }
+    if (!launch_device_address->kernel_tensor()->host_info_exist()) {
+      launch_device_address->kernel_tensor()->SetHostInfo(std::make_shared<abstract::TensorShape>(output_shape),
+                                                          std::make_shared<TensorType>(outputs[0]->Dtype()), nullptr);
+    }
+    auto input_kernel_tensors = {input_x_address->kernel_tensor().get()};
+    auto output_kernel_tensors = {launch_device_address->kernel_tensor().get()};
+
+    if (!std::static_pointer_cast<KernelMod>(identity_kernel)
+           ->Init(prim::kPrimIdentity, input_kernel_tensors, output_kernel_tensors)) {
+      MS_LOG(EXCEPTION) << "#dmsg#Kernel build failed:#dmsg#Initialize acl kernel op[Identity] failed.";
+    }
+    identity_kernel->CreateAclConverter();
+    identity_kernel->SetDeviceInfo({input_x_address->format()}, {launch_device_address->format()},
+                                   {input_x_address->type_id()}, {launch_device_address->type_id()});
+
+    identity_kernel->PackageInput(kIndex0, input_x_address->format(), &input_shape);
+    identity_kernel->PackageOutput(kIndex0, output_shape);
+    identity_kernel->SetNeedConvertHostTensor(true);
+
+    if (identity_kernel->Resize(input_kernel_tensors, output_kernel_tensors) != KRET_OK) {
+      MS_LOG(EXCEPTION) << "Kernel identity resize failed";
+    }
+    auto stream_ptr = device_context->device_res_manager_->GetStream(op->stream_id());
+
+    auto workspace_address = PyBoostUtils::CreateWorkSpaceDeviceAddress(identity_kernel, device_context, "Identity");
+    auto workspaces = PyBoostUtils::GetKernelTensorFromAddress(workspace_address);
+
+    if (!identity_kernel->Launch(input_kernel_tensors, workspaces, output_kernel_tensors, stream_ptr)) {
+      MS_LOG(EXCEPTION) << "Launch kernel identity failed";
+    }
+    auto output_address = std::dynamic_pointer_cast<device::DeviceAddress>(outputs[0]->device_address());
+    output_address->SetStorageInfo(input_x_address->GetStorageInfo());
+    output_address->set_ptr(launch_device_address->GetMutablePtr());
+    MS_LOG(DEBUG) << "Run device task Identity end";
+  }));
+}
+
+void IdentityCustomizeCall(const std::shared_ptr<OpRunner> &op, const TensorPtr &x_tensor) {
+  // Async
+  PyBoostUtils::DispatchRun(std::make_shared<runtime::PyBoostDeviceTask>([op, x_tensor]() {
+    MS_LOG(DEBUG) << "Run device task Identity start";
+    auto device_context = op->device_context();
+    const auto &outputs = op->outputs();
+    auto input_shape = x_tensor->shape();
+    auto output_shape = outputs[0]->shape();
    // Malloc for input tensors
    PyBoostUtils::MallocOpInputs(device_context, x_tensor);
    // Malloc for output tensors
@ -45,12 +104,13 @@ tensor::TensorPtr IdentityAscendCustomize(const std::shared_ptr<OpRunner> &op, c
    auto identity_kernel = std::make_shared<kernel::AclKernelMod>();
    auto input_x_address = std::dynamic_pointer_cast<device::DeviceAddress>(x_tensor->device_address());
    auto output_address = std::dynamic_pointer_cast<device::DeviceAddress>(outputs[0]->device_address());
+
    if (!input_x_address->kernel_tensor()->host_info_exist()) {
      input_x_address->kernel_tensor()->SetHostInfo(std::make_shared<abstract::TensorShape>(x_tensor->shape()),
                                                    std::make_shared<TensorType>(x_tensor->Dtype()), nullptr);
    }
    if (!output_address->kernel_tensor()->host_info_exist()) {
-      output_address->kernel_tensor()->SetHostInfo(std::make_shared<abstract::TensorShape>(outputs[0]->shape()),
+      output_address->kernel_tensor()->SetHostInfo(std::make_shared<abstract::TensorShape>(output_shape),
                                                   std::make_shared<TensorType>(outputs[0]->Dtype()), nullptr);
    }
    auto input_kernel_tensors = {input_x_address->kernel_tensor().get()};
@ -63,40 +123,39 @@ tensor::TensorPtr IdentityAscendCustomize(const std::shared_ptr<OpRunner> &op, c
    identity_kernel->CreateAclConverter();
    identity_kernel->SetDeviceInfo({input_x_address->format()}, {output_address->format()},
                                   {input_x_address->type_id()}, {output_address->type_id()});
-    auto input_shape = x_tensor->shape();

    identity_kernel->PackageInput(kIndex0, input_x_address->format(), &input_shape);
-    identity_kernel->PackageOutput(kIndex0, outputs[0]->shape());
+    identity_kernel->PackageOutput(kIndex0, output_shape);
    identity_kernel->SetNeedConvertHostTensor(true);

    if (identity_kernel->Resize(input_kernel_tensors, output_kernel_tensors) != KRET_OK) {
      MS_LOG(EXCEPTION) << "Kernel identity resize failed";
    }
-    auto stream_ptr = device_context->device_res_manager_->GetStream(kDefaultStreamIndex);
+    auto stream_ptr = device_context->device_res_manager_->GetStream(op->stream_id());

-    auto workspace_sizes = identity_kernel->GetWorkspaceSizeList();
-    std::vector<kernel::KernelTensor *> workspaces;
-    workspaces.reserve(workspace_sizes.size());
-    for (size_t i = 0; i < workspace_sizes.size(); ++i) {
-      auto kernel_tensor = std::make_shared<KernelTensor>(
-        nullptr, workspace_sizes[i], Format::DEFAULT_FORMAT, kTypeUnknown, ShapeVector(),
-        device_context->device_context_key().device_name_, device_context->device_context_key().device_id_);
-      auto device_address = device_context->device_res_manager_->CreateDeviceAddress(kernel_tensor);
-      MS_EXCEPTION_IF_NULL(device_address);
-      if (device_address->GetPtr() == nullptr &&
-          !device_context->device_res_manager_->AllocateMemory(device_address.get())) {
-        MS_LOG(EXCEPTION) << "Allocate dynamic workspace memory failed";
-      }
-      (void)workspaces.emplace_back(device_address->kernel_tensor().get());
-      MS_LOG(DEBUG) << "workspace[" << i << "]:" << workspaces.back()->device_ptr()
-                    << " size:" << workspaces.back()->size();
-    }
+    auto workspace_address = PyBoostUtils::CreateWorkSpaceDeviceAddress(identity_kernel, device_context, "Identity");
+    auto workspaces = PyBoostUtils::GetKernelTensorFromAddress(workspace_address);

    if (!identity_kernel->Launch(input_kernel_tensors, workspaces, output_kernel_tensors, stream_ptr)) {
      MS_LOG(EXCEPTION) << "Launch kernel identity failed";
    }
    MS_LOG(DEBUG) << "Run device task Identity end";
  }));
+}
+
+tensor::TensorPtr IdentityAscendCustomize(const std::shared_ptr<OpRunner> &op, const TensorPtr &x_tensor) {
+  OpRunner::InferOpOutput(op, x_tensor);
+
+  PyBoostUtils::PrepareOpInputs(op->device_context(), x_tensor);
+  PyBoostUtils::PrepareOpOutputs(op->device_context(), op->outputs());
+
+  if (x_tensor->is_contiguous()) {
+    MS_LOG(DEBUG) << "Run Identity input contiguous";
+    IdentityCustomizeCall(op, x_tensor);
+  } else {
+    MS_LOG(DEBUG) << "Run Identity input without contiguous";
+    IdentityCustomizeCallWithoutContigous(op, x_tensor);
+  }
  return op->output(0);
 }
 }  // namespace pyboost
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/pyboost/customize/identity.cc
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/pyboost/customize/identity.cc
@ -0,0 +1,33 @@
+/**
+ * Copyright 2024 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "plugin/device/cpu/kernel/pyboost/customize/identity.h"
+#include <memory>
+#include <utility>
+#include "mindspore/ccsrc/kernel/pyboost/customize/identity.h"
+
+namespace mindspore {
+namespace kernel {
+namespace pyboost {
+tensor::TensorPtr IdentityCPUCustomize(const std::shared_ptr<OpRunner> &op, const TensorPtr &x_tensor) {
+  MS_LOG(DEBUG) << "Identity call start";
+  IdentityCustomize(op, x_tensor);
+  MS_LOG(DEBUG) << "Identity call end";
+  return op->output(0);
+}
+}  // namespace pyboost
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/pyboost/customize/identity.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/pyboost/customize/identity.h
@ -0,0 +1,33 @@
+/**
+ * Copyright 2024 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_PYBOOST_CUSTOMIZE_IDENTITY_H_
+#define MINDSPORE_MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_PYBOOST_CUSTOMIZE_IDENTITY_H_
+#include <vector>
+#include <memory>
+#include "ir/tensor.h"
+#include "ir/value.h"
+#include "runtime/hardware/device_context_manager.h"
+#include "kernel/pyboost/op_runner.h"
+
+namespace mindspore {
+namespace kernel {
+namespace pyboost {
+tensor::TensorPtr IdentityCPUCustomize(const std::shared_ptr<OpRunner> &op, const TensorPtr &x_tensor);
+}  // namespace pyboost
+}  // namespace kernel
+}  // namespace mindspore
+#endif  // MINDSPORE_MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_PYBOOST_CUSTOMIZE_IDENTITY_H_
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/pyboost/customize/identity.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/pyboost/customize/identity.cc
@ -0,0 +1,39 @@
+/**
+ * Copyright 2024 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "plugin/device/gpu/kernel/pyboost/customize/identity.h"
+#include <memory>
+#include <utility>
+#include "plugin/device/gpu/hal/device/gpu_device_manager.h"
+#include "mindspore/ccsrc/kernel/pyboost/customize/identity.h"
+
+namespace mindspore {
+namespace kernel {
+namespace pyboost {
+tensor::TensorPtr IdentityGPUCustomize(const std::shared_ptr<OpRunner> &op, const TensorPtr &x_tensor) {
+  MS_LOG(DEBUG) << "Identity call start";
+  auto stream = device::gpu::GPUDeviceManager::GetInstance().GetStream(op->stream_id());
+  IdentityCustomize(op, x_tensor, stream);
+  static auto sync = MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_SYNCHRONIZE);
+  if (sync && !op->device_context()->device_res_manager_->SyncAllStreams()) {
+    MS_LOG(EXCEPTION) << "SyncStream failed for op Identity.";
+  }
+  MS_LOG(DEBUG) << "Identity call end";
+  return op->output(0);
+}
+}  // namespace pyboost
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/pyboost/customize/identity.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/pyboost/customize/identity.h
@ -0,0 +1,33 @@
+/**
+ * Copyright 2024 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_PYBOOST_CUSTOMIZE_IDENTITY_H_
+#define MINDSPORE_MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_PYBOOST_CUSTOMIZE_IDENTITY_H_
+#include <vector>
+#include <memory>
+#include "ir/tensor.h"
+#include "ir/value.h"
+#include "runtime/hardware/device_context_manager.h"
+#include "kernel/pyboost/op_runner.h"
+
+namespace mindspore {
+namespace kernel {
+namespace pyboost {
+tensor::TensorPtr IdentityGPUCustomize(const std::shared_ptr<OpRunner> &op, const TensorPtr &x_tensor);
+}  // namespace pyboost
+}  // namespace kernel
+}  // namespace mindspore
+#endif  // MINDSPORE_MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_PYBOOST_CUSTOMIZE_IDENTITY_H_
--- a/mindspore/ccsrc/runtime/device/device_address_utils.cc
+++ b/mindspore/ccsrc/runtime/device/device_address_utils.cc
@ -1077,6 +1077,24 @@ void DeviceAddressUtils::CreateOutputTensorAddress(DeviceContext *device_context
  }
 }

+device::DeviceAddressPtr DeviceAddressUtils::CreateDeviceAddress(DeviceContext *device_context,
+                                                                 const tensor::TensorPtr &tensor,
+                                                                 const ShapeVector &real_shape,
+                                                                 const size_t &stream_id) {
+  MS_EXCEPTION_IF_NULL(device_context);
+  MS_EXCEPTION_IF_NULL(tensor);
+  auto tensor_size = GetTypeByte(TypeIdToType(tensor->data_type())) * SizeOf(real_shape);
+  const auto &device_format = GetFormatByTensorShape(device_context, tensor->shape());
+  auto kernel_tensor = std::make_shared<kernel::KernelTensor>(
+    nullptr, tensor_size, device_format, tensor->data_type(), real_shape,
+    device_context->device_context_key().device_name_, device_context->device_context_key().device_id_);
+  kernel_tensor->set_stream_id(stream_id);
+  device::DeviceAddressPtr device_address = device_context->device_res_manager_->CreateDeviceAddress(kernel_tensor);
+  MS_LOG(DEBUG) << "Create tensor device address " << device_address << "Shape: " << tensor->shape()
+                << ", Type: " << tensor->data_type();
+  return device_address;
+}
+
 void DeviceAddressUtils::MallocForOutputs(DeviceContext *device_context,
                                          const std::vector<tensor::TensorPtr> &outputs) {
  for (const auto &output : outputs) {
--- a/mindspore/ccsrc/runtime/device/device_address_utils.h
+++ b/mindspore/ccsrc/runtime/device/device_address_utils.h
@ -117,6 +117,8 @@ class BACKEND_EXPORT DeviceAddressUtils {

  static void UpdateDeviceAddressHostInfoByNode(const device::DeviceAddressPtr &addr, const AnfNodePtr &node,
                                                size_t output_idx);
+  static device::DeviceAddressPtr CreateDeviceAddress(DeviceContext *device_context, const tensor::TensorPtr &tensor,
+                                                      const ShapeVector &real_shape, const size_t &stream_id);
 };
 }  // namespace runtime
 }  // namespace mindspore
--- a/mindspore/core/ops/ops_def/identity_op.yaml
+++ b/mindspore/core/ops/ops_def/identity_op.yaml
@ -9,5 +9,7 @@ identity:
  function:
    name: deepcopy
  dispatch:
-    enable: False
-    Ascend: IdentityAscend
+    enable: True
+    Ascend: IdentityAscend
+    CPU: IdentityCPU
+    GPU: IdentityGPU
--- a/tests/st/numpy_native/test_array_creations.py
+++ b/tests/st/numpy_native/test_array_creations.py
@ -1111,7 +1111,7 @@ def test_empty_like_exception():
        _pynative_executor.sync()


-@pytest.mark.level1
+@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.platform_x86_gpu_training