!37274 support ascend cloud infer

Merge pull request !37274 from zhengyuanhua/br3
2022-07-05 11:25:24 +00:00 · 2022-07-05 11:25:24 +00:00 · 69aa258eb2
parent 1c08d29bb6 2153320b2f
commit 69aa258eb2
27 changed files with 1939 additions and 14 deletions
--- a/mindspore/ccsrc/transform/graph_ir/op_adapter.cc
+++ b/mindspore/ccsrc/transform/graph_ir/op_adapter.cc
@ -84,7 +84,7 @@ std::string OpAdapterImpl::GetCustomOpType(const PrimitivePtr &prim) const {
  MS_EXCEPTION_IF_NULL(prim);
  auto value = prim->GetAttr("reg_op_name");
  if (value == nullptr) {
-    MS_LOG(ERROR) << "Custom op has no func_type attr.";
+    MS_LOG(ERROR) << "Custom op has no reg_op_name attr.";
    return "";
  }
  auto op_type = GetValue<std::string>(value);
--- a/mindspore/core/load_mindir/anf_model_parser.cc
+++ b/mindspore/core/load_mindir/anf_model_parser.cc
@ -1441,6 +1441,10 @@ bool MSANFModelParser::BuildAttrForFuncGraph(const FuncGraphPtr &outputFuncGraph
        outputFuncGraph->set_attr(attr_proto.name(), ParseAttrInSingleScalar_int32_t_bool(attr_proto));
        break;
      }
+      case mind_ir::AttributeProto_AttributeType_INT32: {
+        outputFuncGraph->set_attr(attr_proto.name(), ParseAttrInSingleScalar_int32_t_int32_t(attr_proto));
+        break;
+      }
      default:
        MS_LOG(ERROR) << "Obtain attr for graph has not support input type: " << attr_type << "!";
        return false;
--- a/mindspore/lite/CMakeLists.txt
+++ b/mindspore/lite/CMakeLists.txt
@ -202,6 +202,13 @@ if(DEFINED ENV{MSLITE_ENABLE_CLOUD_FUSION_INFERENCE})
    set(MSLITE_ENABLE_CLOUD_FUSION_INFERENCE $ENV{MSLITE_ENABLE_CLOUD_FUSION_INFERENCE})
 endif()

+if(MSLITE_ENABLE_ACL AND MSLITE_ENABLE_CLOUD_FUSION_INFERENCE)
+    set(PLATFORM_ARM64 off)
+    set(PLATFORM_ARM32 off)
+    set(MSLITE_ENABLE_FP16 off)
+    set(ENABLE_NEON off)
+endif()
+
 if(MACHINE_LINUX_ARM64)
    add_compile_definitions(MACHINE_LINUX_ARM64)
    add_compile_definitions(LINUX_RUNTIME)
--- a/mindspore/lite/cmake/lite_dependences.cmake
+++ b/mindspore/lite/cmake/lite_dependences.cmake
@ -31,7 +31,13 @@ if(MSLITE_DEPS_OPENCV)
 endif()

 if(MSLITE_DEPS_MKLDNN)
-    include(${TOP_DIR}/cmake/external_libs/mkl_dnn.cmake)
+    if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+        set(USE_MS_THREADPOOL_FOR_DNNL ON)
+    endif()
+    if(USE_MS_THREADPOOL_FOR_DNNL)
+        add_compile_definitions(USE_MS_THREADPOOL_FOR_DNNL)
+    endif()
+include(${TOP_DIR}/cmake/external_libs/mkl_dnn.cmake)
 endif()

 if(MSLITE_DEPS_LIBEVENT)
@ -47,6 +53,7 @@ if(MSLITE_DEPS_PYBIND11)
            include_directories(${Python3_NumPy_INCLUDE_DIRS})
            include_directories(${TOP_DIR})
            include_directories(${CORE_DIR})
+            set(PYBIND11_CPP_STANDARD -std=c++17)
            include(${TOP_DIR}/cmake/external_libs/pybind11.cmake)
        endif()
    endif()
--- a/mindspore/lite/src/CMakeLists.txt
+++ b/mindspore/lite/src/CMakeLists.txt
@ -421,9 +421,13 @@ add_subdirectory(runtime/kernel/cpu)
 add_library(lite_src_mid OBJECT ${LITE_SRC})
 add_dependencies(lite_src_mid fbs_src)

-if(MSLITE_ENABLE_ACL AND NOT MSLITE_ENABLE_CLOUD_FUSION_INFERENCE)
+if(MSLITE_ENABLE_ACL)
    include_directories(${TOP_DIR}/graphengine/inc/external)
-    add_subdirectory(runtime/kernel/ascend)
+    if(NOT MSLITE_ENABLE_CLOUD_FUSION_INFERENCE)
+        add_subdirectory(runtime/kernel/ascend)
+    else()
+        add_compile_definitions(ENABLE_CLOUD_FUSION_INFERENCE)
+    endif()
    link_directories(${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
 endif()

--- a/mindspore/lite/src/extendrt/CMakeLists.txt
+++ b/mindspore/lite/src/extendrt/CMakeLists.txt
@ -13,6 +13,8 @@ if(MSLITE_ENABLE_CLOUD_FUSION_INFERENCE)
    add_compile_definitions(USE_GLOG)
    string(REPLACE "-fno-rtti" "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
    string(REPLACE "-fno-rtti" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+    string(REPLACE "-fno-exceptions" "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
+    string(REPLACE "-fno-exceptions" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
    add_compile_definitions(ENABLE_CLOUD_FUSION_INFERENCE)
    remove_definitions(-DBUILD_LITE_INFERENCE)
    set(MINDIR_MODEL_SRC
@ -165,7 +167,9 @@ if(MSLITE_ENABLE_CLOUD_FUSION_INFERENCE)

    if(MSLITE_ENABLE_ACL)
        include_directories(${TOP_DIR}/graphengine/inc/external)
+        add_subdirectory(kernel/ascend)
        link_directories(${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+        target_link_libraries(mindspore-extendrt ascend_kernel_mid)
    endif()

    if(SUPPORT_CUDA)
--- a/mindspore/lite/src/extendrt/kernel/ascend/CMakeLists.txt
+++ b/mindspore/lite/src/extendrt/kernel/ascend/CMakeLists.txt
@ -0,0 +1,26 @@
+include_directories(${TOP_DIR}/graphengine/inc/external)
+include_directories(${TOP_DIR}/mindspore)
+include_directories(${TOP_DIR}/mindspore/lite/src)
+
+find_library(ge_graph libgraph.so ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+
+file(GLOB_RECURSE ASCEND_SRC ${CMAKE_CURRENT_SOURCE_DIR}
+        "custom_ascend_kernel.cc"
+        "model/*.cc"
+        )
+
+add_library(ascend_kernel_mid OBJECT ${ASCEND_SRC})
+
+add_dependencies(ascend_kernel_mid fbs_inner_src)
+if("${MSLITE_REGISTRY_DEVICE}" STREQUAL "SD3403" AND PLATFORM_ARM64)
+    find_library(ge_graph libgraph.so ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    find_library(acl libascendcl.so ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    find_library(acl_retr libacl_retr.so ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    find_library(acl_cblas libacl_cblas.so ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    find_library(acl_runtime libruntime.so ${ASCEND_CANN_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH})
+    target_link_libraries(ascend_kernel_mid ${ge_graph} ${acl} ${acl_retr} ${acl_cblas} ${acl_runtime})
+else()
+    target_link_libraries(ascend_kernel_mid ${ge_graph} ${ge_compiler}
+        ${acl_retr} ${acl_cblas} ${acl_dvpp} ${acl_runtime} ${libplatform}
+        ${libcompress} ${libopskernel} ${libaicore_utils} ${libaicpu_engine_common} ${acl})
+endif()
--- a/mindspore/lite/src/extendrt/kernel/ascend/custom_ascend_kernel.cc
+++ b/mindspore/lite/src/extendrt/kernel/ascend/custom_ascend_kernel.cc
@ -0,0 +1,242 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "extendrt/kernel/ascend/custom_ascend_kernel.h"
+#include <utility>
+#include "include/registry/register_kernel.h"
+#include "include/api/types.h"
+#include "include/api/data_type.h"
+#include "extendrt/kernel/ascend/model/model_infer.h"
+#include "extendrt/kernel/ascend/options/acl_options_parser.h"
+#include "core/ops/custom.h"
+#include "plugin/factory/ms_factory.h"
+#include "src/common/log_util.h"
+#include "common/log_adapter.h"
+
+namespace mindspore::kernel {
+namespace acl {
+CustomAscendKernelMod::CustomAscendKernelMod()
+    : load_model_(false), acl_options_(nullptr), dyn_shape_proc_(nullptr), model_infer_(nullptr), input_data_idx_(0) {}
+
+CustomAscendKernelMod::~CustomAscendKernelMod() {
+  if (load_model_) {
+    int ret = model_infer_->Finalize();
+    if (ret != lite::RET_OK) {
+      MS_LOG(ERROR) << "Model finalize failed.";
+    }
+  }
+}
+
+void CustomAscendKernelMod::RecordInputDataIndex(const std::vector<KernelTensorPtr> &inputs) {
+  for (size_t idx = 0; idx < inputs.size(); ++idx) {
+    if (inputs[idx] == nullptr) {
+      MS_LOG(ERROR) << "Input " << idx << " is invalid.";
+      return;
+    }
+    if (inputs[idx]->GetData() == nullptr) {
+      input_data_idx_ = idx;
+      break;
+    }
+  }
+}
+
+bool CustomAscendKernelMod::InitParam(const std::vector<KernelTensorPtr> &inputs,
+                                      const std::vector<KernelTensorPtr> &outputs) {
+  if (inputs.empty() || outputs.empty()) {
+    MS_LOG(ERROR) << "Custom kernel has empty inputs or outputs, which is invalid.";
+    return false;
+  }
+  inputs_.assign(inputs.begin(), inputs.end() - 1);
+  outputs_.assign(outputs.begin(), outputs.end());
+  acl_options_ = std::make_shared<AclModelOptions>();
+  if (acl_options_ == nullptr) {
+    MS_LOG(ERROR) << "Create AclModelOptions failed.";
+    return false;
+  }
+  //  AclOptionsParser parser;
+  //  if (parser.ParseAclOptions(context_, &acl_options_) != lite::RET_OK) {
+  //    MS_LOG(ERROR) << "Parse model options failed.";
+  //    return false;
+  //  }
+  // last input is om data tensor
+  int idx = inputs.size() - 1;
+  if (inputs[idx] == nullptr || inputs[idx]->GetData() == nullptr) {
+    MS_LOG(ERROR) << "Input " << idx << " is invalid.";
+    return false;
+  }
+  Buffer om_data(inputs[idx]->GetData()->addr, inputs[idx]->GetData()->size);
+  model_infer_ = std::make_shared<ModelInfer>(om_data, acl_options_);
+  if (model_infer_ == nullptr) {
+    MS_LOG(ERROR) << "Create ModelInfer failed.";
+    return false;
+  }
+  RecordInputDataIndex(inputs);
+  dyn_shape_proc_ = std::make_shared<DynShapeProcess>(acl_options_, input_data_idx_);
+  if (dyn_shape_proc_ == nullptr) {
+    MS_LOG(ERROR) << "Create DynShapeProcess failed.";
+    return false;
+  }
+  return true;
+}
+
+bool CustomAscendKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
+                                 const std::vector<KernelTensorPtr> &outputs) {
+  if (load_model_) {
+    MS_LOG(INFO) << "Om has been loaded in custom kernel.";
+    return lite::RET_OK;
+  }
+
+  auto kernel_ptr = std::dynamic_pointer_cast<ops::Custom>(base_operator);
+  if (!kernel_ptr) {
+    MS_LOG(ERROR) << "Cast Custom ops failed!";
+    return false;
+  }
+  if (!InitParam(inputs, outputs)) {
+    MS_LOG(ERROR) << "Init param failed.";
+    return false;
+  }
+  if (LoadModel() != lite::RET_OK) {
+    MS_LOG(ERROR) << "Load model failed.";
+    return false;
+  }
+
+  load_model_ = true;
+  return true;
+}
+
+int CustomAscendKernelMod::LoadModel() {
+  int ret = model_infer_->Init();
+  if (ret != lite::RET_OK) {
+    MS_LOG(ERROR) << "Model infer init failed.";
+    return lite::RET_ERROR;
+  }
+  ret = model_infer_->Load();
+  if (ret != lite::RET_OK) {
+    MS_LOG(ERROR) << "Load om data failed.";
+    return lite::RET_ERROR;
+  }
+  acl_options_->batch_size = model_infer_->GetDynamicBatch();
+  acl_options_->image_size = model_infer_->GetDynamicImage();
+
+  MS_LOG(INFO) << "Load om data success.";
+  return lite::RET_OK;
+}
+
+int CustomAscendKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
+                                  const std::vector<KernelTensorPtr> &outputs,
+                                  const std::map<uint32_t, tensor::TensorPtr> &inputsOnHost) {
+  if (!load_model_) {
+    MS_LOG(WARNING) << "Model has not been loaded, start to load when resize.";
+    if (!Init(base_operator, inputs, outputs)) {
+      MS_LOG(ERROR) << "Load model failed when resize.";
+      return lite::RET_ERROR;
+    }
+  }
+  return lite::RET_OK;
+}
+
+int CustomAscendKernelMod::SetInputAndOutputAddr(const std::vector<AddressPtr> &inputs,
+                                                 const std::vector<AddressPtr> &outputs) {
+  if ((inputs_.size() + 1) != inputs.size()) {
+    MS_LOG(ERROR) << "Size of inputs in init [" << (inputs_.size() + 1) << "] and "
+                  << "size of inputs in launch [" << inputs.size() << "] are not equal.";
+    return lite::RET_ERROR;
+  }
+  if (outputs_.size() != outputs.size()) {
+    MS_LOG(ERROR) << "Size of outputs in init (" << outputs_.size() << ") and "
+                  << "size of outputs in launch (" << outputs.size() << ") are not equal.";
+    return lite::RET_ERROR;
+  }
+  for (size_t i = 0; i < inputs_.size(); ++i) {
+    if (inputs[i]->addr == nullptr || inputs[i]->size == 0) {
+      MS_LOG(ERROR) << "Input " << i << " addr is invalid.";
+      return lite::RET_ERROR;
+    }
+    inputs_[i]->SetData(inputs[i]);
+  }
+  for (size_t j = 0; j < outputs_.size(); ++j) {
+    if (outputs[j]->addr == nullptr || inputs[j]->size == 0) {
+      MS_LOG(ERROR) << "Output " << j << " addr is invalid.";
+      return lite::RET_ERROR;
+    }
+    outputs_[j]->SetData(outputs[j]);
+  }
+  return lite::RET_OK;
+}
+
+bool CustomAscendKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+                                   const std::vector<AddressPtr> &outputs, void *stream_ptr) {
+  if (!load_model_) {
+    MS_LOG(ERROR) << "Init custom ascend kernel has been not ready.";
+    return false;
+  }
+  if (SetInputAndOutputAddr(inputs, outputs) != lite::RET_OK) {
+    MS_LOG(ERROR) << "Check input and output param failed.";
+    return false;
+  }
+  if (dyn_shape_proc_->ProcDynamicInput(&inputs_) != lite::RET_OK) {
+    MS_LOG(ERROR) << "Proc dynamic batch size input failed.";
+    return false;
+  }
+  if (model_infer_->Inference(inputs_, outputs_) != lite::RET_OK) {
+    MS_LOG(ERROR) << "Custom kernel execute failed.";
+    return false;
+  }
+  return true;
+}
+
+// std::shared_ptr<kernel::Kernel> CustomCreateKernel(const std::vector<mindspore::MSTensor> &inputs,
+//                                                    const std::vector<mindspore::MSTensor> &outputs,
+//                                                    const schema::Primitive *primitive, const mindspore::Context *ctx)
+//                                                    {
+//   if (primitive == nullptr) {
+//     MS_LOG(ERROR) << "Primitive is nullptr.";
+//     return nullptr;
+//   }
+//   if (primitive->value_type() != schema::PrimitiveType_Custom) {
+//     MS_LOG(ERROR) << "Primitive type is not PrimitiveType_Custom";
+//     return nullptr;
+//   }
+//
+//   auto kernel = std::make_shared<CustomAscendKernel>(inputs, outputs, primitive, ctx);
+//   if (kernel == nullptr) {
+//     MS_LOG(ERROR) << "New custom kernel is nullptr";
+//     return nullptr;
+//   }
+//   return kernel;
+// }
+
+MS_KERNEL_FACTORY_REG(KernelMod, CustomAscend, CustomAscendKernelMod);
+}  // namespace acl
+}  // namespace mindspore::kernel
+namespace mindspore {
+namespace registry {
+namespace {
+const auto kFloat32 = DataType::kNumberTypeFloat32;
+const auto kFloat16 = DataType::kNumberTypeFloat16;
+const auto kInt32 = DataType::kNumberTypeInt32;
+const auto kInt8 = DataType::kNumberTypeInt8;
+const auto kUInt8 = DataType::kNumberTypeUInt8;
+const auto kBool = DataType::kNumberTypeBool;
+}  // namespace
+// REGISTER_CUSTOM_KERNEL(ASCEND, ACL, kFloat32, ACL, kernel::acl::CustomCreateKernel)
+// REGISTER_CUSTOM_KERNEL(ASCEND, ACL, kFloat16, ACL, kernel::acl::CustomCreateKernel)
+// REGISTER_CUSTOM_KERNEL(ASCEND, ACL, kInt32, ACL, kernel::acl::CustomCreateKernel)
+// REGISTER_CUSTOM_KERNEL(ASCEND, ACL, kInt8, ACL, kernel::acl::CustomCreateKernel)
+// REGISTER_CUSTOM_KERNEL(ASCEND, ACL, kUInt8, ACL, kernel::acl::CustomCreateKernel)
+// REGISTER_CUSTOM_KERNEL(ASCEND, ACL, kBool, ACL, kernel::acl::CustomCreateKernel)
+}  // namespace registry
+}  // namespace mindspore
--- a/mindspore/lite/src/extendrt/kernel/ascend/custom_ascend_kernel.h
+++ b/mindspore/lite/src/extendrt/kernel/ascend/custom_ascend_kernel.h
@ -0,0 +1,68 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_EXTENDRT_KERNEL_ASCEND_CUSTOM_ASCEND_KERNEL_H_
+#define MINDSPORE_LITE_SRC_EXTENDRT_KERNEL_ASCEND_CUSTOM_ASCEND_KERNEL_H_
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <map>
+#include "extendrt/kernel/ascend/options/acl_model_options.h"
+#include "extendrt/kernel/ascend/model/model_infer.h"
+#include "extendrt/kernel/ascend/model/dyn_shape_process.h"
+#include "include/api/types.h"
+#include "include/api/context.h"
+#include "kernel/kernel.h"
+#include "kernel/common_utils.h"
+#include "include/errorcode.h"
+
+namespace mindspore::kernel {
+namespace acl {
+class CustomAscendKernelMod : public kernel::KernelMod {
+ public:
+  CustomAscendKernelMod();
+  ~CustomAscendKernelMod() override;
+
+  bool Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
+            const std::vector<KernelTensorPtr> &outputs) override;
+
+  int Resize(
+    const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
+    const std::vector<KernelTensorPtr> &outputs,
+    const std::map<uint32_t, tensor::TensorPtr> &inputsOnHost = std::map<uint32_t, tensor::TensorPtr>()) override;
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
+
+ private:
+  void RecordInputDataIndex(const std::vector<KernelTensorPtr> &inputs);
+  bool InitParam(const std::vector<KernelTensorPtr> &inputs, const std::vector<KernelTensorPtr> &outputs);
+  int SetInputAndOutputAddr(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
+  int LoadModel();
+
+  bool load_model_;
+  std::vector<KernelTensorPtr> inputs_;
+  std::vector<KernelTensorPtr> outputs_;
+  AclModelOptionsPtr acl_options_;
+  DynShapeProcPtr dyn_shape_proc_;
+  ModelInferPtr model_infer_;
+  size_t input_data_idx_;
+};
+}  // namespace acl
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_EXTENDRT_KERNEL_ASCEND_CUSTOM_ASCEND_KERNEL_H_
--- a/mindspore/lite/src/extendrt/kernel/ascend/model/acl_env_guard.cc
+++ b/mindspore/lite/src/extendrt/kernel/ascend/model/acl_env_guard.cc
@ -0,0 +1,60 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "extendrt/kernel/ascend/model/acl_env_guard.h"
+#include "common/log_adapter.h"
+#include "acl/acl.h"
+
+namespace mindspore::kernel {
+namespace acl {
+std::shared_ptr<AclEnvGuard> AclEnvGuard::global_acl_env_ = nullptr;
+std::mutex AclEnvGuard::global_acl_env_mutex_;
+
+AclEnvGuard::AclEnvGuard(std::string_view cfg_file) {
+  errno_ = aclInit(cfg_file.data());
+  if (errno_ != ACL_ERROR_NONE && errno_ != ACL_ERROR_REPEAT_INITIALIZE) {
+    MS_LOG(ERROR) << "Execute aclInit Failed";
+    return;
+  }
+  MS_LOG(INFO) << "Acl init success";
+}
+
+AclEnvGuard::~AclEnvGuard() { (void)aclFinalize(); }
+
+std::shared_ptr<AclEnvGuard> AclEnvGuard::GetAclEnv(std::string_view cfg_file) {
+  std::shared_ptr<AclEnvGuard> acl_env;
+
+  std::lock_guard<std::mutex> lock(global_acl_env_mutex_);
+  acl_env = global_acl_env_;
+  if (acl_env != nullptr) {
+    MS_LOG(INFO) << "Acl has been initialized, skip.";
+    if (!cfg_file.empty()) {
+      MS_LOG(WARNING) << "Dump config file option " << cfg_file << " is ignored.";
+    }
+  } else {
+    acl_env = std::make_shared<AclEnvGuard>(cfg_file);
+    aclError ret = acl_env->GetErrno();
+    if (ret != ACL_ERROR_NONE && ret != ACL_ERROR_REPEAT_INITIALIZE) {
+      MS_LOG(ERROR) << "Execute aclInit Failed";
+      return nullptr;
+    }
+    global_acl_env_ = acl_env;
+    MS_LOG(INFO) << "Acl init success";
+  }
+  return acl_env;
+}
+}  // namespace acl
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/extendrt/kernel/ascend/model/acl_env_guard.h
+++ b/mindspore/lite/src/extendrt/kernel/ascend/model/acl_env_guard.h
@ -0,0 +1,42 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_EXTENDRT_KERNEL_ASCEND_MODEL_ACL_ENV_GUARD_H_
+#define MINDSPORE_LITE_SRC_EXTENDRT_KERNEL_ASCEND_MODEL_ACL_ENV_GUARD_H_
+
+#include <memory>
+#include <mutex>
+#include "acl/acl_base.h"
+
+namespace mindspore::kernel {
+namespace acl {
+class AclEnvGuard {
+ public:
+  explicit AclEnvGuard(std::string_view cfg_file);
+  ~AclEnvGuard();
+  aclError GetErrno() const { return errno_; }
+  static std::shared_ptr<AclEnvGuard> GetAclEnv(std::string_view cfg_file);
+
+ private:
+  static std::shared_ptr<AclEnvGuard> global_acl_env_;
+  static std::mutex global_acl_env_mutex_;
+
+  aclError errno_;
+};
+}  // namespace acl
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_EXTENDRT_KERNEL_ASCEND_MODEL_ACL_ENV_GUARD_H_
--- a/mindspore/lite/src/extendrt/kernel/ascend/model/dyn_shape_process.cc
+++ b/mindspore/lite/src/extendrt/kernel/ascend/model/dyn_shape_process.cc
@ -0,0 +1,179 @@
+/**
+ * Copyright 2021-2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "extendrt/kernel/ascend/model/dyn_shape_process.h"
+#include <utility>
+#include "mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h"
+#include "include/errorcode.h"
+
+namespace mindspore::kernel {
+namespace acl {
+namespace {
+constexpr auto kInputDimNum = 4;
+constexpr auto kNHWCHeightIdx = 1;
+constexpr auto kNHWCWidthIdx = 2;
+constexpr auto kNCHWHeightIdx = 2;
+constexpr auto kNCHWWidthIdx = 3;
+constexpr auto kImageSizeHwNum = 2;
+}  // namespace
+
+int DynShapeProcess::ProcDynamicInput(std::vector<KernelTensorPtr> *const inputs) {
+  MS_CHECK_TRUE_MSG(acl_options_ != nullptr, lite::RET_ERROR, "Acl options ptr is nullptr.");
+  if (acl_options_->batch_size.empty() && acl_options_->image_size.empty()) {
+    MS_LOG(INFO) << "Inputs are not dynamic mode.";
+    return lite::RET_OK;
+  }
+  if (!acl_options_->batch_size.empty() && !acl_options_->image_size.empty()) {
+    MS_LOG(ERROR) << "Batch size and image size can't be set at the same time.";
+    return lite::RET_ERROR;
+  }
+  MS_CHECK_TRUE_MSG(inputs != nullptr, lite::RET_ERROR, "Inputs is nullptr.");
+  if (!acl_options_->batch_size.empty()) {
+    if (AddBatchSizeInput(inputs) != lite::RET_OK) {
+      MS_LOG(ERROR) << "Add batch size input failed.";
+      return lite::RET_ERROR;
+    }
+  }
+  if (!acl_options_->image_size.empty()) {
+    if (AddImageSizeInput(inputs) != lite::RET_OK) {
+      MS_LOG(ERROR) << "Add Image size input failed.";
+      return lite::RET_ERROR;
+    }
+  }
+  return lite::RET_OK;
+}
+
+int DynShapeProcess::AddBatchSizeInput(std::vector<KernelTensorPtr> *const inputs) {
+  int32_t *batch_size_addr = reinterpret_cast<int32_t *>(malloc(sizeof(int32_t)));
+  if (batch_size_addr == nullptr) {
+    MS_LOG(ERROR) << "Malloc batch size failed.";
+    return lite::RET_ERROR;
+  }
+  if (GetRealBatchSize(inputs, batch_size_addr) != lite::RET_OK) {
+    MS_LOG(ERROR) << "Get real batch size failed.";
+    free(batch_size_addr);
+    return lite::RET_ERROR;
+  }
+  auto batch_size_ptr = std::make_shared<Address>(batch_size_addr, sizeof(int32_t));
+  if (batch_size_ptr == nullptr) {
+    MS_LOG(ERROR) << "Create Address failed.";
+    free(batch_size_addr);
+    return lite::RET_ERROR;
+  }
+  auto tensor_ptr = std::make_shared<KernelTensor>();
+  if (tensor_ptr == nullptr) {
+    MS_LOG(ERROR) << "Create KernelTensor failed.";
+    free(batch_size_addr);
+    return lite::RET_ERROR;
+  }
+
+  tensor_ptr->SetData(batch_size_ptr);
+  inputs->emplace_back(tensor_ptr);
+  return lite::RET_OK;
+}
+
+int DynShapeProcess::AddImageSizeInput(std::vector<KernelTensorPtr> *const inputs) {
+  int32_t *image_size_addr = reinterpret_cast<int32_t *>(malloc(kImageSizeHwNum * sizeof(int32_t)));
+  if (image_size_addr == nullptr) {
+    MS_LOG(ERROR) << "Malloc image size failed.";
+    return lite::RET_ERROR;
+  }
+  if (GetRealImageSize(inputs, image_size_addr, kImageSizeHwNum) != lite::RET_OK) {
+    MS_LOG(ERROR) << "Get real image size failed.";
+    free(image_size_addr);
+    return lite::RET_ERROR;
+  }
+  auto image_size_ptr = std::make_shared<Address>(image_size_addr, kImageSizeHwNum * sizeof(int32_t));
+  if (image_size_ptr == nullptr) {
+    MS_LOG(ERROR) << "Create Address failed.";
+    free(image_size_addr);
+    return lite::RET_ERROR;
+  }
+  auto tensor_ptr = std::make_shared<KernelTensor>();
+  if (tensor_ptr == nullptr) {
+    MS_LOG(ERROR) << "Create KernelTensor failed.";
+    free(image_size_addr);
+    return lite::RET_ERROR;
+  }
+
+  tensor_ptr->SetData(image_size_ptr);
+  inputs->emplace_back(tensor_ptr);
+  return lite::RET_OK;
+}
+
+int DynShapeProcess::GetRealBatchSize(std::vector<KernelTensorPtr> *const inputs, int32_t *batch_size) {
+  MS_CHECK_TRUE_MSG(batch_size != nullptr, lite::RET_ERROR, "Batch size ptr is nullptr.");
+  if (input_data_idx_ >= inputs->size()) {
+    MS_LOG(ERROR) << " Input data index " << input_data_idx_ << " is larger than input size " << inputs->size();
+    return lite::RET_ERROR;
+  }
+  auto tensor = (*inputs)[input_data_idx_];
+  std::vector<int64_t> shape = tensor->GetShapeVector();
+  if (shape.empty()) {
+    MS_LOG(ERROR) << "Shape is empty, input index = " << input_data_idx_;
+    return lite::RET_ERROR;
+  }
+  int32_t cur_batch_size = static_cast<uint64_t>(shape[0]);
+  auto iter = acl_options_->batch_size.find(cur_batch_size);
+  if (iter == acl_options_->batch_size.end()) {
+    MS_LOG(ERROR) << "Current batch size " << cur_batch_size << " is invalid, please check device info of context";
+    return lite::RET_ERROR;
+  }
+  *batch_size = cur_batch_size;
+  MS_LOG(DEBUG) << "Current batch size " << cur_batch_size;
+  return lite::RET_OK;
+}
+
+int DynShapeProcess::GetRealImageSize(std::vector<KernelTensorPtr> *const inputs, int32_t *image_size, int32_t num) {
+  MS_CHECK_TRUE_MSG(image_size != nullptr, lite::RET_ERROR, "Image size ptr is nullptr.");
+  if (input_data_idx_ >= inputs->size()) {
+    MS_LOG(ERROR) << "Input data index " << input_data_idx_ << " is larger than input size " << inputs->size();
+    return lite::RET_ERROR;
+  }
+  auto tensor = (*inputs)[input_data_idx_];
+  std::vector<int64_t> shape = tensor->GetShapeVector();
+  if (shape.size() != kInputDimNum) {
+    MS_LOG(ERROR) << "Shape size " << shape.size() << " is invalid, input index = " << input_data_idx_;
+    return lite::RET_ERROR;
+  }
+  auto format = tensor->GetFormat();
+  uint64_t height;
+  uint64_t width;
+  if (format == mindspore::Format::NHWC) {
+    height = shape[kNHWCHeightIdx];
+    width = shape[kNHWCWidthIdx];
+  } else {
+    height = shape[kNCHWHeightIdx];
+    width = shape[kNCHWWidthIdx];
+  }
+  auto cur_image_size = std::pair<int32_t, int32_t>(static_cast<uint64_t>(height), static_cast<uint64_t>(width));
+  auto iter = acl_options_->image_size.find(cur_image_size);
+  if (iter == acl_options_->image_size.end()) {
+    MS_LOG(ERROR) << "Image size height " << height << ",weight " << width
+                  << " is invalid, please check device info of context.";
+    return lite::RET_ERROR;
+  }
+  if (num != kImageSizeHwNum) {
+    MS_LOG(ERROR) << "The hw num should be " << kImageSizeHwNum << ",real num " << num;
+    return lite::RET_ERROR;
+  }
+  image_size[0] = height;
+  image_size[1] = width;
+  MS_LOG(DEBUG) << "Current height " << height << " width " << width;
+  return lite::RET_OK;
+}
+}  // namespace acl
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/extendrt/kernel/ascend/model/dyn_shape_process.h
+++ b/mindspore/lite/src/extendrt/kernel/ascend/model/dyn_shape_process.h
@ -0,0 +1,48 @@
+/**
+ * Copyright 2021-2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_EXTENDRT_KERNEL_ASCEND_MODEL_DYN_SHAPE_PROCESS_H
+#define MINDSPORE_LITE_SRC_EXTENDRT_KERNEL_ASCEND_MODEL_DYN_SHAPE_PROCESS_H
+
+#include <vector>
+#include <memory>
+#include "extendrt/kernel/ascend/options/acl_model_options.h"
+#include "kernel/kernel.h"
+#include "include/api/types.h"
+
+namespace mindspore::kernel {
+namespace acl {
+class DynShapeProcess {
+ public:
+  explicit DynShapeProcess(const AclModelOptionsPtr &options, size_t input_data_idx)
+      : acl_options_(options), input_data_idx_(input_data_idx) {}
+
+  int ProcDynamicInput(std::vector<KernelTensorPtr> *const inputs);
+
+ private:
+  int AddBatchSizeInput(std::vector<KernelTensorPtr> *const inputs);
+  int AddImageSizeInput(std::vector<KernelTensorPtr> *const inputs);
+  int GetRealBatchSize(std::vector<KernelTensorPtr> *const inputs, int32_t *batch_size);
+  int GetRealImageSize(std::vector<KernelTensorPtr> *const inputs, int32_t *image_size, int32_t num);
+
+  AclModelOptionsPtr acl_options_;
+  size_t input_data_idx_;
+};
+
+using DynShapeProcPtr = std::shared_ptr<DynShapeProcess>;
+}  // namespace acl
+}  // namespace mindspore::kernel
+#endif  // MINDSPORE_LITE_SRC_EXTENDRT_KERNEL_ASCEND_MODEL_DYN_SHAPE_PROCESS_H
--- a/mindspore/lite/src/extendrt/kernel/ascend/model/model_infer.cc
+++ b/mindspore/lite/src/extendrt/kernel/ascend/model/model_infer.cc
@ -0,0 +1,170 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "extendrt/kernel/ascend/model/model_infer.h"
+#include "common/log_adapter.h"
+#include "acl/acl.h"
+
+namespace mindspore::kernel {
+namespace acl {
+ModelInfer::ModelInfer(const Buffer &om_data, const AclModelOptionsPtr &options)
+    : init_flag_(false),
+      load_flag_(false),
+      device_type_("AscendCL"),
+      context_(nullptr),
+      om_data_(om_data),
+      options_(options),
+      model_process_(options),
+      acl_env_(nullptr) {}
+
+STATUS ModelInfer::Init() {
+  if (init_flag_) {
+    MS_LOG(INFO) << "Acl has been initialized, skip.";
+    return lite::RET_OK;
+  }
+  if (options_ == nullptr) {
+    MS_LOG(ERROR) << "Acl options is nullptr.";
+    return lite::RET_ERROR;
+  }
+  acl_env_ = AclEnvGuard::GetAclEnv(options_->dump_cfg_path);
+  if (acl_env_ == nullptr) {
+    MS_LOG(ERROR) << "Acl init failed.";
+    return lite::RET_ERROR;
+  }
+  int32_t device_id = options_->device_id;
+  aclError ret = aclrtSetDevice(device_id);
+  if (ret != ACL_ERROR_NONE) {
+    MS_LOG(ERROR) << "Acl open device " << device_id << " failed.";
+    return lite::RET_ERROR;
+  }
+  MS_LOG(INFO) << "Open device " << device_id << " success.";
+
+  ret = aclrtCreateContext(&context_, device_id);
+  if (ret != ACL_ERROR_NONE) {
+    MS_LOG(ERROR) << "Acl create context failed.";
+    return lite::RET_ERROR;
+  }
+  MS_LOG(INFO) << "Create context success.";
+
+  aclrtRunMode run_mode;
+  ret = aclrtGetRunMode(&run_mode);
+  if (ret != ACL_ERROR_NONE) {
+    MS_LOG(ERROR) << "Acl get run mode failed.";
+    return lite::RET_ERROR;
+  }
+  bool is_device = (run_mode == ACL_DEVICE);
+  model_process_.SetIsDevice(is_device);
+  MS_LOG(INFO) << "Get run mode success is device input/output " << is_device;
+
+  MS_LOG(INFO) << "Init model success, device id " << device_id;
+  init_flag_ = true;
+  return lite::RET_OK;
+}
+
+STATUS ModelInfer::Finalize() {
+  if (!init_flag_) {
+    MS_LOG(WARNING) << "Init is not ok, no need to finalize.";
+    return lite::RET_OK;
+  }
+
+  aclError rt_ret = aclrtSetCurrentContext(context_);
+  if (rt_ret != ACL_ERROR_NONE) {
+    MS_LOG(ERROR) << "Set the ascend device context failed.";
+    return lite::RET_ERROR;
+  }
+  if (load_flag_) {
+    auto ret = model_process_.UnLoad();
+    if (ret != lite::RET_OK) {
+      MS_LOG(ERROR) << "Unload model inner failed.";
+      return ret;
+    }
+  }
+  if (context_ != nullptr) {
+    rt_ret = aclrtDestroyContext(context_);
+    if (rt_ret != ACL_ERROR_NONE) {
+      MS_LOG(ERROR) << "Destroy context failed.";
+    }
+    context_ = nullptr;
+  }
+  MS_LOG(INFO) << "End to destroy context.";
+
+  rt_ret = aclrtResetDevice(options_->device_id);
+  if (rt_ret != ACL_ERROR_NONE) {
+    MS_LOG(ERROR) << "Reset device " << options_->device_id << " failed.";
+  }
+  MS_LOG(INFO) << "End to reset device " << options_->device_id;
+  init_flag_ = false;
+  load_flag_ = false;
+  return lite::RET_OK;
+}
+
+STATUS ModelInfer::Load() {
+  if (!load_flag_) {
+    int ret = LoadAclModel(om_data_);
+    if (ret != lite::RET_OK) {
+      MS_LOG(ERROR) << "Load model model failed.";
+      return ret;
+    }
+    load_flag_ = true;
+  }
+
+  aclError rt_ret = aclrtSetCurrentContext(context_);
+  if (rt_ret != ACL_ERROR_NONE) {
+    MS_LOG(ERROR) << "Set the ascend device context failed, ret = " << rt_ret;
+    return lite::RET_ERROR;
+  }
+
+  return lite::RET_OK;
+}
+
+STATUS ModelInfer::LoadAclModel(const Buffer &om_data) {
+  MS_LOG(INFO) << "Start load model model.";
+  // model load model
+  uint32_t acl_model_id;
+  auto acl_ret = aclmdlLoadFromMem(om_data.Data(), om_data.DataSize(), &acl_model_id);
+  if (acl_ret != ACL_ERROR_NONE) {
+    MS_LOG(ERROR) << "Call aclmdlLoadFromMem failed, ret = " << acl_ret;
+    return lite::RET_ERROR;
+  }
+
+  // model init model resource
+  model_process_.set_model_id(acl_model_id);
+  int ret = model_process_.PreInitModelResource();
+  if (ret != lite::RET_OK) {
+    (void)aclmdlUnload(acl_model_id);
+    MS_LOG(ERROR) << "Pre init model resource failed.";
+    return ret;
+  }
+
+  MS_LOG(INFO) << "Load model model success.";
+  return lite::RET_OK;
+}
+
+STATUS ModelInfer::Inference(const std::vector<KernelTensorPtr> &inputs, const std::vector<KernelTensorPtr> &outputs) {
+  if (Load() != lite::RET_OK) {
+    MS_LOG(ERROR) << "Prepare model resource failed.";
+    return lite::RET_ERROR;
+  }
+
+  return model_process_.PredictFromHost(inputs, outputs);
+}
+
+std::set<uint64_t> ModelInfer::GetDynamicBatch() { return model_process_.GetDynamicBatch(); }
+
+// need to be called after model load;
+std::set<std::pair<uint64_t, uint64_t>> ModelInfer::GetDynamicImage() { return model_process_.GetDynamicImage(); }
+}  // namespace acl
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/extendrt/kernel/ascend/model/model_infer.h
+++ b/mindspore/lite/src/extendrt/kernel/ascend/model/model_infer.h
@ -0,0 +1,65 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_EXTENDRT_KERNEL_ASCEND_MODEL_MODEL_INFER_H_
+#define MINDSPORE_LITE_SRC_EXTENDRT_KERNEL_ASCEND_MODEL_MODEL_INFER_H_
+
+#include <vector>
+#include <memory>
+#include <set>
+#include <utility>
+#include <string>
+#include "extendrt/kernel/ascend/model/model_process.h"
+#include "extendrt/kernel/ascend/model/acl_env_guard.h"
+#include "extendrt/kernel/ascend/options/acl_model_options.h"
+#include "include/api/types.h"
+#include "include/errorcode.h"
+
+namespace mindspore::kernel {
+namespace acl {
+using mindspore::lite::STATUS;
+
+class ModelInfer {
+ public:
+  ModelInfer(const Buffer &om_data, const AclModelOptionsPtr &options);
+  ~ModelInfer() = default;
+
+  STATUS Init();
+  STATUS Finalize();
+  STATUS Load();
+  STATUS Inference(const std::vector<KernelTensorPtr> &inputs, const std::vector<KernelTensorPtr> &outputs);
+  // need to be called after model load
+  std::set<uint64_t> GetDynamicBatch();
+  // need to be called after model load
+  std::set<std::pair<uint64_t, uint64_t>> GetDynamicImage();
+
+ private:
+  STATUS LoadAclModel(const Buffer &om_data);
+
+  bool init_flag_;
+  bool load_flag_;
+  std::string device_type_;
+  aclrtContext context_;
+  Buffer om_data_;
+  AclModelOptionsPtr options_;
+  ModelProcess model_process_;
+  std::shared_ptr<AclEnvGuard> acl_env_;
+};
+
+using ModelInferPtr = std::shared_ptr<ModelInfer>;
+}  // namespace acl
+}  // namespace mindspore::kernel
+#endif  // MINDSPORE_LITE_SRC_EXTENDRT_KERNEL_ASCEND_MODEL_MODEL_INFER_H_
--- a/mindspore/lite/src/extendrt/kernel/ascend/model/model_process.cc
+++ b/mindspore/lite/src/extendrt/kernel/ascend/model/model_process.cc
@ -0,0 +1,642 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "extendrt/kernel/ascend/model/model_process.h"
+#include <sys/time.h>
+#include <utility>
+#include <algorithm>
+#include <map>
+#include "common/log_adapter.h"
+#include "src/common/utils.h"
+#include "src/common/log_util.h"
+
+namespace mindspore::kernel {
+namespace acl {
+namespace {
+constexpr size_t kBatchSizeNum = 1;
+constexpr size_t kImageSizeHwNum = 2;
+}  // namespace
+static TypeId TransToDataType(aclDataType data_type) {
+  static const std::map<aclDataType, enum TypeId> data_type_map = {
+    {ACL_FLOAT16, TypeId::kNumberTypeFloat16}, {ACL_FLOAT, TypeId::kNumberTypeFloat32},
+    {ACL_DOUBLE, TypeId::kNumberTypeFloat64},  {ACL_INT8, TypeId::kNumberTypeInt8},
+    {ACL_INT16, TypeId::kNumberTypeInt16},     {ACL_INT32, TypeId::kNumberTypeInt32},
+    {ACL_INT64, TypeId::kNumberTypeInt64},     {ACL_UINT8, TypeId::kNumberTypeUInt8},
+    {ACL_UINT16, TypeId::kNumberTypeUInt16},   {ACL_UINT32, TypeId::kNumberTypeUInt32},
+    {ACL_UINT64, TypeId::kNumberTypeUInt64},   {ACL_BOOL, TypeId::kNumberTypeBool},
+  };
+  auto it = data_type_map.find(data_type);
+  if (it == data_type_map.end()) {
+    return TypeId::kNumberTypeEnd;
+  } else {
+    return it->second;
+  }
+}
+
+template <class T>
+inline static void ClearIfNotNull(T *vec) {
+  if (vec != nullptr) {
+    vec->clear();
+  }
+}
+
+template <class T, class U = std::vector<T>>
+inline static void PushbackIfNotNull(U *vec, T &&item) {
+  if (vec != nullptr) {
+    vec->emplace_back(item);
+  }
+}
+
+static STATUS ConstructTensorDesc(const std::vector<AclTensorInfo> &acl_tensor_list, std::vector<std::string> *names,
+                                  std::vector<std::vector<int64_t>> *shapes, std::vector<enum TypeId> *data_types,
+                                  std::vector<size_t> *mem_sizes) {
+  ClearIfNotNull(names);
+  ClearIfNotNull(shapes);
+  ClearIfNotNull(data_types);
+  ClearIfNotNull(mem_sizes);
+  for (size_t i = 0; i < acl_tensor_list.size(); ++i) {
+    const auto &info = acl_tensor_list[i];
+    PushbackIfNotNull(names, info.name);
+    PushbackIfNotNull(shapes, info.dims);
+    PushbackIfNotNull(data_types, TransToDataType(info.data_type));
+    PushbackIfNotNull(mem_sizes, info.buffer_size);
+  }
+
+  if (names->size() != acl_tensor_list.size() || shapes->size() != acl_tensor_list.size() ||
+      data_types->size() != acl_tensor_list.size() || mem_sizes->size() != acl_tensor_list.size()) {
+    MS_LOG(ERROR) << "Inner error, size do not match: names size " << names->size() << " shapes size " << shapes->size()
+                  << " data types size " << data_types->size() << " mem sizes size " << mem_sizes->size()
+                  << " acl_tensor_list size " << acl_tensor_list.size();
+    return lite::RET_ERROR;
+  }
+
+  return lite::RET_OK;
+}
+
+static std::string ShapeToString(const std::vector<int64_t> &shape) {
+  std::string result = "[";
+  for (size_t i = 0; i < shape.size(); ++i) {
+    result += std::to_string(shape[i]);
+    if (i + 1 < shape.size()) {
+      result += ", ";
+    }
+  }
+  result += "]";
+  return result;
+}
+
+STATUS ModelProcess::PreInitModelResource() {
+  model_desc_ = aclmdlCreateDesc();
+  aclError acl_ret = aclmdlGetDesc(model_desc_, model_id_);
+  if (acl_ret != ACL_ERROR_NONE) {
+    MS_LOG(ERROR) << "Read model desc failed, ret = " << acl_ret;
+    return lite::RET_ERROR;
+  }
+  STATUS ret = InitInputsBuffer();
+  if (ret != lite::RET_OK) {
+    MS_LOG(ERROR) << "Create input buffer failed.";
+    return ret;
+  }
+  ret = InitOutputsBuffer();
+  if (ret != lite::RET_OK) {
+    MS_LOG(ERROR) << "Create output buffer failed.";
+    return ret;
+  }
+  return lite::RET_OK;
+}
+
+std::set<uint64_t> ModelProcess::GetDynamicBatch() {
+  if (model_desc_ == nullptr) {
+    MS_LOG(ERROR) << " Model desc is nullptr.";
+    return std::set<uint64_t>();
+  }
+  aclmdlBatch dynamic_batch;
+  if (aclmdlGetDynamicBatch(model_desc_, &dynamic_batch) != ACL_SUCCESS) {
+    MS_LOG(ERROR) << "Failed to get dynamic batch.";
+    return std::set<uint64_t>();
+  }
+  size_t batch_count = dynamic_batch.batchCount;
+  if (batch_count > ACL_MAX_BATCH_NUM) {
+    MS_LOG(ERROR) << "Real batch count " << batch_count << " is larger than max " << ACL_MAX_BATCH_NUM;
+    return std::set<uint64_t>();
+  }
+  std::set<uint64_t> batch;
+  for (size_t i = 0; i < dynamic_batch.batchCount; ++i) {
+    batch.insert(dynamic_batch.batch[i]);
+  }
+  return batch;
+}
+
+std::set<std::pair<uint64_t, uint64_t>> ModelProcess::GetDynamicImage() {
+  if (model_desc_ == nullptr) {
+    MS_LOG(ERROR) << " Model desc is nullptr.";
+    return std::set<std::pair<uint64_t, uint64_t>>();
+  }
+  aclmdlHW dynamic_hw;
+  if (aclmdlGetDynamicHW(model_desc_, 0, &dynamic_hw) != ACL_SUCCESS) {
+    MS_LOG(ERROR) << "Failed to get dynamic hw.";
+    return std::set<std::pair<uint64_t, uint64_t>>();
+  }
+  size_t hw_count = dynamic_hw.hwCount;
+  if (hw_count > ACL_MAX_HW_NUM) {
+    MS_LOG(ERROR) << "Real hw count " << hw_count << " is larger than max " << ACL_MAX_HW_NUM;
+    return std::set<std::pair<uint64_t, uint64_t>>();
+  }
+  std::set<std::pair<uint64_t, uint64_t>> image;
+  for (size_t i = 0; i < dynamic_hw.hwCount; ++i) {
+    image.insert(std::pair<uint64_t, uint64_t>(dynamic_hw.hw[i][0], dynamic_hw.hw[i][1]));
+  }
+  return image;
+}
+
+STATUS ModelProcess::InitInputsBuffer() {
+  aclError ret;
+  size_t input_size = aclmdlGetNumInputs(model_desc_);
+  MS_LOG(INFO) << "input_size = " << input_size;
+  for (size_t i = 0; i < input_size; ++i) {
+    auto buffer_size = aclmdlGetInputSizeByIndex(model_desc_, i);
+    void *data_mem_buffer = nullptr;
+    if (!is_run_on_device_) {  // need to copy input/output to/from device
+      ret = aclrtMalloc(&data_mem_buffer, buffer_size, ACL_MEM_MALLOC_NORMAL_ONLY);
+      if (ret != ACL_ERROR_NONE) {
+        MS_LOG(ERROR) << "Malloc device input buffer failed , input size " << buffer_size;
+        return lite::RET_ERROR;
+      }
+    }
+
+    aclmdlIODims dims;
+    ret = aclmdlGetInputDims(model_desc_, i, &dims);
+    if (ret != ACL_ERROR_NONE) {
+      MS_LOG(ERROR) << "Get input shape failed, ret = " << ret;
+      if (!is_run_on_device_) {
+        aclrtFree(data_mem_buffer);
+      }
+      return lite::RET_ERROR;
+    }
+    aclDataType data_type = aclmdlGetInputDataType(model_desc_, i);
+    std::vector<int64_t> shape(dims.dims, dims.dims + dims.dimCount);
+    std::string input_name = aclmdlGetInputNameByIndex(model_desc_, i);
+    if (input_name.empty()) {
+      MS_LOG(WARNING) << "Get name of input " << i << " failed.";
+    }
+    MS_LOG(INFO) << "Name of input " << i << " is " << input_name;
+    input_infos_.emplace_back(
+      AclTensorInfo{data_mem_buffer, data_mem_buffer, buffer_size, data_type, shape, input_name});
+  }
+  MS_LOG(INFO) << "Create model inputs success";
+  return lite::RET_OK;
+}
+
+STATUS ModelProcess::CreateDataBuffer(void **data_mem_buffer, size_t buffer_size, aclmdlDataset *dataset) {
+  if (data_mem_buffer == nullptr) {
+    MS_LOG(ERROR) << "Data mem buffer is nullptr.";
+    return lite::RET_ERROR;
+  }
+  aclError ret;
+  auto free_data_buffer = [this](void *dataMemBuffer) {
+    if (!is_run_on_device_) {
+      (void)aclrtFree(dataMemBuffer);
+    } else {
+      (void)aclrtFreeHost(dataMemBuffer);
+    }
+  };
+
+  if (!is_run_on_device_) {
+    ret = aclrtMalloc(data_mem_buffer, buffer_size, ACL_MEM_MALLOC_NORMAL_ONLY);
+    if (ret != ACL_ERROR_NONE) {
+      MS_LOG(ERROR) << "Malloc device buffer failed , buffer size " << buffer_size;
+      return lite::RET_ERROR;
+    }
+  } else {
+    ret = aclrtMallocHost(data_mem_buffer, buffer_size);
+    if (ret != ACL_ERROR_NONE) {
+      MS_LOG(ERROR) << "Malloc host buffer failed , buffer size " << buffer_size;
+      return lite::RET_ERROR;
+    }
+  }
+
+  auto data_buffer = aclCreateDataBuffer(*data_mem_buffer, buffer_size);
+  if (data_buffer == nullptr) {
+    MS_LOG(ERROR) << "Create Data Buffer failed";
+    free_data_buffer(*data_mem_buffer);
+    return lite::RET_ERROR;
+  }
+  ret = aclmdlAddDatasetBuffer(dataset, data_buffer);
+  if (ret != ACL_ERROR_NONE) {
+    MS_LOG(ERROR) << "add data buffer failed";
+    free_data_buffer(*data_mem_buffer);
+    aclDestroyDataBuffer(data_buffer);
+    return lite::RET_ERROR;
+  }
+  return lite::RET_OK;
+}
+
+STATUS ModelProcess::InitOutputsBuffer() {
+  aclError ret;
+  outputs_ = aclmdlCreateDataset();
+  if (outputs_ == nullptr) {
+    MS_LOG(ERROR) << "Create output dataset failed";
+    return lite::RET_ERROR;
+  }
+  size_t output_size = aclmdlGetNumOutputs(model_desc_);
+  MS_LOG(INFO) << "Output_size = " << output_size;
+  for (size_t i = 0; i < output_size; ++i) {
+    auto buffer_size = aclmdlGetOutputSizeByIndex(model_desc_, i);
+
+    void *data_mem_buffer = nullptr;
+    if (CreateDataBuffer(&data_mem_buffer, buffer_size, outputs_) != lite::RET_OK) {
+      MS_LOG(ERROR) << "Add output data buffer failed, buffer size " << buffer_size;
+      return lite::RET_ERROR;
+    }
+    aclmdlIODims dims;
+    ret = aclmdlGetOutputDims(model_desc_, i, &dims);
+    if (ret != ACL_ERROR_NONE) {
+      MS_LOG(ERROR) << "Get output shape failed";
+      if (!is_run_on_device_) {
+        aclrtFree(data_mem_buffer);
+      } else {
+        aclrtFreeHost(data_mem_buffer);
+      }
+      return lite::RET_OK;
+    }
+    aclFormat format = aclmdlGetOutputFormat(model_desc_, i);
+    MS_LOG(DEBUG) << "The output format of om is " << format;
+    aclDataType data_type = aclmdlGetOutputDataType(model_desc_, i);
+    std::vector<int64_t> shape(dims.dims, dims.dims + dims.dimCount);
+    std::string output_name = aclmdlGetOutputNameByIndex(model_desc_, i);
+    if (output_name.empty()) {
+      MS_LOG(WARNING) << "Get name of output " << i << " failed.";
+    }
+    MS_LOG(INFO) << "Name of om output " << i << " is " << output_name << "Buffer size " << buffer_size;
+    output_infos_.emplace_back(
+      AclTensorInfo{data_mem_buffer, data_mem_buffer, buffer_size, data_type, shape, output_name});
+  }
+  MS_LOG(INFO) << "Create model output success.";
+  return lite::RET_OK;
+}
+
+void ModelProcess::DestroyInputsDataset() {
+  if (inputs_ == nullptr) {
+    return;
+  }
+  for (size_t i = 0; i < aclmdlGetDatasetNumBuffers(inputs_); i++) {
+    auto dataBuffer = aclmdlGetDatasetBuffer(inputs_, i);
+    aclDestroyDataBuffer(dataBuffer);
+  }
+  aclmdlDestroyDataset(inputs_);
+  inputs_ = nullptr;
+}
+
+void ModelProcess::DestroyInputsDataMem() {
+  if (!is_run_on_device_) {
+    for (const auto &item : input_infos_) {
+      aclrtFree(item.device_data);
+    }
+  }
+  input_infos_.clear();
+}
+
+void ModelProcess::DestroyInputsBuffer() {
+  DestroyInputsDataMem();
+  DestroyInputsDataset();
+}
+
+void ModelProcess::DestroyOutputsBuffer() {
+  for (const auto &item : output_infos_) {
+    if (!is_run_on_device_) {
+      aclrtFree(item.device_data);
+    } else {
+      aclrtFreeHost(item.device_data);
+    }
+  }
+  output_infos_.clear();
+
+  if (outputs_ == nullptr) {
+    return;
+  }
+  for (size_t i = 0; i < aclmdlGetDatasetNumBuffers(outputs_); i++) {
+    auto dataBuffer = aclmdlGetDatasetBuffer(outputs_, i);
+    aclDestroyDataBuffer(dataBuffer);
+  }
+  aclmdlDestroyDataset(outputs_);
+  outputs_ = nullptr;
+}
+
+STATUS ModelProcess::UnLoad() {
+  auto ret = aclmdlUnload(model_id_);
+  if (ret != ACL_ERROR_NONE) {
+    MS_LOG(ERROR) << "Unload model failed, ret = " << ret;
+    return lite::RET_ERROR;
+  }
+  if (model_desc_ != nullptr) {
+    ret = aclmdlDestroyDesc(model_desc_);
+    if (ret != ACL_ERROR_NONE) {
+      MS_LOG(ERROR) << "Unload model failed, ret = " << ret;
+      return lite::RET_ERROR;
+    }
+    model_desc_ = nullptr;
+  }
+  DestroyInputsBuffer();
+  DestroyOutputsBuffer();
+  MS_LOG(INFO) << "End unload model " << model_id_;
+  return lite::RET_OK;
+}
+
+STATUS ModelProcess::SetBatchSize(const std::vector<KernelTensorPtr> &inputs) {
+  for (size_t i = 0; i < inputs.size(); i++) {
+    input_infos_[i].buffer_size = inputs[i]->GetData()->size;
+  }
+  auto batch_size_tensor = inputs[inputs.size() - 1];
+  size_t data_type_size = lite::DataTypeSize(batch_size_tensor->GetDtype());
+  size_t num = 0;
+  if (data_type_size != 0) {
+    num = batch_size_tensor->GetData()->size / data_type_size;
+  }
+  if (num != kBatchSizeNum) {
+    MS_LOG(ERROR) << "Batch size num should be " << kBatchSizeNum;
+    return lite::RET_ERROR;
+  }
+  auto *ptr = reinterpret_cast<const int32_t *>(batch_size_tensor->GetData()->addr);
+  CHECK_NULL_RETURN(ptr);
+  auto batch_size = ptr[0];
+  aclError ret;
+  size_t index;
+  ret = aclmdlGetInputIndexByName(model_desc_, ACL_DYNAMIC_TENSOR_NAME, &index);
+  if (ret != ACL_ERROR_NONE) {
+    MS_LOG(ERROR) << "Get index failed";
+    return lite::RET_ERROR;
+  }
+  MS_LOG(INFO) << "Set Batch size(" << batch_size << ") of input " << index << ".";
+  ret = aclmdlSetDynamicBatchSize(model_id_, inputs_, index, batch_size);
+  if (ret != ACL_ERROR_NONE) {
+    MS_LOG(ERROR) << "Set dynamic batch size failed, model_id is " << model_id_;
+    return lite::RET_ERROR;
+  }
+  return lite::RET_OK;
+}
+
+STATUS ModelProcess::SetImageSize(const std::vector<KernelTensorPtr> &inputs) {
+  for (size_t i = 0; i < inputs.size(); i++) {
+    input_infos_[i].buffer_size = inputs[i]->GetData()->size;
+  }
+  auto image_size_tensor = inputs[inputs.size() - 1];
+  size_t data_type_size = lite::DataTypeSize(image_size_tensor->GetDtype());
+  size_t num = 0;
+  if (data_type_size != 0) {
+    num = image_size_tensor->GetData()->size / data_type_size;
+  }
+  if (num != kImageSizeHwNum) {
+    MS_LOG(ERROR) << "Image size hw num should be " << kImageSizeHwNum;
+    return lite::RET_ERROR;
+  }
+  auto *hw = reinterpret_cast<const int32_t *>(image_size_tensor->GetData()->addr);
+  CHECK_NULL_RETURN(hw);
+  int32_t height = hw[0];
+  int32_t width = hw[1];
+  size_t index;
+  aclError ret = ACL_ERROR_NONE;
+  ret = aclmdlGetInputIndexByName(model_desc_, ACL_DYNAMIC_TENSOR_NAME, &index);
+  if (ret != ACL_ERROR_NONE) {
+    MS_LOG(ERROR) << "Get index failed";
+    return lite::RET_ERROR;
+  }
+  MS_LOG(INFO) << "Set Image size(" << height << "," << width << ") of input " << index << ".";
+  ret = aclmdlSetDynamicHWSize(model_id_, inputs_, index, height, width);
+  if (ret != ACL_ERROR_NONE) {
+    MS_LOG(ERROR) << "Set dynamic batch size failed, model_id is " << model_id_;
+    return lite::RET_ERROR;
+  }
+  return lite::RET_OK;
+}
+
+STATUS ModelProcess::CheckTensorByTensorInfo(const std::vector<KernelTensorPtr> &tensor,
+                                             const std::vector<AclTensorInfo> &tensor_info) {
+  if (!IsDynamicShape()) {
+    for (size_t i = 0; i < tensor_info.size(); ++i) {
+      if (tensor[i]->GetShapeVector() != tensor_info[i].dims) {
+        MS_LOG(WARNING) << "Note: input " << i << " shape not match, required " << ShapeToString(tensor_info[i].dims)
+                        << ", given " << ShapeToString(tensor[i]->GetShapeVector()) << "."
+                        << "Please check input shape has been modified by DVPP method.";
+      }
+      if (tensor[i]->GetDtype() != TransToDataType(tensor_info[i].data_type)) {
+        MS_LOG(ERROR) << "Note: input " << i << " data type not match, required "
+                      << static_cast<int>(TransToDataType(tensor_info[i].data_type)) << ", given "
+                      << static_cast<int>(tensor[i]->GetDtype());
+        return lite::RET_ERROR;
+      }
+      if (tensor[i]->GetData()->size != tensor_info[i].buffer_size) {
+        MS_LOG(ERROR) << "Input " << i << " data size not match, required size " << tensor_info[i].buffer_size
+                      << ", given count " << tensor[i]->GetData()->size;
+        return lite::RET_ERROR;
+      }
+    }
+  }
+  return lite::RET_OK;
+}
+
+STATUS ModelProcess::ProcDynamicShape(const std::vector<KernelTensorPtr> &inputs) {
+  if (!IsDynamicShape()) {
+    MS_LOG(DEBUG) << "Input is not dynamic shape";
+    return lite::RET_OK;
+  }
+  if (IsDynamicBatchSize()) {
+    if (SetBatchSize(inputs) != lite::RET_OK) {
+      MS_LOG(ERROR) << "Set dynamic batch size failed.";
+      return lite::RET_ERROR;
+    }
+  }
+  if (IsDynamicImageSize()) {
+    if (SetImageSize(inputs) != lite::RET_OK) {
+      MS_LOG(ERROR) << "Set dynamic image size failed.";
+      return lite::RET_ERROR;
+    }
+  }
+  if (ResetOutputSize() != lite::RET_OK) {
+    MS_LOG(ERROR) << "Reset output size failed";
+    return lite::RET_ERROR;
+  }
+  return lite::RET_OK;
+}
+
+bool ModelProcess::IsDynamicShape() { return IsDynamicBatchSize() || IsDynamicImageSize(); }
+
+bool ModelProcess::IsDynamicBatchSize() { return !GetDynamicBatch().empty(); }
+
+bool ModelProcess::IsDynamicImageSize() { return !GetDynamicImage().empty(); }
+
+STATUS ModelProcess::CheckAndInitInput(const std::vector<KernelTensorPtr> &inputs) {
+  aclError ret;
+  inputs_ = aclmdlCreateDataset();
+  // check inputs
+  if (CheckTensorByTensorInfo(inputs, input_infos_) != lite::RET_OK) {
+    MS_LOG(ERROR) << "Check input tensor failed.";
+    return lite::RET_ERROR;
+  }
+  // copy inputs
+  for (size_t i = 0; i < input_infos_.size(); ++i) {
+    auto &info = input_infos_[i];
+    auto input = inputs[i];
+    void *data = input->GetData()->addr;
+    void *input_buffer = nullptr;
+    if (!is_run_on_device_) {
+      info.cur_device_data = info.device_data;
+      ret =
+        aclrtMemcpy(info.cur_device_data, info.buffer_size, data, input->GetData()->size, ACL_MEMCPY_HOST_TO_DEVICE);
+      if (ret != ACL_ERROR_NONE) {
+        MS_LOG(ERROR) << "Acl memcpy input " << i
+                      << " data to device failed, src input size: " << input->GetData()->size
+                      << ", dst device buffer size: " << info.buffer_size;
+        return lite::RET_ERROR;
+      }
+      input_buffer = info.cur_device_data;
+    } else {
+      input_buffer = data;
+    }
+    auto data_buffer = aclCreateDataBuffer(input_buffer, info.buffer_size);
+    if (data_buffer == nullptr) {
+      MS_LOG(ERROR) << "Create Data Buffer failed";
+      return lite::RET_ERROR;
+    }
+    ret = aclmdlAddDatasetBuffer(inputs_, data_buffer);
+    if (ret != ACL_ERROR_NONE) {
+      MS_LOG(ERROR) << "Add data buffer failed";
+      aclDestroyDataBuffer(data_buffer);
+      return lite::RET_ERROR;
+    }
+  }
+  if (ProcDynamicShape(inputs) != lite::RET_OK) {
+    MS_LOG(ERROR) << "Proc input dynamic shape failed.";
+    return lite::RET_ERROR;
+  }
+  return lite::RET_OK;
+}
+
+STATUS ModelProcess::ResetOutputSize() {
+  aclDataType output_type;
+  aclError ret;
+  size_t output_size = aclmdlGetNumOutputs(model_desc_);
+  for (size_t index = 0; index < output_size; index++) {
+    size_t dims = 1;
+    struct aclmdlIODims output_dims;
+    ret = aclmdlGetCurOutputDims(model_desc_, index, &output_dims);
+    if (ret != ACL_ERROR_NONE) {
+      MS_LOG(ERROR) << "get output dim error.";
+      return lite::RET_ERROR;
+    }
+    std::vector<int64_t> shape(output_dims.dims, output_dims.dims + output_dims.dimCount);
+    for (size_t i = 0; i < output_dims.dimCount; i++) {
+      dims *= output_dims.dims[i];
+    }
+    output_type = aclmdlGetOutputDataType(model_desc_, index);
+    output_infos_[index].dims = shape;
+    output_infos_[index].buffer_size = dims * aclDataTypeSize(output_type);
+  }
+  return lite::RET_OK;
+}
+
+STATUS ModelProcess::PredictFromHost(const std::vector<KernelTensorPtr> &inputs,
+                                     const std::vector<KernelTensorPtr> &outputs) {
+  STATUS ret = CheckAndInitInput(inputs);
+  if (ret != lite::RET_OK) {
+    MS_LOG(ERROR) << "Check or init input failed";
+    DestroyInputsDataset();
+    return ret;  // forward status error
+  }
+
+  aclError acl_ret;
+  auto env = std::getenv("GLOG_v");
+  if (env != nullptr && env[0] == '1') {
+    struct timeval start_time;
+    struct timeval end_time;
+    (void)gettimeofday(&start_time, nullptr);
+    acl_ret = aclmdlExecute(model_id_, inputs_, outputs_);
+    (void)gettimeofday(&end_time, nullptr);
+    constexpr uint64_t kUSecondInSecond = 1000000;
+    uint64_t cost =
+      (kUSecondInSecond * static_cast<uint64_t>(end_time.tv_sec) + static_cast<uint64_t>(end_time.tv_usec)) -
+      (kUSecondInSecond * static_cast<uint64_t>(start_time.tv_sec) + static_cast<uint64_t>(start_time.tv_usec));
+    MS_LOG(INFO) << "Model execute in " << cost << " us";
+  } else {
+    acl_ret = aclmdlExecute(model_id_, inputs_, outputs_);
+  }
+
+  DestroyInputsDataset();
+  if (acl_ret != ACL_ERROR_NONE) {
+    MS_LOG(ERROR) << "Execute Model Failed, ret = " << acl_ret;
+    return lite::RET_ERROR;
+  }
+  ret = GetOutputs(outputs);
+  if (ret != lite::RET_OK) {
+    MS_LOG(ERROR) << "Build outputs failed";
+    return ret;
+  }
+  MS_LOG(INFO) << "Execute model success";
+  return lite::RET_OK;
+}
+
+STATUS ModelProcess::GetOutputs(const std::vector<KernelTensorPtr> &outputs) {
+  if (outputs.empty()) {
+    MS_LOG(ERROR) << "Ms tensor outputs is empty.";
+    return lite::RET_ERROR;
+  }
+
+  if (ConstructTensor(outputs) != lite::RET_OK) {
+    MS_LOG(ERROR) << "Construct ms tensor failed.";
+    return lite::RET_ERROR;
+  }
+  return lite::RET_OK;
+}
+
+STATUS ModelProcess::ConstructTensor(const std::vector<KernelTensorPtr> &outputs) {
+  if (outputs.size() != output_infos_.size()) {
+    MS_LOG(ERROR) << "Actual tensor count not match, required count " << output_infos_.size() << ", given count "
+                  << outputs.size();
+    return lite::RET_ERROR;
+  }
+  std::vector<std::string> names;
+  std::vector<std::vector<int64_t>> shapes;
+  std::vector<enum TypeId> data_types;
+  std::vector<size_t> mem_sizes;
+  if (ConstructTensorDesc(output_infos_, &names, &shapes, &data_types, &mem_sizes) != lite::RET_OK) {
+    MS_LOG(ERROR) << "Construct tensor desc failed.";
+    return lite::RET_ERROR;
+  }
+  // set output info and malloc data size
+  for (size_t i = 0; i < output_infos_.size(); ++i) {
+    if (outputs[i]->GetData()->size != mem_sizes[i]) {
+      MS_LOG(ERROR) << "Ms tensor size " << outputs[i]->GetData()->size << " not match model tensor size "
+                    << mem_sizes[i];
+      return lite::RET_ERROR;
+    }
+  }
+  aclrtMemcpyKind kind = is_run_on_device_ ? ACL_MEMCPY_HOST_TO_HOST : ACL_MEMCPY_DEVICE_TO_HOST;
+  for (size_t i = 0; i < output_infos_.size(); ++i) {
+    if (output_infos_[i].cur_device_data == nullptr) {
+      // when run on device, cur_device_data is nullptr before first execute
+      continue;
+    }
+    auto ret = aclrtMemcpy(outputs[i]->GetData()->addr, outputs[i]->GetData()->size, output_infos_[i].cur_device_data,
+                           output_infos_[i].buffer_size, kind);
+    if (ret != ACL_ERROR_NONE) {
+      MS_LOG(ERROR) << "Memcpy input " << i << " from " << (is_run_on_device_ ? "host" : "device")
+                    << " to host failed, memory size " << output_infos_[i].buffer_size;
+      return lite::RET_ERROR;
+    }
+  }
+  return lite::RET_OK;
+}
+}  // namespace acl
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/extendrt/kernel/ascend/model/model_process.h
+++ b/mindspore/lite/src/extendrt/kernel/ascend/model/model_process.h
@ -0,0 +1,104 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_EXTENDRT_KERNEL_ASCEND_MODEL_MODEL_PROCESS_H_
+#define MINDSPORE_LITE_SRC_EXTENDRT_KERNEL_ASCEND_MODEL_MODEL_PROCESS_H_
+
+#include <vector>
+#include <string>
+#include <map>
+#include <set>
+#include <utility>
+#include "acl/acl.h"
+#include "acl/acl_mdl.h"
+#include "acl/acl_rt.h"
+#include "include/api/types.h"
+#include "include/errorcode.h"
+#include "kernel/kernel.h"
+#include "extendrt/kernel/ascend/options/acl_model_options.h"
+
+namespace mindspore::kernel {
+namespace acl {
+using mindspore::lite::STATUS;
+struct AclTensorInfo {
+  void *cur_device_data;
+  void *device_data;
+  size_t buffer_size;
+  aclDataType data_type;
+  std::vector<int64_t> dims;
+  std::string name;
+};
+
+class ModelProcess {
+ public:
+  explicit ModelProcess(const AclModelOptionsPtr &options)
+      : options_(options),
+        model_id_(0xffffffff),
+        is_run_on_device_(false),
+        model_desc_(nullptr),
+        inputs_(nullptr),
+        outputs_(nullptr),
+        input_infos_(),
+        output_infos_() {}
+  ~ModelProcess() {}
+
+  STATUS UnLoad();
+  STATUS PredictFromHost(const std::vector<KernelTensorPtr> &inputs, const std::vector<KernelTensorPtr> &outputs);
+  STATUS PreInitModelResource();
+
+  // override this method to avoid request/reply data copy
+  void SetIsDevice(bool is_device) { is_run_on_device_ = is_device; }
+
+  void set_model_id(uint32_t model_id) { model_id_ = model_id; }
+  uint32_t model_id() const { return model_id_; }
+  std::set<uint64_t> GetDynamicBatch();
+  std::set<std::pair<uint64_t, uint64_t>> GetDynamicImage();
+
+ private:
+  STATUS CreateDataBuffer(void **data_mem_buffer, size_t buffer_size, aclmdlDataset *dataset);
+  STATUS CheckAndInitInput(const std::vector<KernelTensorPtr> &inputs);
+  STATUS CheckTensorByTensorInfo(const std::vector<KernelTensorPtr> &tensor,
+                                 const std::vector<AclTensorInfo> &tensor_info);
+  STATUS GetOutputs(const std::vector<KernelTensorPtr> &outputs);
+  STATUS ConstructTensor(const std::vector<KernelTensorPtr> &outputs);
+  STATUS SetBatchSize(const std::vector<KernelTensorPtr> &inputs);
+  STATUS SetImageSize(const std::vector<KernelTensorPtr> &inputs);
+  STATUS InitInputsBuffer();
+  STATUS InitOutputsBuffer();
+  STATUS ResetOutputSize();
+  STATUS ProcDynamicShape(const std::vector<KernelTensorPtr> &inputs);
+  std::string VectorToString(const std::vector<int64_t> &);
+  bool IsDynamicShape();
+  bool IsDynamicBatchSize();
+  bool IsDynamicImageSize();
+  void DestroyInputsDataset();
+  void DestroyInputsDataMem();
+  void DestroyInputsBuffer();
+  void DestroyOutputsBuffer();
+
+  AclModelOptionsPtr options_;
+  uint32_t model_id_;
+  // if run one device(AICPU), there is no need to alloc device memory and copy inputs to(/outputs from) device
+  bool is_run_on_device_;
+  aclmdlDesc *model_desc_;
+  aclmdlDataset *inputs_;
+  aclmdlDataset *outputs_;
+  std::vector<AclTensorInfo> input_infos_;
+  std::vector<AclTensorInfo> output_infos_;
+};
+}  // namespace acl
+}  // namespace mindspore::kernel
+#endif  // MINDSPORE_LITE_SRC_EXTENDRT_KERNEL_ASCEND_MODEL_MODEL_PROCESS_H_
--- a/mindspore/lite/src/extendrt/kernel/ascend/options/acl_model_options.h
+++ b/mindspore/lite/src/extendrt/kernel/ascend/options/acl_model_options.h
@ -0,0 +1,39 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_EXTENDRT_KERNEL_ASCEND_SRC_ACL_MODEL_OPTIONS_H_
+#define MINDSPORE_LITE_SRC_EXTENDRT_KERNEL_ASCEND_SRC_ACL_MODEL_OPTIONS_H_
+
+#include <string>
+#include <set>
+#include <utility>
+#include <memory>
+
+namespace mindspore::kernel {
+namespace acl {
+struct AclModelOptions {
+  int32_t device_id;
+  std::string dump_cfg_path;
+  std::set<uint64_t> batch_size;
+  std::set<std::pair<uint64_t, uint64_t>> image_size;
+
+  AclModelOptions() : device_id(0) {}
+};
+
+using AclModelOptionsPtr = std::shared_ptr<AclModelOptions>;
+}  // namespace acl
+}  // namespace mindspore::kernel
+#endif  // MINDSPORE_LITE_SRC_EXTENDRT_KERNEL_ASCEND_SRC_ACL_MODEL_OPTIONS_H_
--- a/mindspore/lite/src/extendrt/kernel/ascend/options/acl_options_parser.cc
+++ b/mindspore/lite/src/extendrt/kernel/ascend/options/acl_options_parser.cc
@ -0,0 +1,80 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "extendrt/kernel/ascend/options/acl_options_parser.h"
+#include <utility>
+#include <vector>
+#include "common/log_adapter.h"
+#include "src/common/log_util.h"
+#include "src/common/utils.h"
+#include "acl/acl_base.h"
+#include "acl/acl_rt.h"
+
+namespace mindspore::kernel {
+namespace acl {
+constexpr auto kImageHwNum = 2;
+
+STATUS AclOptionsParser::ParseAclOptions(const mindspore::Context *ctx, AclModelOptionsPtr *const acl_options) {
+  CHECK_NULL_RETURN(ctx);
+  CHECK_NULL_RETURN(acl_options);
+
+  auto context = const_cast<mindspore::Context *>(ctx);
+  CHECK_NULL_RETURN(context);
+  auto device_infos = context->MutableDeviceInfo();
+  if (device_infos.size() < 1) {
+    MS_LOG(WARNING) << "Context is not set device info, please check.";
+    return lite::RET_OK;
+  }
+  CHECK_NULL_RETURN(device_infos[0]);
+  if (ParseOptions(device_infos[0], acl_options) != lite::RET_OK) {
+    MS_LOG(ERROR) << "Parse model options failed.";
+    return lite::RET_ERROR;
+  }
+  return lite::RET_OK;
+}
+
+STATUS AclOptionsParser::ParseOptions(const std::shared_ptr<DeviceInfoContext> &device_info,
+                                      AclModelOptions *acl_options) {
+  auto ascend_info = device_info->Cast<mindspore::AscendDeviceInfo>();
+  if (ascend_info == nullptr) {
+    MS_LOG(ERROR) << "There is no ascend info.";
+    return lite::RET_ERROR;
+  }
+  int32_t device_id = static_cast<int32_t>(ascend_info->GetDeviceID());
+  if (CheckDeviceId(&device_id) != lite::RET_OK) {
+    MS_LOG(ERROR) << "Check device id failed, device id = " << device_id;
+    return lite::RET_ERROR;
+  }
+  acl_options->device_id = device_id;
+  return lite::RET_OK;
+}
+
+STATUS AclOptionsParser::CheckDeviceId(int32_t *device_id) {
+  CHECK_NULL_RETURN(device_id);
+  uint32_t device_count;
+  if (aclrtGetDeviceCount(&device_count) != ACL_ERROR_NONE) {
+    MS_LOG(WARNING) << "Get device count failed.";
+    return lite::RET_OK;
+  }
+  if (*device_id >= static_cast<int32_t>(device_count)) {
+    MS_LOG(ERROR) << "Current device id " << *device_id << " is larger than max count " << device_count
+                  << ",please check the device info of context.";
+    return lite::RET_ERROR;
+  }
+  return lite::RET_OK;
+}
+}  // namespace acl
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/extendrt/kernel/ascend/options/acl_options_parser.h
+++ b/mindspore/lite/src/extendrt/kernel/ascend/options/acl_options_parser.h
@ -0,0 +1,41 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_EXTENDRT_KERNEL_ASCEND_SRC_ACL_OPTIONS_PARSER_H_
+#define MINDSPORE_LITE_SRC_EXTENDRT_KERNEL_ASCEND_SRC_ACL_OPTIONS_PARSER_H_
+
+#include <memory>
+#include <string>
+#include "include/api/context.h"
+#include "include/errorcode.h"
+#include "extendrt/kernel/ascend/options/acl_model_options.h"
+
+namespace mindspore::kernel {
+namespace acl {
+using mindspore::lite::STATUS;
+
+class AclOptionsParser {
+ public:
+  STATUS ParseAclOptions(const mindspore::Context *ctx, AclModelOptionsPtr *const acl_options);
+
+ private:
+  STATUS ParseOptions(const std::shared_ptr<DeviceInfoContext> &device_info, AclModelOptions *acl_options);
+  STATUS CheckDeviceId(int32_t *device_id);
+};
+}  // namespace acl
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_EXTENDRT_KERNEL_ASCEND_SRC_ACL_OPTIONS_PARSER_H_
--- a/mindspore/lite/src/extendrt/single_op_session.cc
+++ b/mindspore/lite/src/extendrt/single_op_session.cc
@ -52,16 +52,19 @@ Status SingleOpInferSession::CompileGraph(FuncGraphPtr graph) {
  for (const auto &kernel_node : kernel_nodes) {
    mindspore::infer::SetKernelInfo(kernel_node);
    std::string kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
-    std::shared_ptr<kernel::CpuKernelMod> cpu_kernel_mod =
-      kernel::Factory<kernel::CpuKernelMod>::Instance().Create(kernel_name);
+    std::shared_ptr<kernel::KernelMod> kernel_mod = kernel::Factory<kernel::KernelMod>::Instance().Create(kernel_name);
    MS_LOG(INFO) << "SingleOpInferSession::Kernels " << kernel_name;
    auto args = kernel::AbstractArgsFromCNode(kernel_node);
-    auto ret = cpu_kernel_mod->Init(args.op, args.inputs, args.outputs);
+    if (kernel_mod == nullptr) {
+      MS_LOG(EXCEPTION) << "Kernel mod is nullptr, kernel name: " << kernel_name;
+    }
+    mindspore::infer::CopyInputWeights(kernel_node, args.inputs);
+    auto ret = kernel_mod->Init(args.op, args.inputs, args.outputs);
    MS_LOG(INFO) << "SingleOpInferSession::Kernels ret " << ret;
    if (!ret) {
      MS_LOG(EXCEPTION) << "kernel init failed " << kernel_name;
    }
-    if (cpu_kernel_mod->Resize(args.op, args.inputs, args.outputs, kernel::GetKernelDepends(kernel_node)) ==
+    if (kernel_mod->Resize(args.op, args.inputs, args.outputs, kernel::GetKernelDepends(kernel_node)) ==
        kernel::KRET_RESIZE_FAILED) {
      MS_LOG(EXCEPTION) << "CPU kernel op [" << kernel_node->fullname_with_scope() << "] Resize failed.";
    }
@ -90,10 +93,10 @@ Status SingleOpInferSession::CompileGraph(FuncGraphPtr graph) {
      tensor_size = std::max(tensor_size, type_size);
      (void)output_size_list.emplace_back(tensor_size);
    }
-    cpu_kernel_mod->SetInputSizeList(input_size_list);
-    cpu_kernel_mod->SetOutputSizeList(output_size_list);
+    kernel_mod->SetInputSizeList(input_size_list);
+    kernel_mod->SetOutputSizeList(output_size_list);

-    AnfAlgo::SetKernelMod(cpu_kernel_mod, kernel_node.get());
+    AnfAlgo::SetKernelMod(kernel_mod, kernel_node.get());
  }

  this->AssignKernelGraphAddress(kernel_graph_);
@ -284,9 +287,29 @@ device::DeviceAddressPtr SingleOpInferSession::CreateDeviceAddress(void *device_
  return std::make_shared<InferDeviceAddress>(device_ptr, device_size, format, type_id);
 }

+std::vector<AnfNodePtr> SingleOpInferSession::GetGraphDataInputs() const {
+  MS_EXCEPTION_IF_NULL(kernel_graph_);
+  std::vector<AnfNodePtr> data_inputs;
+  auto inputs = kernel_graph_->inputs();
+  for (auto input : inputs) {
+    if (input->isa<Parameter>()) {
+      auto parameter = input->cast<ParameterPtr>();
+      if (parameter != nullptr && !parameter->has_default()) {
+        data_inputs.push_back(input);
+      }
+    }
+  }
+  return data_inputs;
+}
+
 void SingleOpInferSession::CopyInputs(const std::vector<tensor::TensorPtr> inputs) {
  MS_EXCEPTION_IF_NULL(kernel_graph_);
-  auto graph_inputs = kernel_graph_->inputs();
+  auto graph_inputs = GetGraphDataInputs();
+  if (graph_inputs.size() != inputs.size()) {
+    MS_LOG(ERROR) << "Graph inputs size[" << graph_inputs.size() << "] is not equal to User input size[ "
+                  << inputs.size() << "].";
+    return;
+  }
  for (size_t i = 0; i < graph_inputs.size(); i++) {
    auto input = inputs[i];
    auto graph_input = graph_inputs[i];
--- a/mindspore/lite/src/extendrt/single_op_session.h
+++ b/mindspore/lite/src/extendrt/single_op_session.h
@ -49,6 +49,7 @@ class SingleOpInferSession : public InferSession {
  device::DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
                                               TypeId type_id) const;
  void CopyInputs(const std::vector<tensor::TensorPtr> inputs);
+  std::vector<AnfNodePtr> GetGraphDataInputs() const;
  void CopyOutputs(std::vector<tensor::TensorPtr> *outputs);

 private:
--- a/mindspore/lite/src/extendrt/utils/kernel_build_utils.cc
+++ b/mindspore/lite/src/extendrt/utils/kernel_build_utils.cc
@ -35,6 +35,9 @@ using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm;
 using mindspore::kernel::KernelBuildInfo;
 namespace {
 constexpr auto kParamDynamic = "dynamic";
+constexpr auto kCustomAscendInputNum = 3;
+constexpr auto kNameCustomAscend = "CustomAscend";
+constexpr auto kCustomTypeAscend = "acl_build";

 bool IsInputNotCNode(const CNodePtr &kernel_node, size_t input_index) {
  auto input_node = common::AnfAlgo::VisitKernel(kernel_node->input(input_index + 1), 0).first;
@ -326,8 +329,9 @@ void UpdateCustomKernelBuildInfo(const CNodePtr &kernel_node, bool is_akg_op) {
  GetOutputFormat(kernel_node, &output_formats);
  builder->SetOutputsDeviceType(output_types);
  builder->SetOutputsFormat(output_formats);
-  //   AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), kernel_node.get());
-
+  if (op_name == kNameCustomAscend) {
+    AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), kernel_node.get());
+  }
  // check reg info if kernel_attr is not null
  if (kernel_attr != nullptr) {
    std::vector<std::shared_ptr<KernelBuildInfo>> kernel_info_list;
@ -465,6 +469,10 @@ std::pair<std::string, ExceptionType> SetKernelInfoWithMsg(const CNodePtr &kerne
      UpdateCustomKernelBuildInfo(kernel_node, true);
      return {};
    }
+    if (tp == kCustomTypeAscend) {
+      UpdateCustomKernelBuildInfo(kernel_node, false);
+      return {};
+    }
    // If Custom op has not set reg info, then infer info from inputs
    if (mindspore::kernel::OpLib::FindOp(op_name, kernel::OpImplyType::kCPU) == nullptr) {
      MS_LOG(WARNING) << "Not find operator information for Custom operator[" << op_name << "]. "
@ -535,5 +543,49 @@ void SetKernelInfo(const CNodePtr &kernel_node) {
  if (msg.empty()) return;
  MS_EXCEPTION(etype) << msg;
 }
+
+void CopyInputWeights(const CNodePtr &kernel_node, const std::vector<kernel::KernelTensorPtr> &inputs) {
+  std::string kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
+  if (kernel_name == kNameCustomAscend) {
+    auto node_input_size = kernel_node->inputs().size();
+    if (node_input_size < kCustomAscendInputNum) {
+      MS_LOG(ERROR) << "Input num of custom ascend kernel should larger than " << (kCustomAscendInputNum - 1)
+                    << ", real num is " << node_input_size;
+      return;
+    }
+    if (node_input_size != inputs.size() + 1) {
+      MS_LOG(ERROR) << "Input num of custom ascend kernel [" << node_input_size << "]"
+                    << " is not equal to kernel tensor size[" << (inputs.size() + 1) << "].";
+      return;
+    }
+    auto om_input = kernel_node->input(node_input_size - 1);
+    if (!om_input->isa<Parameter>()) {
+      MS_LOG(ERROR) << "Om input is not parameter.";
+      return;
+    }
+    ParameterPtr om_param = om_input->cast<ParameterPtr>();
+    if (om_param == nullptr || !om_param->has_default()) {
+      MS_LOG(ERROR) << "Om param is invalid, val= " << om_param;
+      return;
+    }
+    auto tensor = std::static_pointer_cast<tensor::Tensor>(om_param->default_param());
+    if (tensor == nullptr) {
+      MS_LOG(ERROR) << "Tensor is nullptr.";
+      return;
+    }
+    if (tensor->data_c() == nullptr || tensor->Size() == 0) {
+      MS_LOG(ERROR) << "Tensor data is invalid.";
+      return;
+    }
+    auto new_addr = malloc(tensor->Size());
+    if (new_addr == nullptr) {
+      MS_LOG(ERROR) << "Malloc failed, size= " << tensor->Size();
+      return;
+    }
+    memcpy(new_addr, tensor->data_c(), tensor->Size());
+    kernel::AddressPtr addr_ptr = std::make_shared<kernel::Address>(new_addr, tensor->Size());
+    inputs[inputs.size() - 1]->SetData(addr_ptr);
+  }
+}
 }  // namespace infer
 }  // namespace mindspore
--- a/mindspore/lite/src/extendrt/utils/kernel_build_utils.h
+++ b/mindspore/lite/src/extendrt/utils/kernel_build_utils.h
@ -26,11 +26,13 @@
 #include "ir/anf.h"
 #include "ir/dtype/type.h"
 #include "include/common/utils/utils.h"
+#include "mindspore/ccsrc/kernel/kernel.h"

 namespace mindspore {
 namespace infer {
 using DataType = std::pair<TypeId, std::string>;
 void SetKernelInfo(const CNodePtr &apply_kernel_ptr);
+void CopyInputWeights(const CNodePtr &kernel_node, const std::vector<kernel::KernelTensorPtr> &inputs);
 }  // namespace infer
 }  // namespace mindspore

--- a/mindspore/lite/tools/converter/adapter/acl/src/acl_model_process.cc
+++ b/mindspore/lite/tools/converter/adapter/acl/src/acl_model_process.cc
@ -15,7 +15,11 @@
 */

 #include "tools/converter/adapter/acl/src/acl_model_process.h"
+#ifdef ENABLE_CLOUD_FUSION_INFERENCE
+#include "src/extendrt/kernel/ascend/model/acl_env_guard.h"
+#else
 #include "src/runtime/kernel/ascend/src/acl_env_guard.h"
+#endif
 #include "src/common/log_util.h"
 #include "acl/acl.h"
 #include "acl/acl_rt.h"
--- a/mindspore/lite/tools/converter/adapter/acl/src/acl_pass_impl.cc
+++ b/mindspore/lite/tools/converter/adapter/acl/src/acl_pass_impl.cc
@ -52,6 +52,8 @@ constexpr auto kInferShapePass = "InferShapePass";
 constexpr auto kConstFoldPass = "ConstFoldPass";
 constexpr auto kRemoveRedundantOpPass = "RemoveRedundantOpPass";
 constexpr auto kDelRedundantTranspose = "DeleteRedundantTranspose";
+constexpr auto kFuncType = "func_type";
+constexpr auto kUniqueName = "uniq_name";
 constexpr size_t kDependInputNum = 3;
 constexpr size_t kDependFirstInputIdx = 1;
 constexpr size_t kTupleGetItemFirstInputIdx = 1;
@ -155,6 +157,10 @@ STATUS AclPassImpl::PreProcGraph(const FuncGraphPtr &func_graph) {
 }

 STATUS AclPassImpl::PostProcGraph(const FuncGraphPtr &func_graph) {
+  if (lite::acl::DelRedundantParameter(func_graph) != RET_SUCCESS) {
+    MS_LOG(ERROR) << "Delete redundant parameters failed.";
+    return lite::RET_ERROR;
+  }
  if (!user_options_cfg_.offline) {
    MS_LOG(DEBUG) << "Online model infer no need to change to nhwc format.";
    return lite::RET_OK;
@ -548,6 +554,8 @@ void AclPassImpl::SetCustomAttrs(const std::shared_ptr<ops::Custom> &prim) {
  std::vector<uint8_t> output_dim_char(output_dim_str.begin(), output_dim_str.end());
  std::map<std::string, std::vector<uint8_t>> attrs = {{lite::acl::kOutputShapes, output_dim_char}};
  prim->set_attr(attrs);
+  prim->AddAttr(kFuncType, api::MakeValue<std::string>("acl_build"));
+  prim->AddAttr(kUniqueName, api::MakeValue<std::string>("CustomAscend"));
 }

 CNodePtr AclPassImpl::CreateCustomNode(const FuncGraphPtr &func_graph) {
--- a/mindspore/lite/tools/optimizer/format/delete_redundant_transpose.cc
+++ b/mindspore/lite/tools/optimizer/format/delete_redundant_transpose.cc
@ -136,6 +136,9 @@ STATUS DeleteRedundantTranspose::TransTransFusion(const FuncGraphPtr &func_graph
      if (!manager_->Replace(cnode, pre_cnode->input(1))) {
        MS_LOG(ERROR) << "replace old node failed, please check.";
        return lite::RET_ERROR;
+      } else {
+        func_graph->DropNode(cnode->input(kInputIndexTwo));
+        func_graph->DropNode(pre_cnode->input(kInputIndexTwo));
      }
    }
  }